datacontract-cli 0.10.14__py3-none-any.whl → 0.10.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (68) hide show
  1. datacontract/breaking/breaking.py +227 -9
  2. datacontract/breaking/breaking_rules.py +24 -0
  3. datacontract/catalog/catalog.py +1 -1
  4. datacontract/cli.py +99 -32
  5. datacontract/data_contract.py +26 -4
  6. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  7. datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
  8. datacontract/engines/soda/check_soda_execute.py +5 -3
  9. datacontract/engines/soda/connections/duckdb.py +1 -0
  10. datacontract/engines/soda/connections/kafka.py +12 -12
  11. datacontract/export/avro_idl_converter.py +1 -2
  12. datacontract/export/bigquery_converter.py +4 -3
  13. datacontract/export/data_caterer_converter.py +1 -1
  14. datacontract/export/dbml_converter.py +2 -4
  15. datacontract/export/dbt_converter.py +2 -3
  16. datacontract/export/exporter.py +1 -1
  17. datacontract/export/exporter_factory.py +3 -2
  18. datacontract/export/go_converter.py +3 -2
  19. datacontract/export/great_expectations_converter.py +202 -40
  20. datacontract/export/html_export.py +1 -1
  21. datacontract/export/jsonschema_converter.py +3 -2
  22. datacontract/export/odcs_v2_exporter.py +1 -1
  23. datacontract/export/odcs_v3_exporter.py +1 -1
  24. datacontract/export/pandas_type_converter.py +40 -0
  25. datacontract/export/protobuf_converter.py +1 -1
  26. datacontract/export/rdf_converter.py +4 -5
  27. datacontract/export/sodacl_converter.py +6 -2
  28. datacontract/export/spark_converter.py +7 -6
  29. datacontract/export/sql_converter.py +1 -2
  30. datacontract/export/sqlalchemy_converter.py +1 -2
  31. datacontract/export/terraform_converter.py +1 -1
  32. datacontract/imports/avro_importer.py +1 -1
  33. datacontract/imports/bigquery_importer.py +1 -1
  34. datacontract/imports/dbml_importer.py +2 -2
  35. datacontract/imports/dbt_importer.py +3 -2
  36. datacontract/imports/glue_importer.py +5 -3
  37. datacontract/imports/iceberg_importer.py +5 -6
  38. datacontract/imports/importer.py +1 -0
  39. datacontract/imports/importer_factory.py +7 -1
  40. datacontract/imports/jsonschema_importer.py +3 -2
  41. datacontract/imports/odcs_v2_importer.py +2 -2
  42. datacontract/imports/odcs_v3_importer.py +2 -2
  43. datacontract/imports/parquet_importer.py +81 -0
  44. datacontract/imports/spark_importer.py +2 -1
  45. datacontract/imports/sql_importer.py +1 -1
  46. datacontract/imports/unity_importer.py +3 -3
  47. datacontract/integration/opentelemetry.py +0 -1
  48. datacontract/lint/lint.py +2 -1
  49. datacontract/lint/linters/description_linter.py +1 -0
  50. datacontract/lint/linters/example_model_linter.py +1 -0
  51. datacontract/lint/linters/field_pattern_linter.py +1 -0
  52. datacontract/lint/linters/field_reference_linter.py +1 -0
  53. datacontract/lint/linters/notice_period_linter.py +1 -0
  54. datacontract/lint/linters/quality_schema_linter.py +1 -0
  55. datacontract/lint/linters/valid_constraints_linter.py +1 -0
  56. datacontract/lint/resolve.py +1 -1
  57. datacontract/lint/schema.py +1 -1
  58. datacontract/model/data_contract_specification.py +11 -5
  59. datacontract/model/run.py +21 -12
  60. datacontract/templates/index.html +6 -6
  61. datacontract/web.py +2 -3
  62. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.15.dist-info}/METADATA +97 -52
  63. datacontract_cli-0.10.15.dist-info/RECORD +105 -0
  64. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.15.dist-info}/WHEEL +1 -1
  65. datacontract_cli-0.10.14.dist-info/RECORD +0 -103
  66. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.15.dist-info}/LICENSE +0 -0
  67. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.15.dist-info}/entry_points.txt +0 -0
  68. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.15.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import json
3
3
  import fastjsonschema
4
4
 
5
5
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Field, Model
7
7
  from datacontract.model.exceptions import DataContractException
8
8
 
9
9
 
@@ -111,7 +111,8 @@ def schema_to_args(property_schema, is_required: bool = None) -> dict:
111
111
  nested_properties = property_schema.get("properties")
112
112
  if nested_properties is not None:
113
113
  # recursive call for complex nested properties
114
- field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
114
+ required = property_schema.get("required", [])
115
+ field_kwargs["fields"] = jsonschema_to_args(nested_properties, required)
115
116
 
116
117
  return field_kwargs
117
118
 
@@ -6,16 +6,16 @@ import yaml
6
6
 
7
7
  from datacontract.imports.importer import Importer
8
8
  from datacontract.model.data_contract_specification import (
9
+ DATACONTRACT_TYPES,
9
10
  Availability,
10
11
  Contact,
11
12
  DataContractSpecification,
13
+ Field,
12
14
  Info,
13
15
  Model,
14
- Field,
15
16
  Retention,
16
17
  ServiceLevel,
17
18
  Terms,
18
- DATACONTRACT_TYPES,
19
19
  )
20
20
  from datacontract.model.exceptions import DataContractException
21
21
 
@@ -8,16 +8,16 @@ import yaml
8
8
  from datacontract.imports.importer import Importer
9
9
  from datacontract.lint.resources import read_resource
10
10
  from datacontract.model.data_contract_specification import (
11
+ DATACONTRACT_TYPES,
11
12
  Availability,
12
13
  DataContractSpecification,
14
+ Field,
13
15
  Info,
14
16
  Model,
15
- Field,
16
17
  Retention,
17
18
  Server,
18
19
  ServiceLevel,
19
20
  Terms,
20
- DATACONTRACT_TYPES,
21
21
  )
22
22
  from datacontract.model.exceptions import DataContractException
23
23
 
@@ -0,0 +1,81 @@
1
+ import os.path
2
+
3
+ import pyarrow
4
+ from pyarrow import parquet
5
+
6
+ from datacontract.imports.importer import Importer
7
+ from datacontract.model.data_contract_specification import (
8
+ DataContractSpecification,
9
+ Field,
10
+ Model,
11
+ )
12
+ from datacontract.model.exceptions import DataContractException
13
+
14
+
15
+ class ParquetImporter(Importer):
16
+ def import_source(
17
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
18
+ ) -> DataContractSpecification:
19
+ return import_parquet(data_contract_specification, source)
20
+
21
+
22
+ def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
23
+ # use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
24
+ schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
25
+
26
+ fields: dict[str, Field] = {}
27
+
28
+ arrow_schema = parquet.read_schema(source)
29
+ for field_name in arrow_schema.names:
30
+ parquet_field = arrow_schema.field(field_name)
31
+
32
+ field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
33
+
34
+ if not parquet_field.nullable:
35
+ field.required = True
36
+
37
+ fields[field_name] = field
38
+
39
+ data_contract_specification.models[schema_name] = Model(fields=fields)
40
+
41
+ return data_contract_specification
42
+
43
+
44
+ def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
45
+ if pyarrow.types.is_boolean(pyarrow_field.type):
46
+ return Field(type="boolean")
47
+ if pyarrow.types.is_int32(pyarrow_field.type):
48
+ return Field(type="int")
49
+ if pyarrow.types.is_int64(pyarrow_field.type):
50
+ return Field(type="long")
51
+ if pyarrow.types.is_integer(pyarrow_field.type):
52
+ return Field(type="number")
53
+ if pyarrow.types.is_float32(pyarrow_field.type):
54
+ return Field(type="float")
55
+ if pyarrow.types.is_float64(pyarrow_field.type):
56
+ return Field(type="double")
57
+ if pyarrow.types.is_decimal(pyarrow_field.type):
58
+ return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
59
+ if pyarrow.types.is_timestamp(pyarrow_field.type):
60
+ return Field(type="timestamp")
61
+ if pyarrow.types.is_date(pyarrow_field.type):
62
+ return Field(type="date")
63
+ if pyarrow.types.is_null(pyarrow_field.type):
64
+ return Field(type="null")
65
+ if pyarrow.types.is_binary(pyarrow_field.type):
66
+ return Field(type="bytes")
67
+ if pyarrow.types.is_string(pyarrow_field.type):
68
+ return Field(type="string")
69
+ if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
70
+ return Field(type="map")
71
+ if pyarrow.types.is_struct(pyarrow_field.type):
72
+ return Field(type="struct")
73
+ if pyarrow.types.is_list(pyarrow_field.type):
74
+ return Field(type="array")
75
+
76
+ raise DataContractException(
77
+ type="schema",
78
+ name=f"Parse {file_format} schema",
79
+ reason=f"{pyarrow_field.type} currently not supported.",
80
+ engine="datacontract",
81
+ )
@@ -1,9 +1,10 @@
1
1
  from pyspark.sql import DataFrame, SparkSession, types
2
+
2
3
  from datacontract.imports.importer import Importer
3
4
  from datacontract.model.data_contract_specification import (
4
5
  DataContractSpecification,
5
- Model,
6
6
  Field,
7
+ Model,
7
8
  Server,
8
9
  )
9
10
 
@@ -1,7 +1,7 @@
1
1
  from simple_ddl_parser import parse_from_file
2
2
 
3
3
  from datacontract.imports.importer import Importer
4
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
4
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
5
5
 
6
6
 
7
7
  class SqlImporter(Importer):
@@ -2,13 +2,13 @@ import json
2
2
  import os
3
3
  from typing import List, Optional
4
4
 
5
- from pyspark.sql import types
6
5
  from databricks.sdk import WorkspaceClient
7
- from databricks.sdk.service.catalog import TableInfo, ColumnInfo
6
+ from databricks.sdk.service.catalog import ColumnInfo, TableInfo
7
+ from pyspark.sql import types
8
8
 
9
9
  from datacontract.imports.importer import Importer
10
10
  from datacontract.imports.spark_importer import _field_from_struct_type
11
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
11
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
12
12
  from datacontract.model.exceptions import DataContractException
13
13
 
14
14
 
@@ -12,7 +12,6 @@ from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExpo
12
12
 
13
13
  from datacontract.model.run import Run
14
14
 
15
-
16
15
  # Publishes metrics of a test run.
17
16
  # Metric contains the values:
18
17
  # 0 == test run passed,
datacontract/lint/lint.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import abc
2
2
  from dataclasses import dataclass, field
3
3
  from enum import Enum
4
- from typing import Sequence, Any, cast
4
+ from typing import Any, Sequence, cast
5
5
 
6
6
  from datacontract.model.run import Check
7
+
7
8
  from ..model.data_contract_specification import DataContractSpecification
8
9
 
9
10
  """This module contains linter definitions for linting a data contract.
@@ -1,4 +1,5 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification
2
+
2
3
  from ..lint import Linter, LinterResult
3
4
 
4
5
 
@@ -5,6 +5,7 @@ import json
5
5
  import yaml
6
6
 
7
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Example
8
+
8
9
  from ..lint import Linter, LinterResult
9
10
 
10
11
 
@@ -1,6 +1,7 @@
1
1
  import re
2
2
 
3
3
  from datacontract.model.data_contract_specification import DataContractSpecification
4
+
4
5
  from ..lint import Linter, LinterResult
5
6
 
6
7
 
@@ -1,4 +1,5 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification
2
+
2
3
  from ..lint import Linter, LinterResult
3
4
 
4
5
 
@@ -1,6 +1,7 @@
1
1
  import re
2
2
 
3
3
  from datacontract.model.data_contract_specification import DataContractSpecification
4
+
4
5
  from ..lint import Linter, LinterResult
5
6
 
6
7
 
@@ -1,6 +1,7 @@
1
1
  import yaml
2
2
 
3
3
  from datacontract.model.data_contract_specification import DataContractSpecification, Model
4
+
4
5
  from ..lint import Linter, LinterResult
5
6
 
6
7
 
@@ -1,4 +1,5 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification, Field
2
+
2
3
  from ..lint import Linter, LinterResult
3
4
 
4
5
 
@@ -199,7 +199,7 @@ def _resolve_data_contract_from_str(
199
199
 
200
200
  if is_open_data_contract_standard(yaml_dict):
201
201
  # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
202
- data_contract_specification = DataContractSpecification(dataContractSpecification="0.9.3")
202
+ data_contract_specification = DataContractSpecification(dataContractSpecification="1.1.0")
203
203
  return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str)
204
204
 
205
205
  _validate_data_contract_specification_schema(yaml_dict, schema_location)
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import os
3
- from typing import Dict, Any
3
+ from typing import Any, Dict
4
4
 
5
5
  import requests
6
6
 
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import List, Dict, Optional, Any
2
+ from typing import Any, Dict, List, Optional
3
3
 
4
4
  import pydantic as pyd
5
5
  import yaml
@@ -147,6 +147,7 @@ class Field(pyd.BaseModel):
147
147
  format: str = None
148
148
  required: bool = None
149
149
  primary: bool = None
150
+ primaryKey: bool = None
150
151
  unique: bool | None = None
151
152
  references: str = None
152
153
  description: str | None = None
@@ -282,9 +283,14 @@ class DataContractSpecification(pyd.BaseModel):
282
283
  terms: Terms = None
283
284
  models: Dict[str, Model] = {}
284
285
  definitions: Dict[str, Definition] = {}
285
- # schema: Dict[str, str]
286
- examples: List[Example] = []
287
- quality: Quality = None
286
+ examples: List[Example] = pyd.Field(
287
+ default_factory=list,
288
+ deprecated="Removed in Data Contract Specification " "v1.1.0. Use models.examples instead.",
289
+ )
290
+ quality: Quality = pyd.Field(
291
+ default=None,
292
+ deprecated="Removed in Data Contract Specification v1.1.0. Use " "model-level and field-level quality instead.",
293
+ )
288
294
  servicelevels: Optional[ServiceLevel] = None
289
295
  links: Dict[str, str] = {}
290
296
  tags: List[str] = []
@@ -292,7 +298,7 @@ class DataContractSpecification(pyd.BaseModel):
292
298
  @classmethod
293
299
  def from_file(cls, file):
294
300
  if not os.path.exists(file):
295
- raise (f"The file '{file}' does not exist.")
301
+ raise FileNotFoundError(f"The file '{file}' does not exist.")
296
302
  with open(file, "r") as file:
297
303
  file_content = file.read()
298
304
  return DataContractSpecification.from_string(file_content)
datacontract/model/run.py CHANGED
@@ -1,15 +1,24 @@
1
1
  import logging
2
2
  from datetime import datetime, timezone
3
+ from enum import Enum
3
4
  from typing import List, Optional
4
5
  from uuid import UUID, uuid4
5
6
 
6
7
  from pydantic import BaseModel
7
8
 
8
9
 
10
+ class ResultEnum(str, Enum):
11
+ passed = "passed"
12
+ warning = "warning"
13
+ failed = "failed"
14
+ error = "error"
15
+ unknown = "unknown"
16
+
17
+
9
18
  class Check(BaseModel):
10
19
  type: str
11
20
  name: Optional[str]
12
- result: str # passed, failed, warning, unknown
21
+ result: ResultEnum
13
22
  engine: str
14
23
  reason: Optional[str] = None
15
24
  model: Optional[str] = None
@@ -33,29 +42,29 @@ class Run(BaseModel):
33
42
  server: Optional[str] = None
34
43
  timestampStart: datetime
35
44
  timestampEnd: datetime
36
- result: str = "unknown" # passed, warning, failed, error, unknown
45
+ result: ResultEnum = ResultEnum.unknown
37
46
  checks: List[Check]
38
47
  logs: List[Log]
39
48
 
40
49
  def has_passed(self):
41
50
  self.calculate_result()
42
- return self.result == "passed"
51
+ return self.result == ResultEnum.passed
43
52
 
44
53
  def finish(self):
45
54
  self.timestampEnd = datetime.now(timezone.utc)
46
55
  self.calculate_result()
47
56
 
48
57
  def calculate_result(self):
49
- if any(check.result == "error" for check in self.checks):
50
- self.result = "error"
51
- elif any(check.result == "failed" for check in self.checks):
52
- self.result = "failed"
53
- elif any(check.result == "warning" for check in self.checks):
54
- self.result = "warning"
55
- elif any(check.result == "passed" for check in self.checks):
56
- self.result = "passed"
58
+ if any(check.result == ResultEnum.error for check in self.checks):
59
+ self.result = ResultEnum.error
60
+ elif any(check.result == ResultEnum.failed for check in self.checks):
61
+ self.result = ResultEnum.failed
62
+ elif any(check.result == ResultEnum.warning for check in self.checks):
63
+ self.result = ResultEnum.warning
64
+ elif any(check.result == ResultEnum.passed for check in self.checks):
65
+ self.result = ResultEnum.passed
57
66
  else:
58
- self.result = "unknown"
67
+ self.result = ResultEnum.unknown
59
68
 
60
69
  def log_info(self, message: str):
61
70
  logging.info(message)
@@ -78,17 +78,17 @@
78
78
 
79
79
  <li class="col-span-1 rounded-lg bg-white shadow hover:bg-gray-50"
80
80
  data-search="{{
81
- contract.spec.info.title|lower }} {{
82
- contract.spec.info.owner|lower if contract.spec.info.owner else '' }} {{
83
- contract.spec.info.description|lower }} {%
81
+ contract.spec.info.title|lower|e }} {{
82
+ contract.spec.info.owner|lower|e if contract.spec.info.owner else '' }} {{
83
+ contract.spec.info.description|lower|e }} {%
84
84
  for model_name, model in contract.spec.models.items() %}
85
- {{ model.description|lower }} {%
85
+ {{ model.description|lower|e }} {%
86
86
  for field_name, field in model.fields.items() %}
87
- {{ field_name|lower }} {{ field.description|lower if field.description else '' }} {%
87
+ {{ field_name|lower|e }} {{ field.description|lower|e if field.description else '' }} {%
88
88
  endfor %}
89
89
  {% endfor %}
90
90
  ">
91
- <a href="{{contract.html_link}}" >
91
+ <a href="{{contract.html_link|e}}" >
92
92
  <div class="flex w-full justify-between space-x-1 p-6 pb-4">
93
93
  <div class="flex-1 truncate">
94
94
  <div class="flex items-center space-x-3">
datacontract/web.py CHANGED
@@ -1,11 +1,10 @@
1
- from typing import Annotated, Union, Optional
1
+ from typing import Annotated, Optional, Union
2
2
 
3
3
  import typer
4
4
  from fastapi import FastAPI, File
5
- from fastapi.responses import HTMLResponse
5
+ from fastapi.responses import HTMLResponse, PlainTextResponse
6
6
 
7
7
  from datacontract.data_contract import DataContract, ExportFormat
8
- from fastapi.responses import PlainTextResponse
9
8
 
10
9
  app = FastAPI()
11
10