datacontract-cli 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking_rules.py +4 -0
- datacontract/catalog/catalog.py +76 -0
- datacontract/cli.py +39 -3
- datacontract/data_contract.py +12 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +1 -2
- datacontract/engines/soda/check_soda_execute.py +9 -15
- datacontract/engines/soda/connections/duckdb.py +83 -14
- datacontract/engines/soda/connections/kafka.py +108 -105
- datacontract/export/avro_idl_converter.py +1 -2
- datacontract/export/dbt_converter.py +1 -2
- datacontract/export/great_expectations_converter.py +1 -2
- datacontract/export/html_export.py +3 -2
- datacontract/export/jsonschema_converter.py +1 -2
- datacontract/export/odcs_converter.py +1 -2
- datacontract/export/rdf_converter.py +1 -1
- datacontract/export/sodacl_converter.py +1 -2
- datacontract/export/terraform_converter.py +1 -2
- datacontract/imports/avro_importer.py +1 -2
- datacontract/imports/glue_importer.py +183 -0
- datacontract/imports/sql_importer.py +20 -9
- datacontract/integration/publish_opentelemetry.py +3 -6
- datacontract/lint/linters/example_model_linter.py +1 -2
- datacontract/lint/linters/field_pattern_linter.py +1 -2
- datacontract/lint/linters/notice_period_linter.py +1 -2
- datacontract/lint/linters/quality_schema_linter.py +1 -2
- datacontract/lint/resolve.py +9 -6
- datacontract/model/data_contract_specification.py +2 -0
- datacontract/templates/datacontract.html +76 -21
- datacontract/templates/index.html +168 -0
- datacontract/templates/style/output.css +113 -4
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/METADATA +180 -102
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/RECORD +36 -33
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,36 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
|
-
import pyspark.sql.functions as fn
|
|
4
2
|
from pyspark.sql import SparkSession
|
|
3
|
+
from pyspark.sql.functions import col, expr, from_json
|
|
5
4
|
from pyspark.sql.avro.functions import from_avro
|
|
6
|
-
from pyspark.sql.functions import from_json, col
|
|
7
5
|
from pyspark.sql.types import (
|
|
8
6
|
StructType,
|
|
9
|
-
DataType,
|
|
10
|
-
NullType,
|
|
11
|
-
ArrayType,
|
|
12
|
-
BinaryType,
|
|
13
|
-
DateType,
|
|
14
|
-
TimestampNTZType,
|
|
15
|
-
TimestampType,
|
|
16
|
-
BooleanType,
|
|
17
|
-
LongType,
|
|
18
|
-
IntegerType,
|
|
19
|
-
DoubleType,
|
|
20
|
-
DecimalType,
|
|
21
|
-
StringType,
|
|
22
7
|
StructField,
|
|
8
|
+
StringType,
|
|
9
|
+
DecimalType,
|
|
10
|
+
DoubleType,
|
|
11
|
+
IntegerType,
|
|
12
|
+
LongType,
|
|
13
|
+
BooleanType,
|
|
14
|
+
TimestampType,
|
|
15
|
+
TimestampNTZType,
|
|
16
|
+
DateType,
|
|
17
|
+
BinaryType,
|
|
18
|
+
ArrayType,
|
|
19
|
+
NullType,
|
|
20
|
+
DataType,
|
|
23
21
|
)
|
|
24
22
|
|
|
25
23
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
26
|
-
from datacontract.model.data_contract_specification import
|
|
27
|
-
DataContractSpecification, Server, Field
|
|
24
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server, Field
|
|
28
25
|
from datacontract.model.exceptions import DataContractException
|
|
29
26
|
|
|
30
27
|
|
|
31
|
-
def create_spark_session(tmp_dir) -> SparkSession:
|
|
32
|
-
|
|
33
|
-
# TODO: add protobuf library
|
|
28
|
+
def create_spark_session(tmp_dir: str) -> SparkSession:
|
|
29
|
+
"""Create and configure a Spark session."""
|
|
34
30
|
spark = (
|
|
35
31
|
SparkSession.builder.appName("datacontract")
|
|
36
|
-
.config("spark.sql.warehouse.dir", tmp_dir
|
|
37
|
-
.config("spark.streaming.stopGracefullyOnShutdown",
|
|
32
|
+
.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
|
|
33
|
+
.config("spark.streaming.stopGracefullyOnShutdown", "true")
|
|
38
34
|
.config(
|
|
39
35
|
"spark.jars.packages",
|
|
40
36
|
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
|
|
@@ -47,106 +43,113 @@ def create_spark_session(tmp_dir) -> SparkSession:
|
|
|
47
43
|
|
|
48
44
|
|
|
49
45
|
def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
|
|
50
|
-
|
|
51
|
-
topic = server.topic
|
|
52
|
-
auth_options = get_auth_options()
|
|
53
|
-
|
|
54
|
-
# read full kafka topic
|
|
46
|
+
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
55
47
|
df = (
|
|
56
48
|
spark.read.format("kafka")
|
|
57
|
-
.options(**
|
|
58
|
-
.option("kafka.bootstrap.servers", host)
|
|
59
|
-
.option("subscribe", topic)
|
|
49
|
+
.options(**get_auth_options())
|
|
50
|
+
.option("kafka.bootstrap.servers", server.host)
|
|
51
|
+
.option("subscribe", server.topic)
|
|
60
52
|
.option("startingOffsets", "earliest")
|
|
61
53
|
.load()
|
|
62
54
|
)
|
|
63
|
-
|
|
55
|
+
|
|
64
56
|
model_name, model = next(iter(data_contract.models.items()))
|
|
65
|
-
if server.format == "avro":
|
|
66
|
-
avro_schema = to_avro_schema_json(model_name, model)
|
|
67
|
-
|
|
68
|
-
# Parse out the extra bytes from the Avro data
|
|
69
|
-
# A Kafka message contains a key and a value. Data going through a Kafka topic in Confluent Cloud has five bytes added to the beginning of every Avro value. If you are using Avro format keys, then five bytes will be added to the beginning of those as well. For this example, we’re assuming string keys. These bytes consist of one magic byte and four bytes representing the schema ID of the schema in the registry that is needed to decode that data. The bytes need to be removed so that the schema ID can be determined and the Avro data can be parsed. To manipulate the data, we need a couple of imports:
|
|
70
|
-
df2 = df.withColumn("fixedValue", fn.expr("substring(value, 6, length(value)-5)"))
|
|
71
|
-
|
|
72
|
-
options = {"mode": "PERMISSIVE"}
|
|
73
|
-
df3 = df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(col("avro.*"))
|
|
74
|
-
elif server.format == "json":
|
|
75
|
-
# TODO A good warning when the conversion to json fails
|
|
76
|
-
struct_type = to_struct_type(model.fields)
|
|
77
|
-
df2 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
|
|
78
|
-
|
|
79
|
-
options = {"mode": "PERMISSIVE"}
|
|
80
|
-
df3 = df2.select(from_json(df2.value, struct_type, options).alias("json")).select(col("json.*"))
|
|
81
|
-
else:
|
|
82
|
-
raise DataContractException(
|
|
83
|
-
type="test",
|
|
84
|
-
name="Configuring Kafka checks",
|
|
85
|
-
result="warning",
|
|
86
|
-
reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
|
|
87
|
-
engine="datacontract",
|
|
88
|
-
)
|
|
89
57
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
58
|
+
match server.format:
|
|
59
|
+
case "avro":
|
|
60
|
+
process_avro_format(df, model_name, model)
|
|
61
|
+
case "json":
|
|
62
|
+
process_json_format(df, model_name, model)
|
|
63
|
+
case _:
|
|
64
|
+
raise DataContractException(
|
|
65
|
+
type="test",
|
|
66
|
+
name="Configuring Kafka checks",
|
|
67
|
+
result="warning",
|
|
68
|
+
reason=f"Kafka format '{server.format}' is not supported. " f"Skip executing tests.",
|
|
69
|
+
engine="datacontract",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def process_avro_format(df, model_name, model):
|
|
74
|
+
avro_schema = to_avro_schema_json(model_name, model)
|
|
75
|
+
df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
|
|
76
|
+
options = {"mode": "PERMISSIVE"}
|
|
77
|
+
df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(
|
|
78
|
+
col("avro.*")
|
|
79
|
+
).createOrReplaceTempView(model_name)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def process_json_format(df, model_name, model):
|
|
83
|
+
struct_type = to_struct_type(model.fields)
|
|
84
|
+
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
|
|
85
|
+
from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
|
|
86
|
+
).select(col("json.*")).createOrReplaceTempView(model_name)
|
|
93
87
|
|
|
94
88
|
|
|
95
89
|
def get_auth_options():
|
|
90
|
+
"""Retrieve Kafka authentication options from environment variables."""
|
|
96
91
|
kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
|
|
97
92
|
kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
|
|
93
|
+
|
|
98
94
|
if kafka_sasl_username is None:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
"kafka.
|
|
106
|
-
|
|
107
|
-
|
|
95
|
+
return {}
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"kafka.sasl.mechanism": "PLAIN",
|
|
99
|
+
"kafka.security.protocol": "SASL_SSL",
|
|
100
|
+
"kafka.sasl.jaas.config": (
|
|
101
|
+
f"org.apache.kafka.common.security.plain.PlainLoginModule required "
|
|
102
|
+
f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
|
|
103
|
+
),
|
|
104
|
+
}
|
|
108
105
|
|
|
109
106
|
|
|
110
107
|
def to_struct_type(fields):
|
|
111
|
-
|
|
112
|
-
for field_name, field in fields.items()
|
|
113
|
-
struct_fields.append(to_struct_field(field_name, field))
|
|
114
|
-
return StructType(struct_fields)
|
|
108
|
+
"""Convert field definitions to Spark StructType."""
|
|
109
|
+
return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
|
|
115
110
|
|
|
116
111
|
|
|
117
112
|
def to_struct_field(field_name: str, field: Field) -> StructField:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
113
|
+
"""Map field definitions to Spark StructField using match-case."""
|
|
114
|
+
match field.type:
|
|
115
|
+
case "string" | "varchar" | "text":
|
|
116
|
+
data_type = StringType()
|
|
117
|
+
case "number" | "decimal" | "numeric":
|
|
118
|
+
data_type = DecimalType()
|
|
119
|
+
case "float" | "double":
|
|
120
|
+
data_type = DoubleType()
|
|
121
|
+
case "integer" | "int":
|
|
122
|
+
data_type = IntegerType()
|
|
123
|
+
case "long" | "bigint":
|
|
124
|
+
data_type = LongType()
|
|
125
|
+
case "boolean":
|
|
126
|
+
data_type = BooleanType()
|
|
127
|
+
case "timestamp" | "timestamp_tz":
|
|
128
|
+
data_type = TimestampType()
|
|
129
|
+
case "timestamp_ntz":
|
|
130
|
+
data_type = TimestampNTZType()
|
|
131
|
+
case "date":
|
|
132
|
+
data_type = DateType()
|
|
133
|
+
case "time":
|
|
134
|
+
data_type = DataType() # Specific handling for time type
|
|
135
|
+
case "object" | "record" | "struct":
|
|
136
|
+
data_type = StructType(
|
|
137
|
+
[to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
|
|
138
|
+
)
|
|
139
|
+
case "binary":
|
|
140
|
+
data_type = BinaryType()
|
|
141
|
+
case "array":
|
|
142
|
+
element_type = (
|
|
143
|
+
StructType(
|
|
144
|
+
[to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
|
|
145
|
+
)
|
|
146
|
+
if field.fields
|
|
147
|
+
else DataType()
|
|
148
|
+
)
|
|
149
|
+
data_type = ArrayType(element_type)
|
|
150
|
+
case "null":
|
|
151
|
+
data_type = NullType()
|
|
152
|
+
case _:
|
|
153
|
+
data_type = DataType() # Fallback generic DataType
|
|
151
154
|
|
|
152
155
|
return StructField(field_name, data_type, nullable=not field.required)
|
|
@@ -4,8 +4,7 @@ from enum import Enum
|
|
|
4
4
|
from io import StringIO
|
|
5
5
|
|
|
6
6
|
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
7
|
-
from datacontract.model.data_contract_specification import
|
|
8
|
-
DataContractSpecification, Field
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field
|
|
9
8
|
from datacontract.model.exceptions import DataContractException
|
|
10
9
|
|
|
11
10
|
|
|
@@ -3,8 +3,7 @@ from typing import Dict
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
5
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
6
|
-
from datacontract.model.data_contract_specification import
|
|
7
|
-
DataContractSpecification, Model, Field
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
|
|
@@ -3,8 +3,7 @@ from typing import Dict, List, Any
|
|
|
3
3
|
|
|
4
4
|
import yaml
|
|
5
5
|
|
|
6
|
-
from datacontract.model.data_contract_specification import
|
|
7
|
-
DataContractSpecification, Field, Quality
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Quality
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def to_great_expectations(data_contract_spec: DataContractSpecification, model_key: str) -> str:
|
|
@@ -22,6 +22,7 @@ def to_html(data_contract_spec: DataContractSpecification) -> str:
|
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
# Load the required template
|
|
25
|
+
# needs to be included in /MANIFEST.in
|
|
25
26
|
template = env.get_template("datacontract.html")
|
|
26
27
|
|
|
27
28
|
if data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, str):
|
|
@@ -40,9 +41,9 @@ def to_html(data_contract_spec: DataContractSpecification) -> str:
|
|
|
40
41
|
|
|
41
42
|
datacontract_yaml = data_contract_spec.to_yaml()
|
|
42
43
|
|
|
43
|
-
tz = pytz.timezone(
|
|
44
|
+
tz = pytz.timezone("UTC")
|
|
44
45
|
now = datetime.datetime.now(tz)
|
|
45
|
-
formatted_date = now.strftime(
|
|
46
|
+
formatted_date = now.strftime("%d %b %Y %H:%M:%S UTC")
|
|
46
47
|
datacontract_cli_version = get_version()
|
|
47
48
|
|
|
48
49
|
# Render the template with necessary data
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Dict
|
|
3
3
|
|
|
4
|
-
from datacontract.model.data_contract_specification import
|
|
5
|
-
DataContractSpecification, Model, Field
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def to_jsonschemas(data_contract_spec: DataContractSpecification):
|
|
@@ -2,8 +2,7 @@ from typing import Dict
|
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import
|
|
6
|
-
DataContractSpecification, Model, Field
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
def to_odcs_yaml(data_contract_spec: DataContractSpecification):
|
|
@@ -141,7 +141,7 @@ def add_info(contract, info, graph, dc, dcx):
|
|
|
141
141
|
graph.add((bnode_info, dc.version, Literal(info.version)))
|
|
142
142
|
|
|
143
143
|
# add owner
|
|
144
|
-
owner =
|
|
144
|
+
owner = Literal(info.owner)
|
|
145
145
|
graph.add((bnode_info, dc.owner, owner))
|
|
146
146
|
|
|
147
147
|
# add contact
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import yaml
|
|
2
2
|
|
|
3
3
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
4
|
-
from datacontract.model.data_contract_specification import
|
|
5
|
-
DataContractSpecification
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def to_sodacl_yaml(
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
from datacontract.model.data_contract_specification import
|
|
4
|
-
DataContractSpecification, Server
|
|
3
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
def to_terraform(data_contract_spec: DataContractSpecification, server_id: str = None) -> str:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import avro.schema
|
|
2
2
|
|
|
3
|
-
from datacontract.model.data_contract_specification import
|
|
4
|
-
DataContractSpecification, Model, Field
|
|
3
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
5
4
|
from datacontract.model.exceptions import DataContractException
|
|
6
5
|
|
|
7
6
|
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import boto3
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datacontract.model.data_contract_specification import (
|
|
5
|
+
DataContractSpecification,
|
|
6
|
+
Model,
|
|
7
|
+
Field,
|
|
8
|
+
Server,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_glue_database(datebase_name: str):
|
|
13
|
+
"""Get the details Glue database.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
database_name (str): glue database to request.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
set: catalogid and locationUri
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
glue = boto3.client("glue")
|
|
23
|
+
try:
|
|
24
|
+
response = glue.get_database(Name=datebase_name)
|
|
25
|
+
except glue.exceptions.EntityNotFoundException:
|
|
26
|
+
print(f"Database not found {datebase_name}.")
|
|
27
|
+
return (None, None)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
# todo catch all
|
|
30
|
+
print(f"Error: {e}")
|
|
31
|
+
return (None, None)
|
|
32
|
+
|
|
33
|
+
return (response["Database"]["CatalogId"], response["Database"].get("LocationUri", "None"))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_glue_tables(database_name: str) -> List[str]:
|
|
37
|
+
"""Get the list of tables in a Glue database.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
database_name (str): glue database to request.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List[string]: List of table names
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
glue = boto3.client("glue")
|
|
47
|
+
|
|
48
|
+
# Set the paginator
|
|
49
|
+
paginator = glue.get_paginator("get_tables")
|
|
50
|
+
|
|
51
|
+
# Initialize an empty list to store the table names
|
|
52
|
+
table_names = []
|
|
53
|
+
try:
|
|
54
|
+
# Paginate through the tables
|
|
55
|
+
for page in paginator.paginate(DatabaseName=database_name, PaginationConfig={"PageSize": 100}):
|
|
56
|
+
# Add the tables from the current page to the list
|
|
57
|
+
table_names.extend([table["Name"] for table in page["TableList"] if "Name" in table])
|
|
58
|
+
except glue.exceptions.EntityNotFoundException:
|
|
59
|
+
print(f"Database {database_name} not found.")
|
|
60
|
+
return []
|
|
61
|
+
except Exception as e:
|
|
62
|
+
# todo catch all
|
|
63
|
+
print(f"Error: {e}")
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
return table_names
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_glue_table_schema(database_name: str, table_name: str):
|
|
70
|
+
"""Get the schema of a Glue table.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
database_name (str): Glue database name.
|
|
74
|
+
table_name (str): Glue table name.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
dict: Table schema
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
glue = boto3.client("glue")
|
|
81
|
+
|
|
82
|
+
# Get the table schema
|
|
83
|
+
try:
|
|
84
|
+
response = glue.get_table(DatabaseName=database_name, Name=table_name)
|
|
85
|
+
except glue.exceptions.EntityNotFoundException:
|
|
86
|
+
print(f"Table {table_name} not found in database {database_name}.")
|
|
87
|
+
return {}
|
|
88
|
+
except Exception as e:
|
|
89
|
+
# todo catch all
|
|
90
|
+
print(f"Error: {e}")
|
|
91
|
+
return {}
|
|
92
|
+
|
|
93
|
+
table_schema = response["Table"]["StorageDescriptor"]["Columns"]
|
|
94
|
+
|
|
95
|
+
# when using hive partition keys, the schema is stored in the PartitionKeys field
|
|
96
|
+
if response["Table"].get("PartitionKeys") is not None:
|
|
97
|
+
for pk in response["Table"]["PartitionKeys"]:
|
|
98
|
+
table_schema.append(
|
|
99
|
+
{
|
|
100
|
+
"Name": pk["Name"],
|
|
101
|
+
"Type": pk["Type"],
|
|
102
|
+
"Hive": True,
|
|
103
|
+
"Comment": "Partition Key",
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return table_schema
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def import_glue(data_contract_specification: DataContractSpecification, source: str):
|
|
111
|
+
"""Import the schema of a Glue database."""
|
|
112
|
+
|
|
113
|
+
catalogid, location_uri = get_glue_database(source)
|
|
114
|
+
|
|
115
|
+
# something went wrong
|
|
116
|
+
if catalogid is None:
|
|
117
|
+
return data_contract_specification
|
|
118
|
+
|
|
119
|
+
tables = get_glue_tables(source)
|
|
120
|
+
|
|
121
|
+
data_contract_specification.servers = {
|
|
122
|
+
"production": Server(type="glue", account=catalogid, database=source, location=location_uri),
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
for table_name in tables:
|
|
126
|
+
if data_contract_specification.models is None:
|
|
127
|
+
data_contract_specification.models = {}
|
|
128
|
+
|
|
129
|
+
table_schema = get_glue_table_schema(source, table_name)
|
|
130
|
+
|
|
131
|
+
fields = {}
|
|
132
|
+
for column in table_schema:
|
|
133
|
+
field = Field()
|
|
134
|
+
field.type = map_type_from_sql(column["Type"])
|
|
135
|
+
|
|
136
|
+
# hive partitons are required, but are not primary keys
|
|
137
|
+
if column.get("Hive"):
|
|
138
|
+
field.required = True
|
|
139
|
+
|
|
140
|
+
field.description = column.get("Comment")
|
|
141
|
+
|
|
142
|
+
fields[column["Name"]] = field
|
|
143
|
+
|
|
144
|
+
data_contract_specification.models[table_name] = Model(
|
|
145
|
+
type="table",
|
|
146
|
+
fields=fields,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return data_contract_specification
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def map_type_from_sql(sql_type: str):
|
|
153
|
+
if sql_type is None:
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
if sql_type.lower().startswith("varchar"):
|
|
157
|
+
return "varchar"
|
|
158
|
+
if sql_type.lower().startswith("string"):
|
|
159
|
+
return "string"
|
|
160
|
+
if sql_type.lower().startswith("text"):
|
|
161
|
+
return "text"
|
|
162
|
+
elif sql_type.lower().startswith("byte"):
|
|
163
|
+
return "byte"
|
|
164
|
+
elif sql_type.lower().startswith("short"):
|
|
165
|
+
return "short"
|
|
166
|
+
elif sql_type.lower().startswith("integer"):
|
|
167
|
+
return "integer"
|
|
168
|
+
elif sql_type.lower().startswith("long"):
|
|
169
|
+
return "long"
|
|
170
|
+
elif sql_type.lower().startswith("bigint"):
|
|
171
|
+
return "long"
|
|
172
|
+
elif sql_type.lower().startswith("float"):
|
|
173
|
+
return "float"
|
|
174
|
+
elif sql_type.lower().startswith("double"):
|
|
175
|
+
return "double"
|
|
176
|
+
elif sql_type.lower().startswith("boolean"):
|
|
177
|
+
return "boolean"
|
|
178
|
+
elif sql_type.lower().startswith("timestamp"):
|
|
179
|
+
return "timestamp"
|
|
180
|
+
elif sql_type.lower().startswith("date"):
|
|
181
|
+
return "date"
|
|
182
|
+
else:
|
|
183
|
+
return "variant"
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from simple_ddl_parser import parse_from_file
|
|
2
2
|
|
|
3
|
-
from datacontract.model.data_contract_specification import
|
|
4
|
-
DataContractSpecification, Model, Field
|
|
3
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
def import_sql(data_contract_specification: DataContractSpecification, format: str, source: str):
|
|
@@ -45,19 +44,31 @@ def map_type_from_sql(sql_type: str):
|
|
|
45
44
|
if sql_type is None:
|
|
46
45
|
return None
|
|
47
46
|
|
|
48
|
-
|
|
47
|
+
sql_type_normed = sql_type.lower().strip()
|
|
48
|
+
|
|
49
|
+
if sql_type_normed.startswith("varchar"):
|
|
49
50
|
return "varchar"
|
|
50
|
-
|
|
51
|
+
elif sql_type_normed.startswith("string"):
|
|
51
52
|
return "string"
|
|
52
|
-
|
|
53
|
+
elif sql_type_normed.startswith("text"):
|
|
53
54
|
return "text"
|
|
54
|
-
elif
|
|
55
|
+
elif sql_type_normed.startswith("int"):
|
|
55
56
|
return "integer"
|
|
56
|
-
elif
|
|
57
|
+
elif sql_type_normed.startswith("float"):
|
|
57
58
|
return "float"
|
|
58
|
-
elif
|
|
59
|
+
elif sql_type_normed.startswith("bool"):
|
|
59
60
|
return "boolean"
|
|
60
|
-
elif
|
|
61
|
+
elif sql_type_normed.startswith("timestamp"):
|
|
61
62
|
return "timestamp"
|
|
63
|
+
elif sql_type_normed == "date":
|
|
64
|
+
return "date"
|
|
65
|
+
elif sql_type_normed == "smalldatetime":
|
|
66
|
+
return "timestamp_ntz"
|
|
67
|
+
elif sql_type_normed == "datetime":
|
|
68
|
+
return "timestamp_ntz"
|
|
69
|
+
elif sql_type_normed == "datetime2":
|
|
70
|
+
return "timestamp_ntz"
|
|
71
|
+
elif sql_type_normed == "datetimeoffset":
|
|
72
|
+
return "timestamp_tz"
|
|
62
73
|
else:
|
|
63
74
|
return "variant"
|
|
@@ -4,14 +4,11 @@ import os
|
|
|
4
4
|
from importlib import metadata
|
|
5
5
|
|
|
6
6
|
from opentelemetry import metrics
|
|
7
|
-
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import
|
|
8
|
-
|
|
9
|
-
from opentelemetry.exporter.otlp.proto.http.metric_exporter import \
|
|
10
|
-
OTLPMetricExporter
|
|
7
|
+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPgRPCMetricExporter
|
|
8
|
+
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
|
|
11
9
|
from opentelemetry.metrics import Observation
|
|
12
10
|
from opentelemetry.sdk.metrics import MeterProvider
|
|
13
|
-
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter,
|
|
14
|
-
PeriodicExportingMetricReader
|
|
11
|
+
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
|
|
15
12
|
|
|
16
13
|
from datacontract.model.run import Run
|
|
17
14
|
|
|
@@ -4,8 +4,7 @@ import json
|
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from datacontract.model.data_contract_specification import
|
|
8
|
-
DataContractSpecification, Example
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Example
|
|
9
8
|
from ..lint import Linter, LinterResult
|
|
10
9
|
|
|
11
10
|
|