datacontract-cli 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (40) hide show
  1. datacontract/cli.py +26 -24
  2. datacontract/data_contract.py +69 -152
  3. datacontract/engines/fastjsonschema/s3/s3_read_files.py +13 -1
  4. datacontract/engines/soda/check_soda_execute.py +11 -0
  5. datacontract/engines/soda/connections/bigquery.py +8 -1
  6. datacontract/engines/soda/connections/kafka.py +3 -0
  7. datacontract/export/__init__.py +0 -0
  8. datacontract/export/avro_converter.py +28 -21
  9. datacontract/export/avro_idl_converter.py +29 -22
  10. datacontract/export/bigquery_converter.py +15 -0
  11. datacontract/export/dbml_converter.py +9 -0
  12. datacontract/export/dbt_converter.py +26 -1
  13. datacontract/export/exporter.py +87 -0
  14. datacontract/export/exporter_factory.py +52 -0
  15. datacontract/export/go_converter.py +6 -0
  16. datacontract/export/great_expectations_converter.py +10 -0
  17. datacontract/export/html_export.py +6 -0
  18. datacontract/export/jsonschema_converter.py +24 -16
  19. datacontract/export/odcs_converter.py +24 -1
  20. datacontract/export/protobuf_converter.py +6 -0
  21. datacontract/export/pydantic_converter.py +6 -0
  22. datacontract/export/rdf_converter.py +9 -0
  23. datacontract/export/sodacl_converter.py +7 -1
  24. datacontract/export/sql_converter.py +32 -2
  25. datacontract/export/sql_type_converter.py +4 -5
  26. datacontract/export/terraform_converter.py +6 -0
  27. datacontract/imports/bigquery_importer.py +30 -4
  28. datacontract/imports/glue_importer.py +13 -3
  29. datacontract/imports/odcs_importer.py +192 -0
  30. datacontract/imports/unity_importer.py +138 -0
  31. datacontract/model/data_contract_specification.py +2 -0
  32. datacontract/templates/partials/server.html +64 -32
  33. datacontract/templates/style/output.css +9 -0
  34. datacontract/web.py +56 -2
  35. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/METADATA +232 -96
  36. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/RECORD +40 -35
  37. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/LICENSE +0 -0
  38. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/WHEEL +0 -0
  39. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/entry_points.txt +0 -0
  40. {datacontract_cli-0.10.6.dist-info → datacontract_cli-0.10.8.dist-info}/top_level.txt +0 -0
datacontract/cli.py CHANGED
@@ -5,6 +5,7 @@ from typing import Iterable, Optional
5
5
  from typing import List
6
6
 
7
7
  import typer
8
+ import uvicorn
8
9
  from click import Context
9
10
  from rich import box
10
11
  from rich.console import Console
@@ -12,8 +13,9 @@ from rich.table import Table
12
13
  from typer.core import TyperGroup
13
14
  from typing_extensions import Annotated
14
15
 
16
+ from datacontract import web
15
17
  from datacontract.catalog.catalog import create_index_html, create_data_contract_html
16
- from datacontract.data_contract import DataContract
18
+ from datacontract.data_contract import DataContract, ExportFormat
17
19
  from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
18
20
  from datacontract.publish.publish import publish_to_datamesh_manager
19
21
 
@@ -141,28 +143,6 @@ def test(
141
143
  _handle_result(run)
142
144
 
143
145
 
144
- class ExportFormat(str, Enum):
145
- jsonschema = "jsonschema"
146
- pydantic_model = "pydantic-model"
147
- sodacl = "sodacl"
148
- dbt = "dbt"
149
- dbt_sources = "dbt-sources"
150
- dbt_staging_sql = "dbt-staging-sql"
151
- odcs = "odcs"
152
- rdf = "rdf"
153
- avro = "avro"
154
- protobuf = "protobuf"
155
- great_expectations = "great-expectations"
156
- terraform = "terraform"
157
- avro_idl = "avro-idl"
158
- sql = "sql"
159
- sql_query = "sql-query"
160
- html = "html"
161
- go = "go"
162
- bigquery = "bigquery"
163
- dbml = "dbml"
164
-
165
-
166
146
  @app.command()
167
147
  def export(
168
148
  format: Annotated[ExportFormat, typer.Option(help="The export format.")],
@@ -205,6 +185,7 @@ def export(
205
185
  result = DataContract(data_contract_file=location, server=server).export(
206
186
  export_format=format,
207
187
  model=model,
188
+ server=server,
208
189
  rdf_base=rdf_base,
209
190
  sql_server_type=sql_server_type,
210
191
  )
@@ -223,6 +204,8 @@ class ImportFormat(str, Enum):
223
204
  glue = "glue"
224
205
  bigquery = "bigquery"
225
206
  jsonschema = "jsonschema"
207
+ odcs="odcs"
208
+ unity = "unity"
226
209
 
227
210
 
228
211
  @app.command(name="import")
@@ -231,6 +214,12 @@ def import_(
231
214
  source: Annotated[
232
215
  Optional[str], typer.Option(help="The path to the file or Glue Database that should be imported.")
233
216
  ] = None,
217
+ glue_table: Annotated[
218
+ Optional[List[str]],
219
+ typer.Option(
220
+ help="List of table ids to import from the Glue Database (repeat for multiple table ids, leave empty for all tables in the dataset)."
221
+ ),
222
+ ] = None,
234
223
  bigquery_project: Annotated[Optional[str], typer.Option(help="The bigquery project id.")] = None,
235
224
  bigquery_dataset: Annotated[Optional[str], typer.Option(help="The bigquery dataset id.")] = None,
236
225
  bigquery_table: Annotated[
@@ -239,11 +228,12 @@ def import_(
239
228
  help="List of table ids to import from the bigquery API (repeat for multiple table ids, leave empty for all tables in the dataset)."
240
229
  ),
241
230
  ] = None,
231
+ unity_table_full_name: Annotated[Optional[str], typer.Option(help="Full name of a table in the unity catalog")] = None,
242
232
  ):
243
233
  """
244
234
  Create a data contract from the given source location. Prints to stdout.
245
235
  """
246
- result = DataContract().import_from_source(format, source, bigquery_table, bigquery_project, bigquery_dataset)
236
+ result = DataContract().import_from_source(format, source, glue_table, bigquery_table, bigquery_project, bigquery_dataset, unity_table_full_name)
247
237
  console.print(result.to_yaml())
248
238
 
249
239
 
@@ -339,6 +329,18 @@ def diff(
339
329
  console.print(result.changelog_str())
340
330
 
341
331
 
332
+ @app.command()
333
+ def serve(
334
+ port: Annotated[int, typer.Option(help="Bind socket to this port.")] = 4242,
335
+ host: Annotated[str, typer.Option(help="Bind socket to this host.")] = "127.0.0.1",
336
+ ):
337
+ """
338
+ Start the datacontract web server.
339
+ """
340
+
341
+ uvicorn.run(web.app, port=port, host=host)
342
+
343
+
342
344
  def _handle_result(run):
343
345
  _print_table(run)
344
346
  if run.result == "passed":
@@ -12,27 +12,15 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
12
12
  )
13
13
  from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
14
14
  from datacontract.engines.soda.check_soda_execute import check_soda_execute
15
- from datacontract.export.avro_converter import to_avro_schema_json
16
- from datacontract.export.avro_idl_converter import to_avro_idl
17
- from datacontract.export.bigquery_converter import to_bigquery_json
18
- from datacontract.export.dbml_converter import to_dbml_diagram
19
- from datacontract.export.dbt_converter import to_dbt_models_yaml, to_dbt_sources_yaml, to_dbt_staging_sql
20
- from datacontract.export.go_converter import to_go_types
21
- from datacontract.export.great_expectations_converter import to_great_expectations
22
- from datacontract.export.html_export import to_html
23
- from datacontract.export.jsonschema_converter import to_jsonschema_json
24
- from datacontract.export.odcs_converter import to_odcs_yaml
25
- from datacontract.export.protobuf_converter import to_protobuf
26
- from datacontract.export.pydantic_converter import to_pydantic_model_str
27
- from datacontract.export.rdf_converter import to_rdf_n3
28
- from datacontract.export.sodacl_converter import to_sodacl_yaml
29
- from datacontract.export.sql_converter import to_sql_ddl, to_sql_query
30
- from datacontract.export.terraform_converter import to_terraform
15
+ from datacontract.export.exporter import ExportFormat
16
+ from datacontract.export.exporter_factory import exporter_factory
31
17
  from datacontract.imports.avro_importer import import_avro
32
18
  from datacontract.imports.bigquery_importer import import_bigquery_from_api, import_bigquery_from_json
33
19
  from datacontract.imports.glue_importer import import_glue
34
20
  from datacontract.imports.jsonschema_importer import import_jsonschema
21
+ from datacontract.imports.odcs_importer import import_odcs
35
22
  from datacontract.imports.sql_importer import import_sql
23
+ from datacontract.imports.unity_importer import import_unity_from_json, import_unity_from_api
36
24
  from datacontract.integration.publish_datamesh_manager import publish_datamesh_manager
37
25
  from datacontract.integration.publish_opentelemetry import publish_opentelemetry
38
26
  from datacontract.lint import resolve
@@ -184,6 +172,9 @@ class DataContract:
184
172
  if self._examples:
185
173
  server_name = "examples"
186
174
  server = self._get_examples_server(data_contract, run, tmp_dir)
175
+ elif self._server:
176
+ server_name = self._server
177
+ server = data_contract.servers.get(server_name)
187
178
  else:
188
179
  server_name = list(data_contract.servers.keys())[0]
189
180
  server = data_contract.servers.get(server_name)
@@ -195,10 +186,13 @@ class DataContract:
195
186
  run.outputPortId = server.outputPortId
196
187
  run.server = server_name
197
188
 
198
- # 5. check server is supported type
199
- # 6. check server credentials are complete
189
+ # TODO check server is supported type for nicer error messages
190
+
191
+ # TODO check server credentials are complete for nicer error messages
192
+
200
193
  if server.format == "json" and server.type != "kafka":
201
194
  check_jsonschema(run, data_contract, server)
195
+
202
196
  check_soda_execute(run, data_contract, server, self._spark, tmp_dir)
203
197
 
204
198
  except DataContractException as e:
@@ -234,6 +228,38 @@ class DataContract:
234
228
 
235
229
  return run
236
230
 
231
+ def _get_examples_server(self, data_contract, run, tmp_dir):
232
+ run.log_info(f"Copying examples to files in temporary directory {tmp_dir}")
233
+ format = "json"
234
+ for example in data_contract.examples:
235
+ format = example.type
236
+ p = f"{tmp_dir}/{example.model}.{format}"
237
+ run.log_info(f"Creating example file {p}")
238
+ with open(p, "w") as f:
239
+ content = ""
240
+ if format == "json" and isinstance(example.data, list):
241
+ content = json.dumps(example.data)
242
+ elif format == "json" and isinstance(example.data, str):
243
+ content = example.data
244
+ elif format == "yaml" and isinstance(example.data, list):
245
+ content = yaml.dump(example.data, allow_unicode=True)
246
+ elif format == "yaml" and isinstance(example.data, str):
247
+ content = example.data
248
+ elif format == "csv":
249
+ content = example.data
250
+ logging.debug(f"Content of example file {p}: {content}")
251
+ f.write(content)
252
+ path = f"{tmp_dir}" + "/{model}." + format
253
+ delimiter = "array"
254
+ server = Server(
255
+ type="local",
256
+ path=path,
257
+ format=format,
258
+ delimiter=delimiter,
259
+ )
260
+ run.log_info(f"Using {server} for testing the examples")
261
+ return server
262
+
237
263
  def breaking(self, other: "DataContract") -> BreakingChanges:
238
264
  return self.changelog(other, include_severities=[Severity.ERROR, Severity.WARNING])
239
265
 
@@ -275,7 +301,13 @@ class DataContract:
275
301
  inline_quality=self._inline_quality,
276
302
  )
277
303
 
278
- def export(self, export_format, model: str = "all", rdf_base: str = None, sql_server_type: str = "auto") -> str:
304
+ def export(
305
+ self,
306
+ export_format: ExportFormat,
307
+ model: str = "all",
308
+ sql_server_type: str = "auto",
309
+ **kwargs,
310
+ ) -> str:
279
311
  data_contract = resolve.resolve_data_contract(
280
312
  self._data_contract_file,
281
313
  self._data_contract_str,
@@ -283,148 +315,24 @@ class DataContract:
283
315
  inline_definitions=True,
284
316
  inline_quality=True,
285
317
  )
286
- if export_format == "jsonschema":
287
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
288
- return to_jsonschema_json(model_name, model_value)
289
- if export_format == "sodacl":
290
- return to_sodacl_yaml(data_contract)
291
- if export_format == "dbt":
292
- return to_dbt_models_yaml(data_contract)
293
- if export_format == "dbt-sources":
294
- return to_dbt_sources_yaml(data_contract, self._server)
295
- if export_format == "dbt-staging-sql":
296
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
297
- return to_dbt_staging_sql(data_contract, model_name, model_value)
298
- if export_format == "odcs":
299
- return to_odcs_yaml(data_contract)
300
- if export_format == "rdf":
301
- return to_rdf_n3(data_contract, rdf_base)
302
- if export_format == "protobuf":
303
- return to_protobuf(data_contract)
304
- if export_format == "avro":
305
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
306
- return to_avro_schema_json(model_name, model_value)
307
- if export_format == "avro-idl":
308
- return to_avro_idl(data_contract)
309
- if export_format == "terraform":
310
- return to_terraform(data_contract)
311
- if export_format == "sql":
312
- server_type = self._determine_sql_server_type(data_contract, sql_server_type)
313
- return to_sql_ddl(data_contract, server_type=server_type)
314
- if export_format == "sql-query":
315
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
316
- server_type = self._determine_sql_server_type(data_contract, sql_server_type)
317
- return to_sql_query(data_contract, model_name, model_value, server_type)
318
- if export_format == "great-expectations":
319
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
320
- return to_great_expectations(data_contract, model_name)
321
- if export_format == "pydantic-model":
322
- return to_pydantic_model_str(data_contract)
323
- if export_format == "html":
324
- return to_html(data_contract)
325
- if export_format == "go":
326
- return to_go_types(data_contract)
327
- if export_format == "bigquery":
328
- model_name, model_value = self._check_models_for_export(data_contract, model, export_format)
329
- found_server = data_contract.servers.get(self._server)
330
- if found_server is None:
331
- raise RuntimeError(
332
- f"Export to {export_format} requires selecting a bigquery server from the data contract."
333
- )
334
- if found_server.type != "bigquery":
335
- raise RuntimeError(
336
- f"Export to {export_format} requires selecting a bigquery server from the data contract."
337
- )
338
- return to_bigquery_json(model_name, model_value, found_server)
339
- if export_format == "dbml":
340
- found_server = data_contract.servers.get(self._server)
341
- return to_dbml_diagram(data_contract, found_server)
342
- else:
343
- print(f"Export format {export_format} not supported.")
344
- return ""
345
-
346
- def _determine_sql_server_type(self, data_contract: DataContractSpecification, sql_server_type: str):
347
- if sql_server_type == "auto":
348
- if data_contract.servers is None or len(data_contract.servers) == 0:
349
- raise RuntimeError("Export with server_type='auto' requires servers in the data contract.")
350
-
351
- server_types = set([server.type for server in data_contract.servers.values()])
352
- if "snowflake" in server_types:
353
- return "snowflake"
354
- elif "postgres" in server_types:
355
- return "postgres"
356
- elif "databricks" in server_types:
357
- return "databricks"
358
- else:
359
- # default to snowflake dialect
360
- return "snowflake"
361
- else:
362
- return sql_server_type
363
318
 
364
- def _get_examples_server(self, data_contract, run, tmp_dir):
365
- run.log_info(f"Copying examples to files in temporary directory {tmp_dir}")
366
- format = "json"
367
- for example in data_contract.examples:
368
- format = example.type
369
- p = f"{tmp_dir}/{example.model}.{format}"
370
- run.log_info(f"Creating example file {p}")
371
- with open(p, "w") as f:
372
- content = ""
373
- if format == "json" and isinstance(example.data, list):
374
- content = json.dumps(example.data)
375
- elif format == "json" and isinstance(example.data, str):
376
- content = example.data
377
- elif format == "yaml" and isinstance(example.data, list):
378
- content = yaml.dump(example.data, allow_unicode=True)
379
- elif format == "yaml" and isinstance(example.data, str):
380
- content = example.data
381
- elif format == "csv":
382
- content = example.data
383
- logging.debug(f"Content of example file {p}: {content}")
384
- f.write(content)
385
- path = f"{tmp_dir}" + "/{model}." + format
386
- delimiter = "array"
387
- server = Server(
388
- type="local",
389
- path=path,
390
- format=format,
391
- delimiter=delimiter,
319
+ return exporter_factory.create(export_format).export(
320
+ data_contract=data_contract,
321
+ model=model,
322
+ server=self._server,
323
+ sql_server_type=sql_server_type,
324
+ export_args=kwargs,
392
325
  )
393
- run.log_info(f"Using {server} for testing the examples")
394
- return server
395
-
396
- def _check_models_for_export(
397
- self, data_contract: DataContractSpecification, model: str, export_format: str
398
- ) -> typing.Tuple[str, str]:
399
- if data_contract.models is None:
400
- raise RuntimeError(f"Export to {export_format} requires models in the data contract.")
401
-
402
- model_names = list(data_contract.models.keys())
403
-
404
- if model == "all":
405
- if len(data_contract.models.items()) != 1:
406
- raise RuntimeError(
407
- f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}"
408
- )
409
-
410
- model_name, model_value = next(iter(data_contract.models.items()))
411
- else:
412
- model_name = model
413
- model_value = data_contract.models.get(model_name)
414
- if model_value is None:
415
- raise RuntimeError(
416
- f"Model {model_name} not found in the data contract. Available models: {model_names}"
417
- )
418
-
419
- return model_name, model_value
420
326
 
421
327
  def import_from_source(
422
328
  self,
423
329
  format: str,
424
330
  source: typing.Optional[str] = None,
331
+ glue_tables: typing.Optional[typing.List[str]] = None,
425
332
  bigquery_tables: typing.Optional[typing.List[str]] = None,
426
333
  bigquery_project: typing.Optional[str] = None,
427
334
  bigquery_dataset: typing.Optional[str] = None,
335
+ unity_table_full_name: typing.Optional[str] = None
428
336
  ) -> DataContractSpecification:
429
337
  data_contract_specification = DataContract.init()
430
338
 
@@ -433,7 +341,7 @@ class DataContract:
433
341
  elif format == "avro":
434
342
  data_contract_specification = import_avro(data_contract_specification, source)
435
343
  elif format == "glue":
436
- data_contract_specification = import_glue(data_contract_specification, source)
344
+ data_contract_specification = import_glue(data_contract_specification, source, glue_tables)
437
345
  elif format == "jsonschema":
438
346
  data_contract_specification = import_jsonschema(data_contract_specification, source)
439
347
  elif format == "bigquery":
@@ -443,6 +351,15 @@ class DataContract:
443
351
  data_contract_specification = import_bigquery_from_api(
444
352
  data_contract_specification, bigquery_tables, bigquery_project, bigquery_dataset
445
353
  )
354
+ elif format == "odcs":
355
+ data_contract_specification = import_odcs(data_contract_specification, source)
356
+ elif format == "unity":
357
+ if source is not None:
358
+ data_contract_specification = import_unity_from_json(data_contract_specification, source)
359
+ else:
360
+ data_contract_specification = import_unity_from_api(
361
+ data_contract_specification, unity_table_full_name
362
+ )
446
363
  else:
447
364
  print(f"Import format {format} not supported.")
448
365
 
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
 
4
- import s3fs
4
+ from datacontract.model.exceptions import DataContractException
5
5
 
6
6
 
7
7
  def yield_s3_files(s3_endpoint_url, s3_location):
@@ -14,6 +14,18 @@ def yield_s3_files(s3_endpoint_url, s3_location):
14
14
 
15
15
 
16
16
  def s3_fs(s3_endpoint_url):
17
+ try:
18
+ import s3fs
19
+ except ImportError as e:
20
+ raise DataContractException(
21
+ type="schema",
22
+ result="failed",
23
+ name="s3 extra missing",
24
+ reason="Install the extra datacontract-cli\[s3] to use s3",
25
+ engine="datacontract",
26
+ original_exception=e,
27
+ )
28
+
17
29
  aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
18
30
  aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
19
31
  return s3fs.S3FileSystem(
@@ -64,6 +64,17 @@ def check_soda_execute(
64
64
  soda_configuration_str = to_databricks_soda_configuration(server)
65
65
  scan.add_configuration_yaml_str(soda_configuration_str)
66
66
  scan.set_data_source_name(server.type)
67
+ elif server.type == "dataframe":
68
+ if spark is None:
69
+ run.log_warn(
70
+ "Server type dataframe only works with the Python library and requires a Spark session, "
71
+ "please provide one with the DataContract class"
72
+ )
73
+ return
74
+ else:
75
+ logging.info("Use Spark to connect to data source")
76
+ scan.add_spark_session(spark, data_source_name="datacontract-cli")
77
+ scan.set_data_source_name("datacontract-cli")
67
78
  elif server.type == "kafka":
68
79
  if spark is None:
69
80
  spark = create_spark_session(tmp_dir)
@@ -6,10 +6,17 @@ import yaml
6
6
  # https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
7
7
  def to_bigquery_soda_configuration(server):
8
8
  # with service account key, using an external json file
9
+
10
+ # check for our own environment variable first
11
+ account_info = os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH")
12
+ if account_info is None:
13
+ # but as a fallback look for the default google one
14
+ account_info = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
15
+
9
16
  soda_configuration = {
10
17
  f"data_source {server.type}": {
11
18
  "type": "bigquery",
12
- "account_info_json_path": os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH"),
19
+ "account_info_json_path": account_info,
13
20
  "auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
14
21
  "project_id": server.project,
15
22
  "dataset": server.dataset,
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import os
2
3
  from pyspark.sql import SparkSession
3
4
  from pyspark.sql.functions import col, expr, from_json
@@ -44,6 +45,8 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
44
45
 
45
46
  def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
46
47
  """Read and process data from a Kafka topic based on the server configuration."""
48
+
49
+ logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
47
50
  df = (
48
51
  spark.read.format("kafka")
49
52
  .options(**get_auth_options())
File without changes
@@ -1,8 +1,15 @@
1
1
  import json
2
2
 
3
+ from datacontract.export.exporter import Exporter, _check_models_for_export
3
4
  from datacontract.model.data_contract_specification import Field
4
5
 
5
6
 
7
+ class AvroExporter(Exporter):
8
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
9
+ model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
10
+ return to_avro_schema_json(model_name, model_value)
11
+
12
+
6
13
  def to_avro_schema(model_name, model) -> dict:
7
14
  return to_avro_record(model_name, model.fields, model.description, model.namespace)
8
15
 
@@ -34,13 +41,8 @@ def to_avro_field(field, field_name):
34
41
  if field.description is not None:
35
42
  avro_field["doc"] = field.description
36
43
  avro_field["type"] = to_avro_type(field, field_name)
37
- # add logical type definitions for any of the date type fields
38
- if field.type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
39
- avro_field["logicalType"] = to_avro_logical_type(field.type)
40
44
 
41
45
  if field.config:
42
- if "avroLogicalType" in field.config:
43
- avro_field["logicalType"] = field.config["avroLogicalType"]
44
46
  if "avroDefault" in field.config:
45
47
  avro_field["default"] = field.config["avroDefault"]
46
48
 
@@ -48,6 +50,23 @@ def to_avro_field(field, field_name):
48
50
 
49
51
 
50
52
  def to_avro_type(field: Field, field_name: str) -> str | dict:
53
+ if field.config:
54
+ if "avroLogicalType" in field.config and "avroType" in field.config:
55
+ return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
56
+ if "avroLogicalType" in field.config:
57
+ if field.config["avroLogicalType"] in [
58
+ "timestamp-millis",
59
+ "timestamp-micros",
60
+ "local-timestamp-millis",
61
+ "local-timestamp-micros",
62
+ "time-micros",
63
+ ]:
64
+ return {"type": "long", "logicalType": field.config["avroLogicalType"]}
65
+ if field.config["avroLogicalType"] in ["time-millis", "date"]:
66
+ return {"type": "int", "logicalType": field.config["avroLogicalType"]}
67
+ if "avroType" in field.config:
68
+ return field.config["avroLogicalType"]
69
+
51
70
  if field.type is None:
52
71
  return "null"
53
72
  if field.type in ["string", "varchar", "text"]:
@@ -64,11 +83,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
64
83
  elif field.type in ["boolean"]:
65
84
  return "boolean"
66
85
  elif field.type in ["timestamp", "timestamp_tz"]:
67
- return "long"
86
+ return {"type": "long", "logicalType": "timestamp-millis"}
68
87
  elif field.type in ["timestamp_ntz"]:
69
- return "long"
88
+ return {"type": "long", "logicalType": "local-timestamp-millis"}
70
89
  elif field.type in ["date"]:
71
- return "int"
90
+ return {"type": "int", "logicalType": "date"}
72
91
  elif field.type in ["time"]:
73
92
  return "long"
74
93
  elif field.type in ["object", "record", "struct"]:
@@ -76,20 +95,8 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
76
95
  elif field.type in ["binary"]:
77
96
  return "bytes"
78
97
  elif field.type in ["array"]:
79
- # TODO support array structs
80
- return "array"
98
+ return {"type": "array", "items": to_avro_type(field.items, field_name)}
81
99
  elif field.type in ["null"]:
82
100
  return "null"
83
101
  else:
84
102
  return "bytes"
85
-
86
-
87
- def to_avro_logical_type(type: str) -> str:
88
- if type in ["timestamp", "timestamp_tz"]:
89
- return "timestamp-millis"
90
- elif type in ["timestamp_ntz"]:
91
- return "local-timestamp-millis"
92
- elif type in ["date"]:
93
- return "date"
94
- else:
95
- return ""
@@ -7,28 +7,7 @@ from datacontract.lint.resolve import inline_definitions_into_data_contract
7
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field
8
8
  from datacontract.model.exceptions import DataContractException
9
9
 
10
-
11
- def to_avro_idl(contract: DataContractSpecification) -> str:
12
- """Serialize the provided data contract specification into an Avro IDL string.
13
-
14
- The data contract will be serialized as a protocol, with one record type
15
- for each contained model. Model fields are mapped one-to-one to Avro IDL
16
- record fields.
17
- """
18
- stream = StringIO()
19
- to_avro_idl_stream(contract, stream)
20
- return stream.getvalue()
21
-
22
-
23
- def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
24
- """Serialize the provided data contract specification into Avro IDL."""
25
- ir = _contract_to_avro_idl_ir(contract)
26
- if ir.description:
27
- stream.write(f"/** {contract.info.description} */\n")
28
- stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
29
- for model_type in ir.model_types:
30
- _write_model_type(model_type, stream)
31
- stream.write("}\n")
10
+ from datacontract.export.exporter import Exporter
32
11
 
33
12
 
34
13
  class AvroPrimitiveType(Enum):
@@ -107,6 +86,34 @@ avro_primitive_types = set(
107
86
  )
108
87
 
109
88
 
89
+ class AvroIdlExporter(Exporter):
90
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
91
+ return to_avro_idl(data_contract)
92
+
93
+
94
+ def to_avro_idl(contract: DataContractSpecification) -> str:
95
+ """Serialize the provided data contract specification into an Avro IDL string.
96
+
97
+ The data contract will be serialized as a protocol, with one record type
98
+ for each contained model. Model fields are mapped one-to-one to Avro IDL
99
+ record fields.
100
+ """
101
+ stream = StringIO()
102
+ to_avro_idl_stream(contract, stream)
103
+ return stream.getvalue()
104
+
105
+
106
+ def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
107
+ """Serialize the provided data contract specification into Avro IDL."""
108
+ ir = _contract_to_avro_idl_ir(contract)
109
+ if ir.description:
110
+ stream.write(f"/** {contract.info.description} */\n")
111
+ stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
112
+ for model_type in ir.model_types:
113
+ _write_model_type(model_type, stream)
114
+ stream.write("}\n")
115
+
116
+
110
117
  def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
111
118
  result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
112
119
  match field.type:
@@ -5,6 +5,21 @@ from typing import Dict, List
5
5
  from datacontract.model.data_contract_specification import Model, Field, Server
6
6
  from datacontract.model.exceptions import DataContractException
7
7
 
8
+ from datacontract.export.exporter import Exporter, _check_models_for_export
9
+
10
+
11
+ class BigQueryExporter(Exporter):
12
+ def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
13
+ self.dict_args = export_args
14
+ model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
15
+ found_server = data_contract.servers.get(server)
16
+ if found_server is None:
17
+ raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.")
18
+ if found_server.type != "bigquery":
19
+ raise RuntimeError("Export to bigquery requires selecting a bigquery server from the data contract.")
20
+
21
+ return to_bigquery_json(model_name, model_value, found_server)
22
+
8
23
 
9
24
  def to_bigquery_json(model_name: str, model_value: Model, server: Server) -> str:
10
25
  bigquery_table = to_bigquery_schema(model_name, model_value, server)