data-validation-engine 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data_validation_engine-0.6.2.dist-info/METADATA +104 -0
  2. data_validation_engine-0.6.2.dist-info/RECORD +105 -0
  3. data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
  4. data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
  5. dve/__init__.py +0 -0
  6. dve/common/__init__.py +0 -0
  7. dve/common/error_utils.py +189 -0
  8. dve/core_engine/__init__.py +0 -0
  9. dve/core_engine/backends/__init__.py +1 -0
  10. dve/core_engine/backends/base/__init__.py +1 -0
  11. dve/core_engine/backends/base/auditing.py +618 -0
  12. dve/core_engine/backends/base/backend.py +240 -0
  13. dve/core_engine/backends/base/contract.py +454 -0
  14. dve/core_engine/backends/base/core.py +124 -0
  15. dve/core_engine/backends/base/reader.py +176 -0
  16. dve/core_engine/backends/base/reference_data.py +217 -0
  17. dve/core_engine/backends/base/rules.py +685 -0
  18. dve/core_engine/backends/base/utilities.py +146 -0
  19. dve/core_engine/backends/exceptions.py +311 -0
  20. dve/core_engine/backends/implementations/__init__.py +1 -0
  21. dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
  22. dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
  23. dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
  24. dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
  25. dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
  26. dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
  27. dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
  28. dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
  29. dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
  30. dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
  31. dve/core_engine/backends/implementations/duckdb/types.py +47 -0
  32. dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
  33. dve/core_engine/backends/implementations/spark/__init__.py +22 -0
  34. dve/core_engine/backends/implementations/spark/auditing.py +230 -0
  35. dve/core_engine/backends/implementations/spark/backend.py +78 -0
  36. dve/core_engine/backends/implementations/spark/contract.py +241 -0
  37. dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
  38. dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
  39. dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
  40. dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
  41. dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
  42. dve/core_engine/backends/implementations/spark/rules.py +430 -0
  43. dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
  44. dve/core_engine/backends/implementations/spark/types.py +21 -0
  45. dve/core_engine/backends/implementations/spark/utilities.py +144 -0
  46. dve/core_engine/backends/metadata/__init__.py +47 -0
  47. dve/core_engine/backends/metadata/contract.py +80 -0
  48. dve/core_engine/backends/metadata/reporting.py +374 -0
  49. dve/core_engine/backends/metadata/rules.py +737 -0
  50. dve/core_engine/backends/readers/__init__.py +41 -0
  51. dve/core_engine/backends/readers/csv.py +232 -0
  52. dve/core_engine/backends/readers/utilities.py +21 -0
  53. dve/core_engine/backends/readers/xml.py +432 -0
  54. dve/core_engine/backends/readers/xml_linting.py +142 -0
  55. dve/core_engine/backends/types.py +26 -0
  56. dve/core_engine/backends/utilities.py +177 -0
  57. dve/core_engine/configuration/__init__.py +1 -0
  58. dve/core_engine/configuration/base.py +56 -0
  59. dve/core_engine/configuration/v1/__init__.py +351 -0
  60. dve/core_engine/configuration/v1/filters.py +60 -0
  61. dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
  62. dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
  63. dve/core_engine/configuration/v1/steps.py +365 -0
  64. dve/core_engine/constants.py +8 -0
  65. dve/core_engine/engine.py +265 -0
  66. dve/core_engine/exceptions.py +29 -0
  67. dve/core_engine/functions/__init__.py +6 -0
  68. dve/core_engine/functions/implementations.py +200 -0
  69. dve/core_engine/loggers.py +57 -0
  70. dve/core_engine/message.py +512 -0
  71. dve/core_engine/models.py +196 -0
  72. dve/core_engine/templating.py +114 -0
  73. dve/core_engine/type_hints.py +255 -0
  74. dve/core_engine/validation.py +160 -0
  75. dve/metadata_parser/__init__.py +2 -0
  76. dve/metadata_parser/domain_types.py +682 -0
  77. dve/metadata_parser/exc.py +44 -0
  78. dve/metadata_parser/function_library.py +64 -0
  79. dve/metadata_parser/function_wrapper.py +201 -0
  80. dve/metadata_parser/model_generator.py +119 -0
  81. dve/metadata_parser/models.py +410 -0
  82. dve/metadata_parser/utilities.py +54 -0
  83. dve/parser/__init__.py +1 -0
  84. dve/parser/exceptions.py +50 -0
  85. dve/parser/file_handling/__init__.py +31 -0
  86. dve/parser/file_handling/helpers.py +29 -0
  87. dve/parser/file_handling/implementations/__init__.py +7 -0
  88. dve/parser/file_handling/implementations/base.py +97 -0
  89. dve/parser/file_handling/implementations/dbfs.py +81 -0
  90. dve/parser/file_handling/implementations/file.py +203 -0
  91. dve/parser/file_handling/implementations/s3.py +371 -0
  92. dve/parser/file_handling/log_handler.py +215 -0
  93. dve/parser/file_handling/service.py +441 -0
  94. dve/parser/file_handling/utilities.py +53 -0
  95. dve/parser/type_hints.py +46 -0
  96. dve/parser/utilities.py +113 -0
  97. dve/pipeline/__init__.py +0 -0
  98. dve/pipeline/duckdb_pipeline.py +56 -0
  99. dve/pipeline/foundry_ddb_pipeline.py +171 -0
  100. dve/pipeline/pipeline.py +935 -0
  101. dve/pipeline/spark_pipeline.py +69 -0
  102. dve/pipeline/utils.py +96 -0
  103. dve/reporting/__init__.py +1 -0
  104. dve/reporting/error_report.py +153 -0
  105. dve/reporting/excel_report.py +319 -0
@@ -0,0 +1,234 @@
1
+ """Auditing definitions for duckdb backend"""
2
+
3
+ from collections.abc import Iterable
4
+ from typing import Any, Optional, Union
5
+
6
+ import polars as pl
7
+ from duckdb import ColumnExpression, DuckDBPyConnection, DuckDBPyRelation, StarExpression, connect
8
+ from polars.datatypes.classes import DataTypeClass as PolarsType
9
+
10
+ from dve.core_engine.backends.base.auditing import (
11
+ BaseAuditingManager,
12
+ BaseAuditor,
13
+ FilterCriteria,
14
+ OrderCriteria,
15
+ )
16
+ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
17
+ PYTHON_TYPE_TO_DUCKDB_TYPE,
18
+ table_exists,
19
+ )
20
+ from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
21
+ from dve.core_engine.models import (
22
+ AuditRecord,
23
+ ProcessingStatusRecord,
24
+ SubmissionInfo,
25
+ SubmissionStatisticsRecord,
26
+ TransferRecord,
27
+ )
28
+ from dve.core_engine.type_hints import URI, ExecutorType
29
+
30
+
31
+ class DDBAuditor(BaseAuditor[DuckDBPyRelation]):
32
+ """An auditor implemented using the python duckdb package"""
33
+
34
+ def __init__(
35
+ self,
36
+ record_type: type[AuditRecord],
37
+ database_uri: URI,
38
+ name: str,
39
+ connection: Optional[DuckDBPyConnection] = None,
40
+ ):
41
+
42
+ self._db = database_uri
43
+ self._connection: DuckDBPyConnection = (
44
+ connection
45
+ if connection
46
+ else connect(
47
+ database=database_uri,
48
+ config={
49
+ "access_mode": "READ_WRITE",
50
+ "default_null_order": "NULLS_LAST",
51
+ "threads": 1,
52
+ },
53
+ )
54
+ )
55
+ super().__init__(name=name, record_type=record_type)
56
+ if not table_exists(self._connection, self._name):
57
+
58
+ self._connection.sql(self.ddb_create_table_sql)
59
+
60
+ @property
61
+ def ddb_create_table_sql(self) -> str:
62
+ """Generate create table sql script for auditor"""
63
+ _sql_expression = f"CREATE TABLE {self._name} ("
64
+ _sql_expression += ", ".join(
65
+ [f"{fld} {PYTHON_TYPE_TO_DUCKDB_TYPE.get(dtype)}" for fld, dtype in self.schema.items()]
66
+ )
67
+ _sql_expression += ")"
68
+ return _sql_expression
69
+
70
+ @property
71
+ def polars_schema(self) -> dict[str, PolarsType]:
72
+ """Get polars dataframe schema for auditor"""
73
+ return {
74
+ fld: PYTHON_TYPE_TO_POLARS_TYPE.get(dtype, pl.Utf8) # type: ignore
75
+ for fld, dtype in self.schema.items()
76
+ }
77
+
78
+ def get_relation(self) -> DuckDBPyRelation:
79
+ """Get a relation to interact with the auditor duckdb table"""
80
+ return self._connection.table(self._name)
81
+
82
+ def combine_filters(self, filter_criteria: list[FilterCriteria]) -> str:
83
+ """Combine multiple filters to apply"""
84
+ return " AND ".join([self.normalise_filter(filt) for filt in filter_criteria])
85
+
86
+ @staticmethod
87
+ def normalise_field(field: str) -> ColumnExpression: # type: ignore
88
+ """Convert field to duckdb expression"""
89
+ return ColumnExpression(field)
90
+
91
+ @staticmethod
92
+ def normalise_order(order_condition: OrderCriteria) -> str:
93
+ """Convert order criteria to duckdb expression"""
94
+ return order_condition.to_sql()
95
+
96
+ @staticmethod
97
+ def normalise_filter(filter_condition: FilterCriteria) -> str:
98
+ """Convert filter criteria to duckdb expression"""
99
+ return filter_condition.to_sql()
100
+
101
+ def conv_to_records(self, recs: DuckDBPyRelation) -> Iterable[AuditRecord]:
102
+ """Convert the relation to an iterable of the related audit record"""
103
+ return (self._record_type(**rec) for rec in recs.pl().iter_rows(named=True))
104
+
105
+ def conv_to_entity(self, recs: list[AuditRecord]) -> DuckDBPyRelation:
106
+ """Convert a list of audit records to a relation"""
107
+ # pylint: disable=W0612
108
+ rec_df = pl.DataFrame( # type: ignore
109
+ [rec.dict() for rec in recs],
110
+ schema=self.polars_schema,
111
+ )
112
+ return self._connection.sql("select * from rec_df")
113
+
114
+ def add_records(self, records: Iterable[dict[str, Any]]) -> None:
115
+ """Add records to the underlying duckdb table"""
116
+ # pylint: disable=W0612
117
+ data_pl_df = pl.DataFrame( # type: ignore
118
+ records,
119
+ schema=self.polars_schema,
120
+ )
121
+
122
+ self._connection.sql(
123
+ f"""INSERT INTO {self._name} ({', '.join(self.polars_schema)})
124
+ SELECT {', '.join(self.polars_schema)} from data_pl_df"""
125
+ )
126
+
127
+ def retrieve_records(
128
+ self,
129
+ filter_criteria: Optional[list[FilterCriteria]] = None,
130
+ data: Optional[DuckDBPyRelation] = None,
131
+ ) -> DuckDBPyRelation:
132
+ """Get records from the underlying duckdb table"""
133
+ rel = self.get_relation() if not data else data
134
+ if filter_criteria:
135
+ rel = rel.filter(self.combine_filters(filter_criteria))
136
+ return rel
137
+
138
+ def get_most_recent_records(
139
+ self,
140
+ order_criteria: list[OrderCriteria],
141
+ partition_fields: Optional[list[str]] = None,
142
+ pre_filter_criteria: Optional[list[FilterCriteria]] = None,
143
+ ) -> DuckDBPyRelation:
144
+ """Get most recent records, based on the order and partitioning,
145
+ from the underlying duckdb table"""
146
+ ordering = " AND ".join([self.normalise_order(fld) for fld in order_criteria])
147
+ rel = self.get_relation()
148
+ if pre_filter_criteria:
149
+ rel = rel.filter(self.combine_filters(pre_filter_criteria))
150
+ if partition_fields:
151
+ rel = (
152
+ rel.select(
153
+ "*, row_number() OVER (PARTITION BY {} ORDER BY {}) as RN".format( # pylint: disable=C0209
154
+ ",".join(partition_fields),
155
+ ",".join([self.normalise_order(ordr) for ordr in order_criteria]),
156
+ )
157
+ )
158
+ .filter("RN = 1")
159
+ .select(StarExpression(exclude=["RN"]))
160
+ )
161
+ else:
162
+ rel = rel.order(ordering).limit(1)
163
+ return rel
164
+
165
+
166
+ class DDBAuditingManager(BaseAuditingManager[DDBAuditor, DuckDBPyRelation]):
167
+ """Auditing manager for duckdb implementaion"""
168
+
169
+ def __init__(
170
+ self,
171
+ database_uri: URI,
172
+ pool: Optional[ExecutorType] = None,
173
+ connection: Optional[DuckDBPyRelation] = None,
174
+ ):
175
+ self._database_uri = database_uri
176
+ self._connection = (
177
+ connection
178
+ if connection
179
+ else connect(
180
+ database=database_uri,
181
+ config={
182
+ "access_mode": "READ_WRITE",
183
+ "default_null_order": "NULLS_LAST",
184
+ "threads": 1,
185
+ },
186
+ )
187
+ )
188
+ self._pool = pool
189
+ super().__init__(
190
+ processing_status=DDBAuditor(
191
+ record_type=ProcessingStatusRecord,
192
+ database_uri=self._database_uri,
193
+ name="processing_status",
194
+ connection=self._connection, # type: ignore
195
+ ),
196
+ submission_info=DDBAuditor(
197
+ record_type=SubmissionInfo,
198
+ database_uri=self._database_uri,
199
+ name="submission_info",
200
+ connection=self._connection, # type: ignore
201
+ ),
202
+ submission_statistics=DDBAuditor(
203
+ record_type=SubmissionStatisticsRecord,
204
+ database_uri=self._database_uri,
205
+ name="submission_statistics",
206
+ connection=self._connection, # type: ignore
207
+ ),
208
+ transfers=DDBAuditor(
209
+ record_type=TransferRecord,
210
+ database_uri=self._database_uri,
211
+ name="transfers",
212
+ connection=self._connection, # type: ignore
213
+ ),
214
+ pool=self._pool,
215
+ )
216
+
217
+ def combine_auditor_information(
218
+ self, left: Union[DDBAuditor, DuckDBPyRelation], right: Union[DDBAuditor, DuckDBPyRelation]
219
+ ) -> DuckDBPyRelation:
220
+ if isinstance(left, DDBAuditor):
221
+ left = left.get_relation()
222
+ left = left.set_alias("lhs")
223
+ if isinstance(right, DDBAuditor):
224
+ right = right.get_relation()
225
+ right = right.set_alias("rhs")
226
+ return left.join(right, condition="submission_id", how="inner").select(
227
+ *[f"lhs.{fld}" for fld in left.columns],
228
+ *[f"rhs.{fld}" for fld in right.columns if not fld in left.columns],
229
+ )
230
+
231
+ @staticmethod
232
+ def conv_to_iterable(recs: Union[DDBAuditor, DuckDBPyRelation]) -> Iterable[dict[str, Any]]:
233
+ recs_rel: DuckDBPyRelation = recs.get_relation() if isinstance(recs, DDBAuditor) else recs
234
+ return recs_rel.pl().iter_rows(named=True)
@@ -0,0 +1,213 @@
1
+ """An implementation of the data contract in Duck DB."""
2
+
3
+ # pylint: disable=R0903
4
+ import logging
5
+ from collections.abc import Iterator
6
+ from functools import partial
7
+ from typing import Any, Optional
8
+ from uuid import uuid4
9
+
10
+ import pandas as pd
11
+ import polars as pl
12
+ import pyarrow.parquet as pq # type: ignore
13
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation
14
+ from duckdb.typing import DuckDBPyType
15
+ from polars.datatypes.classes import DataTypeClass as PolarsType
16
+ from pydantic import BaseModel
17
+ from pydantic.fields import ModelField
18
+
19
+ import dve.parser.file_handling as fh
20
+ from dve.common.error_utils import (
21
+ BackgroundMessageWriter,
22
+ dump_processing_errors,
23
+ get_feedback_errors_uri,
24
+ )
25
+ from dve.core_engine.backends.base.contract import BaseDataContract
26
+ from dve.core_engine.backends.base.utilities import (
27
+ check_if_parquet_file,
28
+ generate_error_casting_entity_message,
29
+ )
30
+ from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
31
+ duckdb_read_parquet,
32
+ duckdb_write_parquet,
33
+ get_duckdb_type_from_annotation,
34
+ relation_is_empty,
35
+ )
36
+ from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
37
+ from dve.core_engine.backends.metadata.contract import DataContractMetadata
38
+ from dve.core_engine.backends.types import StageSuccessful
39
+ from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
40
+ from dve.core_engine.message import FeedbackMessage
41
+ from dve.core_engine.type_hints import URI, EntityLocations
42
+ from dve.core_engine.validation import RowValidator, apply_row_validator_helper
43
+
44
+
45
+ class PandasApplyHelper:
46
+ """A helper for using RowValidator object with pandas dataframe"""
47
+
48
+ def __init__(self, row_validator: RowValidator):
49
+ self.row_validator = row_validator
50
+ self.errors: list[FeedbackMessage] = []
51
+
52
+ def __call__(self, row: pd.Series):
53
+ self.errors.extend(self.row_validator(row.to_dict())[1]) # type: ignore
54
+ return row # no op
55
+
56
+
57
+ @duckdb_write_parquet
58
+ @duckdb_read_parquet
59
+ class DuckDBDataContract(BaseDataContract[DuckDBPyRelation]):
60
+ """An implementation of a data contract in DuckDB.
61
+
62
+ This utilises pyarrow to distibute parquet data across python processes and
63
+ a background process to write error messages.
64
+
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ connection: DuckDBPyConnection,
70
+ logger: Optional[logging.Logger] = None,
71
+ debug: bool = False,
72
+ **kwargs: Any,
73
+ ):
74
+ self.debug = debug
75
+ self._connection = connection
76
+ """A bool indicating whether to enable debug logging."""
77
+
78
+ super().__init__(logger, **kwargs)
79
+
80
+ @property
81
+ def connection(self) -> DuckDBPyConnection:
82
+ """The duckdb connection"""
83
+ return self._connection
84
+
85
+ def _cache_records(self, relation: DuckDBPyRelation, working_dir: URI) -> URI:
86
+ chunk_uri = "/".join((working_dir.rstrip("/"), str(uuid4()))) + ".parquet"
87
+ self.write_parquet(entity=relation, target_location=chunk_uri)
88
+ return chunk_uri
89
+
90
+ def create_entity_from_py_iterator( # pylint: disable=unused-argument
91
+ self, entity_name: URI, records: Iterator[dict[URI, Any]], schema: type[BaseModel]
92
+ ) -> DuckDBPyRelation:
93
+ """Create DuckDB Relation from iterator of records"""
94
+ polars_schema: dict[str, PolarsType] = {
95
+ fld.name: get_polars_type_from_annotation(fld.type_)
96
+ for fld in stringify_model(schema).__fields__.values()
97
+ }
98
+ _lazy_df = pl.LazyFrame(records, polars_schema) # type: ignore # pylint: disable=unused-variable
99
+ return self._connection.sql("select * from _lazy_df")
100
+
101
+ @staticmethod
102
+ def generate_ddb_cast_statement(
103
+ column_name: str, dtype: DuckDBPyType, null_flag: bool = False
104
+ ) -> str:
105
+ """Helper method to generate sql statements for casting datatypes (permissively).
106
+ Current duckdb python API doesn't play well with this currently.
107
+ """
108
+ if not null_flag:
109
+ return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
110
+ return f'cast(NULL AS {dtype}) AS "{column_name}"'
111
+
112
+ # pylint: disable=R0914
113
+ def apply_data_contract(
114
+ self,
115
+ working_dir: URI,
116
+ entities: DuckDBEntities,
117
+ entity_locations: EntityLocations,
118
+ contract_metadata: DataContractMetadata,
119
+ key_fields: Optional[dict[str, list[str]]] = None,
120
+ ) -> tuple[DuckDBEntities, URI, StageSuccessful]:
121
+ """Apply the data contract to the duckdb relations"""
122
+ self.logger.info("Applying data contracts")
123
+ feedback_errors_uri: URI = get_feedback_errors_uri(working_dir, "data_contract")
124
+
125
+ # check if entities are valid parquet - if not, convert
126
+ for entity, entity_loc in entity_locations.items():
127
+ if not check_if_parquet_file(entity_loc):
128
+ parquet_uri = self.write_parquet(
129
+ entities[entity], fh.joinuri(fh.get_parent(entity_loc), f"{entity}.parquet")
130
+ )
131
+ entity_locations[entity] = parquet_uri
132
+
133
+ successful = True
134
+
135
+ with BackgroundMessageWriter(
136
+ working_dir, "data_contract", key_fields=key_fields
137
+ ) as msg_writer:
138
+ for entity_name, relation in entities.items():
139
+ # get dtypes for all fields -> python data types or use with relation
140
+ entity_fields: dict[str, ModelField] = contract_metadata.schemas[
141
+ entity_name
142
+ ].__fields__
143
+ ddb_schema: dict[str, DuckDBPyType] = {
144
+ fld.name: get_duckdb_type_from_annotation(fld.annotation)
145
+ for fld in entity_fields.values()
146
+ }
147
+ polars_schema: dict[str, PolarsType] = {
148
+ fld.name: get_polars_type_from_annotation(fld.annotation)
149
+ for fld in entity_fields.values()
150
+ }
151
+ if relation_is_empty(relation):
152
+ self.logger.warning(f"+ Empty relation for {entity_name}")
153
+ empty_df = pl.DataFrame([], schema=polars_schema) # type: ignore # pylint: disable=W0612
154
+ relation = self._connection.sql("select * from empty_df")
155
+ continue
156
+
157
+ self.logger.info(f"+ Applying contract to: {entity_name}")
158
+
159
+ row_validator_helper = partial(
160
+ apply_row_validator_helper,
161
+ row_validator=contract_metadata.validators[entity_name],
162
+ )
163
+
164
+ batches = pq.ParquetFile(entity_locations[entity_name]).iter_batches(10000)
165
+ msg_count = 0
166
+ for batch in batches:
167
+ if msgs := row_validator_helper(arrow_batch=batch):
168
+ msg_writer.write_queue.put(msgs)
169
+ msg_count += len(msgs)
170
+
171
+ self.logger.info(f"Data contract found {msg_count} issues in {entity_name}")
172
+
173
+ casting_statements = [
174
+ (
175
+ self.generate_ddb_cast_statement(column, dtype)
176
+ if column in relation.columns
177
+ else self.generate_ddb_cast_statement(column, dtype, null_flag=True)
178
+ )
179
+ for column, dtype in ddb_schema.items()
180
+ ]
181
+ try:
182
+ relation = relation.project(", ".join(casting_statements))
183
+ except Exception as err: # pylint: disable=broad-except
184
+ successful = False
185
+ self.logger.error(f"Error in casting relation: {err}")
186
+ dump_processing_errors(
187
+ working_dir,
188
+ "data_contract",
189
+ [generate_error_casting_entity_message(entity_name)],
190
+ )
191
+ continue
192
+
193
+ if self.debug:
194
+ # count will force evaluation - only done in debug
195
+ pre_convert_row_count = relation.count("*").fetchone()[0] # type: ignore
196
+ self.logger.info(f"+ Converting to parquet: ({pre_convert_row_count} rows)")
197
+ else:
198
+ pre_convert_row_count = 0
199
+ self.logger.info("+ Converting to parquet")
200
+
201
+ entities[entity_name] = relation
202
+ if self.debug:
203
+ post_convert_row_count = entities[entity_name].count("*").fetchone()[0] # type: ignore # pylint:disable=line-too-long
204
+ self.logger.info(f"+ Converted to parquet: ({post_convert_row_count} rows)")
205
+ if post_convert_row_count != pre_convert_row_count:
206
+ raise ValueError(
207
+ f"Row count mismatch for {entity_name}"
208
+ f" ({pre_convert_row_count} vs {post_convert_row_count})"
209
+ )
210
+ else:
211
+ self.logger.info("+ Converted to parquet")
212
+
213
+ return entities, feedback_errors_uri, successful