valediction 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +45 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/METADATA +1 -1
- valediction-1.1.0.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
|
@@ -1,357 +1,357 @@
|
|
|
1
|
-
# valediction/dictionary/generation.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from datetime import timedelta
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Iterable
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from pandas import DataFrame
|
|
11
|
-
|
|
12
|
-
from valediction.data_types.data_types import DataType
|
|
13
|
-
from valediction.data_types.type_inference import (
|
|
14
|
-
COLUMN_STEPS,
|
|
15
|
-
ColumnState,
|
|
16
|
-
TypeInferer,
|
|
17
|
-
)
|
|
18
|
-
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
19
|
-
from valediction.dictionary.model import Column, Dictionary, Table
|
|
20
|
-
from valediction.io.csv_readers import (
|
|
21
|
-
CsvReadConfig,
|
|
22
|
-
iter_csv_chunks,
|
|
23
|
-
read_csv_headers,
|
|
24
|
-
read_csv_sample,
|
|
25
|
-
)
|
|
26
|
-
from valediction.progress import Progress
|
|
27
|
-
from valediction.support import _normalise_name, calculate_runtime
|
|
28
|
-
|
|
29
|
-
IMPORTING_DATA = "Importing data"
|
|
30
|
-
CHUNK_STEPS = 1
|
|
31
|
-
COLUMN_STEPS = COLUMN_STEPS
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@dataclass(slots=True)
|
|
35
|
-
class GeneratorConfig:
|
|
36
|
-
chunk_size: int = 10_000_000
|
|
37
|
-
sample_rows: int | None = None
|
|
38
|
-
dayfirst: bool = True
|
|
39
|
-
infer_types: bool = True
|
|
40
|
-
infer_max_length: bool = True
|
|
41
|
-
|
|
42
|
-
def set_variables(
|
|
43
|
-
self,
|
|
44
|
-
chunk_size: int | None = None,
|
|
45
|
-
sample_rows: int | None = None,
|
|
46
|
-
) -> None:
|
|
47
|
-
# Set user variables
|
|
48
|
-
self.chunk_size = chunk_size
|
|
49
|
-
self.sample_rows = sample_rows
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class Generator:
|
|
53
|
-
"""
|
|
54
|
-
Summary:
|
|
55
|
-
Generator class for creating dictionaries from datasets.
|
|
56
|
-
|
|
57
|
-
Arguments:
|
|
58
|
-
feedback (bool): Provide user feedback on progress (default: True)
|
|
59
|
-
debug (bool): Enable debug mode, providing full log of data type inference and
|
|
60
|
-
reasoning (default: False)
|
|
61
|
-
chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
|
|
62
|
-
if reading from CSV (default: 10_000_000)
|
|
63
|
-
sample_rows (int | None): Number of rows to sample for data type inference. Note:
|
|
64
|
-
this overrides `chunk_size` and reads in a single chunk (default: None)
|
|
65
|
-
|
|
66
|
-
Raises:
|
|
67
|
-
DataDictionaryError: If there is an issue with the data dictionary
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
feedback: bool = True,
|
|
73
|
-
debug: bool = False,
|
|
74
|
-
chunk_size: int | None = 10_000_000,
|
|
75
|
-
sample_rows: int | None = None,
|
|
76
|
-
) -> None:
|
|
77
|
-
# User Config
|
|
78
|
-
self.config = GeneratorConfig()
|
|
79
|
-
self.config.set_variables(sample_rows=sample_rows, chunk_size=chunk_size)
|
|
80
|
-
self.feedback: bool = feedback
|
|
81
|
-
self.debug: bool = debug
|
|
82
|
-
self.csv_cfg: CsvReadConfig = CsvReadConfig()
|
|
83
|
-
|
|
84
|
-
# Progress
|
|
85
|
-
self.progress: Progress = None
|
|
86
|
-
|
|
87
|
-
# Setup
|
|
88
|
-
if sample_rows is not None:
|
|
89
|
-
self.config.sample_rows = int(sample_rows)
|
|
90
|
-
if chunk_size is not None:
|
|
91
|
-
self.config.chunk_size = int(chunk_size)
|
|
92
|
-
|
|
93
|
-
def __say(
|
|
94
|
-
self,
|
|
95
|
-
*values: object,
|
|
96
|
-
sep: str | None = " ",
|
|
97
|
-
end: str | None = "\n",
|
|
98
|
-
) -> None:
|
|
99
|
-
if self.feedback:
|
|
100
|
-
print(*values, sep=sep, end=end)
|
|
101
|
-
|
|
102
|
-
def generate_dictionary(
|
|
103
|
-
self,
|
|
104
|
-
items: Iterable[DatasetItemLike],
|
|
105
|
-
dictionary_name: str | None = None,
|
|
106
|
-
primary_keys: dict[str, list[str]] | None = None,
|
|
107
|
-
) -> Dictionary:
|
|
108
|
-
"""
|
|
109
|
-
Summary:
|
|
110
|
-
Generate a dictionary from a Dataset.
|
|
111
|
-
|
|
112
|
-
Arguments:
|
|
113
|
-
items (Dataset): A list of DatasetItems to generate the dictionary from.
|
|
114
|
-
dictionary_name (str | None): The name of the dictionary to generate.
|
|
115
|
-
If None, will not be set.
|
|
116
|
-
primary_keys (dict[str, list[str]] | None): A dictionary of primary keys
|
|
117
|
-
to set on the generated dictionary. If None, will not be set.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
Dictionary: The generated dictionary.
|
|
121
|
-
"""
|
|
122
|
-
dictionary = Dictionary(name=dictionary_name, imported=True)
|
|
123
|
-
|
|
124
|
-
self.__say(f"Generating dictionary for {len(items)} tables")
|
|
125
|
-
for item in items:
|
|
126
|
-
self.__progress_init(item)
|
|
127
|
-
table = Table(name=_normalise_name(item.name))
|
|
128
|
-
dictionary.add_table(table)
|
|
129
|
-
|
|
130
|
-
if item.is_path:
|
|
131
|
-
self._infer_from_csv_into_table(item, table)
|
|
132
|
-
else:
|
|
133
|
-
self._infer_from_dataframe_into_table(item.data, table)
|
|
134
|
-
|
|
135
|
-
item._dictionary_runtimes = self.__finish_generation_for_table()
|
|
136
|
-
|
|
137
|
-
dictionary.set_primary_keys(primary_keys or {})
|
|
138
|
-
self.__say("\n", end="")
|
|
139
|
-
return dictionary
|
|
140
|
-
|
|
141
|
-
# Generation Helpers
|
|
142
|
-
def _infer_from_csv_into_table(self, item: DatasetItemLike, table: Table) -> None:
|
|
143
|
-
self.__begin_step(step=IMPORTING_DATA)
|
|
144
|
-
csv_path = item.data
|
|
145
|
-
inferer = TypeInferer(
|
|
146
|
-
debug=self.debug,
|
|
147
|
-
dayfirst=self.config.dayfirst,
|
|
148
|
-
progress=self.progress,
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Read single sample
|
|
152
|
-
if self.config.sample_rows is not None:
|
|
153
|
-
self.__begin_step(step=IMPORTING_DATA)
|
|
154
|
-
df = read_csv_sample(
|
|
155
|
-
csv_path,
|
|
156
|
-
nrows=self.config.sample_rows,
|
|
157
|
-
cfg=self.csv_cfg,
|
|
158
|
-
).df
|
|
159
|
-
self.__complete_step()
|
|
160
|
-
|
|
161
|
-
inferer.update_with_chunk(df)
|
|
162
|
-
self._create_or_update_columns(table, inferer)
|
|
163
|
-
return
|
|
164
|
-
|
|
165
|
-
# Read in chunks
|
|
166
|
-
first_chunk = True
|
|
167
|
-
columns_by_name: dict[str, Column] = {}
|
|
168
|
-
column_count = item.column_count
|
|
169
|
-
iterator = iter_csv_chunks(
|
|
170
|
-
path=Path(csv_path), chunk_size=self.config.chunk_size, cfg=self.csv_cfg
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
while True:
|
|
174
|
-
# Import chunk
|
|
175
|
-
try:
|
|
176
|
-
chunk = next(iterator)
|
|
177
|
-
except StopIteration:
|
|
178
|
-
break
|
|
179
|
-
|
|
180
|
-
est_chunk_count = chunk.estimate_chunk_count()
|
|
181
|
-
self.__progress_retarget_total(
|
|
182
|
-
est_chunk_count=est_chunk_count, column_count=column_count
|
|
183
|
-
)
|
|
184
|
-
self.__complete_step()
|
|
185
|
-
|
|
186
|
-
inferer.update_with_chunk(chunk.df)
|
|
187
|
-
|
|
188
|
-
self.__begin_step(step="Saving chunk data types")
|
|
189
|
-
if first_chunk:
|
|
190
|
-
ordered = list(inferer.states.keys())
|
|
191
|
-
for idx, col_name in enumerate(ordered, start=1):
|
|
192
|
-
col_state = inferer.states[col_name]
|
|
193
|
-
data_type, length = col_state.final_data_type_and_length()
|
|
194
|
-
col = Column(
|
|
195
|
-
name=_normalise_name(col_name),
|
|
196
|
-
order=idx,
|
|
197
|
-
data_type=data_type,
|
|
198
|
-
length=length if data_type == DataType.TEXT else None,
|
|
199
|
-
vocabulary=None,
|
|
200
|
-
primary_key=None,
|
|
201
|
-
foreign_key=None,
|
|
202
|
-
description=None,
|
|
203
|
-
enumerations=None,
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
self._set_datetime_format(column_state=col_state, column=col)
|
|
207
|
-
table.add_column(col)
|
|
208
|
-
columns_by_name[col_name] = col
|
|
209
|
-
first_chunk = False
|
|
210
|
-
|
|
211
|
-
else:
|
|
212
|
-
self._apply_state_to_existing_columns(table, inferer, columns_by_name)
|
|
213
|
-
|
|
214
|
-
if first_chunk:
|
|
215
|
-
empty = read_csv_headers(
|
|
216
|
-
csv_path,
|
|
217
|
-
cfg=self.csv_cfg,
|
|
218
|
-
)
|
|
219
|
-
inferer.update_with_chunk(empty)
|
|
220
|
-
self._create_or_update_columns(table, inferer)
|
|
221
|
-
|
|
222
|
-
def _infer_from_dataframe_into_table(self, df: pd.DataFrame, table: Table) -> None:
|
|
223
|
-
self.__begin_step(step=IMPORTING_DATA)
|
|
224
|
-
inferer = TypeInferer(
|
|
225
|
-
debug=self.debug,
|
|
226
|
-
dayfirst=self.config.dayfirst,
|
|
227
|
-
progress=self.progress,
|
|
228
|
-
)
|
|
229
|
-
self.__complete_step()
|
|
230
|
-
|
|
231
|
-
inferer.update_with_chunk(df)
|
|
232
|
-
self._create_or_update_columns(table, inferer)
|
|
233
|
-
|
|
234
|
-
# Emit/Update Helpers
|
|
235
|
-
def _create_or_update_columns(self, table: Table, inferer: TypeInferer) -> None:
|
|
236
|
-
if len(table):
|
|
237
|
-
for existing in table:
|
|
238
|
-
table.remove_column(existing.name)
|
|
239
|
-
|
|
240
|
-
ordered = list(inferer.states.keys())
|
|
241
|
-
for idx, col_name in enumerate(ordered, start=1):
|
|
242
|
-
col_state = inferer.states[col_name]
|
|
243
|
-
data_type, length = col_state.final_data_type_and_length()
|
|
244
|
-
col = Column(
|
|
245
|
-
name=_normalise_name(col_name),
|
|
246
|
-
order=idx,
|
|
247
|
-
data_type=data_type,
|
|
248
|
-
length=length if data_type == DataType.TEXT else None,
|
|
249
|
-
vocabulary=None,
|
|
250
|
-
primary_key=None,
|
|
251
|
-
foreign_key=None,
|
|
252
|
-
description=None,
|
|
253
|
-
enumerations=None,
|
|
254
|
-
)
|
|
255
|
-
self._set_datetime_format(column_state=col_state, column=col)
|
|
256
|
-
|
|
257
|
-
table.add_column(col)
|
|
258
|
-
|
|
259
|
-
def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
|
|
260
|
-
if column.data_type in (DataType.DATE, DataType.DATETIME):
|
|
261
|
-
datetime_format = getattr(column_state, "cached_datetime_format", None)
|
|
262
|
-
if datetime_format and hasattr(column, "datetime_format"):
|
|
263
|
-
column.datetime_format = datetime_format
|
|
264
|
-
|
|
265
|
-
else:
|
|
266
|
-
if hasattr(column, "datetime_format"):
|
|
267
|
-
column.datetime_format = None
|
|
268
|
-
|
|
269
|
-
def _apply_state_to_existing_columns(
|
|
270
|
-
self,
|
|
271
|
-
table: Table,
|
|
272
|
-
inferer: TypeInferer,
|
|
273
|
-
columns_by_name: dict[str, Column],
|
|
274
|
-
) -> None:
|
|
275
|
-
for col_name, col_state in inferer.states.items():
|
|
276
|
-
if col_name not in columns_by_name:
|
|
277
|
-
next_order = max((c.order or 0 for c in table), default=0) + 1
|
|
278
|
-
data_type, length = col_state.final_data_type_and_length()
|
|
279
|
-
new_col = Column(
|
|
280
|
-
name=_normalise_name(col_name),
|
|
281
|
-
order=next_order,
|
|
282
|
-
data_type=data_type,
|
|
283
|
-
length=length if data_type == DataType.TEXT else None,
|
|
284
|
-
vocabulary=None,
|
|
285
|
-
primary_key=None,
|
|
286
|
-
foreign_key=None,
|
|
287
|
-
description=None,
|
|
288
|
-
enumerations=None,
|
|
289
|
-
)
|
|
290
|
-
self._set_datetime_format(column_state=col_state, column=new_col)
|
|
291
|
-
table.add_column(new_col)
|
|
292
|
-
columns_by_name[col_name] = new_col
|
|
293
|
-
continue
|
|
294
|
-
|
|
295
|
-
col = columns_by_name[col_name]
|
|
296
|
-
data_type, length = col_state.final_data_type_and_length()
|
|
297
|
-
|
|
298
|
-
if col.data_type != data_type:
|
|
299
|
-
col.data_type = data_type
|
|
300
|
-
|
|
301
|
-
if data_type == DataType.TEXT:
|
|
302
|
-
if length is not None and (col.length or 0) < length:
|
|
303
|
-
col.length = int(length)
|
|
304
|
-
else:
|
|
305
|
-
col.length = None
|
|
306
|
-
|
|
307
|
-
self._set_datetime_format(column_state=col_state, column=col)
|
|
308
|
-
|
|
309
|
-
# Progress
|
|
310
|
-
def __progress_init(self, item: DatasetItemLike) -> None:
|
|
311
|
-
# Switch to debug mode
|
|
312
|
-
if self.debug:
|
|
313
|
-
self.progress = Progress(enabled=False)
|
|
314
|
-
return
|
|
315
|
-
|
|
316
|
-
# Switch to silent mode
|
|
317
|
-
if not self.feedback:
|
|
318
|
-
self.progress = Progress(enabled=False)
|
|
319
|
-
return
|
|
320
|
-
|
|
321
|
-
# Progress bars on
|
|
322
|
-
total_steps = (
|
|
323
|
-
(CHUNK_STEPS + (COLUMN_STEPS * item.column_count))
|
|
324
|
-
if (isinstance(item.data, DataFrame) or self.config.sample_rows)
|
|
325
|
-
else None
|
|
326
|
-
)
|
|
327
|
-
pad = " " * item._padding if item._padding else ""
|
|
328
|
-
|
|
329
|
-
self.progress = Progress(
|
|
330
|
-
desc=f"Generating {item.name}: {pad}",
|
|
331
|
-
starting_step=IMPORTING_DATA,
|
|
332
|
-
est_total=total_steps,
|
|
333
|
-
smoothing_steps=(COLUMN_STEPS * item.column_count),
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
def __progress_retarget_total(
|
|
337
|
-
self, est_chunk_count: int, column_count: int
|
|
338
|
-
) -> None:
|
|
339
|
-
new_total = (CHUNK_STEPS * est_chunk_count) + (
|
|
340
|
-
COLUMN_STEPS * est_chunk_count * column_count
|
|
341
|
-
)
|
|
342
|
-
self.progress.retarget_total(new_total=new_total)
|
|
343
|
-
|
|
344
|
-
def __begin_step(self, step: str | None = None) -> None:
|
|
345
|
-
self.progress.begin_step(step=step)
|
|
346
|
-
|
|
347
|
-
def __complete_step(self) -> None:
|
|
348
|
-
self.progress.complete_step()
|
|
349
|
-
|
|
350
|
-
def __finish_generation_for_table(self) -> dict[str, timedelta]:
|
|
351
|
-
step = (
|
|
352
|
-
f"Completed ({calculate_runtime(start=self.progress.full_start).message})"
|
|
353
|
-
)
|
|
354
|
-
save_as = "Total"
|
|
355
|
-
self.progress.finish(postfix=step, save_as=save_as, good=True)
|
|
356
|
-
self.progress.close()
|
|
357
|
-
return self.progress.runtimes
|
|
1
|
+
# valediction/dictionary/generation.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import timedelta
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pandas import DataFrame
|
|
11
|
+
|
|
12
|
+
from valediction.data_types.data_types import DataType
|
|
13
|
+
from valediction.data_types.type_inference import (
|
|
14
|
+
COLUMN_STEPS,
|
|
15
|
+
ColumnState,
|
|
16
|
+
TypeInferer,
|
|
17
|
+
)
|
|
18
|
+
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
19
|
+
from valediction.dictionary.model import Column, Dictionary, Table
|
|
20
|
+
from valediction.io.csv_readers import (
|
|
21
|
+
CsvReadConfig,
|
|
22
|
+
iter_csv_chunks,
|
|
23
|
+
read_csv_headers,
|
|
24
|
+
read_csv_sample,
|
|
25
|
+
)
|
|
26
|
+
from valediction.progress import Progress
|
|
27
|
+
from valediction.support import _normalise_name, calculate_runtime
|
|
28
|
+
|
|
29
|
+
IMPORTING_DATA = "Importing data"
|
|
30
|
+
CHUNK_STEPS = 1
|
|
31
|
+
COLUMN_STEPS = COLUMN_STEPS
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(slots=True)
|
|
35
|
+
class GeneratorConfig:
|
|
36
|
+
chunk_size: int = 10_000_000
|
|
37
|
+
sample_rows: int | None = None
|
|
38
|
+
dayfirst: bool = True
|
|
39
|
+
infer_types: bool = True
|
|
40
|
+
infer_max_length: bool = True
|
|
41
|
+
|
|
42
|
+
def set_variables(
|
|
43
|
+
self,
|
|
44
|
+
chunk_size: int | None = None,
|
|
45
|
+
sample_rows: int | None = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
# Set user variables
|
|
48
|
+
self.chunk_size = chunk_size
|
|
49
|
+
self.sample_rows = sample_rows
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Generator:
|
|
53
|
+
"""
|
|
54
|
+
Summary:
|
|
55
|
+
Generator class for creating dictionaries from datasets.
|
|
56
|
+
|
|
57
|
+
Arguments:
|
|
58
|
+
feedback (bool): Provide user feedback on progress (default: True)
|
|
59
|
+
debug (bool): Enable debug mode, providing full log of data type inference and
|
|
60
|
+
reasoning (default: False)
|
|
61
|
+
chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
|
|
62
|
+
if reading from CSV (default: 10_000_000)
|
|
63
|
+
sample_rows (int | None): Number of rows to sample for data type inference. Note:
|
|
64
|
+
this overrides `chunk_size` and reads in a single chunk (default: None)
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
DataDictionaryError: If there is an issue with the data dictionary
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
feedback: bool = True,
|
|
73
|
+
debug: bool = False,
|
|
74
|
+
chunk_size: int | None = 10_000_000,
|
|
75
|
+
sample_rows: int | None = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
# User Config
|
|
78
|
+
self.config = GeneratorConfig()
|
|
79
|
+
self.config.set_variables(sample_rows=sample_rows, chunk_size=chunk_size)
|
|
80
|
+
self.feedback: bool = feedback
|
|
81
|
+
self.debug: bool = debug
|
|
82
|
+
self.csv_cfg: CsvReadConfig = CsvReadConfig()
|
|
83
|
+
|
|
84
|
+
# Progress
|
|
85
|
+
self.progress: Progress = None
|
|
86
|
+
|
|
87
|
+
# Setup
|
|
88
|
+
if sample_rows is not None:
|
|
89
|
+
self.config.sample_rows = int(sample_rows)
|
|
90
|
+
if chunk_size is not None:
|
|
91
|
+
self.config.chunk_size = int(chunk_size)
|
|
92
|
+
|
|
93
|
+
def __say(
|
|
94
|
+
self,
|
|
95
|
+
*values: object,
|
|
96
|
+
sep: str | None = " ",
|
|
97
|
+
end: str | None = "\n",
|
|
98
|
+
) -> None:
|
|
99
|
+
if self.feedback:
|
|
100
|
+
print(*values, sep=sep, end=end)
|
|
101
|
+
|
|
102
|
+
def generate_dictionary(
|
|
103
|
+
self,
|
|
104
|
+
items: Iterable[DatasetItemLike],
|
|
105
|
+
dictionary_name: str | None = None,
|
|
106
|
+
primary_keys: dict[str, list[str]] | None = None,
|
|
107
|
+
) -> Dictionary:
|
|
108
|
+
"""
|
|
109
|
+
Summary:
|
|
110
|
+
Generate a dictionary from a Dataset.
|
|
111
|
+
|
|
112
|
+
Arguments:
|
|
113
|
+
items (Dataset): A list of DatasetItems to generate the dictionary from.
|
|
114
|
+
dictionary_name (str | None): The name of the dictionary to generate.
|
|
115
|
+
If None, will not be set.
|
|
116
|
+
primary_keys (dict[str, list[str]] | None): A dictionary of primary keys
|
|
117
|
+
to set on the generated dictionary. If None, will not be set.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Dictionary: The generated dictionary.
|
|
121
|
+
"""
|
|
122
|
+
dictionary = Dictionary(name=dictionary_name, imported=True)
|
|
123
|
+
|
|
124
|
+
self.__say(f"Generating dictionary for {len(items)} tables")
|
|
125
|
+
for item in items:
|
|
126
|
+
self.__progress_init(item)
|
|
127
|
+
table = Table(name=_normalise_name(item.name))
|
|
128
|
+
dictionary.add_table(table)
|
|
129
|
+
|
|
130
|
+
if item.is_path:
|
|
131
|
+
self._infer_from_csv_into_table(item, table)
|
|
132
|
+
else:
|
|
133
|
+
self._infer_from_dataframe_into_table(item.data, table)
|
|
134
|
+
|
|
135
|
+
item._dictionary_runtimes = self.__finish_generation_for_table()
|
|
136
|
+
|
|
137
|
+
dictionary.set_primary_keys(primary_keys or {})
|
|
138
|
+
self.__say("\n", end="")
|
|
139
|
+
return dictionary
|
|
140
|
+
|
|
141
|
+
# Generation Helpers
|
|
142
|
+
def _infer_from_csv_into_table(self, item: DatasetItemLike, table: Table) -> None:
|
|
143
|
+
self.__begin_step(step=IMPORTING_DATA)
|
|
144
|
+
csv_path = item.data
|
|
145
|
+
inferer = TypeInferer(
|
|
146
|
+
debug=self.debug,
|
|
147
|
+
dayfirst=self.config.dayfirst,
|
|
148
|
+
progress=self.progress,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Read single sample
|
|
152
|
+
if self.config.sample_rows is not None:
|
|
153
|
+
self.__begin_step(step=IMPORTING_DATA)
|
|
154
|
+
df = read_csv_sample(
|
|
155
|
+
csv_path,
|
|
156
|
+
nrows=self.config.sample_rows,
|
|
157
|
+
cfg=self.csv_cfg,
|
|
158
|
+
).df
|
|
159
|
+
self.__complete_step()
|
|
160
|
+
|
|
161
|
+
inferer.update_with_chunk(df)
|
|
162
|
+
self._create_or_update_columns(table, inferer)
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
# Read in chunks
|
|
166
|
+
first_chunk = True
|
|
167
|
+
columns_by_name: dict[str, Column] = {}
|
|
168
|
+
column_count = item.column_count
|
|
169
|
+
iterator = iter_csv_chunks(
|
|
170
|
+
path=Path(csv_path), chunk_size=self.config.chunk_size, cfg=self.csv_cfg
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
while True:
|
|
174
|
+
# Import chunk
|
|
175
|
+
try:
|
|
176
|
+
chunk = next(iterator)
|
|
177
|
+
except StopIteration:
|
|
178
|
+
break
|
|
179
|
+
|
|
180
|
+
est_chunk_count = chunk.estimate_chunk_count()
|
|
181
|
+
self.__progress_retarget_total(
|
|
182
|
+
est_chunk_count=est_chunk_count, column_count=column_count
|
|
183
|
+
)
|
|
184
|
+
self.__complete_step()
|
|
185
|
+
|
|
186
|
+
inferer.update_with_chunk(chunk.df)
|
|
187
|
+
|
|
188
|
+
self.__begin_step(step="Saving chunk data types")
|
|
189
|
+
if first_chunk:
|
|
190
|
+
ordered = list(inferer.states.keys())
|
|
191
|
+
for idx, col_name in enumerate(ordered, start=1):
|
|
192
|
+
col_state = inferer.states[col_name]
|
|
193
|
+
data_type, length = col_state.final_data_type_and_length()
|
|
194
|
+
col = Column(
|
|
195
|
+
name=_normalise_name(col_name),
|
|
196
|
+
order=idx,
|
|
197
|
+
data_type=data_type,
|
|
198
|
+
length=length if data_type == DataType.TEXT else None,
|
|
199
|
+
vocabulary=None,
|
|
200
|
+
primary_key=None,
|
|
201
|
+
foreign_key=None,
|
|
202
|
+
description=None,
|
|
203
|
+
enumerations=None,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
self._set_datetime_format(column_state=col_state, column=col)
|
|
207
|
+
table.add_column(col)
|
|
208
|
+
columns_by_name[col_name] = col
|
|
209
|
+
first_chunk = False
|
|
210
|
+
|
|
211
|
+
else:
|
|
212
|
+
self._apply_state_to_existing_columns(table, inferer, columns_by_name)
|
|
213
|
+
|
|
214
|
+
if first_chunk:
|
|
215
|
+
empty = read_csv_headers(
|
|
216
|
+
csv_path,
|
|
217
|
+
cfg=self.csv_cfg,
|
|
218
|
+
)
|
|
219
|
+
inferer.update_with_chunk(empty)
|
|
220
|
+
self._create_or_update_columns(table, inferer)
|
|
221
|
+
|
|
222
|
+
def _infer_from_dataframe_into_table(self, df: pd.DataFrame, table: Table) -> None:
|
|
223
|
+
self.__begin_step(step=IMPORTING_DATA)
|
|
224
|
+
inferer = TypeInferer(
|
|
225
|
+
debug=self.debug,
|
|
226
|
+
dayfirst=self.config.dayfirst,
|
|
227
|
+
progress=self.progress,
|
|
228
|
+
)
|
|
229
|
+
self.__complete_step()
|
|
230
|
+
|
|
231
|
+
inferer.update_with_chunk(df)
|
|
232
|
+
self._create_or_update_columns(table, inferer)
|
|
233
|
+
|
|
234
|
+
# Emit/Update Helpers
|
|
235
|
+
def _create_or_update_columns(self, table: Table, inferer: TypeInferer) -> None:
|
|
236
|
+
if len(table):
|
|
237
|
+
for existing in table:
|
|
238
|
+
table.remove_column(existing.name)
|
|
239
|
+
|
|
240
|
+
ordered = list(inferer.states.keys())
|
|
241
|
+
for idx, col_name in enumerate(ordered, start=1):
|
|
242
|
+
col_state = inferer.states[col_name]
|
|
243
|
+
data_type, length = col_state.final_data_type_and_length()
|
|
244
|
+
col = Column(
|
|
245
|
+
name=_normalise_name(col_name),
|
|
246
|
+
order=idx,
|
|
247
|
+
data_type=data_type,
|
|
248
|
+
length=length if data_type == DataType.TEXT else None,
|
|
249
|
+
vocabulary=None,
|
|
250
|
+
primary_key=None,
|
|
251
|
+
foreign_key=None,
|
|
252
|
+
description=None,
|
|
253
|
+
enumerations=None,
|
|
254
|
+
)
|
|
255
|
+
self._set_datetime_format(column_state=col_state, column=col)
|
|
256
|
+
|
|
257
|
+
table.add_column(col)
|
|
258
|
+
|
|
259
|
+
def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
|
|
260
|
+
if column.data_type in (DataType.DATE, DataType.DATETIME):
|
|
261
|
+
datetime_format = getattr(column_state, "cached_datetime_format", None)
|
|
262
|
+
if datetime_format and hasattr(column, "datetime_format"):
|
|
263
|
+
column.datetime_format = datetime_format
|
|
264
|
+
|
|
265
|
+
else:
|
|
266
|
+
if hasattr(column, "datetime_format"):
|
|
267
|
+
column.datetime_format = None
|
|
268
|
+
|
|
269
|
+
def _apply_state_to_existing_columns(
|
|
270
|
+
self,
|
|
271
|
+
table: Table,
|
|
272
|
+
inferer: TypeInferer,
|
|
273
|
+
columns_by_name: dict[str, Column],
|
|
274
|
+
) -> None:
|
|
275
|
+
for col_name, col_state in inferer.states.items():
|
|
276
|
+
if col_name not in columns_by_name:
|
|
277
|
+
next_order = max((c.order or 0 for c in table), default=0) + 1
|
|
278
|
+
data_type, length = col_state.final_data_type_and_length()
|
|
279
|
+
new_col = Column(
|
|
280
|
+
name=_normalise_name(col_name),
|
|
281
|
+
order=next_order,
|
|
282
|
+
data_type=data_type,
|
|
283
|
+
length=length if data_type == DataType.TEXT else None,
|
|
284
|
+
vocabulary=None,
|
|
285
|
+
primary_key=None,
|
|
286
|
+
foreign_key=None,
|
|
287
|
+
description=None,
|
|
288
|
+
enumerations=None,
|
|
289
|
+
)
|
|
290
|
+
self._set_datetime_format(column_state=col_state, column=new_col)
|
|
291
|
+
table.add_column(new_col)
|
|
292
|
+
columns_by_name[col_name] = new_col
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
col = columns_by_name[col_name]
|
|
296
|
+
data_type, length = col_state.final_data_type_and_length()
|
|
297
|
+
|
|
298
|
+
if col.data_type != data_type:
|
|
299
|
+
col.data_type = data_type
|
|
300
|
+
|
|
301
|
+
if data_type == DataType.TEXT:
|
|
302
|
+
if length is not None and (col.length or 0) < length:
|
|
303
|
+
col.length = int(length)
|
|
304
|
+
else:
|
|
305
|
+
col.length = None
|
|
306
|
+
|
|
307
|
+
self._set_datetime_format(column_state=col_state, column=col)
|
|
308
|
+
|
|
309
|
+
# Progress
|
|
310
|
+
def __progress_init(self, item: DatasetItemLike) -> None:
|
|
311
|
+
# Switch to debug mode
|
|
312
|
+
if self.debug:
|
|
313
|
+
self.progress = Progress(enabled=False)
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
# Switch to silent mode
|
|
317
|
+
if not self.feedback:
|
|
318
|
+
self.progress = Progress(enabled=False)
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
# Progress bars on
|
|
322
|
+
total_steps = (
|
|
323
|
+
(CHUNK_STEPS + (COLUMN_STEPS * item.column_count))
|
|
324
|
+
if (isinstance(item.data, DataFrame) or self.config.sample_rows)
|
|
325
|
+
else None
|
|
326
|
+
)
|
|
327
|
+
pad = " " * item._padding if item._padding else ""
|
|
328
|
+
|
|
329
|
+
self.progress = Progress(
|
|
330
|
+
desc=f"Generating {item.name}: {pad}",
|
|
331
|
+
starting_step=IMPORTING_DATA,
|
|
332
|
+
est_total=total_steps,
|
|
333
|
+
smoothing_steps=(COLUMN_STEPS * item.column_count),
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def __progress_retarget_total(
|
|
337
|
+
self, est_chunk_count: int, column_count: int
|
|
338
|
+
) -> None:
|
|
339
|
+
new_total = (CHUNK_STEPS * est_chunk_count) + (
|
|
340
|
+
COLUMN_STEPS * est_chunk_count * column_count
|
|
341
|
+
)
|
|
342
|
+
self.progress.retarget_total(new_total=new_total)
|
|
343
|
+
|
|
344
|
+
def __begin_step(self, step: str | None = None) -> None:
|
|
345
|
+
self.progress.begin_step(step=step)
|
|
346
|
+
|
|
347
|
+
def __complete_step(self) -> None:
|
|
348
|
+
self.progress.complete_step()
|
|
349
|
+
|
|
350
|
+
def __finish_generation_for_table(self) -> dict[str, timedelta]:
|
|
351
|
+
step = (
|
|
352
|
+
f"Completed ({calculate_runtime(start=self.progress.full_start).message})"
|
|
353
|
+
)
|
|
354
|
+
save_as = "Total"
|
|
355
|
+
self.progress.finish(postfix=step, save_as=save_as, good=True)
|
|
356
|
+
self.progress.close()
|
|
357
|
+
return self.progress.runtimes
|