tabcaddy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tabcaddy/__init__.py +3 -0
- tabcaddy/__main__.py +5 -0
- tabcaddy/application/compile_dataset.py +75 -0
- tabcaddy/application/diff_datasets.py +24 -0
- tabcaddy/application/generate_analysis.py +32 -0
- tabcaddy/application/scaffold_transform.py +45 -0
- tabcaddy/application/transform_dataset.py +101 -0
- tabcaddy/cli/app.py +116 -0
- tabcaddy/domain/__init__.py +25 -0
- tabcaddy/domain/models.py +89 -0
- tabcaddy/domain/serialization.py +132 -0
- tabcaddy/infrastructure/analysis_builder.py +282 -0
- tabcaddy/infrastructure/cache_manager.py +73 -0
- tabcaddy/infrastructure/compiled_dataset_differ.py +22 -0
- tabcaddy/infrastructure/csv_reader.py +13 -0
- tabcaddy/infrastructure/csv_writer.py +10 -0
- tabcaddy/infrastructure/diff_support.py +104 -0
- tabcaddy/infrastructure/feather_reader.py +13 -0
- tabcaddy/infrastructure/feather_writer.py +10 -0
- tabcaddy/infrastructure/file_differ.py +19 -0
- tabcaddy/infrastructure/folder_differ.py +46 -0
- tabcaddy/infrastructure/metadata_builder.py +28 -0
- tabcaddy/infrastructure/parquet_dataset_reader.py +28 -0
- tabcaddy/infrastructure/parquet_dataset_writer.py +19 -0
- tabcaddy/infrastructure/schema_analyzer.py +118 -0
- tabcaddy/infrastructure/source_resolver.py +46 -0
- tabcaddy/infrastructure/transform_loader.py +51 -0
- tabcaddy/rendering/charts/bar_chart.py +15 -0
- tabcaddy/rendering/charts/line_chart.py +11 -0
- tabcaddy/rendering/console.py +19 -0
- tabcaddy/rendering/views/diff.py +28 -0
- tabcaddy/rendering/views/schema.py +68 -0
- tabcaddy/rendering/views/summary.py +102 -0
- tabcaddy-0.1.0.dist-info/METADATA +65 -0
- tabcaddy-0.1.0.dist-info/RECORD +38 -0
- tabcaddy-0.1.0.dist-info/WHEEL +4 -0
- tabcaddy-0.1.0.dist-info/entry_points.txt +2 -0
- tabcaddy-0.1.0.dist-info/licenses/LICENSE +174 -0
tabcaddy/__init__.py
ADDED
tabcaddy/__main__.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from tabcaddy.domain.models import DatasetSource, ProfileMode, SourceType
|
|
7
|
+
from tabcaddy.domain.serialization import analysis_to_dict
|
|
8
|
+
from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
|
|
9
|
+
from tabcaddy.infrastructure.csv_reader import read_csv
|
|
10
|
+
from tabcaddy.infrastructure.feather_reader import read_feather
|
|
11
|
+
from tabcaddy.infrastructure.parquet_dataset_writer import write_parquet_dataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _read_dataframe(path: Path):
|
|
15
|
+
if path.suffix.lower() == ".csv":
|
|
16
|
+
return read_csv(path)
|
|
17
|
+
return read_feather(path)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CompileDataset:
|
|
21
|
+
def __init__(self, analysis_builder: AnalysisBuilder | None = None) -> None:
|
|
22
|
+
self._analysis_builder = analysis_builder or AnalysisBuilder()
|
|
23
|
+
|
|
24
|
+
def run(
|
|
25
|
+
self, source: DatasetSource, output_path: Path, schema_index: int | None = None
|
|
26
|
+
) -> tuple[Path, list[str]]:
|
|
27
|
+
if source.source_type != SourceType.FOLDER:
|
|
28
|
+
raise ValueError("Compile expects a folder source.")
|
|
29
|
+
build_result = self._analysis_builder.build(source, ProfileMode.STANDARD)
|
|
30
|
+
schemas = build_result.analysis.schemas
|
|
31
|
+
if not schemas:
|
|
32
|
+
raise ValueError("No schemas found to compile.")
|
|
33
|
+
if len(schemas) > 1 and schema_index is None:
|
|
34
|
+
labels = [
|
|
35
|
+
f"Schema {index} ({schema.occurrence_count} files)"
|
|
36
|
+
for index, schema in enumerate(schemas, start=1)
|
|
37
|
+
]
|
|
38
|
+
raise ValueError(
|
|
39
|
+
"Multiple schemas detected. Re-run with --schema. Available: "
|
|
40
|
+
+ ", ".join(labels)
|
|
41
|
+
)
|
|
42
|
+
chosen_index = schema_index or 1
|
|
43
|
+
if chosen_index < 1 or chosen_index > len(schemas):
|
|
44
|
+
raise ValueError(f"Schema index must be between 1 and {len(schemas)}")
|
|
45
|
+
selected_schema = schemas[chosen_index - 1]
|
|
46
|
+
selected_files = [
|
|
47
|
+
record.path
|
|
48
|
+
for record in build_result.files
|
|
49
|
+
if record.schema_hash == selected_schema.hash
|
|
50
|
+
]
|
|
51
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
written = write_parquet_dataset(
|
|
53
|
+
(_read_dataframe(path) for path in selected_files), output_path
|
|
54
|
+
)
|
|
55
|
+
selected_analysis = self._analysis_builder.build_file_set(
|
|
56
|
+
files=selected_files,
|
|
57
|
+
base_path=source.path,
|
|
58
|
+
source_type=SourceType.FOLDER,
|
|
59
|
+
profile_mode=ProfileMode.DEEP,
|
|
60
|
+
).analysis
|
|
61
|
+
payload = analysis_to_dict(selected_analysis)
|
|
62
|
+
payload["compiled"] = {
|
|
63
|
+
"source": str(source.path),
|
|
64
|
+
"selected_schema_hash": selected_schema.hash,
|
|
65
|
+
"written_parts": [str(path.relative_to(output_path)) for path in written],
|
|
66
|
+
}
|
|
67
|
+
(output_path / "metadata.json").write_text(
|
|
68
|
+
json.dumps(payload, indent=2), encoding="utf-8"
|
|
69
|
+
)
|
|
70
|
+
skipped = [
|
|
71
|
+
record.relative_path.as_posix()
|
|
72
|
+
for record in build_result.files
|
|
73
|
+
if record.schema_hash != selected_schema.hash
|
|
74
|
+
]
|
|
75
|
+
return output_path, skipped
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from tabcaddy.domain.models import DatasetSource, DiffLevel, DiffReport, SourceType
|
|
4
|
+
from tabcaddy.infrastructure.compiled_dataset_differ import CompiledDatasetDiffer
|
|
5
|
+
from tabcaddy.infrastructure.file_differ import FileDiffer
|
|
6
|
+
from tabcaddy.infrastructure.folder_differ import FolderDiffer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DiffDatasets:
|
|
10
|
+
def __init__(self, generate_analysis) -> None:
|
|
11
|
+
self._file_differ = FileDiffer(generate_analysis)
|
|
12
|
+
self._folder_differ = FolderDiffer(generate_analysis)
|
|
13
|
+
self._compiled_differ = CompiledDatasetDiffer(generate_analysis)
|
|
14
|
+
|
|
15
|
+
def run(
|
|
16
|
+
self, left: DatasetSource, right: DatasetSource, level: DiffLevel
|
|
17
|
+
) -> DiffReport:
|
|
18
|
+
if left.source_type == right.source_type == SourceType.FILE:
|
|
19
|
+
return self._file_differ.diff(left, right, level)
|
|
20
|
+
if left.source_type == right.source_type == SourceType.FOLDER:
|
|
21
|
+
return self._folder_differ.diff(left, right, level)
|
|
22
|
+
if left.source_type == right.source_type == SourceType.COMPILED_DATASET:
|
|
23
|
+
return self._compiled_differ.diff(left, right, level)
|
|
24
|
+
return self._file_differ.diff(left, right, level)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from tabcaddy.domain.models import (
|
|
4
|
+
DatasetAnalysis,
|
|
5
|
+
DatasetSource,
|
|
6
|
+
ProfileMode,
|
|
7
|
+
SourceType,
|
|
8
|
+
)
|
|
9
|
+
from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
|
|
10
|
+
from tabcaddy.infrastructure.cache_manager import CacheManager
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GenerateAnalysis:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
analysis_builder: AnalysisBuilder | None = None,
|
|
17
|
+
cache_manager: CacheManager | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
self._analysis_builder = analysis_builder or AnalysisBuilder()
|
|
20
|
+
self._cache_manager = cache_manager or CacheManager()
|
|
21
|
+
|
|
22
|
+
def run(self, source: DatasetSource, profile_mode: ProfileMode) -> DatasetAnalysis:
|
|
23
|
+
if source.source_type == SourceType.COMPILED_DATASET:
|
|
24
|
+
compiled = self._analysis_builder.load_compiled_analysis(source)
|
|
25
|
+
if compiled is not None:
|
|
26
|
+
return compiled
|
|
27
|
+
cached = self._cache_manager.get(source, profile_mode)
|
|
28
|
+
if cached is not None:
|
|
29
|
+
return cached
|
|
30
|
+
analysis = self._analysis_builder.build(source, profile_mode).analysis
|
|
31
|
+
self._cache_manager.set(source, profile_mode, analysis)
|
|
32
|
+
return analysis
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from tabcaddy.domain.models import DatasetSource, ProfileMode
|
|
6
|
+
from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ScaffoldTransform:
|
|
10
|
+
def __init__(self, analysis_builder: AnalysisBuilder | None = None) -> None:
|
|
11
|
+
self._analysis_builder = analysis_builder or AnalysisBuilder()
|
|
12
|
+
|
|
13
|
+
def run(self, source: DatasetSource, output_path: Path) -> Path:
|
|
14
|
+
analysis = self._analysis_builder.build(source, ProfileMode.STANDARD).analysis
|
|
15
|
+
lines = [
|
|
16
|
+
'"""TabCaddy transform scaffold."""',
|
|
17
|
+
"",
|
|
18
|
+
"import polars as pl",
|
|
19
|
+
"",
|
|
20
|
+
"",
|
|
21
|
+
"# Observed schemas",
|
|
22
|
+
]
|
|
23
|
+
for index, schema in enumerate(analysis.schemas, start=1):
|
|
24
|
+
lines.append(
|
|
25
|
+
f"# Schema {index}: {schema.occurrence_count} files, hash={schema.hash}"
|
|
26
|
+
)
|
|
27
|
+
for column in schema.columns:
|
|
28
|
+
lines.append(f"# - {column.name}: {column.dtype}")
|
|
29
|
+
lines.extend(
|
|
30
|
+
[
|
|
31
|
+
"",
|
|
32
|
+
"def transform(df: pl.DataFrame, context=None) -> pl.DataFrame:",
|
|
33
|
+
" # Example: rename a column",
|
|
34
|
+
" # if 'old_name' in df.columns:",
|
|
35
|
+
" # df = df.rename({'old_name': 'new_name'})",
|
|
36
|
+
"",
|
|
37
|
+
" # Example: filter rows",
|
|
38
|
+
" # df = df.filter(pl.col('quantity') > 0)",
|
|
39
|
+
"",
|
|
40
|
+
" return df",
|
|
41
|
+
"",
|
|
42
|
+
]
|
|
43
|
+
)
|
|
44
|
+
output_path.write_text("\n".join(lines), encoding="utf-8")
|
|
45
|
+
return output_path
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from tabcaddy.domain.models import DatasetSource, SourceType
|
|
9
|
+
from tabcaddy.infrastructure.csv_reader import read_csv
|
|
10
|
+
from tabcaddy.infrastructure.csv_writer import write_csv
|
|
11
|
+
from tabcaddy.infrastructure.feather_reader import read_feather
|
|
12
|
+
from tabcaddy.infrastructure.feather_writer import write_feather
|
|
13
|
+
from tabcaddy.infrastructure.schema_analyzer import SchemaAnalyzer
|
|
14
|
+
from tabcaddy.infrastructure.source_resolver import iter_dataset_files
|
|
15
|
+
from tabcaddy.infrastructure.transform_loader import (
|
|
16
|
+
TransformContext,
|
|
17
|
+
TransformLoader,
|
|
18
|
+
TransformMetadata,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _read_dataframe(path: Path) -> pl.DataFrame:
|
|
23
|
+
if path.suffix.lower() == ".csv":
|
|
24
|
+
return read_csv(path)
|
|
25
|
+
return read_feather(path)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _write_dataframe(df: pl.DataFrame, path: Path) -> None:
|
|
29
|
+
if path.suffix.lower() == ".csv":
|
|
30
|
+
write_csv(df, path)
|
|
31
|
+
return
|
|
32
|
+
write_feather(df, path)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TransformDataset:
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
transform_loader: TransformLoader | None = None,
|
|
39
|
+
schema_analyzer: SchemaAnalyzer | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
self._transform_loader = transform_loader or TransformLoader()
|
|
42
|
+
self._schema_analyzer = schema_analyzer or SchemaAnalyzer()
|
|
43
|
+
|
|
44
|
+
def run(
|
|
45
|
+
self,
|
|
46
|
+
source: DatasetSource,
|
|
47
|
+
transform_path: Path,
|
|
48
|
+
output_path: Path | None,
|
|
49
|
+
workers: int,
|
|
50
|
+
) -> Path:
|
|
51
|
+
if source.source_type == SourceType.COMPILED_DATASET:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"Transform currently supports files and folders, not compiled datasets."
|
|
54
|
+
)
|
|
55
|
+
output_root = output_path or self._default_output_path(source.path)
|
|
56
|
+
output_root.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
transform, expects_context = self._transform_loader.load(transform_path)
|
|
58
|
+
files = iter_dataset_files(source)
|
|
59
|
+
schema_result = self._schema_analyzer.analyze_files(
|
|
60
|
+
files, base_path=source.path, source_type=source.source_type
|
|
61
|
+
)
|
|
62
|
+
record_map = {record.path: record for record in schema_result.files}
|
|
63
|
+
|
|
64
|
+
def process(path: Path) -> None:
|
|
65
|
+
record = record_map[path]
|
|
66
|
+
df = _read_dataframe(path)
|
|
67
|
+
context = TransformContext(
|
|
68
|
+
file_name=path.name,
|
|
69
|
+
file_path=str(path),
|
|
70
|
+
schema=[
|
|
71
|
+
{"name": column.name, "dtype": column.dtype}
|
|
72
|
+
for column in record.columns
|
|
73
|
+
],
|
|
74
|
+
metadata=TransformMetadata(
|
|
75
|
+
row_count=record.row_count, schema_hash=record.schema_hash
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
result = transform(df, context) if expects_context else transform(df)
|
|
79
|
+
if not isinstance(result, pl.DataFrame):
|
|
80
|
+
raise TypeError(
|
|
81
|
+
f"Transform must return a Polars DataFrame for {path.name}"
|
|
82
|
+
)
|
|
83
|
+
relative_path = (
|
|
84
|
+
record.relative_path
|
|
85
|
+
if source.source_type != SourceType.FILE
|
|
86
|
+
else Path(path.name)
|
|
87
|
+
)
|
|
88
|
+
_write_dataframe(result, output_root / relative_path)
|
|
89
|
+
|
|
90
|
+
if workers <= 1:
|
|
91
|
+
for path in files:
|
|
92
|
+
process(path)
|
|
93
|
+
else:
|
|
94
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
95
|
+
list(executor.map(process, files))
|
|
96
|
+
return output_root
|
|
97
|
+
|
|
98
|
+
def _default_output_path(self, input_path: Path) -> Path:
|
|
99
|
+
if input_path.is_dir():
|
|
100
|
+
return input_path.parent / f"{input_path.name}_transformed"
|
|
101
|
+
return input_path.parent / f"{input_path.stem}_transformed"
|
tabcaddy/cli/app.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from tabcaddy.application.compile_dataset import CompileDataset
|
|
8
|
+
from tabcaddy.application.diff_datasets import DiffDatasets
|
|
9
|
+
from tabcaddy.application.generate_analysis import GenerateAnalysis
|
|
10
|
+
from tabcaddy.application.scaffold_transform import ScaffoldTransform
|
|
11
|
+
from tabcaddy.application.transform_dataset import TransformDataset
|
|
12
|
+
from tabcaddy.domain.models import DiffLevel
|
|
13
|
+
from tabcaddy.domain.models import ProfileMode
|
|
14
|
+
from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
|
|
15
|
+
from tabcaddy.infrastructure.source_resolver import resolve_source
|
|
16
|
+
from tabcaddy.rendering.console import create_console
|
|
17
|
+
from tabcaddy.rendering.views.diff import build_diff_view
|
|
18
|
+
from tabcaddy.rendering.views.schema import build_schema_view
|
|
19
|
+
from tabcaddy.rendering.views.summary import build_summary_view
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
app = typer.Typer(
|
|
23
|
+
add_completion=False,
|
|
24
|
+
help="Explore, compile, transform, and compare datasets.",
|
|
25
|
+
no_args_is_help=True,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@app.callback()
|
|
30
|
+
def root() -> None:
|
|
31
|
+
"""TabCaddy command line interface."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@app.command()
|
|
35
|
+
def summary(
|
|
36
|
+
source: Path,
|
|
37
|
+
profile: ProfileMode = typer.Option(ProfileMode.STANDARD, "--profile"),
|
|
38
|
+
) -> None:
|
|
39
|
+
console = create_console()
|
|
40
|
+
analysis = GenerateAnalysis().run(resolve_source(source), profile)
|
|
41
|
+
console.print(build_summary_view(analysis))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@app.command()
|
|
45
|
+
def schema(
|
|
46
|
+
source: Path,
|
|
47
|
+
profile: ProfileMode = typer.Option(ProfileMode.STANDARD, "--profile"),
|
|
48
|
+
) -> None:
|
|
49
|
+
console = create_console()
|
|
50
|
+
scan = AnalysisBuilder().build(resolve_source(source), profile)
|
|
51
|
+
console.print(build_schema_view(scan.analysis, scan.files))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@app.command()
|
|
55
|
+
def compile(
|
|
56
|
+
folder: Path,
|
|
57
|
+
output: Path = typer.Option(Path("compiled_dataset"), "--output"),
|
|
58
|
+
schema: int | None = typer.Option(None, "--schema"),
|
|
59
|
+
interactive: bool = typer.Option(False, "--interactive"),
|
|
60
|
+
) -> None:
|
|
61
|
+
console = create_console()
|
|
62
|
+
source = resolve_source(folder)
|
|
63
|
+
schema_index = schema
|
|
64
|
+
if interactive and schema_index is None:
|
|
65
|
+
preview = AnalysisBuilder().build(source, ProfileMode.QUICK)
|
|
66
|
+
if len(preview.analysis.schemas) > 1:
|
|
67
|
+
schema_index = typer.prompt(
|
|
68
|
+
"Multiple schemas detected. Choose schema number", type=int
|
|
69
|
+
)
|
|
70
|
+
output_path, skipped = CompileDataset().run(source, output, schema_index)
|
|
71
|
+
console.print(f"Compiled dataset written to [green]{output_path}[/green]")
|
|
72
|
+
if skipped:
|
|
73
|
+
console.print(
|
|
74
|
+
f"Skipped {len(skipped)} files from non-selected schemas.", style="yellow"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@app.command()
|
|
79
|
+
def transform(
|
|
80
|
+
input_path: Path,
|
|
81
|
+
transform_path: Path,
|
|
82
|
+
output_path: Path | None = typer.Argument(None),
|
|
83
|
+
workers: int = typer.Option(1, "--workers", min=1),
|
|
84
|
+
) -> None:
|
|
85
|
+
console = create_console()
|
|
86
|
+
source = resolve_source(input_path)
|
|
87
|
+
destination = TransformDataset().run(source, transform_path, output_path, workers)
|
|
88
|
+
console.print(f"Transformed files written to [green]{destination}[/green]")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command("scaffold-transform")
|
|
92
|
+
def scaffold_transform(
|
|
93
|
+
source: Path,
|
|
94
|
+
output: Path = typer.Option(Path("transform_template.py"), "--output"),
|
|
95
|
+
) -> None:
|
|
96
|
+
console = create_console()
|
|
97
|
+
destination = ScaffoldTransform().run(resolve_source(source), output)
|
|
98
|
+
console.print(f"Transform scaffold written to [green]{destination}[/green]")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@app.command()
|
|
102
|
+
def diff(
|
|
103
|
+
left: Path,
|
|
104
|
+
right: Path,
|
|
105
|
+
level: DiffLevel = typer.Option(DiffLevel.FULL, "--level"),
|
|
106
|
+
) -> None:
|
|
107
|
+
console = create_console()
|
|
108
|
+
generator = GenerateAnalysis()
|
|
109
|
+
report = DiffDatasets(generator).run(
|
|
110
|
+
resolve_source(left), resolve_source(right), level
|
|
111
|
+
)
|
|
112
|
+
console.print(build_diff_view(report))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def main() -> None:
|
|
116
|
+
app()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from tabcaddy.domain.models import ColumnDefinition
|
|
2
|
+
from tabcaddy.domain.models import ColumnStatistics
|
|
3
|
+
from tabcaddy.domain.models import DatasetAnalysis
|
|
4
|
+
from tabcaddy.domain.models import DatasetMetadata
|
|
5
|
+
from tabcaddy.domain.models import DatasetSource
|
|
6
|
+
from tabcaddy.domain.models import DatasetStatistics
|
|
7
|
+
from tabcaddy.domain.models import DiffLevel
|
|
8
|
+
from tabcaddy.domain.models import DiffReport
|
|
9
|
+
from tabcaddy.domain.models import ProfileMode
|
|
10
|
+
from tabcaddy.domain.models import SchemaSignature
|
|
11
|
+
from tabcaddy.domain.models import SourceType
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ColumnDefinition",
|
|
15
|
+
"ColumnStatistics",
|
|
16
|
+
"DatasetAnalysis",
|
|
17
|
+
"DatasetMetadata",
|
|
18
|
+
"DatasetSource",
|
|
19
|
+
"DatasetStatistics",
|
|
20
|
+
"DiffLevel",
|
|
21
|
+
"DiffReport",
|
|
22
|
+
"ProfileMode",
|
|
23
|
+
"SchemaSignature",
|
|
24
|
+
"SourceType",
|
|
25
|
+
]
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SourceType(str, Enum):
|
|
11
|
+
FILE = "file"
|
|
12
|
+
FOLDER = "folder"
|
|
13
|
+
COMPILED_DATASET = "compiled_dataset"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProfileMode(str, Enum):
|
|
17
|
+
QUICK = "quick"
|
|
18
|
+
STANDARD = "standard"
|
|
19
|
+
DEEP = "deep"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DiffLevel(str, Enum):
|
|
23
|
+
METADATA = "metadata"
|
|
24
|
+
STATISTICS = "statistics"
|
|
25
|
+
FULL = "full"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class DatasetSource:
|
|
30
|
+
path: Path
|
|
31
|
+
source_type: SourceType
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class ColumnDefinition:
|
|
36
|
+
name: str
|
|
37
|
+
dtype: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class SchemaSignature:
|
|
42
|
+
columns: list[ColumnDefinition]
|
|
43
|
+
hash: str
|
|
44
|
+
occurrence_count: int
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class DatasetMetadata:
|
|
49
|
+
version: int
|
|
50
|
+
created_at: datetime
|
|
51
|
+
row_count: int
|
|
52
|
+
column_count: int
|
|
53
|
+
source_file_count: int
|
|
54
|
+
schema_hash: str | None
|
|
55
|
+
column_hashes: dict[str, str] | None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class ColumnStatistics:
|
|
60
|
+
dtype: str
|
|
61
|
+
null_rate: float
|
|
62
|
+
unique_estimate: int | None
|
|
63
|
+
min_value: Any | None
|
|
64
|
+
max_value: Any | None
|
|
65
|
+
mean: float | None
|
|
66
|
+
median: float | None
|
|
67
|
+
stddev: float | None
|
|
68
|
+
histogram: list[tuple[str, int]] | None = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class DatasetStatistics:
|
|
73
|
+
columns: dict[str, ColumnStatistics]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class DatasetAnalysis:
|
|
78
|
+
metadata: DatasetMetadata
|
|
79
|
+
schemas: list[SchemaSignature]
|
|
80
|
+
statistics: DatasetStatistics | None
|
|
81
|
+
warnings: list[str] = field(default_factory=list)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class DiffReport:
|
|
86
|
+
metadata_changes: list[str]
|
|
87
|
+
schema_changes: list[str]
|
|
88
|
+
statistics_changes: list[str]
|
|
89
|
+
warnings: list[str] = field(default_factory=list)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import date, datetime, time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from tabcaddy.domain.models import (
|
|
7
|
+
ColumnDefinition,
|
|
8
|
+
ColumnStatistics,
|
|
9
|
+
DatasetAnalysis,
|
|
10
|
+
DatasetMetadata,
|
|
11
|
+
DatasetStatistics,
|
|
12
|
+
DiffReport,
|
|
13
|
+
SchemaSignature,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _serialize_value(value: Any) -> Any:
|
|
18
|
+
if isinstance(value, (datetime, date, time)):
|
|
19
|
+
return value.isoformat()
|
|
20
|
+
if isinstance(value, list):
|
|
21
|
+
return [_serialize_value(item) for item in value]
|
|
22
|
+
if isinstance(value, dict):
|
|
23
|
+
return {key: _serialize_value(item) for key, item in value.items()}
|
|
24
|
+
return value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def analysis_to_dict(analysis: DatasetAnalysis) -> dict[str, Any]:
|
|
28
|
+
return {
|
|
29
|
+
"metadata": {
|
|
30
|
+
"version": analysis.metadata.version,
|
|
31
|
+
"created_at": analysis.metadata.created_at.isoformat(),
|
|
32
|
+
"row_count": analysis.metadata.row_count,
|
|
33
|
+
"column_count": analysis.metadata.column_count,
|
|
34
|
+
"source_file_count": analysis.metadata.source_file_count,
|
|
35
|
+
"schema_hash": analysis.metadata.schema_hash,
|
|
36
|
+
"column_hashes": analysis.metadata.column_hashes,
|
|
37
|
+
},
|
|
38
|
+
"schemas": [
|
|
39
|
+
{
|
|
40
|
+
"columns": [
|
|
41
|
+
{"name": column.name, "dtype": column.dtype}
|
|
42
|
+
for column in schema.columns
|
|
43
|
+
],
|
|
44
|
+
"hash": schema.hash,
|
|
45
|
+
"occurrence_count": schema.occurrence_count,
|
|
46
|
+
}
|
|
47
|
+
for schema in analysis.schemas
|
|
48
|
+
],
|
|
49
|
+
"statistics": None
|
|
50
|
+
if analysis.statistics is None
|
|
51
|
+
else {
|
|
52
|
+
"columns": {
|
|
53
|
+
name: {
|
|
54
|
+
"dtype": stats.dtype,
|
|
55
|
+
"null_rate": stats.null_rate,
|
|
56
|
+
"unique_estimate": stats.unique_estimate,
|
|
57
|
+
"min_value": _serialize_value(stats.min_value),
|
|
58
|
+
"max_value": _serialize_value(stats.max_value),
|
|
59
|
+
"mean": stats.mean,
|
|
60
|
+
"median": stats.median,
|
|
61
|
+
"stddev": stats.stddev,
|
|
62
|
+
"histogram": None
|
|
63
|
+
if stats.histogram is None
|
|
64
|
+
else [
|
|
65
|
+
{"label": label, "count": count}
|
|
66
|
+
for label, count in stats.histogram
|
|
67
|
+
],
|
|
68
|
+
}
|
|
69
|
+
for name, stats in analysis.statistics.columns.items()
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"warnings": list(analysis.warnings),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def analysis_from_dict(payload: dict[str, Any]) -> DatasetAnalysis:
|
|
77
|
+
metadata_payload = payload["metadata"]
|
|
78
|
+
statistics_payload = payload.get("statistics")
|
|
79
|
+
return DatasetAnalysis(
|
|
80
|
+
metadata=DatasetMetadata(
|
|
81
|
+
version=metadata_payload["version"],
|
|
82
|
+
created_at=datetime.fromisoformat(metadata_payload["created_at"]),
|
|
83
|
+
row_count=metadata_payload["row_count"],
|
|
84
|
+
column_count=metadata_payload["column_count"],
|
|
85
|
+
source_file_count=metadata_payload["source_file_count"],
|
|
86
|
+
schema_hash=metadata_payload.get("schema_hash"),
|
|
87
|
+
column_hashes=metadata_payload.get("column_hashes"),
|
|
88
|
+
),
|
|
89
|
+
schemas=[
|
|
90
|
+
SchemaSignature(
|
|
91
|
+
columns=[
|
|
92
|
+
ColumnDefinition(name=column["name"], dtype=column["dtype"])
|
|
93
|
+
for column in schema["columns"]
|
|
94
|
+
],
|
|
95
|
+
hash=schema["hash"],
|
|
96
|
+
occurrence_count=schema["occurrence_count"],
|
|
97
|
+
)
|
|
98
|
+
for schema in payload.get("schemas", [])
|
|
99
|
+
],
|
|
100
|
+
statistics=None
|
|
101
|
+
if statistics_payload is None
|
|
102
|
+
else DatasetStatistics(
|
|
103
|
+
columns={
|
|
104
|
+
name: ColumnStatistics(
|
|
105
|
+
dtype=stats["dtype"],
|
|
106
|
+
null_rate=stats["null_rate"],
|
|
107
|
+
unique_estimate=stats.get("unique_estimate"),
|
|
108
|
+
min_value=stats.get("min_value"),
|
|
109
|
+
max_value=stats.get("max_value"),
|
|
110
|
+
mean=stats.get("mean"),
|
|
111
|
+
median=stats.get("median"),
|
|
112
|
+
stddev=stats.get("stddev"),
|
|
113
|
+
histogram=None
|
|
114
|
+
if stats.get("histogram") is None
|
|
115
|
+
else [
|
|
116
|
+
(entry["label"], entry["count"]) for entry in stats["histogram"]
|
|
117
|
+
],
|
|
118
|
+
)
|
|
119
|
+
for name, stats in statistics_payload.get("columns", {}).items()
|
|
120
|
+
}
|
|
121
|
+
),
|
|
122
|
+
warnings=list(payload.get("warnings", [])),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def diff_report_to_dict(report: DiffReport) -> dict[str, Any]:
|
|
127
|
+
return {
|
|
128
|
+
"metadata_changes": list(report.metadata_changes),
|
|
129
|
+
"schema_changes": list(report.schema_changes),
|
|
130
|
+
"statistics_changes": list(report.statistics_changes),
|
|
131
|
+
"warnings": list(report.warnings),
|
|
132
|
+
}
|