tabcaddy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tabcaddy/__init__.py +3 -0
  2. tabcaddy/__main__.py +5 -0
  3. tabcaddy/application/compile_dataset.py +75 -0
  4. tabcaddy/application/diff_datasets.py +24 -0
  5. tabcaddy/application/generate_analysis.py +32 -0
  6. tabcaddy/application/scaffold_transform.py +45 -0
  7. tabcaddy/application/transform_dataset.py +101 -0
  8. tabcaddy/cli/app.py +116 -0
  9. tabcaddy/domain/__init__.py +25 -0
  10. tabcaddy/domain/models.py +89 -0
  11. tabcaddy/domain/serialization.py +132 -0
  12. tabcaddy/infrastructure/analysis_builder.py +282 -0
  13. tabcaddy/infrastructure/cache_manager.py +73 -0
  14. tabcaddy/infrastructure/compiled_dataset_differ.py +22 -0
  15. tabcaddy/infrastructure/csv_reader.py +13 -0
  16. tabcaddy/infrastructure/csv_writer.py +10 -0
  17. tabcaddy/infrastructure/diff_support.py +104 -0
  18. tabcaddy/infrastructure/feather_reader.py +13 -0
  19. tabcaddy/infrastructure/feather_writer.py +10 -0
  20. tabcaddy/infrastructure/file_differ.py +19 -0
  21. tabcaddy/infrastructure/folder_differ.py +46 -0
  22. tabcaddy/infrastructure/metadata_builder.py +28 -0
  23. tabcaddy/infrastructure/parquet_dataset_reader.py +28 -0
  24. tabcaddy/infrastructure/parquet_dataset_writer.py +19 -0
  25. tabcaddy/infrastructure/schema_analyzer.py +118 -0
  26. tabcaddy/infrastructure/source_resolver.py +46 -0
  27. tabcaddy/infrastructure/transform_loader.py +51 -0
  28. tabcaddy/rendering/charts/bar_chart.py +15 -0
  29. tabcaddy/rendering/charts/line_chart.py +11 -0
  30. tabcaddy/rendering/console.py +19 -0
  31. tabcaddy/rendering/views/diff.py +28 -0
  32. tabcaddy/rendering/views/schema.py +68 -0
  33. tabcaddy/rendering/views/summary.py +102 -0
  34. tabcaddy-0.1.0.dist-info/METADATA +65 -0
  35. tabcaddy-0.1.0.dist-info/RECORD +38 -0
  36. tabcaddy-0.1.0.dist-info/WHEEL +4 -0
  37. tabcaddy-0.1.0.dist-info/entry_points.txt +2 -0
  38. tabcaddy-0.1.0.dist-info/licenses/LICENSE +174 -0
tabcaddy/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from tabcaddy.cli.app import app
2
+
3
+ __all__ = ["app"]
tabcaddy/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ from tabcaddy.cli.app import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ main()
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from tabcaddy.domain.models import DatasetSource, ProfileMode, SourceType
7
+ from tabcaddy.domain.serialization import analysis_to_dict
8
+ from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
9
+ from tabcaddy.infrastructure.csv_reader import read_csv
10
+ from tabcaddy.infrastructure.feather_reader import read_feather
11
+ from tabcaddy.infrastructure.parquet_dataset_writer import write_parquet_dataset
12
+
13
+
14
+ def _read_dataframe(path: Path):
15
+ if path.suffix.lower() == ".csv":
16
+ return read_csv(path)
17
+ return read_feather(path)
18
+
19
+
20
+ class CompileDataset:
21
+ def __init__(self, analysis_builder: AnalysisBuilder | None = None) -> None:
22
+ self._analysis_builder = analysis_builder or AnalysisBuilder()
23
+
24
+ def run(
25
+ self, source: DatasetSource, output_path: Path, schema_index: int | None = None
26
+ ) -> tuple[Path, list[str]]:
27
+ if source.source_type != SourceType.FOLDER:
28
+ raise ValueError("Compile expects a folder source.")
29
+ build_result = self._analysis_builder.build(source, ProfileMode.STANDARD)
30
+ schemas = build_result.analysis.schemas
31
+ if not schemas:
32
+ raise ValueError("No schemas found to compile.")
33
+ if len(schemas) > 1 and schema_index is None:
34
+ labels = [
35
+ f"Schema {index} ({schema.occurrence_count} files)"
36
+ for index, schema in enumerate(schemas, start=1)
37
+ ]
38
+ raise ValueError(
39
+ "Multiple schemas detected. Re-run with --schema. Available: "
40
+ + ", ".join(labels)
41
+ )
42
+ chosen_index = schema_index or 1
43
+ if chosen_index < 1 or chosen_index > len(schemas):
44
+ raise ValueError(f"Schema index must be between 1 and {len(schemas)}")
45
+ selected_schema = schemas[chosen_index - 1]
46
+ selected_files = [
47
+ record.path
48
+ for record in build_result.files
49
+ if record.schema_hash == selected_schema.hash
50
+ ]
51
+ output_path.mkdir(parents=True, exist_ok=True)
52
+ written = write_parquet_dataset(
53
+ (_read_dataframe(path) for path in selected_files), output_path
54
+ )
55
+ selected_analysis = self._analysis_builder.build_file_set(
56
+ files=selected_files,
57
+ base_path=source.path,
58
+ source_type=SourceType.FOLDER,
59
+ profile_mode=ProfileMode.DEEP,
60
+ ).analysis
61
+ payload = analysis_to_dict(selected_analysis)
62
+ payload["compiled"] = {
63
+ "source": str(source.path),
64
+ "selected_schema_hash": selected_schema.hash,
65
+ "written_parts": [str(path.relative_to(output_path)) for path in written],
66
+ }
67
+ (output_path / "metadata.json").write_text(
68
+ json.dumps(payload, indent=2), encoding="utf-8"
69
+ )
70
+ skipped = [
71
+ record.relative_path.as_posix()
72
+ for record in build_result.files
73
+ if record.schema_hash != selected_schema.hash
74
+ ]
75
+ return output_path, skipped
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from tabcaddy.domain.models import DatasetSource, DiffLevel, DiffReport, SourceType
4
+ from tabcaddy.infrastructure.compiled_dataset_differ import CompiledDatasetDiffer
5
+ from tabcaddy.infrastructure.file_differ import FileDiffer
6
+ from tabcaddy.infrastructure.folder_differ import FolderDiffer
7
+
8
+
9
+ class DiffDatasets:
10
+ def __init__(self, generate_analysis) -> None:
11
+ self._file_differ = FileDiffer(generate_analysis)
12
+ self._folder_differ = FolderDiffer(generate_analysis)
13
+ self._compiled_differ = CompiledDatasetDiffer(generate_analysis)
14
+
15
+ def run(
16
+ self, left: DatasetSource, right: DatasetSource, level: DiffLevel
17
+ ) -> DiffReport:
18
+ if left.source_type == right.source_type == SourceType.FILE:
19
+ return self._file_differ.diff(left, right, level)
20
+ if left.source_type == right.source_type == SourceType.FOLDER:
21
+ return self._folder_differ.diff(left, right, level)
22
+ if left.source_type == right.source_type == SourceType.COMPILED_DATASET:
23
+ return self._compiled_differ.diff(left, right, level)
24
+ return self._file_differ.diff(left, right, level)
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from tabcaddy.domain.models import (
4
+ DatasetAnalysis,
5
+ DatasetSource,
6
+ ProfileMode,
7
+ SourceType,
8
+ )
9
+ from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
10
+ from tabcaddy.infrastructure.cache_manager import CacheManager
11
+
12
+
13
+ class GenerateAnalysis:
14
+ def __init__(
15
+ self,
16
+ analysis_builder: AnalysisBuilder | None = None,
17
+ cache_manager: CacheManager | None = None,
18
+ ) -> None:
19
+ self._analysis_builder = analysis_builder or AnalysisBuilder()
20
+ self._cache_manager = cache_manager or CacheManager()
21
+
22
+ def run(self, source: DatasetSource, profile_mode: ProfileMode) -> DatasetAnalysis:
23
+ if source.source_type == SourceType.COMPILED_DATASET:
24
+ compiled = self._analysis_builder.load_compiled_analysis(source)
25
+ if compiled is not None:
26
+ return compiled
27
+ cached = self._cache_manager.get(source, profile_mode)
28
+ if cached is not None:
29
+ return cached
30
+ analysis = self._analysis_builder.build(source, profile_mode).analysis
31
+ self._cache_manager.set(source, profile_mode, analysis)
32
+ return analysis
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from tabcaddy.domain.models import DatasetSource, ProfileMode
6
+ from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
7
+
8
+
9
+ class ScaffoldTransform:
10
+ def __init__(self, analysis_builder: AnalysisBuilder | None = None) -> None:
11
+ self._analysis_builder = analysis_builder or AnalysisBuilder()
12
+
13
+ def run(self, source: DatasetSource, output_path: Path) -> Path:
14
+ analysis = self._analysis_builder.build(source, ProfileMode.STANDARD).analysis
15
+ lines = [
16
+ '"""TabCaddy transform scaffold."""',
17
+ "",
18
+ "import polars as pl",
19
+ "",
20
+ "",
21
+ "# Observed schemas",
22
+ ]
23
+ for index, schema in enumerate(analysis.schemas, start=1):
24
+ lines.append(
25
+ f"# Schema {index}: {schema.occurrence_count} files, hash={schema.hash}"
26
+ )
27
+ for column in schema.columns:
28
+ lines.append(f"# - {column.name}: {column.dtype}")
29
+ lines.extend(
30
+ [
31
+ "",
32
+ "def transform(df: pl.DataFrame, context=None) -> pl.DataFrame:",
33
+ " # Example: rename a column",
34
+ " # if 'old_name' in df.columns:",
35
+ " # df = df.rename({'old_name': 'new_name'})",
36
+ "",
37
+ " # Example: filter rows",
38
+ " # df = df.filter(pl.col('quantity') > 0)",
39
+ "",
40
+ " return df",
41
+ "",
42
+ ]
43
+ )
44
+ output_path.write_text("\n".join(lines), encoding="utf-8")
45
+ return output_path
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+
8
+ from tabcaddy.domain.models import DatasetSource, SourceType
9
+ from tabcaddy.infrastructure.csv_reader import read_csv
10
+ from tabcaddy.infrastructure.csv_writer import write_csv
11
+ from tabcaddy.infrastructure.feather_reader import read_feather
12
+ from tabcaddy.infrastructure.feather_writer import write_feather
13
+ from tabcaddy.infrastructure.schema_analyzer import SchemaAnalyzer
14
+ from tabcaddy.infrastructure.source_resolver import iter_dataset_files
15
+ from tabcaddy.infrastructure.transform_loader import (
16
+ TransformContext,
17
+ TransformLoader,
18
+ TransformMetadata,
19
+ )
20
+
21
+
22
+ def _read_dataframe(path: Path) -> pl.DataFrame:
23
+ if path.suffix.lower() == ".csv":
24
+ return read_csv(path)
25
+ return read_feather(path)
26
+
27
+
28
+ def _write_dataframe(df: pl.DataFrame, path: Path) -> None:
29
+ if path.suffix.lower() == ".csv":
30
+ write_csv(df, path)
31
+ return
32
+ write_feather(df, path)
33
+
34
+
35
+ class TransformDataset:
36
+ def __init__(
37
+ self,
38
+ transform_loader: TransformLoader | None = None,
39
+ schema_analyzer: SchemaAnalyzer | None = None,
40
+ ) -> None:
41
+ self._transform_loader = transform_loader or TransformLoader()
42
+ self._schema_analyzer = schema_analyzer or SchemaAnalyzer()
43
+
44
+ def run(
45
+ self,
46
+ source: DatasetSource,
47
+ transform_path: Path,
48
+ output_path: Path | None,
49
+ workers: int,
50
+ ) -> Path:
51
+ if source.source_type == SourceType.COMPILED_DATASET:
52
+ raise ValueError(
53
+ "Transform currently supports files and folders, not compiled datasets."
54
+ )
55
+ output_root = output_path or self._default_output_path(source.path)
56
+ output_root.mkdir(parents=True, exist_ok=True)
57
+ transform, expects_context = self._transform_loader.load(transform_path)
58
+ files = iter_dataset_files(source)
59
+ schema_result = self._schema_analyzer.analyze_files(
60
+ files, base_path=source.path, source_type=source.source_type
61
+ )
62
+ record_map = {record.path: record for record in schema_result.files}
63
+
64
+ def process(path: Path) -> None:
65
+ record = record_map[path]
66
+ df = _read_dataframe(path)
67
+ context = TransformContext(
68
+ file_name=path.name,
69
+ file_path=str(path),
70
+ schema=[
71
+ {"name": column.name, "dtype": column.dtype}
72
+ for column in record.columns
73
+ ],
74
+ metadata=TransformMetadata(
75
+ row_count=record.row_count, schema_hash=record.schema_hash
76
+ ),
77
+ )
78
+ result = transform(df, context) if expects_context else transform(df)
79
+ if not isinstance(result, pl.DataFrame):
80
+ raise TypeError(
81
+ f"Transform must return a Polars DataFrame for {path.name}"
82
+ )
83
+ relative_path = (
84
+ record.relative_path
85
+ if source.source_type != SourceType.FILE
86
+ else Path(path.name)
87
+ )
88
+ _write_dataframe(result, output_root / relative_path)
89
+
90
+ if workers <= 1:
91
+ for path in files:
92
+ process(path)
93
+ else:
94
+ with ThreadPoolExecutor(max_workers=workers) as executor:
95
+ list(executor.map(process, files))
96
+ return output_root
97
+
98
+ def _default_output_path(self, input_path: Path) -> Path:
99
+ if input_path.is_dir():
100
+ return input_path.parent / f"{input_path.name}_transformed"
101
+ return input_path.parent / f"{input_path.stem}_transformed"
tabcaddy/cli/app.py ADDED
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+
7
+ from tabcaddy.application.compile_dataset import CompileDataset
8
+ from tabcaddy.application.diff_datasets import DiffDatasets
9
+ from tabcaddy.application.generate_analysis import GenerateAnalysis
10
+ from tabcaddy.application.scaffold_transform import ScaffoldTransform
11
+ from tabcaddy.application.transform_dataset import TransformDataset
12
+ from tabcaddy.domain.models import DiffLevel
13
+ from tabcaddy.domain.models import ProfileMode
14
+ from tabcaddy.infrastructure.analysis_builder import AnalysisBuilder
15
+ from tabcaddy.infrastructure.source_resolver import resolve_source
16
+ from tabcaddy.rendering.console import create_console
17
+ from tabcaddy.rendering.views.diff import build_diff_view
18
+ from tabcaddy.rendering.views.schema import build_schema_view
19
+ from tabcaddy.rendering.views.summary import build_summary_view
20
+
21
+
22
+ app = typer.Typer(
23
+ add_completion=False,
24
+ help="Explore, compile, transform, and compare datasets.",
25
+ no_args_is_help=True,
26
+ )
27
+
28
+
29
+ @app.callback()
30
+ def root() -> None:
31
+ """TabCaddy command line interface."""
32
+
33
+
34
+ @app.command()
35
+ def summary(
36
+ source: Path,
37
+ profile: ProfileMode = typer.Option(ProfileMode.STANDARD, "--profile"),
38
+ ) -> None:
39
+ console = create_console()
40
+ analysis = GenerateAnalysis().run(resolve_source(source), profile)
41
+ console.print(build_summary_view(analysis))
42
+
43
+
44
+ @app.command()
45
+ def schema(
46
+ source: Path,
47
+ profile: ProfileMode = typer.Option(ProfileMode.STANDARD, "--profile"),
48
+ ) -> None:
49
+ console = create_console()
50
+ scan = AnalysisBuilder().build(resolve_source(source), profile)
51
+ console.print(build_schema_view(scan.analysis, scan.files))
52
+
53
+
54
+ @app.command()
55
+ def compile(
56
+ folder: Path,
57
+ output: Path = typer.Option(Path("compiled_dataset"), "--output"),
58
+ schema: int | None = typer.Option(None, "--schema"),
59
+ interactive: bool = typer.Option(False, "--interactive"),
60
+ ) -> None:
61
+ console = create_console()
62
+ source = resolve_source(folder)
63
+ schema_index = schema
64
+ if interactive and schema_index is None:
65
+ preview = AnalysisBuilder().build(source, ProfileMode.QUICK)
66
+ if len(preview.analysis.schemas) > 1:
67
+ schema_index = typer.prompt(
68
+ "Multiple schemas detected. Choose schema number", type=int
69
+ )
70
+ output_path, skipped = CompileDataset().run(source, output, schema_index)
71
+ console.print(f"Compiled dataset written to [green]{output_path}[/green]")
72
+ if skipped:
73
+ console.print(
74
+ f"Skipped {len(skipped)} files from non-selected schemas.", style="yellow"
75
+ )
76
+
77
+
78
+ @app.command()
79
+ def transform(
80
+ input_path: Path,
81
+ transform_path: Path,
82
+ output_path: Path | None = typer.Argument(None),
83
+ workers: int = typer.Option(1, "--workers", min=1),
84
+ ) -> None:
85
+ console = create_console()
86
+ source = resolve_source(input_path)
87
+ destination = TransformDataset().run(source, transform_path, output_path, workers)
88
+ console.print(f"Transformed files written to [green]{destination}[/green]")
89
+
90
+
91
+ @app.command("scaffold-transform")
92
+ def scaffold_transform(
93
+ source: Path,
94
+ output: Path = typer.Option(Path("transform_template.py"), "--output"),
95
+ ) -> None:
96
+ console = create_console()
97
+ destination = ScaffoldTransform().run(resolve_source(source), output)
98
+ console.print(f"Transform scaffold written to [green]{destination}[/green]")
99
+
100
+
101
+ @app.command()
102
+ def diff(
103
+ left: Path,
104
+ right: Path,
105
+ level: DiffLevel = typer.Option(DiffLevel.FULL, "--level"),
106
+ ) -> None:
107
+ console = create_console()
108
+ generator = GenerateAnalysis()
109
+ report = DiffDatasets(generator).run(
110
+ resolve_source(left), resolve_source(right), level
111
+ )
112
+ console.print(build_diff_view(report))
113
+
114
+
115
+ def main() -> None:
116
+ app()
@@ -0,0 +1,25 @@
1
+ from tabcaddy.domain.models import ColumnDefinition
2
+ from tabcaddy.domain.models import ColumnStatistics
3
+ from tabcaddy.domain.models import DatasetAnalysis
4
+ from tabcaddy.domain.models import DatasetMetadata
5
+ from tabcaddy.domain.models import DatasetSource
6
+ from tabcaddy.domain.models import DatasetStatistics
7
+ from tabcaddy.domain.models import DiffLevel
8
+ from tabcaddy.domain.models import DiffReport
9
+ from tabcaddy.domain.models import ProfileMode
10
+ from tabcaddy.domain.models import SchemaSignature
11
+ from tabcaddy.domain.models import SourceType
12
+
13
+ __all__ = [
14
+ "ColumnDefinition",
15
+ "ColumnStatistics",
16
+ "DatasetAnalysis",
17
+ "DatasetMetadata",
18
+ "DatasetSource",
19
+ "DatasetStatistics",
20
+ "DiffLevel",
21
+ "DiffReport",
22
+ "ProfileMode",
23
+ "SchemaSignature",
24
+ "SourceType",
25
+ ]
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ class SourceType(str, Enum):
11
+ FILE = "file"
12
+ FOLDER = "folder"
13
+ COMPILED_DATASET = "compiled_dataset"
14
+
15
+
16
+ class ProfileMode(str, Enum):
17
+ QUICK = "quick"
18
+ STANDARD = "standard"
19
+ DEEP = "deep"
20
+
21
+
22
+ class DiffLevel(str, Enum):
23
+ METADATA = "metadata"
24
+ STATISTICS = "statistics"
25
+ FULL = "full"
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class DatasetSource:
30
+ path: Path
31
+ source_type: SourceType
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class ColumnDefinition:
36
+ name: str
37
+ dtype: str
38
+
39
+
40
+ @dataclass
41
+ class SchemaSignature:
42
+ columns: list[ColumnDefinition]
43
+ hash: str
44
+ occurrence_count: int
45
+
46
+
47
+ @dataclass
48
+ class DatasetMetadata:
49
+ version: int
50
+ created_at: datetime
51
+ row_count: int
52
+ column_count: int
53
+ source_file_count: int
54
+ schema_hash: str | None
55
+ column_hashes: dict[str, str] | None
56
+
57
+
58
+ @dataclass
59
+ class ColumnStatistics:
60
+ dtype: str
61
+ null_rate: float
62
+ unique_estimate: int | None
63
+ min_value: Any | None
64
+ max_value: Any | None
65
+ mean: float | None
66
+ median: float | None
67
+ stddev: float | None
68
+ histogram: list[tuple[str, int]] | None = None
69
+
70
+
71
+ @dataclass
72
+ class DatasetStatistics:
73
+ columns: dict[str, ColumnStatistics]
74
+
75
+
76
+ @dataclass
77
+ class DatasetAnalysis:
78
+ metadata: DatasetMetadata
79
+ schemas: list[SchemaSignature]
80
+ statistics: DatasetStatistics | None
81
+ warnings: list[str] = field(default_factory=list)
82
+
83
+
84
+ @dataclass
85
+ class DiffReport:
86
+ metadata_changes: list[str]
87
+ schema_changes: list[str]
88
+ statistics_changes: list[str]
89
+ warnings: list[str] = field(default_factory=list)
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, datetime, time
4
+ from typing import Any
5
+
6
+ from tabcaddy.domain.models import (
7
+ ColumnDefinition,
8
+ ColumnStatistics,
9
+ DatasetAnalysis,
10
+ DatasetMetadata,
11
+ DatasetStatistics,
12
+ DiffReport,
13
+ SchemaSignature,
14
+ )
15
+
16
+
17
+ def _serialize_value(value: Any) -> Any:
18
+ if isinstance(value, (datetime, date, time)):
19
+ return value.isoformat()
20
+ if isinstance(value, list):
21
+ return [_serialize_value(item) for item in value]
22
+ if isinstance(value, dict):
23
+ return {key: _serialize_value(item) for key, item in value.items()}
24
+ return value
25
+
26
+
27
+ def analysis_to_dict(analysis: DatasetAnalysis) -> dict[str, Any]:
28
+ return {
29
+ "metadata": {
30
+ "version": analysis.metadata.version,
31
+ "created_at": analysis.metadata.created_at.isoformat(),
32
+ "row_count": analysis.metadata.row_count,
33
+ "column_count": analysis.metadata.column_count,
34
+ "source_file_count": analysis.metadata.source_file_count,
35
+ "schema_hash": analysis.metadata.schema_hash,
36
+ "column_hashes": analysis.metadata.column_hashes,
37
+ },
38
+ "schemas": [
39
+ {
40
+ "columns": [
41
+ {"name": column.name, "dtype": column.dtype}
42
+ for column in schema.columns
43
+ ],
44
+ "hash": schema.hash,
45
+ "occurrence_count": schema.occurrence_count,
46
+ }
47
+ for schema in analysis.schemas
48
+ ],
49
+ "statistics": None
50
+ if analysis.statistics is None
51
+ else {
52
+ "columns": {
53
+ name: {
54
+ "dtype": stats.dtype,
55
+ "null_rate": stats.null_rate,
56
+ "unique_estimate": stats.unique_estimate,
57
+ "min_value": _serialize_value(stats.min_value),
58
+ "max_value": _serialize_value(stats.max_value),
59
+ "mean": stats.mean,
60
+ "median": stats.median,
61
+ "stddev": stats.stddev,
62
+ "histogram": None
63
+ if stats.histogram is None
64
+ else [
65
+ {"label": label, "count": count}
66
+ for label, count in stats.histogram
67
+ ],
68
+ }
69
+ for name, stats in analysis.statistics.columns.items()
70
+ }
71
+ },
72
+ "warnings": list(analysis.warnings),
73
+ }
74
+
75
+
76
+ def analysis_from_dict(payload: dict[str, Any]) -> DatasetAnalysis:
77
+ metadata_payload = payload["metadata"]
78
+ statistics_payload = payload.get("statistics")
79
+ return DatasetAnalysis(
80
+ metadata=DatasetMetadata(
81
+ version=metadata_payload["version"],
82
+ created_at=datetime.fromisoformat(metadata_payload["created_at"]),
83
+ row_count=metadata_payload["row_count"],
84
+ column_count=metadata_payload["column_count"],
85
+ source_file_count=metadata_payload["source_file_count"],
86
+ schema_hash=metadata_payload.get("schema_hash"),
87
+ column_hashes=metadata_payload.get("column_hashes"),
88
+ ),
89
+ schemas=[
90
+ SchemaSignature(
91
+ columns=[
92
+ ColumnDefinition(name=column["name"], dtype=column["dtype"])
93
+ for column in schema["columns"]
94
+ ],
95
+ hash=schema["hash"],
96
+ occurrence_count=schema["occurrence_count"],
97
+ )
98
+ for schema in payload.get("schemas", [])
99
+ ],
100
+ statistics=None
101
+ if statistics_payload is None
102
+ else DatasetStatistics(
103
+ columns={
104
+ name: ColumnStatistics(
105
+ dtype=stats["dtype"],
106
+ null_rate=stats["null_rate"],
107
+ unique_estimate=stats.get("unique_estimate"),
108
+ min_value=stats.get("min_value"),
109
+ max_value=stats.get("max_value"),
110
+ mean=stats.get("mean"),
111
+ median=stats.get("median"),
112
+ stddev=stats.get("stddev"),
113
+ histogram=None
114
+ if stats.get("histogram") is None
115
+ else [
116
+ (entry["label"], entry["count"]) for entry in stats["histogram"]
117
+ ],
118
+ )
119
+ for name, stats in statistics_payload.get("columns", {}).items()
120
+ }
121
+ ),
122
+ warnings=list(payload.get("warnings", [])),
123
+ )
124
+
125
+
126
+ def diff_report_to_dict(report: DiffReport) -> dict[str, Any]:
127
+ return {
128
+ "metadata_changes": list(report.metadata_changes),
129
+ "schema_changes": list(report.schema_changes),
130
+ "statistics_changes": list(report.statistics_changes),
131
+ "warnings": list(report.warnings),
132
+ }