pirn-data 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pirn_data/AGENTIC_USE.md +315 -0
- pirn_data/__init__.py +23 -0
- pirn_data/data_batch.py +99 -0
- pirn_data/data_profile.py +46 -0
- pirn_data/data_schema.py +85 -0
- pirn_data/frames/AGENTIC_USE.md +123 -0
- pirn_data/frames/__init__.py +0 -0
- pirn_data/frames/datafusion/__init__.py +0 -0
- pirn_data/frames/datafusion/bridges/__init__.py +0 -0
- pirn_data/frames/datafusion/bridges/data_batch_to_datafusion.py +85 -0
- pirn_data/frames/datafusion/bridges/datafusion_to_data_batch.py +66 -0
- pirn_data/frames/datafusion/datafusion_aggregate.py +97 -0
- pirn_data/frames/datafusion/datafusion_data_batch.py +85 -0
- pirn_data/frames/datafusion/datafusion_filter.py +122 -0
- pirn_data/frames/datafusion/datafusion_join.py +147 -0
- pirn_data/frames/datafusion/datafusion_session_context.py +35 -0
- pirn_data/frames/datafusion/datafusion_session_context_knot.py +54 -0
- pirn_data/frames/duckdb/__init__.py +0 -0
- pirn_data/frames/duckdb/bridges/__init__.py +0 -0
- pirn_data/frames/duckdb/bridges/data_batch_to_duckdb.py +122 -0
- pirn_data/frames/duckdb/bridges/duckdb_to_data_batch.py +51 -0
- pirn_data/frames/duckdb/duckdb_aggregate.py +122 -0
- pirn_data/frames/duckdb/duckdb_cast.py +128 -0
- pirn_data/frames/duckdb/duckdb_connection.py +36 -0
- pirn_data/frames/duckdb/duckdb_connection_knot.py +50 -0
- pirn_data/frames/duckdb/duckdb_data_batch.py +86 -0
- pirn_data/frames/duckdb/duckdb_deduplicate.py +112 -0
- pirn_data/frames/duckdb/duckdb_filter.py +97 -0
- pirn_data/frames/duckdb/duckdb_join.py +191 -0
- pirn_data/frames/duckdb/duckdb_rename.py +112 -0
- pirn_data/frames/pandas/__init__.py +0 -0
- pirn_data/frames/pandas/bridges/__init__.py +0 -0
- pirn_data/frames/pandas/bridges/data_batch_to_pandas.py +63 -0
- pirn_data/frames/pandas/bridges/pandas_to_data_batch.py +62 -0
- pirn_data/frames/pandas/pandas_aggregate.py +144 -0
- pirn_data/frames/pandas/pandas_cast.py +103 -0
- pirn_data/frames/pandas/pandas_data_batch.py +75 -0
- pirn_data/frames/pandas/pandas_deduplicate.py +74 -0
- pirn_data/frames/pandas/pandas_filter.py +79 -0
- pirn_data/frames/pandas/pandas_join.py +153 -0
- pirn_data/frames/pandas/pandas_rename.py +70 -0
- pirn_data/frames/polars/__init__.py +0 -0
- pirn_data/frames/polars/bridges/__init__.py +0 -0
- pirn_data/frames/polars/bridges/data_batch_to_polars.py +63 -0
- pirn_data/frames/polars/bridges/polars_to_data_batch.py +62 -0
- pirn_data/frames/polars/polars_aggregate.py +86 -0
- pirn_data/frames/polars/polars_cast.py +101 -0
- pirn_data/frames/polars/polars_data_batch.py +75 -0
- pirn_data/frames/polars/polars_deduplicate.py +69 -0
- pirn_data/frames/polars/polars_filter.py +77 -0
- pirn_data/frames/polars/polars_join.py +156 -0
- pirn_data/frames/polars/polars_pivot.py +127 -0
- pirn_data/frames/polars/polars_rename.py +70 -0
- pirn_data/frames/polars/polars_unpivot.py +113 -0
- pirn_data/frames/polars/polars_window_calc.py +81 -0
- pirn_data/frames/pyarrow/__init__.py +0 -0
- pirn_data/frames/pyarrow/bridges/__init__.py +0 -0
- pirn_data/frames/pyarrow/bridges/data_batch_to_pyarrow.py +53 -0
- pirn_data/frames/pyarrow/bridges/pyarrow_to_data_batch.py +47 -0
- pirn_data/frames/pyarrow/pyarrow_aggregate.py +136 -0
- pirn_data/frames/pyarrow/pyarrow_cast.py +112 -0
- pirn_data/frames/pyarrow/pyarrow_data_batch.py +76 -0
- pirn_data/frames/pyarrow/pyarrow_deduplicate.py +131 -0
- pirn_data/frames/pyarrow/pyarrow_filter.py +101 -0
- pirn_data/frames/pyarrow/pyarrow_join.py +160 -0
- pirn_data/frames/pyarrow/pyarrow_rename.py +76 -0
- pirn_data/identifier_validator.py +53 -0
- pirn_data/lakehouse/AGENTIC_USE.md +106 -0
- pirn_data/lakehouse/__init__.py +0 -0
- pirn_data/lakehouse/delta/__init__.py +0 -0
- pirn_data/lakehouse/delta/delta_table.py +258 -0
- pirn_data/lakehouse/delta/delta_table_config.py +43 -0
- pirn_data/lakehouse/hudi/__init__.py +0 -0
- pirn_data/lakehouse/hudi/hudi_table.py +213 -0
- pirn_data/lakehouse/hudi/hudi_table_config.py +45 -0
- pirn_data/lakehouse/iceberg/__init__.py +0 -0
- pirn_data/lakehouse/iceberg/iceberg_table.py +257 -0
- pirn_data/lakehouse/iceberg/iceberg_table_config.py +47 -0
- pirn_data/lakehouse/lakehouse_table.py +101 -0
- pirn_data/lakehouse/lakehouse_table_sink.py +95 -0
- pirn_data/lakehouse/lakehouse_table_source.py +106 -0
- pirn_data/lazy/AGENTIC_USE.md +101 -0
- pirn_data/lazy/__init__.py +0 -0
- pirn_data/lazy/dask/__init__.py +0 -0
- pirn_data/lazy/dask/dask_aggregate.py +141 -0
- pirn_data/lazy/dask/dask_compute.py +150 -0
- pirn_data/lazy/dask/dask_dataframe.py +67 -0
- pirn_data/lazy/dask/dask_execution_receipt.py +37 -0
- pirn_data/lazy/dask/dask_filter.py +72 -0
- pirn_data/lazy/dask/dask_join.py +125 -0
- pirn_data/lazy/dask/dask_source.py +118 -0
- pirn_data/lazy/ibis/__init__.py +0 -0
- pirn_data/lazy/ibis/ibis_connection.py +34 -0
- pirn_data/lazy/ibis/ibis_connection_knot.py +55 -0
- pirn_data/lazy/ibis/ibis_execution_receipt.py +43 -0
- pirn_data/lazy/ibis/ibis_filter.py +77 -0
- pirn_data/lazy/ibis/ibis_group_by_aggregate.py +120 -0
- pirn_data/lazy/ibis/ibis_join.py +144 -0
- pirn_data/lazy/ibis/ibis_source.py +115 -0
- pirn_data/lazy/ibis/ibis_table.py +81 -0
- pirn_data/lazy/ibis/ibis_to_table.py +146 -0
- pirn_data/lazy/ibis/ibis_window.py +93 -0
- pirn_data/lazy/ray/__init__.py +0 -0
- pirn_data/lazy/ray/ray_aggregate.py +128 -0
- pirn_data/lazy/ray/ray_compute.py +159 -0
- pirn_data/lazy/ray/ray_dataset.py +57 -0
- pirn_data/lazy/ray/ray_execution_receipt.py +38 -0
- pirn_data/lazy/ray/ray_filter.py +71 -0
- pirn_data/lazy/ray/ray_map.py +103 -0
- pirn_data/lazy/ray/ray_source.py +120 -0
- pirn_data/lazy/spark/__init__.py +0 -0
- pirn_data/lazy/spark/spark_aggregate.py +134 -0
- pirn_data/lazy/spark/spark_collect_sink.py +95 -0
- pirn_data/lazy/spark/spark_dataframe.py +71 -0
- pirn_data/lazy/spark/spark_execution_receipt.py +50 -0
- pirn_data/lazy/spark/spark_filter.py +67 -0
- pirn_data/lazy/spark/spark_join.py +160 -0
- pirn_data/lazy/spark/spark_source.py +136 -0
- pirn_data/lazy/spark/spark_write_sink.py +100 -0
- pirn_data/quality/__init__.py +0 -0
- pirn_data/quality/freshness_check.py +140 -0
- pirn_data/quality/null_rate_check.py +126 -0
- pirn_data/quality/profiler.py +144 -0
- pirn_data/quality/row_count_check.py +104 -0
- pirn_data/quality/schema_validator.py +160 -0
- pirn_data/quality_check.py +20 -0
- pirn_data/quality_report.py +31 -0
- pirn_data/sinks/__init__.py +0 -0
- pirn_data/sinks/file_sink.py +92 -0
- pirn_data/sources/__init__.py +0 -0
- pirn_data/sources/directory_source.py +126 -0
- pirn_data/sources/file_source.py +115 -0
- pirn_data/sources/sql_source.py +108 -0
- pirn_data/specializations/AGENTIC_USE.md +116 -0
- pirn_data/specializations/__init__.py +0 -0
- pirn_data/specializations/analytics_engineering/__init__.py +1 -0
- pirn_data/specializations/analytics_engineering/exposure_lineage_tag.py +100 -0
- pirn_data/specializations/analytics_engineering/intermediate_model_knot.py +141 -0
- pirn_data/specializations/analytics_engineering/mart_model_knot.py +124 -0
- pirn_data/specializations/analytics_engineering/metric_layer_aggregator.py +163 -0
- pirn_data/specializations/analytics_engineering/refresh_materialized_view.py +94 -0
- pirn_data/specializations/analytics_engineering/staging_model_knot.py +121 -0
- pirn_data/specializations/data_vault/__init__.py +0 -0
- pirn_data/specializations/data_vault/data_vault_bridge_table_builder.py +174 -0
- pirn_data/specializations/data_vault/data_vault_hub_loader.py +164 -0
- pirn_data/specializations/data_vault/data_vault_link_loader.py +167 -0
- pirn_data/specializations/data_vault/data_vault_pit_table_builder.py +182 -0
- pirn_data/specializations/data_vault/data_vault_satellite_loader.py +229 -0
- pirn_data/specializations/deduplication/__init__.py +1 -0
- pirn_data/specializations/deduplication/exact_deduplicator.py +83 -0
- pirn_data/specializations/deduplication/fuzzy_deduplicator.py +192 -0
- pirn_data/specializations/deduplication/probabilistic_linker.py +136 -0
- pirn_data/specializations/deduplication/windowed_deduplicator.py +90 -0
- pirn_data/specializations/dimensional/__init__.py +0 -0
- pirn_data/specializations/dimensional/bridge_table_builder.py +170 -0
- pirn_data/specializations/dimensional/date_dim_generator.py +148 -0
- pirn_data/specializations/dimensional/dim_table_load.py +277 -0
- pirn_data/specializations/dimensional/fact_table_load.py +186 -0
- pirn_data/specializations/feature_engineering/__init__.py +1 -0
- pirn_data/specializations/feature_engineering/binning_knot.py +129 -0
- pirn_data/specializations/feature_engineering/column_hasher.py +76 -0
- pirn_data/specializations/feature_engineering/date_part_extractor.py +105 -0
- pirn_data/specializations/feature_engineering/derived_column_calculator.py +159 -0
- pirn_data/specializations/feature_engineering/geo_enricher.py +141 -0
- pirn_data/specializations/feature_engineering/lookup_enricher.py +83 -0
- pirn_data/specializations/feature_engineering/string_normalizer.py +109 -0
- pirn_data/specializations/feature_engineering/text_token_counter.py +108 -0
- pirn_data/specializations/incremental/__init__.py +0 -0
- pirn_data/specializations/incremental/dbt_style_snapshot.py +228 -0
- pirn_data/specializations/incremental/delete_safe_sync.py +193 -0
- pirn_data/specializations/incremental/merge_upsert.py +143 -0
- pirn_data/specializations/incremental/partitioned_overwrite.py +115 -0
- pirn_data/specializations/incremental/snapshot_table_appender.py +112 -0
- pirn_data/specializations/ingestion/__init__.py +0 -0
- pirn_data/specializations/ingestion/append_only_ingest.py +92 -0
- pirn_data/specializations/ingestion/full_refresh_extract.py +111 -0
- pirn_data/specializations/ingestion/query_new_rows_knot.py +115 -0
- pirn_data/specializations/ingestion/read_high_water_mark_knot.py +84 -0
- pirn_data/specializations/ingestion/rows_behind_truncate_check_knot.py +40 -0
- pirn_data/specializations/ingestion/truncate_table_knot.py +64 -0
- pirn_data/specializations/ingestion/watermark_incremental_extract.py +133 -0
- pirn_data/specializations/medallion/__init__.py +0 -0
- pirn_data/specializations/medallion/bronze_raw_ingest.py +120 -0
- pirn_data/specializations/medallion/data_batch_to_tuples_knot.py +64 -0
- pirn_data/specializations/medallion/gold_aggregation.py +166 -0
- pirn_data/specializations/medallion/silver_clean_transform.py +170 -0
- pirn_data/specializations/medallion/stamp_bronze_metadata_knot.py +65 -0
- pirn_data/specializations/medallion/tuples_to_data_batch_knot.py +66 -0
- pirn_data/specializations/quality/__init__.py +0 -0
- pirn_data/specializations/quality/database_table_freshness_check.py +130 -0
- pirn_data/specializations/quality/null_rate_monitor.py +117 -0
- pirn_data/specializations/quality/reconciliation_diff.py +155 -0
- pirn_data/specializations/quality/referential_integrity_check.py +109 -0
- pirn_data/specializations/quality/row_count_anomaly_detector.py +140 -0
- pirn_data/specializations/quality/schema_evolution_detector.py +109 -0
- pirn_data/specializations/quality/statistical_profiler.py +183 -0
- pirn_data/specializations/scd/__init__.py +0 -0
- pirn_data/specializations/scd/cdc/__init__.py +0 -0
- pirn_data/specializations/scd/cdc/cdc_message_broker_knot.py +58 -0
- pirn_data/specializations/scd/cdc/debezium_source.py +176 -0
- pirn_data/specializations/scd/cdc/message_broker_connection.py +36 -0
- pirn_data/specializations/scd/cdc_debezium.py +238 -0
- pirn_data/specializations/scd/scd_type_1.py +147 -0
- pirn_data/specializations/scd/scd_type_1_merge_knot.py +146 -0
- pirn_data/specializations/scd/scd_type_1_overwrite.py +156 -0
- pirn_data/specializations/scd/scd_type_2.py +185 -0
- pirn_data/specializations/scd/scd_type_2_history.py +218 -0
- pirn_data/specializations/scd/scd_type_2_merge_knot.py +197 -0
- pirn_data/specializations/scd/scd_type_3_previous_value.py +196 -0
- pirn_data/specializations/scd/scd_type_4_mini_dimension.py +184 -0
- pirn_data/specializations/scd/scd_type_5_mini_dim_with_current.py +200 -0
- pirn_data/specializations/scd/scd_type_6_hybrid.py +270 -0
- pirn_data/specializations/scd/scd_type_7.py +200 -0
- pirn_data/specializations/scd/scd_type_7_hybrid.py +250 -0
- pirn_data/specializations/scd/scd_type_7_merge_knot.py +225 -0
- pirn_data/specializations/schema_migration/__init__.py +1 -0
- pirn_data/specializations/schema_migration/backfill_runner.py +133 -0
- pirn_data/specializations/schema_migration/column_lineage_tracker.py +127 -0
- pirn_data/specializations/schema_migration/schema_version_migrator.py +130 -0
- pirn_data/specializations/timeseries/__init__.py +1 -0
- pirn_data/specializations/timeseries/cohort_aggregator.py +153 -0
- pirn_data/specializations/timeseries/funnel_analysis_knot.py +123 -0
- pirn_data/specializations/timeseries/late_arriving_event_handler.py +161 -0
- pirn_data/specializations/timeseries/rolling_window_aggregator.py +125 -0
- pirn_data/specializations/timeseries/sessionization_knot.py +114 -0
- pirn_data/specializations/timeseries/time_series_resampler.py +131 -0
- pirn_data/specialized/__init__.py +0 -0
- pirn_data/specialized/eland/__init__.py +0 -0
- pirn_data/specialized/eland/eland_dataframe.py +56 -0
- pirn_data/specialized/eland/eland_filter.py +91 -0
- pirn_data/specialized/eland/eland_source.py +81 -0
- pirn_data/specialized/eland/eland_to_pandas.py +51 -0
- pirn_data/specialized/eland/elasticsearch_connection.py +35 -0
- pirn_data/specialized/eland/elasticsearch_connection_knot.py +59 -0
- pirn_data/specialized/lance/__init__.py +0 -0
- pirn_data/specialized/lance/arrow_to_lance_sink.py +77 -0
- pirn_data/specialized/lance/lance_dataset.py +53 -0
- pirn_data/specialized/lance/lance_source.py +63 -0
- pirn_data/specialized/lance/lance_to_arrow.py +46 -0
- pirn_data/transforms/__init__.py +0 -0
- pirn_data/transforms/aggregate.py +197 -0
- pirn_data/transforms/aggregate_spec.py +43 -0
- pirn_data/transforms/cast.py +120 -0
- pirn_data/transforms/deduplicate.py +111 -0
- pirn_data/transforms/filter.py +69 -0
- pirn_data/transforms/normalize.py +132 -0
- pirn_data/transforms/normalize_column_rule.py +36 -0
- pirn_data/transforms/rename.py +105 -0
- pirn_data/validation/__init__.py +0 -0
- pirn_data/validation/great_expectations/__init__.py +0 -0
- pirn_data/validation/great_expectations/great_expectations_pandas_validator.py +229 -0
- pirn_data/validation/pandera/__init__.py +0 -0
- pirn_data/validation/pandera/pandera_pandas_validator.py +162 -0
- pirn_data/validation/pandera/pandera_polars_validator.py +164 -0
- pirn_data-0.4.0.dist-info/METADATA +55 -0
- pirn_data-0.4.0.dist-info/RECORD +257 -0
- pirn_data-0.4.0.dist-info/WHEEL +4 -0
pirn_data/AGENTIC_USE.md
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# AGENTIC_USE — pirn_data
|
|
2
|
+
|
|
3
|
+
Provides a tiered, engine-agnostic layer for reading, transforming, and writing structured data — it does NOT include API connectors, message-queue consumers, or ML training loops (those are separate domains).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Mental model
|
|
8
|
+
|
|
9
|
+
pirn is the **orchestrator**; the engines do the work. The data domain is stratified into tiers, each independently opt-in:
|
|
10
|
+
|
|
11
|
+
| Tier | Label | Engines | When to use |
|
|
12
|
+
|------|-------|---------|-------------|
|
|
13
|
+
| **1** | Dict / DataBatch | Pure Python | Always available. Use for < ~100 k rows, glue logic, or environments where no heavy deps are allowed. Every record is a `dict`; the exchange currency is `DataBatch`. |
|
|
14
|
+
| **2** | Native frames (CPU) | **Polars** (preferred), DuckDB, pandas+PyArrow, DataFusion | Fits comfortably in RAM. Need fast vectorised ops, joins, window functions, pivots. Polars is the default; reach for DuckDB when SQL is more natural or when you need concurrent read-only queries. |
|
|
15
|
+
| **2-GPU** | Native frames (GPU) | cuDF | CUDA cluster only. Drop-in Polars-compatible when `cudf-cu12` is installed. |
|
|
16
|
+
| **2.5** | Out-of-core | Modin | Data exceeds RAM, single machine. Pandas-compatible API chunked on disk. |
|
|
17
|
+
| **3** | Lazy / push-down | **Ibis** (preferred), Spark, Dask, Ray Data | Data doesn't fit on one machine, or lives in a warehouse already. Ibis first — it targets many backends without a Spark cluster. Spark/Dask/Ray when distributed compute is required. |
|
|
18
|
+
| **3-stream** | Streaming dataflow | Pathway, Bytewax | Continuous / low-latency event streams. Requires Python < 3.14 until upstream catches up. |
|
|
19
|
+
| **4** | Specialised | Lance (vector), Eland (Elasticsearch) | Domain-specific columnar layouts; not general-purpose. |
|
|
20
|
+
|
|
21
|
+
**Tier-1 (`DataBatch`) is always installed with `pirn[data]`.** Tiers 2–4 are independent extras that do not pull each other in.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install pirn[data] # Tier 1 only — DataBatch, sources, sinks, transforms (no heavy deps)
|
|
29
|
+
pip install pirn[all-frames] # Tier 2 single-machine engines (Polars, DuckDB, pandas, PyArrow, DataFusion)
|
|
30
|
+
pip install pirn[all-lazy] # Tier 3 push-down engines (Ibis, Spark, Dask, Ray Data)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Specialised extras (one at a time as needed):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install pirn[polars] # Polars only
|
|
37
|
+
pip install pirn[delta] # Delta Lake lakehouse adapter
|
|
38
|
+
pip install pirn[iceberg] # Apache Iceberg adapter
|
|
39
|
+
pip install pirn[health] # DICOM, HL7, FHIR, NIfTI, EDF, BIDS, etc.
|
|
40
|
+
pip install pirn[genomics] # FASTA, FASTQ, VCF, BAM, CRAM, SAM
|
|
41
|
+
pip install pirn[oilgas] # SEG-Y, DLIS, LAS, WITSML
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Source map
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
pirn_data/
|
|
50
|
+
├── data_batch.py # Tier-1 exchange type: immutable tuple of dicts
|
|
51
|
+
├── data_schema.py # Optional schema metadata attached to DataBatch
|
|
52
|
+
├── data_profile.py # Statistical profile of a DataBatch
|
|
53
|
+
├── quality/ # Quality checks and reports
|
|
54
|
+
│ ├── quality_check.py
|
|
55
|
+
│ └── quality_report.py
|
|
56
|
+
├── sources/
|
|
57
|
+
│ ├── file_source.py # Single-file source (ObjectStore + FileFormat)
|
|
58
|
+
│ └── directory_source.py # Prefix glob → one or many DataBatches
|
|
59
|
+
├── sinks/
|
|
60
|
+
│ └── file_sink.py # Encode + write DataBatch to ObjectStore
|
|
61
|
+
├── transforms/ # Tier-1 transforms (pure Python, no extras)
|
|
62
|
+
│ ├── filter.py # Row predicate (Python callable)
|
|
63
|
+
│ ├── rename.py # Column rename map
|
|
64
|
+
│ ├── cast.py # Type coercion
|
|
65
|
+
│ ├── normalize.py # String cleanup rules
|
|
66
|
+
│ ├── aggregate.py # Group-by + aggregations
|
|
67
|
+
│ └── deduplicate.py # Row deduplication
|
|
68
|
+
├── frames/ # Tier-2 engine-specific knots
|
|
69
|
+
│ ├── polars/ # PolarsFilter, PolarsJoin, PolarsAggregate, etc.
|
|
70
|
+
│ │ └── bridges/ # DataBatch ↔ PolarsDataBatch conversions
|
|
71
|
+
│ ├── duckdb/ # DuckDB equivalents
|
|
72
|
+
│ ├── pandas/
|
|
73
|
+
│ ├── pyarrow/
|
|
74
|
+
│ └── datafusion/
|
|
75
|
+
├── lazy/ # Tier-3 lazy/push-down knots
|
|
76
|
+
│ ├── ibis/
|
|
77
|
+
│ ├── spark/
|
|
78
|
+
│ ├── dask/
|
|
79
|
+
│ └── ray/
|
|
80
|
+
├── lakehouse/ # Lakehouse table adapters
|
|
81
|
+
│ ├── delta/ # DeltaTable (full CRUD + merge + time-travel)
|
|
82
|
+
│ ├── iceberg/ # IcebergTable (read/write/time-travel; merge not yet available in pyiceberg)
|
|
83
|
+
│ └── hudi/ # HudiTable (read-only; writes require Spark writer)
|
|
84
|
+
├── specializations/ # Pre-wired high-level patterns ← specializations
|
|
85
|
+
│ ├── ingestion/ # AppendOnlyIngest, FullRefreshExtract, WatermarkIncrementalExtract ← specializations
|
|
86
|
+
│ ├── medallion/ # BronzeRawIngest, SilverCleanTransform, GoldAggregation ← specializations
|
|
87
|
+
│ ├── scd/ # ScdType1/2/3/4/5/6/7, CdcDebezium, DebeziumSource ← specializations
|
|
88
|
+
│ ├── dimensional/ # DateDimGenerator, DimTableLoad, FactTableLoad, BridgeTableBuilder ← specializations
|
|
89
|
+
│ ├── data_vault/ # DataVaultHubLoader, DataVaultLinkLoader, DataVaultSatelliteLoader, DataVaultPITTableBuilder, DataVaultBridgeTableBuilder ← specializations
|
|
90
|
+
│ ├── incremental/ # SnapshotTableAppender, DbtStyleSnapshot, MergeUpsert, DeleteSafeSync, PartitionedOverwrite ← specializations
|
|
91
|
+
│ ├── quality/ # RowCountAnomalyDetector, NullRateMonitor, SchemaEvolutionDetector, FreshnessCheck, ReferentialIntegrityCheck, ReconciliationDiff, StatisticalProfiler ← specializations
|
|
92
|
+
│ ├── deduplication/ # ExactDeduplicator, WindowedDeduplicator, FuzzyDeduplicator, ProbabilisticLinker ← specializations
|
|
93
|
+
│ ├── timeseries/ # TimeSeriesResampler, RollingWindowAggregator, SessionizationKnot, FunnelAnalysisKnot, CohortAggregator, LateArrivingEventHandler ← specializations
|
|
94
|
+
│ ├── feature_engineering/ # DerivedColumnCalculator, ColumnHasher, BinningKnot, StringNormalizer, DatePartExtractor, LookupEnricher, GeoEnricher, TextTokenCounter ← specializations
|
|
95
|
+
│ ├── analytics_engineering/ # StagingModelKnot, IntermediateModelKnot, MartModelKnot, RefreshMaterializedView, MetricLayerAggregator, ExposureLineageTag ← specializations
|
|
96
|
+
│ └── schema_migration/ # BackfillRunner, SchemaVersionMigrator, ColumnLineageTracker ← specializations
|
|
97
|
+
└── specialized/ # Tier-4 specialised adapters (Lance, Eland)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
File formats and object stores live under `pirn/domains/connectors/`, not here.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Tier selection guide
|
|
105
|
+
|
|
106
|
+
| Data size | Latency need | Infrastructure | Recommended tier |
|
|
107
|
+
|-----------|-------------|----------------|-----------------|
|
|
108
|
+
| < 100 k rows | Any | Any | **Tier 1** — zero deps, fast enough |
|
|
109
|
+
| Fits in RAM (up to ~10 GB) | Batch | Single machine | **Tier 2 Polars** |
|
|
110
|
+
| Fits in RAM, SQL-heavy | Batch | Single machine | **Tier 2 DuckDB** |
|
|
111
|
+
| Exceeds RAM, single machine | Batch | Single machine | **Tier 2.5 Modin** |
|
|
112
|
+
| Warehouse-scale, existing backend | Batch | Any (warehouse/cluster) | **Tier 3 Ibis** |
|
|
113
|
+
| Distributed compute required | Batch | Spark / Dask / Ray cluster | **Tier 3 Spark/Dask/Ray** |
|
|
114
|
+
| Continuous / event-driven | Sub-second | Stream processor | **Tier 3-stream** |
|
|
115
|
+
| Vector similarity search | Any | Any | **Tier 4 Lance** |
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Canonical patterns
|
|
120
|
+
|
|
121
|
+
### Tier-1 dict batch pipeline
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from pirn.core.knot_config import KnotConfig
|
|
125
|
+
from pirn.core.knot_factory import knot
|
|
126
|
+
from pirn.core.parameter import Parameter
|
|
127
|
+
from pirn.core.run_request import RunRequest
|
|
128
|
+
from pirn.tapestry import Tapestry
|
|
129
|
+
from pirn_data.sources.file_source import FileSource
|
|
130
|
+
from pirn_data.transforms.filter import Filter
|
|
131
|
+
from pirn_data.transforms.aggregate import Aggregate
|
|
132
|
+
from pirn_data.transforms.aggregate_spec import AggregateSpec
|
|
133
|
+
from pirn_data.sinks.file_sink import FileSink
|
|
134
|
+
from pirn.connectors.file_formats.csv_format import CsvFormat
|
|
135
|
+
from pirn.connectors.file_formats.parquet_format import ParquetFormat
|
|
136
|
+
from pirn.connectors.object_stores.local_object_store import LocalObjectStore
|
|
137
|
+
|
|
138
|
+
with Tapestry() as t:
|
|
139
|
+
store = LocalObjectStore(root="/data")
|
|
140
|
+
source = FileSource(
|
|
141
|
+
store=store,
|
|
142
|
+
format=CsvFormat(),
|
|
143
|
+
key="input/sales.csv",
|
|
144
|
+
_config=KnotConfig(id="source"),
|
|
145
|
+
)
|
|
146
|
+
active = Filter(
|
|
147
|
+
batch=source,
|
|
148
|
+
predicate=lambda row: row["status"] == "active",
|
|
149
|
+
_config=KnotConfig(id="active_only"),
|
|
150
|
+
)
|
|
151
|
+
summary = Aggregate(
|
|
152
|
+
batch=active,
|
|
153
|
+
by=["region"],
|
|
154
|
+
aggs={"revenue": AggregateSpec(op="sum", column="revenue")},
|
|
155
|
+
_config=KnotConfig(id="by_region"),
|
|
156
|
+
)
|
|
157
|
+
FileSink(
|
|
158
|
+
batch=summary,
|
|
159
|
+
store=store,
|
|
160
|
+
format=ParquetFormat(compression="zstd"),
|
|
161
|
+
key="output/region_revenue.parquet",
|
|
162
|
+
_config=KnotConfig(id="sink"),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
result = await t.run(RunRequest(parameters={}))
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Tier-2 Polars pipeline
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import polars as pl
|
|
172
|
+
from pirn_data.frames.polars.polars_data_batch import PolarsDataBatch
|
|
173
|
+
from pirn_data.frames.polars.polars_filter import PolarsFilter
|
|
174
|
+
from pirn_data.frames.polars.polars_join import PolarsJoin
|
|
175
|
+
from pirn_data.frames.polars.polars_aggregate import PolarsAggregate
|
|
176
|
+
from pirn_data.frames.polars.bridges.data_batch_to_polars import DataBatchToPolars
|
|
177
|
+
from pirn_data.frames.polars.bridges.polars_to_data_batch import PolarsToDataBatch
|
|
178
|
+
|
|
179
|
+
# Promote a Tier-1 source to a Polars frame
|
|
180
|
+
polars_batch = DataBatchToPolars(
|
|
181
|
+
batch=source,
|
|
182
|
+
_config=KnotConfig(id="to_polars"),
|
|
183
|
+
)
|
|
184
|
+
filtered = PolarsFilter(
|
|
185
|
+
batch=polars_batch,
|
|
186
|
+
expression=pl.col("region") == "EU",
|
|
187
|
+
_config=KnotConfig(id="eu_only"),
|
|
188
|
+
)
|
|
189
|
+
joined = PolarsJoin(
|
|
190
|
+
left=filtered,
|
|
191
|
+
right=ref_batch,
|
|
192
|
+
on="region",
|
|
193
|
+
how="left",
|
|
194
|
+
_config=KnotConfig(id="join_ref"),
|
|
195
|
+
)
|
|
196
|
+
# Demote back to Tier-1 DataBatch if downstream knots expect it
|
|
197
|
+
result_batch = PolarsToDataBatch(
|
|
198
|
+
batch=joined,
|
|
199
|
+
_config=KnotConfig(id="to_data_batch"),
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Anti-patterns
|
|
206
|
+
|
|
207
|
+
### Reaching for Tier 3 when data fits in memory
|
|
208
|
+
|
|
209
|
+
Ibis/Spark add serialisation overhead and cluster setup cost. If the dataset fits in RAM, use Tier-2 Polars. Reserve Tier-3 for warehouse-resident data or genuinely distributed workloads.
|
|
210
|
+
|
|
211
|
+
### Mixing tier knots without conversion bridges
|
|
212
|
+
|
|
213
|
+
Passing a `PolarsDataBatch` directly into a Tier-1 `Filter` (or vice versa) will fail at runtime because the knots expect different input types. Always interpose the bridge knots:
|
|
214
|
+
|
|
215
|
+
- `DataBatchToPolars` — `DataBatch` → `PolarsDataBatch`
|
|
216
|
+
- `PolarsToDataBatch` — `PolarsDataBatch` → `DataBatch`
|
|
217
|
+
|
|
218
|
+
Equivalent bridges exist in `frames/duckdb/bridges/`.
|
|
219
|
+
|
|
220
|
+
### Using Tier-1 `Aggregate` for joins
|
|
221
|
+
|
|
222
|
+
Tier-1 has no `Join` knot — a Python-level hash-join of two `tuple[dict]` lists would be O(n·m) without indexes. If you need joins, promote to Tier 2 first with a bridge knot, then use `PolarsJoin` or `DuckDbJoin`.
|
|
223
|
+
|
|
224
|
+
### Calling `IcebergTable.merge()` in production
|
|
225
|
+
|
|
226
|
+
As of mid-2026, `pyiceberg`'s Python writer does not implement merge. The method raises `NotImplementedError`. Use the Java/Scala Iceberg writer for production upserts.
|
|
227
|
+
|
|
228
|
+
### Writing to a Hudi table from Python
|
|
229
|
+
|
|
230
|
+
`HudiTable` is read-only. All write methods raise `NotImplementedError`. Writes require the `hudi-spark-bundle` Spark writer.
|
|
231
|
+
|
|
232
|
+
### Instantiating format objects inside `process()`
|
|
233
|
+
|
|
234
|
+
`FileFormat` objects are stateless but they import their vendor library at construction time. Build them once at pipeline-wiring time, not inside knot `process()` methods.
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Constraints and gotchas
|
|
239
|
+
|
|
240
|
+
- **`DataBatch.rows` is a `tuple`** (immutable). Never mutate it. Produce a new batch with `batch.with_rows(new_rows)` — this preserves `schema` and `source_uri`.
|
|
241
|
+
- **`DirectorySource` with `concatenate=True` loses per-file lineage.** The `source_uri` collapses to `{store}://{prefix}*`. Use `concatenate=False` when provenance matters.
|
|
242
|
+
- **`Aggregate` skips `None` values.** Empty groups yield `None` for mean/min/max/first/last and `0` for count/count_distinct.
|
|
243
|
+
- **Tier-3-stream requires Python < 3.14** until Pathway and Bytewax catch up to the new Python release.
|
|
244
|
+
- **`pirn[hudi]` is a no-op marker extra** — there is no stable vendor SDK on PyPI. The read path relies only on `pyarrow` (included with `pirn[data]`).
|
|
245
|
+
- **`CompressedFileFormat` codec availability varies.** `gzip` and `bzip2` use stdlib and are always available. `zstd`, `snappy`, and `lz4` each require their own extra.
|
|
246
|
+
- **`ArchiveFileFormat` is always non-streaming** — the full archive must be buffered. Do not use it for large files where incremental streaming matters.
|
|
247
|
+
- **`FileSink.process()` returns the destination `key` string**, not a `DataBatch`. Downstream knots that expect a `DataBatch` must not follow a `FileSink` directly.
|
|
248
|
+
- **Lakehouse vendor SDKs load lazily.** Import errors for missing extras are raised on first use, not at module import time.
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Quick reference
|
|
253
|
+
|
|
254
|
+
| Task | Tier | Knot / Class |
|
|
255
|
+
|------|------|-------------|
|
|
256
|
+
| Read a single file | 1 | `FileSource` |
|
|
257
|
+
| Read all files under a prefix | 1 | `DirectorySource` |
|
|
258
|
+
| Write a file | 1 | `FileSink` |
|
|
259
|
+
| Filter rows (Python callable) | 1 | `Filter` |
|
|
260
|
+
| Rename columns | 1 | `Rename` |
|
|
261
|
+
| Type coercion | 1 | `Cast` |
|
|
262
|
+
| String normalisation | 1 | `Normalize` |
|
|
263
|
+
| Group-by aggregation | 1 | `Aggregate` |
|
|
264
|
+
| Deduplication | 1 | `Deduplicate` |
|
|
265
|
+
| Vectorised filter (Polars expr) | 2 | `PolarsFilter` |
|
|
266
|
+
| Join two frames | 2 | `PolarsJoin` / `DuckDbJoin` |
|
|
267
|
+
| Window functions | 2 | `PolarsWindowCalc` |
|
|
268
|
+
| Pivot / unpivot | 2 | `PolarsPivot` / `PolarsUnpivot` |
|
|
269
|
+
| DataBatch → Polars | bridge | `DataBatchToPolars` |
|
|
270
|
+
| Polars → DataBatch | bridge | `PolarsToDataBatch` |
|
|
271
|
+
| Delta Lake read/write/merge | lakehouse | `DeltaTable` + `LakehouseTableSource` |
|
|
272
|
+
| Iceberg read/write | lakehouse | `IcebergTable` |
|
|
273
|
+
| Hudi read (only) | lakehouse | `HudiTable` |
|
|
274
|
+
| Bronze raw ingest | specialisation | `BronzeRawIngest` |
|
|
275
|
+
| Silver clean transform | specialisation | `SilverCleanTransform` |
|
|
276
|
+
| Gold aggregation | specialisation | `GoldAggregation` |
|
|
277
|
+
| Incremental watermark ingest | specialisation | `WatermarkIncrementalExtract` |
|
|
278
|
+
| SCD Type 2 history | specialisation | `ScdType2` |
|
|
279
|
+
| SCD Type 3 previous value | specialisation | `ScdType3PreviousValue` |
|
|
280
|
+
| SCD Type 4 mini-dimension | specialisation | `ScdType4MiniDimension` |
|
|
281
|
+
| SCD Type 6 hybrid (1+2+3) | specialisation | `ScdType6Hybrid` |
|
|
282
|
+
| Debezium CDC apply | specialisation | `CdcDebezium` |
|
|
283
|
+
| Date dimension generate | specialisation | `DateDimGenerator` |
|
|
284
|
+
| Dimension table load | specialisation | `DimTableLoad` |
|
|
285
|
+
| Fact table load | specialisation | `FactTableLoad` |
|
|
286
|
+
| Data Vault hub load | specialisation | `DataVaultHubLoader` |
|
|
287
|
+
| Data Vault satellite load | specialisation | `DataVaultSatelliteLoader` |
|
|
288
|
+
| Data Vault PIT table | specialisation | `DataVaultPITTableBuilder` |
|
|
289
|
+
| Snapshot append | specialisation | `SnapshotTableAppender` |
|
|
290
|
+
| dbt-style snapshot | specialisation | `DbtStyleSnapshot` |
|
|
291
|
+
| Merge upsert | specialisation | `MergeUpsert` |
|
|
292
|
+
| Row count anomaly | specialisation | `RowCountAnomalyDetector` |
|
|
293
|
+
| Null rate monitor | specialisation | `NullRateMonitor` |
|
|
294
|
+
| Referential integrity check | specialisation | `ReferentialIntegrityCheck` |
|
|
295
|
+
| Reconciliation diff | specialisation | `ReconciliationDiff` |
|
|
296
|
+
| Exact deduplication | specialisation | `ExactDeduplicator` |
|
|
297
|
+
| Fuzzy deduplication | specialisation | `FuzzyDeduplicator` |
|
|
298
|
+
| Time series resample | specialisation | `TimeSeriesResampler` |
|
|
299
|
+
| Rolling window aggregation | specialisation | `RollingWindowAggregator` |
|
|
300
|
+
| Sessionization | specialisation | `SessionizationKnot` |
|
|
301
|
+
| Derived column calculation | specialisation | `DerivedColumnCalculator` |
|
|
302
|
+
| Binning / bucketing | specialisation | `BinningKnot` |
|
|
303
|
+
| Lookup enrichment | specialisation | `LookupEnricher` |
|
|
304
|
+
| dbt staging model | specialisation | `StagingModelKnot` |
|
|
305
|
+
| dbt mart model | specialisation | `MartModelKnot` |
|
|
306
|
+
| Metric layer aggregation | specialisation | `MetricLayerAggregator` |
|
|
307
|
+
| Schema version migration | specialisation | `SchemaVersionMigrator` |
|
|
308
|
+
| Backfill runner | specialisation | `BackfillRunner` |
|
|
309
|
+
| Column lineage tracking | specialisation | `ColumnLineageTracker` |
|
|
310
|
+
| Compressed format | wrapper | `CompressedFileFormat(inner, codec=...)` |
|
|
311
|
+
| Multi-file archive | wrapper | `ArchiveFileFormat(inner, archive_type=...)` |
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
*See also: [pirn AGENTIC_USE.md](../../AGENTIC_USE.md)*
|
pirn_data/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Data Engineering / Analytics Engineering knot library.
|
|
2
|
+
|
|
3
|
+
Install with::
|
|
4
|
+
|
|
5
|
+
pip install 'pirn-data[data]'
|
|
6
|
+
|
|
7
|
+
Note: ``data_schema``, ``data_batch``, ``quality_check``, and
|
|
8
|
+
``quality_report`` are pure-Python contracts and remain importable in
|
|
9
|
+
minimal environments. Modules that touch pandas / pyarrow (sources,
|
|
10
|
+
transforms, sinks) import those dependencies lazily, so the
|
|
11
|
+
missing-dependency error fires only when those modules are imported.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
from sweet_tea.registry import Registry
|
|
17
|
+
from sweet_tea.sweet_tea_warning import SweetTeaWarning
|
|
18
|
+
|
|
19
|
+
with warnings.catch_warnings():
|
|
20
|
+
warnings.simplefilter("ignore", SweetTeaWarning)
|
|
21
|
+
Registry.fill_registry(module=__name__, library="pirn")
|
|
22
|
+
|
|
23
|
+
__all__: list[str] = []
|
pirn_data/data_batch.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""A batch of rows flowing through the data pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pirn.core.pirn_opaque_value import PirnOpaqueValue
|
|
11
|
+
|
|
12
|
+
from pirn_data.data_schema import DataSchema
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class DataBatch(PirnOpaqueValue):
|
|
17
|
+
"""Tabular batch of rows.
|
|
18
|
+
|
|
19
|
+
Each row is a ``dict[str, Any]`` keyed by column name. Heavy frames
|
|
20
|
+
(Pandas / Arrow / Polars) wrap into ``DataBatch`` via thin adapters in
|
|
21
|
+
the sources / transforms modules.
|
|
22
|
+
|
|
23
|
+
Attributes
|
|
24
|
+
----------
|
|
25
|
+
rows:
|
|
26
|
+
Sequence of row dicts.
|
|
27
|
+
schema:
|
|
28
|
+
Schema the rows conform to.
|
|
29
|
+
source_uri:
|
|
30
|
+
Where the rows came from (DSN, file path, API endpoint). DSN-style
|
|
31
|
+
values must be passed through
|
|
32
|
+
:class:`pirn.connectors.dsn_scrubber.DsnScrubber` before
|
|
33
|
+
assignment to avoid leaking credentials into lineage records.
|
|
34
|
+
fetched_at:
|
|
35
|
+
UTC instant the data was materialised.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
rows: tuple[Mapping[str, Any], ...] = ()
|
|
39
|
+
schema: DataSchema = field(default_factory=DataSchema)
|
|
40
|
+
source_uri: str = ""
|
|
41
|
+
fetched_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def row_count(self) -> int:
|
|
45
|
+
return len(self.rows)
|
|
46
|
+
|
|
47
|
+
def with_rows(self, rows: tuple[Mapping[str, Any], ...]) -> DataBatch:
|
|
48
|
+
"""Copy with ``rows`` replaced; schema/uri/fetched_at preserved."""
|
|
49
|
+
return DataBatch(
|
|
50
|
+
rows=rows,
|
|
51
|
+
schema=self.schema,
|
|
52
|
+
source_uri=self.source_uri,
|
|
53
|
+
fetched_at=self.fetched_at,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def with_schema(self, schema: DataSchema) -> DataBatch:
|
|
57
|
+
"""Copy with ``schema`` replaced; rows preserved."""
|
|
58
|
+
return DataBatch(
|
|
59
|
+
rows=self.rows,
|
|
60
|
+
schema=schema,
|
|
61
|
+
source_uri=self.source_uri,
|
|
62
|
+
fetched_at=self.fetched_at,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _pirn_audit_dict(self) -> dict[str, Any]:
|
|
66
|
+
"""Flatten to a primitive dict for pydantic serialisation.
|
|
67
|
+
|
|
68
|
+
Pirn IO validation only needs ``isinstance(value, DataBatch)``;
|
|
69
|
+
content-addressing flattens the rows into this stable summary.
|
|
70
|
+
The contained :class:`DataSchema` (with its
|
|
71
|
+
``Mapping[str, type]`` columns) is intentionally omitted —
|
|
72
|
+
``DataSchema`` is opaque-serialised separately when needed.
|
|
73
|
+
"""
|
|
74
|
+
return {
|
|
75
|
+
"row_count": self.row_count,
|
|
76
|
+
"source_uri": self.source_uri,
|
|
77
|
+
"fetched_at": self.fetched_at.isoformat(),
|
|
78
|
+
"rows": [dict(r) for r in self.rows],
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def __pirn_canonical__(self) -> dict[str, Any]:
|
|
82
|
+
"""Sanctioned canonical form for :func:`pirn.core.hashing.content_hash`.
|
|
83
|
+
|
|
84
|
+
Returned dict is fully JSON-serialisable: ``schema.columns`` is
|
|
85
|
+
flattened to ``{name: type-name}`` so the otherwise-opaque
|
|
86
|
+
:class:`type` values do not blow up the hasher. The audit dict
|
|
87
|
+
deliberately omits the schema (pydantic IO already validates the
|
|
88
|
+
boundary); content-addressing keeps it so two structurally
|
|
89
|
+
identical batches with the same column types hash equally.
|
|
90
|
+
"""
|
|
91
|
+
return {
|
|
92
|
+
"row_count": self.row_count,
|
|
93
|
+
"source_uri": self.source_uri,
|
|
94
|
+
"fetched_at": self.fetched_at.isoformat(),
|
|
95
|
+
"rows": [dict(r) for r in self.rows],
|
|
96
|
+
"schema_columns": {
|
|
97
|
+
name: column_type.__name__ for name, column_type in self.schema.columns.items()
|
|
98
|
+
},
|
|
99
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""``DataProfile`` and ``ColumnProfile`` — descriptive statistics for a
|
|
2
|
+
:class:`DataBatch`.
|
|
3
|
+
|
|
4
|
+
Emitted by :class:`pirn_data.quality.profiler.Profiler`. A profile
|
|
5
|
+
is observation, not policy: every field describes the input batch, no
|
|
6
|
+
field carries a pass/fail verdict. Compose with a downstream knot if you
|
|
7
|
+
want thresholds enforced (or use :class:`NullRateCheck` /
|
|
8
|
+
:class:`RowCountCheck` directly).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class ColumnProfile:
|
|
20
|
+
"""Per-column statistics."""
|
|
21
|
+
|
|
22
|
+
name: str
|
|
23
|
+
observed_count: int
|
|
24
|
+
null_count: int
|
|
25
|
+
distinct_count: int
|
|
26
|
+
min_value: Any | None = None
|
|
27
|
+
max_value: Any | None = None
|
|
28
|
+
top_value: Any | None = None
|
|
29
|
+
top_value_count: int = 0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class DataProfile:
|
|
34
|
+
"""Aggregate profile of a :class:`DataBatch`."""
|
|
35
|
+
|
|
36
|
+
row_count: int
|
|
37
|
+
column_count: int
|
|
38
|
+
columns: tuple[ColumnProfile, ...] = ()
|
|
39
|
+
sampled_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
40
|
+
|
|
41
|
+
def column(self, name: str) -> ColumnProfile | None:
|
|
42
|
+
"""Return the per-column profile for ``name`` or ``None``."""
|
|
43
|
+
for c in self.columns:
|
|
44
|
+
if c.name == name:
|
|
45
|
+
return c
|
|
46
|
+
return None
|
pirn_data/data_schema.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Tabular schema declaration used across ``pirn_data``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pirn.core.pirn_opaque_value import PirnOpaqueValue
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class DataSchema(PirnOpaqueValue):
|
|
14
|
+
"""Declarative schema for a tabular :class:`DataBatch`.
|
|
15
|
+
|
|
16
|
+
Attributes
|
|
17
|
+
----------
|
|
18
|
+
columns:
|
|
19
|
+
Mapping of column name → expected Python type. Insertion order is
|
|
20
|
+
the canonical column order.
|
|
21
|
+
primary_keys:
|
|
22
|
+
Subset of ``columns`` keys; non-empty for any sink that performs
|
|
23
|
+
upsert or dedup operations.
|
|
24
|
+
nullable:
|
|
25
|
+
Subset of ``columns`` keys whose values may be ``None``.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
columns: Mapping[str, type] = field(default_factory=dict)
|
|
29
|
+
primary_keys: tuple[str, ...] = ()
|
|
30
|
+
nullable: tuple[str, ...] = ()
|
|
31
|
+
|
|
32
|
+
def __post_init__(self) -> None:
|
|
33
|
+
unknown_pks = [k for k in self.primary_keys if k not in self.columns]
|
|
34
|
+
if unknown_pks:
|
|
35
|
+
raise ValueError(f"primary_keys reference unknown columns: {unknown_pks}")
|
|
36
|
+
unknown_nullable = [k for k in self.nullable if k not in self.columns]
|
|
37
|
+
if unknown_nullable:
|
|
38
|
+
raise ValueError(f"nullable references unknown columns: {unknown_nullable}")
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def column_names(self) -> tuple[str, ...]:
|
|
42
|
+
return tuple(self.columns.keys())
|
|
43
|
+
|
|
44
|
+
def is_nullable(self, column: str) -> bool:
|
|
45
|
+
"""Return True if ``column`` is permitted to hold ``None``."""
|
|
46
|
+
return column in self.nullable
|
|
47
|
+
|
|
48
|
+
def with_columns(self, columns: Mapping[str, type]) -> DataSchema:
|
|
49
|
+
"""Return a new schema with the given columns merged in."""
|
|
50
|
+
merged = dict(self.columns)
|
|
51
|
+
merged.update(columns)
|
|
52
|
+
return DataSchema(
|
|
53
|
+
columns=merged,
|
|
54
|
+
primary_keys=self.primary_keys,
|
|
55
|
+
nullable=self.nullable,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _pirn_audit_dict(self) -> dict[str, Any]:
|
|
59
|
+
"""Flatten to a primitive dict for pydantic serialisation.
|
|
60
|
+
|
|
61
|
+
``columns`` holds Python ``type`` objects which pydantic's
|
|
62
|
+
default JSON serialiser can't dump; here we convert each type
|
|
63
|
+
to its name. Pirn IO validation just checks
|
|
64
|
+
``isinstance(value, DataSchema)``; content-addressing serialises
|
|
65
|
+
via this stable summary.
|
|
66
|
+
"""
|
|
67
|
+
return {
|
|
68
|
+
"columns": {k: t.__name__ for k, t in self.columns.items()},
|
|
69
|
+
"primary_keys": list(self.primary_keys),
|
|
70
|
+
"nullable": list(self.nullable),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def __pirn_canonical__(self) -> dict[str, Any]:
|
|
74
|
+
"""Sanctioned canonical form for :func:`pirn.core.hashing.content_hash`.
|
|
75
|
+
|
|
76
|
+
Mirrors :meth:`_pirn_audit_dict` but is the explicit hook the
|
|
77
|
+
hasher prefers. Keeping both methods avoids forcing every
|
|
78
|
+
existing pydantic-serialisation call site through the canonical
|
|
79
|
+
path (and vice versa).
|
|
80
|
+
"""
|
|
81
|
+
return {
|
|
82
|
+
"columns": {name: column_type.__name__ for name, column_type in self.columns.items()},
|
|
83
|
+
"primary_keys": list(self.primary_keys),
|
|
84
|
+
"nullable": list(self.nullable),
|
|
85
|
+
}
|