pirn-data 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (257) hide show
  1. pirn_data/AGENTIC_USE.md +315 -0
  2. pirn_data/__init__.py +23 -0
  3. pirn_data/data_batch.py +99 -0
  4. pirn_data/data_profile.py +46 -0
  5. pirn_data/data_schema.py +85 -0
  6. pirn_data/frames/AGENTIC_USE.md +123 -0
  7. pirn_data/frames/__init__.py +0 -0
  8. pirn_data/frames/datafusion/__init__.py +0 -0
  9. pirn_data/frames/datafusion/bridges/__init__.py +0 -0
  10. pirn_data/frames/datafusion/bridges/data_batch_to_datafusion.py +85 -0
  11. pirn_data/frames/datafusion/bridges/datafusion_to_data_batch.py +66 -0
  12. pirn_data/frames/datafusion/datafusion_aggregate.py +97 -0
  13. pirn_data/frames/datafusion/datafusion_data_batch.py +85 -0
  14. pirn_data/frames/datafusion/datafusion_filter.py +122 -0
  15. pirn_data/frames/datafusion/datafusion_join.py +147 -0
  16. pirn_data/frames/datafusion/datafusion_session_context.py +35 -0
  17. pirn_data/frames/datafusion/datafusion_session_context_knot.py +54 -0
  18. pirn_data/frames/duckdb/__init__.py +0 -0
  19. pirn_data/frames/duckdb/bridges/__init__.py +0 -0
  20. pirn_data/frames/duckdb/bridges/data_batch_to_duckdb.py +122 -0
  21. pirn_data/frames/duckdb/bridges/duckdb_to_data_batch.py +51 -0
  22. pirn_data/frames/duckdb/duckdb_aggregate.py +122 -0
  23. pirn_data/frames/duckdb/duckdb_cast.py +128 -0
  24. pirn_data/frames/duckdb/duckdb_connection.py +36 -0
  25. pirn_data/frames/duckdb/duckdb_connection_knot.py +50 -0
  26. pirn_data/frames/duckdb/duckdb_data_batch.py +86 -0
  27. pirn_data/frames/duckdb/duckdb_deduplicate.py +112 -0
  28. pirn_data/frames/duckdb/duckdb_filter.py +97 -0
  29. pirn_data/frames/duckdb/duckdb_join.py +191 -0
  30. pirn_data/frames/duckdb/duckdb_rename.py +112 -0
  31. pirn_data/frames/pandas/__init__.py +0 -0
  32. pirn_data/frames/pandas/bridges/__init__.py +0 -0
  33. pirn_data/frames/pandas/bridges/data_batch_to_pandas.py +63 -0
  34. pirn_data/frames/pandas/bridges/pandas_to_data_batch.py +62 -0
  35. pirn_data/frames/pandas/pandas_aggregate.py +144 -0
  36. pirn_data/frames/pandas/pandas_cast.py +103 -0
  37. pirn_data/frames/pandas/pandas_data_batch.py +75 -0
  38. pirn_data/frames/pandas/pandas_deduplicate.py +74 -0
  39. pirn_data/frames/pandas/pandas_filter.py +79 -0
  40. pirn_data/frames/pandas/pandas_join.py +153 -0
  41. pirn_data/frames/pandas/pandas_rename.py +70 -0
  42. pirn_data/frames/polars/__init__.py +0 -0
  43. pirn_data/frames/polars/bridges/__init__.py +0 -0
  44. pirn_data/frames/polars/bridges/data_batch_to_polars.py +63 -0
  45. pirn_data/frames/polars/bridges/polars_to_data_batch.py +62 -0
  46. pirn_data/frames/polars/polars_aggregate.py +86 -0
  47. pirn_data/frames/polars/polars_cast.py +101 -0
  48. pirn_data/frames/polars/polars_data_batch.py +75 -0
  49. pirn_data/frames/polars/polars_deduplicate.py +69 -0
  50. pirn_data/frames/polars/polars_filter.py +77 -0
  51. pirn_data/frames/polars/polars_join.py +156 -0
  52. pirn_data/frames/polars/polars_pivot.py +127 -0
  53. pirn_data/frames/polars/polars_rename.py +70 -0
  54. pirn_data/frames/polars/polars_unpivot.py +113 -0
  55. pirn_data/frames/polars/polars_window_calc.py +81 -0
  56. pirn_data/frames/pyarrow/__init__.py +0 -0
  57. pirn_data/frames/pyarrow/bridges/__init__.py +0 -0
  58. pirn_data/frames/pyarrow/bridges/data_batch_to_pyarrow.py +53 -0
  59. pirn_data/frames/pyarrow/bridges/pyarrow_to_data_batch.py +47 -0
  60. pirn_data/frames/pyarrow/pyarrow_aggregate.py +136 -0
  61. pirn_data/frames/pyarrow/pyarrow_cast.py +112 -0
  62. pirn_data/frames/pyarrow/pyarrow_data_batch.py +76 -0
  63. pirn_data/frames/pyarrow/pyarrow_deduplicate.py +131 -0
  64. pirn_data/frames/pyarrow/pyarrow_filter.py +101 -0
  65. pirn_data/frames/pyarrow/pyarrow_join.py +160 -0
  66. pirn_data/frames/pyarrow/pyarrow_rename.py +76 -0
  67. pirn_data/identifier_validator.py +53 -0
  68. pirn_data/lakehouse/AGENTIC_USE.md +106 -0
  69. pirn_data/lakehouse/__init__.py +0 -0
  70. pirn_data/lakehouse/delta/__init__.py +0 -0
  71. pirn_data/lakehouse/delta/delta_table.py +258 -0
  72. pirn_data/lakehouse/delta/delta_table_config.py +43 -0
  73. pirn_data/lakehouse/hudi/__init__.py +0 -0
  74. pirn_data/lakehouse/hudi/hudi_table.py +213 -0
  75. pirn_data/lakehouse/hudi/hudi_table_config.py +45 -0
  76. pirn_data/lakehouse/iceberg/__init__.py +0 -0
  77. pirn_data/lakehouse/iceberg/iceberg_table.py +257 -0
  78. pirn_data/lakehouse/iceberg/iceberg_table_config.py +47 -0
  79. pirn_data/lakehouse/lakehouse_table.py +101 -0
  80. pirn_data/lakehouse/lakehouse_table_sink.py +95 -0
  81. pirn_data/lakehouse/lakehouse_table_source.py +106 -0
  82. pirn_data/lazy/AGENTIC_USE.md +101 -0
  83. pirn_data/lazy/__init__.py +0 -0
  84. pirn_data/lazy/dask/__init__.py +0 -0
  85. pirn_data/lazy/dask/dask_aggregate.py +141 -0
  86. pirn_data/lazy/dask/dask_compute.py +150 -0
  87. pirn_data/lazy/dask/dask_dataframe.py +67 -0
  88. pirn_data/lazy/dask/dask_execution_receipt.py +37 -0
  89. pirn_data/lazy/dask/dask_filter.py +72 -0
  90. pirn_data/lazy/dask/dask_join.py +125 -0
  91. pirn_data/lazy/dask/dask_source.py +118 -0
  92. pirn_data/lazy/ibis/__init__.py +0 -0
  93. pirn_data/lazy/ibis/ibis_connection.py +34 -0
  94. pirn_data/lazy/ibis/ibis_connection_knot.py +55 -0
  95. pirn_data/lazy/ibis/ibis_execution_receipt.py +43 -0
  96. pirn_data/lazy/ibis/ibis_filter.py +77 -0
  97. pirn_data/lazy/ibis/ibis_group_by_aggregate.py +120 -0
  98. pirn_data/lazy/ibis/ibis_join.py +144 -0
  99. pirn_data/lazy/ibis/ibis_source.py +115 -0
  100. pirn_data/lazy/ibis/ibis_table.py +81 -0
  101. pirn_data/lazy/ibis/ibis_to_table.py +146 -0
  102. pirn_data/lazy/ibis/ibis_window.py +93 -0
  103. pirn_data/lazy/ray/__init__.py +0 -0
  104. pirn_data/lazy/ray/ray_aggregate.py +128 -0
  105. pirn_data/lazy/ray/ray_compute.py +159 -0
  106. pirn_data/lazy/ray/ray_dataset.py +57 -0
  107. pirn_data/lazy/ray/ray_execution_receipt.py +38 -0
  108. pirn_data/lazy/ray/ray_filter.py +71 -0
  109. pirn_data/lazy/ray/ray_map.py +103 -0
  110. pirn_data/lazy/ray/ray_source.py +120 -0
  111. pirn_data/lazy/spark/__init__.py +0 -0
  112. pirn_data/lazy/spark/spark_aggregate.py +134 -0
  113. pirn_data/lazy/spark/spark_collect_sink.py +95 -0
  114. pirn_data/lazy/spark/spark_dataframe.py +71 -0
  115. pirn_data/lazy/spark/spark_execution_receipt.py +50 -0
  116. pirn_data/lazy/spark/spark_filter.py +67 -0
  117. pirn_data/lazy/spark/spark_join.py +160 -0
  118. pirn_data/lazy/spark/spark_source.py +136 -0
  119. pirn_data/lazy/spark/spark_write_sink.py +100 -0
  120. pirn_data/quality/__init__.py +0 -0
  121. pirn_data/quality/freshness_check.py +140 -0
  122. pirn_data/quality/null_rate_check.py +126 -0
  123. pirn_data/quality/profiler.py +144 -0
  124. pirn_data/quality/row_count_check.py +104 -0
  125. pirn_data/quality/schema_validator.py +160 -0
  126. pirn_data/quality_check.py +20 -0
  127. pirn_data/quality_report.py +31 -0
  128. pirn_data/sinks/__init__.py +0 -0
  129. pirn_data/sinks/file_sink.py +92 -0
  130. pirn_data/sources/__init__.py +0 -0
  131. pirn_data/sources/directory_source.py +126 -0
  132. pirn_data/sources/file_source.py +115 -0
  133. pirn_data/sources/sql_source.py +108 -0
  134. pirn_data/specializations/AGENTIC_USE.md +116 -0
  135. pirn_data/specializations/__init__.py +0 -0
  136. pirn_data/specializations/analytics_engineering/__init__.py +1 -0
  137. pirn_data/specializations/analytics_engineering/exposure_lineage_tag.py +100 -0
  138. pirn_data/specializations/analytics_engineering/intermediate_model_knot.py +141 -0
  139. pirn_data/specializations/analytics_engineering/mart_model_knot.py +124 -0
  140. pirn_data/specializations/analytics_engineering/metric_layer_aggregator.py +163 -0
  141. pirn_data/specializations/analytics_engineering/refresh_materialized_view.py +94 -0
  142. pirn_data/specializations/analytics_engineering/staging_model_knot.py +121 -0
  143. pirn_data/specializations/data_vault/__init__.py +0 -0
  144. pirn_data/specializations/data_vault/data_vault_bridge_table_builder.py +174 -0
  145. pirn_data/specializations/data_vault/data_vault_hub_loader.py +164 -0
  146. pirn_data/specializations/data_vault/data_vault_link_loader.py +167 -0
  147. pirn_data/specializations/data_vault/data_vault_pit_table_builder.py +182 -0
  148. pirn_data/specializations/data_vault/data_vault_satellite_loader.py +229 -0
  149. pirn_data/specializations/deduplication/__init__.py +1 -0
  150. pirn_data/specializations/deduplication/exact_deduplicator.py +83 -0
  151. pirn_data/specializations/deduplication/fuzzy_deduplicator.py +192 -0
  152. pirn_data/specializations/deduplication/probabilistic_linker.py +136 -0
  153. pirn_data/specializations/deduplication/windowed_deduplicator.py +90 -0
  154. pirn_data/specializations/dimensional/__init__.py +0 -0
  155. pirn_data/specializations/dimensional/bridge_table_builder.py +170 -0
  156. pirn_data/specializations/dimensional/date_dim_generator.py +148 -0
  157. pirn_data/specializations/dimensional/dim_table_load.py +277 -0
  158. pirn_data/specializations/dimensional/fact_table_load.py +186 -0
  159. pirn_data/specializations/feature_engineering/__init__.py +1 -0
  160. pirn_data/specializations/feature_engineering/binning_knot.py +129 -0
  161. pirn_data/specializations/feature_engineering/column_hasher.py +76 -0
  162. pirn_data/specializations/feature_engineering/date_part_extractor.py +105 -0
  163. pirn_data/specializations/feature_engineering/derived_column_calculator.py +159 -0
  164. pirn_data/specializations/feature_engineering/geo_enricher.py +141 -0
  165. pirn_data/specializations/feature_engineering/lookup_enricher.py +83 -0
  166. pirn_data/specializations/feature_engineering/string_normalizer.py +109 -0
  167. pirn_data/specializations/feature_engineering/text_token_counter.py +108 -0
  168. pirn_data/specializations/incremental/__init__.py +0 -0
  169. pirn_data/specializations/incremental/dbt_style_snapshot.py +228 -0
  170. pirn_data/specializations/incremental/delete_safe_sync.py +193 -0
  171. pirn_data/specializations/incremental/merge_upsert.py +143 -0
  172. pirn_data/specializations/incremental/partitioned_overwrite.py +115 -0
  173. pirn_data/specializations/incremental/snapshot_table_appender.py +112 -0
  174. pirn_data/specializations/ingestion/__init__.py +0 -0
  175. pirn_data/specializations/ingestion/append_only_ingest.py +92 -0
  176. pirn_data/specializations/ingestion/full_refresh_extract.py +111 -0
  177. pirn_data/specializations/ingestion/query_new_rows_knot.py +115 -0
  178. pirn_data/specializations/ingestion/read_high_water_mark_knot.py +84 -0
  179. pirn_data/specializations/ingestion/rows_behind_truncate_check_knot.py +40 -0
  180. pirn_data/specializations/ingestion/truncate_table_knot.py +64 -0
  181. pirn_data/specializations/ingestion/watermark_incremental_extract.py +133 -0
  182. pirn_data/specializations/medallion/__init__.py +0 -0
  183. pirn_data/specializations/medallion/bronze_raw_ingest.py +120 -0
  184. pirn_data/specializations/medallion/data_batch_to_tuples_knot.py +64 -0
  185. pirn_data/specializations/medallion/gold_aggregation.py +166 -0
  186. pirn_data/specializations/medallion/silver_clean_transform.py +170 -0
  187. pirn_data/specializations/medallion/stamp_bronze_metadata_knot.py +65 -0
  188. pirn_data/specializations/medallion/tuples_to_data_batch_knot.py +66 -0
  189. pirn_data/specializations/quality/__init__.py +0 -0
  190. pirn_data/specializations/quality/database_table_freshness_check.py +130 -0
  191. pirn_data/specializations/quality/null_rate_monitor.py +117 -0
  192. pirn_data/specializations/quality/reconciliation_diff.py +155 -0
  193. pirn_data/specializations/quality/referential_integrity_check.py +109 -0
  194. pirn_data/specializations/quality/row_count_anomaly_detector.py +140 -0
  195. pirn_data/specializations/quality/schema_evolution_detector.py +109 -0
  196. pirn_data/specializations/quality/statistical_profiler.py +183 -0
  197. pirn_data/specializations/scd/__init__.py +0 -0
  198. pirn_data/specializations/scd/cdc/__init__.py +0 -0
  199. pirn_data/specializations/scd/cdc/cdc_message_broker_knot.py +58 -0
  200. pirn_data/specializations/scd/cdc/debezium_source.py +176 -0
  201. pirn_data/specializations/scd/cdc/message_broker_connection.py +36 -0
  202. pirn_data/specializations/scd/cdc_debezium.py +238 -0
  203. pirn_data/specializations/scd/scd_type_1.py +147 -0
  204. pirn_data/specializations/scd/scd_type_1_merge_knot.py +146 -0
  205. pirn_data/specializations/scd/scd_type_1_overwrite.py +156 -0
  206. pirn_data/specializations/scd/scd_type_2.py +185 -0
  207. pirn_data/specializations/scd/scd_type_2_history.py +218 -0
  208. pirn_data/specializations/scd/scd_type_2_merge_knot.py +197 -0
  209. pirn_data/specializations/scd/scd_type_3_previous_value.py +196 -0
  210. pirn_data/specializations/scd/scd_type_4_mini_dimension.py +184 -0
  211. pirn_data/specializations/scd/scd_type_5_mini_dim_with_current.py +200 -0
  212. pirn_data/specializations/scd/scd_type_6_hybrid.py +270 -0
  213. pirn_data/specializations/scd/scd_type_7.py +200 -0
  214. pirn_data/specializations/scd/scd_type_7_hybrid.py +250 -0
  215. pirn_data/specializations/scd/scd_type_7_merge_knot.py +225 -0
  216. pirn_data/specializations/schema_migration/__init__.py +1 -0
  217. pirn_data/specializations/schema_migration/backfill_runner.py +133 -0
  218. pirn_data/specializations/schema_migration/column_lineage_tracker.py +127 -0
  219. pirn_data/specializations/schema_migration/schema_version_migrator.py +130 -0
  220. pirn_data/specializations/timeseries/__init__.py +1 -0
  221. pirn_data/specializations/timeseries/cohort_aggregator.py +153 -0
  222. pirn_data/specializations/timeseries/funnel_analysis_knot.py +123 -0
  223. pirn_data/specializations/timeseries/late_arriving_event_handler.py +161 -0
  224. pirn_data/specializations/timeseries/rolling_window_aggregator.py +125 -0
  225. pirn_data/specializations/timeseries/sessionization_knot.py +114 -0
  226. pirn_data/specializations/timeseries/time_series_resampler.py +131 -0
  227. pirn_data/specialized/__init__.py +0 -0
  228. pirn_data/specialized/eland/__init__.py +0 -0
  229. pirn_data/specialized/eland/eland_dataframe.py +56 -0
  230. pirn_data/specialized/eland/eland_filter.py +91 -0
  231. pirn_data/specialized/eland/eland_source.py +81 -0
  232. pirn_data/specialized/eland/eland_to_pandas.py +51 -0
  233. pirn_data/specialized/eland/elasticsearch_connection.py +35 -0
  234. pirn_data/specialized/eland/elasticsearch_connection_knot.py +59 -0
  235. pirn_data/specialized/lance/__init__.py +0 -0
  236. pirn_data/specialized/lance/arrow_to_lance_sink.py +77 -0
  237. pirn_data/specialized/lance/lance_dataset.py +53 -0
  238. pirn_data/specialized/lance/lance_source.py +63 -0
  239. pirn_data/specialized/lance/lance_to_arrow.py +46 -0
  240. pirn_data/transforms/__init__.py +0 -0
  241. pirn_data/transforms/aggregate.py +197 -0
  242. pirn_data/transforms/aggregate_spec.py +43 -0
  243. pirn_data/transforms/cast.py +120 -0
  244. pirn_data/transforms/deduplicate.py +111 -0
  245. pirn_data/transforms/filter.py +69 -0
  246. pirn_data/transforms/normalize.py +132 -0
  247. pirn_data/transforms/normalize_column_rule.py +36 -0
  248. pirn_data/transforms/rename.py +105 -0
  249. pirn_data/validation/__init__.py +0 -0
  250. pirn_data/validation/great_expectations/__init__.py +0 -0
  251. pirn_data/validation/great_expectations/great_expectations_pandas_validator.py +229 -0
  252. pirn_data/validation/pandera/__init__.py +0 -0
  253. pirn_data/validation/pandera/pandera_pandas_validator.py +162 -0
  254. pirn_data/validation/pandera/pandera_polars_validator.py +164 -0
  255. pirn_data-0.4.0.dist-info/METADATA +55 -0
  256. pirn_data-0.4.0.dist-info/RECORD +257 -0
  257. pirn_data-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,315 @@
1
+ # AGENTIC_USE — pirn_data
2
+
3
+ Provides a tiered, engine-agnostic layer for reading, transforming, and writing structured data — it does NOT include API connectors, message-queue consumers, or ML training loops (those are separate domains).
4
+
5
+ ---
6
+
7
+ ## Mental model
8
+
9
+ pirn is the **orchestrator**; the engines do the work. The data domain is stratified into tiers, each independently opt-in:
10
+
11
+ | Tier | Label | Engines | When to use |
12
+ |------|-------|---------|-------------|
13
+ | **1** | Dict / DataBatch | Pure Python | Always available. Use for < ~100 k rows, glue logic, or environments where no heavy deps are allowed. Every record is a `dict`; the exchange currency is `DataBatch`. |
14
+ | **2** | Native frames (CPU) | **Polars** (preferred), DuckDB, pandas+PyArrow, DataFusion | Fits comfortably in RAM. Need fast vectorised ops, joins, window functions, pivots. Polars is the default; reach for DuckDB when SQL is more natural or when you need concurrent read-only queries. |
15
+ | **2-GPU** | Native frames (GPU) | cuDF | CUDA cluster only. Drop-in Polars-compatible when `cudf-cu12` is installed. |
16
+ | **2.5** | Out-of-core | Modin | Data exceeds RAM, single machine. Pandas-compatible API chunked on disk. |
17
+ | **3** | Lazy / push-down | **Ibis** (preferred), Spark, Dask, Ray Data | Data doesn't fit on one machine, or lives in a warehouse already. Ibis first — it targets many backends without a Spark cluster. Spark/Dask/Ray when distributed compute is required. |
18
+ | **3-stream** | Streaming dataflow | Pathway, Bytewax | Continuous / low-latency event streams. Requires Python < 3.14 until upstream catches up. |
19
+ | **4** | Specialised | Lance (vector), Eland (Elasticsearch) | Domain-specific columnar layouts; not general-purpose. |
20
+
21
+ **Tier-1 (`DataBatch`) is always installed with `pirn[data]`.** Tiers 2–4 are independent extras that do not pull each other in.
22
+
23
+ ---
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install pirn[data] # Tier 1 only — DataBatch, sources, sinks, transforms (no heavy deps)
29
+ pip install pirn[all-frames] # Tier 2 single-machine engines (Polars, DuckDB, pandas, PyArrow, DataFusion)
30
+ pip install pirn[all-lazy] # Tier 3 push-down engines (Ibis, Spark, Dask, Ray Data)
31
+ ```
32
+
33
+ Specialised extras (one at a time as needed):
34
+
35
+ ```bash
36
+ pip install pirn[polars] # Polars only
37
+ pip install pirn[delta] # Delta Lake lakehouse adapter
38
+ pip install pirn[iceberg] # Apache Iceberg adapter
39
+ pip install pirn[health] # DICOM, HL7, FHIR, NIfTI, EDF, BIDS, etc.
40
+ pip install pirn[genomics] # FASTA, FASTQ, VCF, BAM, CRAM, SAM
41
+ pip install pirn[oilgas] # SEG-Y, DLIS, LAS, WITSML
42
+ ```
43
+
44
+ ---
45
+
46
+ ## Source map
47
+
48
+ ```
49
+ pirn_data/
50
+ ├── data_batch.py # Tier-1 exchange type: immutable tuple of dicts
51
+ ├── data_schema.py # Optional schema metadata attached to DataBatch
52
+ ├── data_profile.py # Statistical profile of a DataBatch
53
+ ├── quality/ # Quality checks and reports
54
+ │ ├── quality_check.py
55
+ │ └── quality_report.py
56
+ ├── sources/
57
+ │ ├── file_source.py # Single-file source (ObjectStore + FileFormat)
58
+ │ └── directory_source.py # Prefix glob → one or many DataBatches
59
+ ├── sinks/
60
+ │ └── file_sink.py # Encode + write DataBatch to ObjectStore
61
+ ├── transforms/ # Tier-1 transforms (pure Python, no extras)
62
+ │ ├── filter.py # Row predicate (Python callable)
63
+ │ ├── rename.py # Column rename map
64
+ │ ├── cast.py # Type coercion
65
+ │ ├── normalize.py # String cleanup rules
66
+ │ ├── aggregate.py # Group-by + aggregations
67
+ │ └── deduplicate.py # Row deduplication
68
+ ├── frames/ # Tier-2 engine-specific knots
69
+ │ ├── polars/ # PolarsFilter, PolarsJoin, PolarsAggregate, etc.
70
+ │ │ └── bridges/ # DataBatch ↔ PolarsDataBatch conversions
71
+ │ ├── duckdb/ # DuckDB equivalents
72
+ │ ├── pandas/
73
+ │ ├── pyarrow/
74
+ │ └── datafusion/
75
+ ├── lazy/ # Tier-3 lazy/push-down knots
76
+ │ ├── ibis/
77
+ │ ├── spark/
78
+ │ ├── dask/
79
+ │ └── ray/
80
+ ├── lakehouse/ # Lakehouse table adapters
81
+ │ ├── delta/ # DeltaTable (full CRUD + merge + time-travel)
82
+ │ ├── iceberg/ # IcebergTable (read/write/time-travel; merge not yet available in pyiceberg)
83
+ │ └── hudi/ # HudiTable (read-only; writes require Spark writer)
84
+ ├── specializations/ # Pre-wired high-level patterns ← specializations
85
+ │ ├── ingestion/ # AppendOnlyIngest, FullRefreshExtract, WatermarkIncrementalExtract ← specializations
86
+ │ ├── medallion/ # BronzeRawIngest, SilverCleanTransform, GoldAggregation ← specializations
87
+ │ ├── scd/ # ScdType1/2/3/4/5/6/7, CdcDebezium, DebeziumSource ← specializations
88
+ │ ├── dimensional/ # DateDimGenerator, DimTableLoad, FactTableLoad, BridgeTableBuilder ← specializations
89
+ │ ├── data_vault/ # DataVaultHubLoader, DataVaultLinkLoader, DataVaultSatelliteLoader, DataVaultPITTableBuilder, DataVaultBridgeTableBuilder ← specializations
90
+ │ ├── incremental/ # SnapshotTableAppender, DbtStyleSnapshot, MergeUpsert, DeleteSafeSync, PartitionedOverwrite ← specializations
91
+ │ ├── quality/ # RowCountAnomalyDetector, NullRateMonitor, SchemaEvolutionDetector, FreshnessCheck, ReferentialIntegrityCheck, ReconciliationDiff, StatisticalProfiler ← specializations
92
+ │ ├── deduplication/ # ExactDeduplicator, WindowedDeduplicator, FuzzyDeduplicator, ProbabilisticLinker ← specializations
93
+ │ ├── timeseries/ # TimeSeriesResampler, RollingWindowAggregator, SessionizationKnot, FunnelAnalysisKnot, CohortAggregator, LateArrivingEventHandler ← specializations
94
+ │ ├── feature_engineering/ # DerivedColumnCalculator, ColumnHasher, BinningKnot, StringNormalizer, DatePartExtractor, LookupEnricher, GeoEnricher, TextTokenCounter ← specializations
95
+ │ ├── analytics_engineering/ # StagingModelKnot, IntermediateModelKnot, MartModelKnot, RefreshMaterializedView, MetricLayerAggregator, ExposureLineageTag ← specializations
96
+ │ └── schema_migration/ # BackfillRunner, SchemaVersionMigrator, ColumnLineageTracker ← specializations
97
+ └── specialized/ # Tier-4 specialised adapters (Lance, Eland)
98
+ ```
99
+
100
+ File formats and object stores live under `pirn/domains/connectors/`, not here.
101
+
102
+ ---
103
+
104
+ ## Tier selection guide
105
+
106
+ | Data size | Latency need | Infrastructure | Recommended tier |
107
+ |-----------|-------------|----------------|-----------------|
108
+ | < 100 k rows | Any | Any | **Tier 1** — zero deps, fast enough |
109
+ | Fits in RAM (up to ~10 GB) | Batch | Single machine | **Tier 2 Polars** |
110
+ | Fits in RAM, SQL-heavy | Batch | Single machine | **Tier 2 DuckDB** |
111
+ | Exceeds RAM, single machine | Batch | Single machine | **Tier 2.5 Modin** |
112
+ | Warehouse-scale, existing backend | Batch | Any (warehouse/cluster) | **Tier 3 Ibis** |
113
+ | Distributed compute required | Batch | Spark / Dask / Ray cluster | **Tier 3 Spark/Dask/Ray** |
114
+ | Continuous / event-driven | Sub-second | Stream processor | **Tier 3-stream** |
115
+ | Vector similarity search | Any | Any | **Tier 4 Lance** |
116
+
117
+ ---
118
+
119
+ ## Canonical patterns
120
+
121
+ ### Tier-1 dict batch pipeline
122
+
123
+ ```python
124
+ from pirn.core.knot_config import KnotConfig
125
+ from pirn.core.knot_factory import knot
126
+ from pirn.core.parameter import Parameter
127
+ from pirn.core.run_request import RunRequest
128
+ from pirn.tapestry import Tapestry
129
+ from pirn_data.sources.file_source import FileSource
130
+ from pirn_data.transforms.filter import Filter
131
+ from pirn_data.transforms.aggregate import Aggregate
132
+ from pirn_data.transforms.aggregate_spec import AggregateSpec
133
+ from pirn_data.sinks.file_sink import FileSink
134
+ from pirn.connectors.file_formats.csv_format import CsvFormat
135
+ from pirn.connectors.file_formats.parquet_format import ParquetFormat
136
+ from pirn.connectors.object_stores.local_object_store import LocalObjectStore
137
+
138
+ with Tapestry() as t:
139
+ store = LocalObjectStore(root="/data")
140
+ source = FileSource(
141
+ store=store,
142
+ format=CsvFormat(),
143
+ key="input/sales.csv",
144
+ _config=KnotConfig(id="source"),
145
+ )
146
+ active = Filter(
147
+ batch=source,
148
+ predicate=lambda row: row["status"] == "active",
149
+ _config=KnotConfig(id="active_only"),
150
+ )
151
+ summary = Aggregate(
152
+ batch=active,
153
+ by=["region"],
154
+ aggs={"revenue": AggregateSpec(op="sum", column="revenue")},
155
+ _config=KnotConfig(id="by_region"),
156
+ )
157
+ FileSink(
158
+ batch=summary,
159
+ store=store,
160
+ format=ParquetFormat(compression="zstd"),
161
+ key="output/region_revenue.parquet",
162
+ _config=KnotConfig(id="sink"),
163
+ )
164
+
165
+ result = await t.run(RunRequest(parameters={}))
166
+ ```
167
+
168
+ ### Tier-2 Polars pipeline
169
+
170
+ ```python
171
+ import polars as pl
172
+ from pirn_data.frames.polars.polars_data_batch import PolarsDataBatch
173
+ from pirn_data.frames.polars.polars_filter import PolarsFilter
174
+ from pirn_data.frames.polars.polars_join import PolarsJoin
175
+ from pirn_data.frames.polars.polars_aggregate import PolarsAggregate
176
+ from pirn_data.frames.polars.bridges.data_batch_to_polars import DataBatchToPolars
177
+ from pirn_data.frames.polars.bridges.polars_to_data_batch import PolarsToDataBatch
178
+
179
+ # Promote a Tier-1 source to a Polars frame
180
+ polars_batch = DataBatchToPolars(
181
+ batch=source,
182
+ _config=KnotConfig(id="to_polars"),
183
+ )
184
+ filtered = PolarsFilter(
185
+ batch=polars_batch,
186
+ expression=pl.col("region") == "EU",
187
+ _config=KnotConfig(id="eu_only"),
188
+ )
189
+ joined = PolarsJoin(
190
+ left=filtered,
191
+ right=ref_batch,
192
+ on="region",
193
+ how="left",
194
+ _config=KnotConfig(id="join_ref"),
195
+ )
196
+ # Demote back to Tier-1 DataBatch if downstream knots expect it
197
+ result_batch = PolarsToDataBatch(
198
+ batch=joined,
199
+ _config=KnotConfig(id="to_data_batch"),
200
+ )
201
+ ```
202
+
203
+ ---
204
+
205
+ ## Anti-patterns
206
+
207
+ ### Reaching for Tier 3 when data fits in memory
208
+
209
+ Ibis/Spark add serialisation overhead and cluster setup cost. If the dataset fits in RAM, use Tier-2 Polars. Reserve Tier-3 for warehouse-resident data or genuinely distributed workloads.
210
+
211
+ ### Mixing tier knots without conversion bridges
212
+
213
+ Passing a `PolarsDataBatch` directly into a Tier-1 `Filter` (or vice versa) will fail at runtime because the knots expect different input types. Always interpose the bridge knots:
214
+
215
+ - `DataBatchToPolars` — `DataBatch` → `PolarsDataBatch`
216
+ - `PolarsToDataBatch` — `PolarsDataBatch` → `DataBatch`
217
+
218
+ Equivalent bridges exist in `frames/duckdb/bridges/`.
219
+
220
+ ### Using Tier-1 `Aggregate` for joins
221
+
222
+ Tier-1 has no `Join` knot — a Python-level hash-join of two `tuple[dict]` lists would be O(n·m) without indexes. If you need joins, promote to Tier 2 first with a bridge knot, then use `PolarsJoin` or `DuckDbJoin`.
223
+
224
+ ### Calling `IcebergTable.merge()` in production
225
+
226
+ As of mid-2026, `pyiceberg`'s Python writer does not implement merge. The method raises `NotImplementedError`. Use the Java/Scala Iceberg writer for production upserts.
227
+
228
+ ### Writing to a Hudi table from Python
229
+
230
+ `HudiTable` is read-only. All write methods raise `NotImplementedError`. Writes require the `hudi-spark-bundle` Spark writer.
231
+
232
+ ### Instantiating format objects inside `process()`
233
+
234
+ `FileFormat` objects are stateless but they import their vendor library at construction time. Build them once at pipeline-wiring time, not inside knot `process()` methods.
235
+
236
+ ---
237
+
238
+ ## Constraints and gotchas
239
+
240
+ - **`DataBatch.rows` is a `tuple`** (immutable). Never mutate it. Produce a new batch with `batch.with_rows(new_rows)` — this preserves `schema` and `source_uri`.
241
+ - **`DirectorySource` with `concatenate=True` loses per-file lineage.** The `source_uri` collapses to `{store}://{prefix}*`. Use `concatenate=False` when provenance matters.
242
+ - **`Aggregate` skips `None` values.** Empty groups yield `None` for mean/min/max/first/last and `0` for count/count_distinct.
243
+ - **Tier-3-stream requires Python < 3.14** until Pathway and Bytewax catch up to the new Python release.
244
+ - **`pirn[hudi]` is a no-op marker extra** — there is no stable vendor SDK on PyPI. The read path relies only on `pyarrow` (included with `pirn[data]`).
245
+ - **`CompressedFileFormat` codec availability varies.** `gzip` and `bzip2` use stdlib and are always available. `zstd`, `snappy`, and `lz4` each require their own extra.
246
+ - **`ArchiveFileFormat` is always non-streaming** — the full archive must be buffered. Do not use it for large files where incremental streaming matters.
247
+ - **`FileSink.process()` returns the destination `key` string**, not a `DataBatch`. Downstream knots that expect a `DataBatch` must not follow a `FileSink` directly.
248
+ - **Lakehouse vendor SDKs load lazily.** Import errors for missing extras are raised on first use, not at module import time.
249
+
250
+ ---
251
+
252
+ ## Quick reference
253
+
254
+ | Task | Tier | Knot / Class |
255
+ |------|------|-------------|
256
+ | Read a single file | 1 | `FileSource` |
257
+ | Read all files under a prefix | 1 | `DirectorySource` |
258
+ | Write a file | 1 | `FileSink` |
259
+ | Filter rows (Python callable) | 1 | `Filter` |
260
+ | Rename columns | 1 | `Rename` |
261
+ | Type coercion | 1 | `Cast` |
262
+ | String normalisation | 1 | `Normalize` |
263
+ | Group-by aggregation | 1 | `Aggregate` |
264
+ | Deduplication | 1 | `Deduplicate` |
265
+ | Vectorised filter (Polars expr) | 2 | `PolarsFilter` |
266
+ | Join two frames | 2 | `PolarsJoin` / `DuckDbJoin` |
267
+ | Window functions | 2 | `PolarsWindowCalc` |
268
+ | Pivot / unpivot | 2 | `PolarsPivot` / `PolarsUnpivot` |
269
+ | DataBatch → Polars | bridge | `DataBatchToPolars` |
270
+ | Polars → DataBatch | bridge | `PolarsToDataBatch` |
271
+ | Delta Lake read/write/merge | lakehouse | `DeltaTable` + `LakehouseTableSource` |
272
+ | Iceberg read/write | lakehouse | `IcebergTable` |
273
+ | Hudi read (only) | lakehouse | `HudiTable` |
274
+ | Bronze raw ingest | specialisation | `BronzeRawIngest` |
275
+ | Silver clean transform | specialisation | `SilverCleanTransform` |
276
+ | Gold aggregation | specialisation | `GoldAggregation` |
277
+ | Incremental watermark ingest | specialisation | `WatermarkIncrementalExtract` |
278
+ | SCD Type 2 history | specialisation | `ScdType2` |
279
+ | SCD Type 3 previous value | specialisation | `ScdType3PreviousValue` |
280
+ | SCD Type 4 mini-dimension | specialisation | `ScdType4MiniDimension` |
281
+ | SCD Type 6 hybrid (1+2+3) | specialisation | `ScdType6Hybrid` |
282
+ | Debezium CDC apply | specialisation | `CdcDebezium` |
283
+ | Date dimension generate | specialisation | `DateDimGenerator` |
284
+ | Dimension table load | specialisation | `DimTableLoad` |
285
+ | Fact table load | specialisation | `FactTableLoad` |
286
+ | Data Vault hub load | specialisation | `DataVaultHubLoader` |
287
+ | Data Vault satellite load | specialisation | `DataVaultSatelliteLoader` |
288
+ | Data Vault PIT table | specialisation | `DataVaultPITTableBuilder` |
289
+ | Snapshot append | specialisation | `SnapshotTableAppender` |
290
+ | dbt-style snapshot | specialisation | `DbtStyleSnapshot` |
291
+ | Merge upsert | specialisation | `MergeUpsert` |
292
+ | Row count anomaly | specialisation | `RowCountAnomalyDetector` |
293
+ | Null rate monitor | specialisation | `NullRateMonitor` |
294
+ | Referential integrity check | specialisation | `ReferentialIntegrityCheck` |
295
+ | Reconciliation diff | specialisation | `ReconciliationDiff` |
296
+ | Exact deduplication | specialisation | `ExactDeduplicator` |
297
+ | Fuzzy deduplication | specialisation | `FuzzyDeduplicator` |
298
+ | Time series resample | specialisation | `TimeSeriesResampler` |
299
+ | Rolling window aggregation | specialisation | `RollingWindowAggregator` |
300
+ | Sessionization | specialisation | `SessionizationKnot` |
301
+ | Derived column calculation | specialisation | `DerivedColumnCalculator` |
302
+ | Binning / bucketing | specialisation | `BinningKnot` |
303
+ | Lookup enrichment | specialisation | `LookupEnricher` |
304
+ | dbt staging model | specialisation | `StagingModelKnot` |
305
+ | dbt mart model | specialisation | `MartModelKnot` |
306
+ | Metric layer aggregation | specialisation | `MetricLayerAggregator` |
307
+ | Schema version migration | specialisation | `SchemaVersionMigrator` |
308
+ | Backfill runner | specialisation | `BackfillRunner` |
309
+ | Column lineage tracking | specialisation | `ColumnLineageTracker` |
310
+ | Compressed format | wrapper | `CompressedFileFormat(inner, codec=...)` |
311
+ | Multi-file archive | wrapper | `ArchiveFileFormat(inner, archive_type=...)` |
312
+
313
+ ---
314
+
315
+ *See also: [pirn AGENTIC_USE.md](../../AGENTIC_USE.md)*
pirn_data/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ """Data Engineering / Analytics Engineering knot library.
2
+
3
+ Install with::
4
+
5
+ pip install 'pirn-data[data]'
6
+
7
+ Note: ``data_schema``, ``data_batch``, ``quality_check``, and
8
+ ``quality_report`` are pure-Python contracts and remain importable in
9
+ minimal environments. Modules that touch pandas / pyarrow (sources,
10
+ transforms, sinks) import those dependencies lazily, so the
11
+ missing-dependency error fires only when those modules are imported.
12
+ """
13
+
14
+ import warnings
15
+
16
+ from sweet_tea.registry import Registry
17
+ from sweet_tea.sweet_tea_warning import SweetTeaWarning
18
+
19
+ with warnings.catch_warnings():
20
+ warnings.simplefilter("ignore", SweetTeaWarning)
21
+ Registry.fill_registry(module=__name__, library="pirn")
22
+
23
+ __all__: list[str] = []
@@ -0,0 +1,99 @@
1
+ """A batch of rows flowing through the data pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from dataclasses import dataclass, field
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from pirn.core.pirn_opaque_value import PirnOpaqueValue
11
+
12
+ from pirn_data.data_schema import DataSchema
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class DataBatch(PirnOpaqueValue):
17
+ """Tabular batch of rows.
18
+
19
+ Each row is a ``dict[str, Any]`` keyed by column name. Heavy frames
20
+ (Pandas / Arrow / Polars) wrap into ``DataBatch`` via thin adapters in
21
+ the sources / transforms modules.
22
+
23
+ Attributes
24
+ ----------
25
+ rows:
26
+ Sequence of row dicts.
27
+ schema:
28
+ Schema the rows conform to.
29
+ source_uri:
30
+ Where the rows came from (DSN, file path, API endpoint). DSN-style
31
+ values must be passed through
32
+ :class:`pirn.connectors.dsn_scrubber.DsnScrubber` before
33
+ assignment to avoid leaking credentials into lineage records.
34
+ fetched_at:
35
+ UTC instant the data was materialised.
36
+ """
37
+
38
+ rows: tuple[Mapping[str, Any], ...] = ()
39
+ schema: DataSchema = field(default_factory=DataSchema)
40
+ source_uri: str = ""
41
+ fetched_at: datetime = field(default_factory=lambda: datetime.now(UTC))
42
+
43
+ @property
44
+ def row_count(self) -> int:
45
+ return len(self.rows)
46
+
47
+ def with_rows(self, rows: tuple[Mapping[str, Any], ...]) -> DataBatch:
48
+ """Copy with ``rows`` replaced; schema/uri/fetched_at preserved."""
49
+ return DataBatch(
50
+ rows=rows,
51
+ schema=self.schema,
52
+ source_uri=self.source_uri,
53
+ fetched_at=self.fetched_at,
54
+ )
55
+
56
+ def with_schema(self, schema: DataSchema) -> DataBatch:
57
+ """Copy with ``schema`` replaced; rows preserved."""
58
+ return DataBatch(
59
+ rows=self.rows,
60
+ schema=schema,
61
+ source_uri=self.source_uri,
62
+ fetched_at=self.fetched_at,
63
+ )
64
+
65
+ def _pirn_audit_dict(self) -> dict[str, Any]:
66
+ """Flatten to a primitive dict for pydantic serialisation.
67
+
68
+ Pirn IO validation only needs ``isinstance(value, DataBatch)``;
69
+ content-addressing flattens the rows into this stable summary.
70
+ The contained :class:`DataSchema` (with its
71
+ ``Mapping[str, type]`` columns) is intentionally omitted —
72
+ ``DataSchema`` is opaque-serialised separately when needed.
73
+ """
74
+ return {
75
+ "row_count": self.row_count,
76
+ "source_uri": self.source_uri,
77
+ "fetched_at": self.fetched_at.isoformat(),
78
+ "rows": [dict(r) for r in self.rows],
79
+ }
80
+
81
+ def __pirn_canonical__(self) -> dict[str, Any]:
82
+ """Sanctioned canonical form for :func:`pirn.core.hashing.content_hash`.
83
+
84
+ Returned dict is fully JSON-serialisable: ``schema.columns`` is
85
+ flattened to ``{name: type-name}`` so the otherwise-opaque
86
+ :class:`type` values do not blow up the hasher. The audit dict
87
+ deliberately omits the schema (pydantic IO already validates the
88
+ boundary); content-addressing keeps it so two structurally
89
+ identical batches with the same column types hash equally.
90
+ """
91
+ return {
92
+ "row_count": self.row_count,
93
+ "source_uri": self.source_uri,
94
+ "fetched_at": self.fetched_at.isoformat(),
95
+ "rows": [dict(r) for r in self.rows],
96
+ "schema_columns": {
97
+ name: column_type.__name__ for name, column_type in self.schema.columns.items()
98
+ },
99
+ }
@@ -0,0 +1,46 @@
1
+ """``DataProfile`` and ``ColumnProfile`` — descriptive statistics for a
2
+ :class:`DataBatch`.
3
+
4
+ Emitted by :class:`pirn_data.quality.profiler.Profiler`. A profile
5
+ is observation, not policy: every field describes the input batch, no
6
+ field carries a pass/fail verdict. Compose with a downstream knot if you
7
+ want thresholds enforced (or use :class:`NullRateCheck` /
8
+ :class:`RowCountCheck` directly).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass, field
14
+ from datetime import UTC, datetime
15
+ from typing import Any
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ColumnProfile:
20
+ """Per-column statistics."""
21
+
22
+ name: str
23
+ observed_count: int
24
+ null_count: int
25
+ distinct_count: int
26
+ min_value: Any | None = None
27
+ max_value: Any | None = None
28
+ top_value: Any | None = None
29
+ top_value_count: int = 0
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class DataProfile:
34
+ """Aggregate profile of a :class:`DataBatch`."""
35
+
36
+ row_count: int
37
+ column_count: int
38
+ columns: tuple[ColumnProfile, ...] = ()
39
+ sampled_at: datetime = field(default_factory=lambda: datetime.now(UTC))
40
+
41
+ def column(self, name: str) -> ColumnProfile | None:
42
+ """Return the per-column profile for ``name`` or ``None``."""
43
+ for c in self.columns:
44
+ if c.name == name:
45
+ return c
46
+ return None
@@ -0,0 +1,85 @@
1
+ """Tabular schema declaration used across ``pirn_data``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from pirn.core.pirn_opaque_value import PirnOpaqueValue
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class DataSchema(PirnOpaqueValue):
14
+ """Declarative schema for a tabular :class:`DataBatch`.
15
+
16
+ Attributes
17
+ ----------
18
+ columns:
19
+ Mapping of column name → expected Python type. Insertion order is
20
+ the canonical column order.
21
+ primary_keys:
22
+ Subset of ``columns`` keys; non-empty for any sink that performs
23
+ upsert or dedup operations.
24
+ nullable:
25
+ Subset of ``columns`` keys whose values may be ``None``.
26
+ """
27
+
28
+ columns: Mapping[str, type] = field(default_factory=dict)
29
+ primary_keys: tuple[str, ...] = ()
30
+ nullable: tuple[str, ...] = ()
31
+
32
+ def __post_init__(self) -> None:
33
+ unknown_pks = [k for k in self.primary_keys if k not in self.columns]
34
+ if unknown_pks:
35
+ raise ValueError(f"primary_keys reference unknown columns: {unknown_pks}")
36
+ unknown_nullable = [k for k in self.nullable if k not in self.columns]
37
+ if unknown_nullable:
38
+ raise ValueError(f"nullable references unknown columns: {unknown_nullable}")
39
+
40
+ @property
41
+ def column_names(self) -> tuple[str, ...]:
42
+ return tuple(self.columns.keys())
43
+
44
+ def is_nullable(self, column: str) -> bool:
45
+ """Return True if ``column`` is permitted to hold ``None``."""
46
+ return column in self.nullable
47
+
48
+ def with_columns(self, columns: Mapping[str, type]) -> DataSchema:
49
+ """Return a new schema with the given columns merged in."""
50
+ merged = dict(self.columns)
51
+ merged.update(columns)
52
+ return DataSchema(
53
+ columns=merged,
54
+ primary_keys=self.primary_keys,
55
+ nullable=self.nullable,
56
+ )
57
+
58
+ def _pirn_audit_dict(self) -> dict[str, Any]:
59
+ """Flatten to a primitive dict for pydantic serialisation.
60
+
61
+ ``columns`` holds Python ``type`` objects which pydantic's
62
+ default JSON serialiser can't dump; here we convert each type
63
+ to its name. Pirn IO validation just checks
64
+ ``isinstance(value, DataSchema)``; content-addressing serialises
65
+ via this stable summary.
66
+ """
67
+ return {
68
+ "columns": {k: t.__name__ for k, t in self.columns.items()},
69
+ "primary_keys": list(self.primary_keys),
70
+ "nullable": list(self.nullable),
71
+ }
72
+
73
+ def __pirn_canonical__(self) -> dict[str, Any]:
74
+ """Sanctioned canonical form for :func:`pirn.core.hashing.content_hash`.
75
+
76
+ Mirrors :meth:`_pirn_audit_dict` but is the explicit hook the
77
+ hasher prefers. Keeping both methods avoids forcing every
78
+ existing pydantic-serialisation call site through the canonical
79
+ path (and vice versa).
80
+ """
81
+ return {
82
+ "columns": {name: column_type.__name__ for name, column_type in self.columns.items()},
83
+ "primary_keys": list(self.primary_keys),
84
+ "nullable": list(self.nullable),
85
+ }