pointblank 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/_constants.py CHANGED
@@ -39,6 +39,7 @@ ASSERTION_TYPE_METHOD_MAP = {
39
39
  "col_vals_expr": "expr",
40
40
  "col_exists": "col_exists",
41
41
  "rows_distinct": "rows_distinct",
42
+ "rows_complete": "rows_complete",
42
43
  "col_schema_match": "col_schema_match",
43
44
  "row_count_match": "row_count_match",
44
45
  "col_count_match": "col_count_match",
@@ -63,6 +64,7 @@ METHOD_CATEGORY_MAP = {
63
64
  "col_exists": "COL_EXISTS_HAS_TYPE",
64
65
  "expr": "COMPARE_EXPR",
65
66
  "rows_distinct": "ROWS_DISTINCT",
67
+ "rows_complete": "ROWS_COMPLETE",
66
68
  "col_schema_match": "COL_SCHEMA_MATCH",
67
69
  "row_count_match": "ROW_COUNT_MATCH",
68
70
  "col_count_match": "COL_COUNT_MATCH",
@@ -375,6 +377,19 @@ SVG_ICONS_FOR_ASSERTION_TYPES = {
375
377
  </g>
376
378
  </g>
377
379
  </g>
380
+ </svg>""",
381
+ "rows_complete": """<?xml version="1.0" encoding="UTF-8"?>
382
+ <svg width="67px" height="67px" viewBox="0 0 67 67" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
383
+ <title>rows_complete</title>
384
+ <g id="All-Icons" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
385
+ <g id="rows_complete" transform="translate(0.000000, 0.965517)">
386
+ <path d="M56.712234,1 C59.1975153,1 61.4475153,2.00735931 63.076195,3.63603897 C64.7048747,5.26471863 65.712234,7.51471863 65.712234,10 L65.712234,10 L65.712234,65 L10.712234,65 C8.22695259,65 5.97695259,63.9926407 4.34827294,62.363961 C2.71959328,60.7352814 1.71223397,58.4852814 1.71223397,56 L1.71223397,56 L1.71223397,10 C1.71223397,7.51471863 2.71959328,5.26471863 4.34827294,3.63603897 C5.97695259,2.00735931 8.22695259,1 10.712234,1 L10.712234,1 Z" id="rectangle" stroke="#000000" stroke-width="2" fill="#FFFFFF"></path>
387
+ <g id="complete_me" transform="translate(12.500000, 9.500000)" fill="#000000">
388
+ <path d="M8,0 L8,10 L16,10 L16,18 L26,18 L26,10 L34,10 L34,0 L8,0 Z M10,2 L16,2 L16,8 L10,8 L10,2 Z M18,2 L24,2 L24,8 L18,8 L18,2 Z M26,2 L32,2 L32,8 L26,8 L26,2 Z M18,10 L24,10 L24,16 L18,16 L18,10 Z M0,21 L0,47 L42,47 L42,21 L32,21 L32,29 L24,29 L24,37 L18,37 L18,29 L10,29 L10,21 L0,21 Z M2,23 L8,23 L8,29 L2,29 L2,23 Z M34,23 L40,23 L40,29 L34,29 L34,23 Z M2,31 L8,31 L8,37 L2,37 L2,31 Z M10,31 L16,31 L16,37 L10,37 L10,31 Z M26,31 L32,31 L32,37 L26,37 L26,31 Z M34,31 L40,31 L40,37 L34,37 L34,31 Z M2,39 L8,39 L8,45 L2,45 L2,39 Z M10,39 L16,39 L16,45 L10,45 L10,39 Z M18,39 L24,39 L24,45 L18,45 L18,39 Z M26,39 L32,39 L32,45 L26,45 L26,39 Z M34,39 L40,39 L40,45 L34,45 L34,39 Z" id="Shape" fill-rule="nonzero"></path>
389
+ <path d="M22.4566476,18.35817 C22.9253976,18.29567 23.3746166,18.569108 23.5308666,19.01442 C23.6910226,19.459733 23.5152416,19.955826 23.1128976,20.20192 L23.1128976,20.20192 L20.2066476,22.38942 L25.7989286,22.3893123 L25.7989286,24.3893123 L20.2066476,24.38942 L23.1128976,26.57692 C23.5621166,26.912858 23.6519606,27.549576 23.3160226,27.998795 C22.9800856,28.448014 22.3433666,28.537858 21.8941476,28.20192 L21.8941476,28.20192 L16.6128976,24.20192 C16.3511786,24.01442 16.1949286,23.709733 16.1949286,23.38942 C16.1949286,23.069108 16.3511786,22.76442 16.6128976,22.57692 L16.6128976,22.57692 L21.8941476,18.57692 C22.0230536,18.479264 22.1714916,18.416764 22.3316476,18.38942 C22.3707106,18.377701 22.4136786,18.365983 22.4566476,18.35817 Z" id="arrow_right" transform="translate(20.997393, 23.377149) rotate(-90.000000) translate(-20.997393, -23.377149) "></path>
390
+ </g>
391
+ </g>
392
+ </g>
378
393
  </svg>""",
379
394
  "col_schema_match": """<?xml version="1.0" encoding="UTF-8"?>
380
395
  <svg width="67px" height="67px" viewBox="0 0 67 67" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
@@ -728,6 +728,114 @@ EXPECT_FAIL_TEXT = {
728
728
  "hi": "चयनित स्तंभों में पंक्तियां पूरी तरह से अलग नहीं थीं, ऐसे असफल परीक्षण इकाइयों की अधिकता।",
729
729
  "el": "Υπέρβαση αποτυχημένων μονάδων δοκιμής όπου δεν υπήρχαν διακριτές γραμμές στις επιλεγμένες στήλες.",
730
730
  },
731
+ "all_row_complete_expectation_text": {
732
+ "en": "Expect entirely complete rows across all columns.",
733
+ "fr": "On s'attend à des lignes entièrement complètes dans toutes les colonnes.",
734
+ "de": "Erwarten Sie vollständig komplette Zeilen über alle Spalten hinweg.",
735
+ "it": "Aspettati righe completamente complete su tutte le colonne.",
736
+ "es": "Se espera que las filas estén completamente completas en todas las columnas.",
737
+ "pt": "Espera-se linhas completamente preenchidas em todas as colunas.",
738
+ "ro": "Se așteaptă ca rândurile să fie complet complete în toate coloanele.",
739
+ "tr": "Tüm sütunlarda tamamen eksiksiz satırlar bekleyin.",
740
+ "zh-Hans": "预期所有列中的行都是完整的。",
741
+ "zh-Hant": "預期所有列中的行都是完整的。",
742
+ "ja": "すべての列で完全に完全な行を期待します。",
743
+ "ko": "모든 열에서 완전히 완성된 행을 기대합니다.",
744
+ "vi": "Kỳ vọng các hàng hoàn toàn đầy đủ trên tất cả các cột.",
745
+ "ru": "Ожидайте полностью заполненные строки по всем столбцам.",
746
+ "cs": "Očekávejte zcela kompletní řádky ve všech sloupcích.",
747
+ "pl": "Spodziewaj się w pełni kompletnych wierszy we wszystkich kolumnach.",
748
+ "da": "Forvent helt komplette rækker på tværs af alle kolonner.",
749
+ "sv": "Förvänta dig helt kompletta rader över alla kolumner.",
750
+ "nb": "Forvent helt komplette rader på tvers av alle kolonner.",
751
+ "nl": "Verwacht volledig complete rijen in alle kolommen.",
752
+ "fi": "Odota täysin täydellisiä rivejä kaikissa sarakkeissa.",
753
+ "is": "Væntir þess að allar raðir séu heildstæðar yfir alla dálka.",
754
+ "ar": "توقع صفوف مكتملة تمامًا عبر جميع الأعمدة.",
755
+ "hi": "सभी स्तंभों में पूरी तरह से पूर्ण पंक्तियों की अपेक्षा करें।",
756
+ "el": "Αναμένεται πλήρως ολοκληρωμένες γραμμές σε όλες τις στήλες.",
757
+ },
758
+ "all_row_complete_failure_text": {
759
+ "en": "Exceedance of failed test units where there weren't complete rows across all columns.",
760
+ "fr": "Dépassement des unités de test ayant échoué là où il n'y avait pas de lignes complètes dans toutes les colonnes.",
761
+ "de": "Überschreitung fehlgeschlagener Testeinheiten, bei denen nicht vollständige Zeilen über alle Spalten hinweg vorhanden waren.",
762
+ "it": "Superamento delle unità di test fallite in cui non c'erano righe complete su tutte le colonne.",
763
+ "es": "Se superó el número de unidades de prueba fallidas donde no había filas completas en todas las columnas.",
764
+ "pt": "Excedeu o número de unidades de teste com falha onde não havia linhas completas em todas as colunas.",
765
+ "ro": "Depășirea unităților de test eșuate unde nu au existat rânduri complete în toate coloanele.",
766
+ "tr": "Tüm sütunlarda eksiksiz satırların olmadığı başarısız test birimlerinin aşılması.",
767
+ "zh-Hans": "错误过多,其中在所有列中行不完整。",
768
+ "zh-Hant": "錯誤過多,在所有列中沒有完整的行。",
769
+ "ja": "すべての列で完全な行がないテスト単位の失敗の超過。",
770
+ "ko": "모든 열에 걸쳐 완전한 행이 아니었던 실패한 테스트 단위 초과.",
771
+ "vi": "Vượt quá số đơn vị kiểm tra thất bại trong đó không có các hàng đầy đủ trên tất cả các cột.",
772
+ "ru": "Превышение неудачных тестовых единиц, где не было полных строк по всем столбцам.",
773
+ "cs": "Překročení počtu neúspěšných testů, kde nebyly úplné řádky ve všech sloupcích.",
774
+ "pl": "Przekroczenie liczby niepomyślnych jednostek testowych, w których nie było kompletnych wierszy we wszystkich kolumnach.",
775
+ "da": "Overskridelse af antal fejlslagne enhedstests, hvor der ikke var komplette rækker på tværs af alle kolonner.",
776
+ "sv": "Överstiger antalet misslyckade enhetstest där det inte fanns kompletta rader över alla kolumner.",
777
+ "nb": "Overskridelse av mislykkede testenheter hvor det ikke var komplette rader på tvers av alle kolonner.",
778
+ "nl": "Overschrijding van mislukte testeenheden waarbij er geen complete rijen waren in alle kolommen.",
779
+ "fi": "Epäonnistuneiden testiyksikköjen ylitys, joissa ei ollut täydellisiä rivejä kaikissa sarakkeissa.",
780
+ "is": "Of mörg misheppnuð próf þar sem raðir voru ekki heildstæðar yfir alla dálka.",
781
+ "ar": "تجاوز وحدات الاختبار الفاشلة حيث لم تكن هناك صفوف مكتملة عبر جميع الأعمدة.",
782
+ "hi": "सभी स्तंभों में पूर्ण पंक्तियां नहीं थीं, ऐसे असफल परीक्षण इकाइयों की अधिकता।",
783
+ "el": "Υπέρβαση αποτυχημένων μονάδων δοκιμής όπου δεν υπήρχαν πλήρεις γραμμές σε όλες τις στήλες.",
784
+ },
785
+ "across_row_complete_expectation_text": {
786
+ "en": "Expect entirely complete rows across {column_text}.",
787
+ "fr": "On s'attend à des lignes entièrement complètes dans {column_text}.",
788
+ "de": "Erwarten Sie vollständig komplette Zeilen über {column_text} hinweg.",
789
+ "it": "Aspettati righe completamente complete su {column_text}.",
790
+ "es": "Se espera que las filas estén completamente completas en {column_text}.",
791
+ "pt": "Espera-se linhas completamente preenchidas em {column_text}.",
792
+ "ro": "Se așteaptă ca rândurile să fie complet complete în {column_text}.",
793
+ "tr": "{column_text} boyunca tamamen eksiksiz satırlar bekleyin.",
794
+ "zh-Hans": "预期在{column_text}中的行是完整的。",
795
+ "zh-Hant": "預期在{column_text}中的行是完整的。",
796
+ "ja": "{column_text}において完全に完全な行を期待します。",
797
+ "ko": "{column_text}에서 완전히 완성된 행을 기대합니다.",
798
+ "vi": "Kỳ vọng các hàng hoàn toàn đầy đủ trên {column_text}.",
799
+ "ru": "Ожидайте полностью заполненные строки в {column_text}.",
800
+ "cs": "Očekávejte zcela kompletní řádky v {column_text}.",
801
+ "pl": "Spodziewaj się w pełni kompletnych wierszy w {column_text}.",
802
+ "da": "Forvent helt komplette rækker på tværs af {column_text}.",
803
+ "sv": "Förvänta dig helt kompletta rader över {column_text}.",
804
+ "nb": "Forvent helt komplette rader på tvers av {column_text}.",
805
+ "nl": "Verwacht volledig complete rijen in {column_text}.",
806
+ "fi": "Odota täysin täydellisiä rivejä sarakkeissa {column_text}.",
807
+ "is": "Væntir þess að allar raðir séu heildstæðar yfir {column_text}.",
808
+ "ar": "توقع صفوف مكتملة تمامًا عبر {column_text}.",
809
+ "hi": "{column_text} में पूरी तरह से पूर्ण पंक्तियों की अपेक्षा करें।",
810
+ "el": "Αναμένεται πλήρως ολοκληρωμένες γραμμές στις στήλες {column_text}.",
811
+ },
812
+ "across_row_complete_failure_text": {
813
+ "en": "Exceedance of failed test units where there weren't complete rows across selected columns.",
814
+ "fr": "Dépassement des unités de test ayant échoué là où il n'y avait pas de lignes complètes dans les colonnes sélectionnées.",
815
+ "de": "Überschreitung fehlgeschlagener Testeinheiten, bei denen nicht vollständige Zeilen über die ausgewählten Spalten hinweg vorhanden waren.",
816
+ "it": "Superamento delle unità di test fallite in cui non c'erano righe complete nelle colonne selezionate.",
817
+ "es": "Se superó el número de unidades de prueba fallidas donde no había filas completas en las columnas seleccionadas.",
818
+ "pt": "Excedeu o número de unidades de teste com falha onde não havia linhas completas nas colunas selecionadas.",
819
+ "ro": "Depășirea unităților de test eșuate unde nu au existat rânduri complete în coloanele selectate.",
820
+ "tr": "Seçili sütunlarda eksiksiz satırların olmadığı başarısız test birimlerinin aşılması.",
821
+ "zh-Hans": "错误过多,其中在所选列中行不完整。",
822
+ "zh-Hant": "錯誤過多,在所選列中沒有完整的行。",
823
+ "ja": "選択された列で完全な行がないテスト単位の失敗の超過。",
824
+ "ko": "선택된 열에서 완전한 행이 아니었던 실패한 테스트 단위 초과.",
825
+ "vi": "Vượt quá số đơn vị kiểm tra thất bại trong đó không có các hàng đầy đủ trên các cột đã chọn.",
826
+ "ru": "Превышение неудачных тестовых единиц, где не было полных строк в выбранных столбцах.",
827
+ "cs": "Překročení počtu neúspěšných testů, kde nebyly úplné řádky ve vybraných sloupcích.",
828
+ "pl": "Przekroczenie liczby niepomyślnych jednostek testowych, w których nie było kompletnych wierszy w wybranych kolumnach.",
829
+ "da": "Overskridelse af antal fejlslagne enhedstests, hvor der ikke var komplette rækker på tværs af valgte kolonner.",
830
+ "sv": "Överstiger antalet misslyckade enhetstest där det inte fanns kompletta rader över valda kolumner.",
831
+ "nb": "Overskridelse av mislykkede testenheter hvor det ikke var komplette rader på tvers av valgte kolonner.",
832
+ "nl": "Overschrijding van mislukte testeenheden waarbij er geen complete rijen waren in geselecteerde kolommen.",
833
+ "fi": "Epäonnistuneiden testiyksikköjen ylitys, joissa ei ollut täydellisiä rivejä valituissa sarakkeissa.",
834
+ "is": "Of mörg misheppnuð próf þar sem raðir voru ekki heildstæðar yfir valda dálka.",
835
+ "ar": "تجاوز وحدات الاختبار الفاشلة حيث لم تكن هناك صفوف مكتملة عبر الأعمدة المحددة.",
836
+ "hi": "चयनित स्तंभों में पूर्ण पंक्तियां नहीं थीं, ऐसे असफल परीक्षण इकाइयों की अधिकता।",
837
+ "el": "Υπέρβαση αποτυχημένων μονάδων δοκιμής όπου δεν υπήρχαν πλήρεις γραμμές στις επιλεγμένες στήλες.",
838
+ },
731
839
  "col_schema_match_expectation_text": {
732
840
  "en": "Expect that column schemas match.",
733
841
  "fr": "On s'attend à ce que les schémas de colonnes correspondent.",
@@ -1735,6 +1843,60 @@ STEP_REPORT_TEXT = {
1735
1843
  "hi": "पंक्तियां स्तंभों के एक उपसमूह में अलग-अलग हैं",
1736
1844
  "el": "Οι γραμμές είναι διακριτές σε ένα υποσύνολο στηλών",
1737
1845
  },
1846
+ "rows_complete_all": {
1847
+ "en": "All rows are complete",
1848
+ "fr": "Toutes les lignes sont complètes",
1849
+ "de": "Alle Zeilen sind vollständig",
1850
+ "it": "Tutte le righe sono complete",
1851
+ "es": "Todas las filas están completas",
1852
+ "pt": "Todas as linhas estão completas",
1853
+ "ro": "Toate rândurile sunt complete",
1854
+ "tr": "Tüm satırlar eksiksizdir",
1855
+ "zh-Hans": "所有行都是完整的",
1856
+ "zh-Hant": "所有行都是完整的",
1857
+ "ja": "すべての行が完全です",
1858
+ "ko": "모든 행이 완전합니다",
1859
+ "vi": "Tất cả các hàng đều đầy đủ",
1860
+ "ru": "Все строки заполнены полностью",
1861
+ "cs": "Všechny řádky jsou úplné",
1862
+ "pl": "Wszystkie wiersze są kompletne",
1863
+ "da": "Alle rækker er komplette",
1864
+ "sv": "Alla rader är kompletta",
1865
+ "nb": "Alle rader er komplette",
1866
+ "nl": "Alle rijen zijn compleet",
1867
+ "fi": "Kaikki rivit ovat täydellisiä",
1868
+ "is": "Allar raðir eru heildstæðar",
1869
+ "ar": "جميع الصفوف مكتملة",
1870
+ "hi": "सभी पंक्तियां पूर्ण हैं",
1871
+ "el": "Όλες οι γραμμές είναι πλήρεις",
1872
+ },
1873
+ "rows_complete_subset": {
1874
+ "en": "Rows are complete across a subset of columns",
1875
+ "fr": "Les lignes sont complètes sur un sous-ensemble de colonnes",
1876
+ "de": "Zeilen sind in einer Teilmenge von Spalten vollständig",
1877
+ "it": "Le righe sono complete in un sottoinsieme di colonne",
1878
+ "es": "Las filas están completas en un subconjunto de columnas",
1879
+ "pt": "As linhas estão completas em um subconjunto de colunas",
1880
+ "ro": "Rândurile sunt complete într-un subset de coloane",
1881
+ "tr": "Satırlar, sütunların bir alt kümesinde eksiksizdir",
1882
+ "zh-Hans": "行在列的子集中是完整的",
1883
+ "zh-Hant": "行在列的子集中是完整的",
1884
+ "ja": "行は列のサブセット間で完全です",
1885
+ "ko": "행이 열의 하위 집합에서 완전합니다",
1886
+ "vi": "Các hàng đầy đủ trong một tập con của các cột",
1887
+ "ru": "Строки полностью заполнены в подмножестве столбцов",
1888
+ "cs": "Řádky jsou úplné napříč podmnožinou sloupců",
1889
+ "pl": "Wiersze są kompletne w podzbiorze kolumn",
1890
+ "da": "Rækker er komplette på tværs af en delmængde af kolonner",
1891
+ "sv": "Rader är kompletta över en delmängd av kolumner",
1892
+ "nb": "Rader er komplette på tvers av en delmengde av kolonner",
1893
+ "nl": "Rijen zijn compleet over een subset van kolommen",
1894
+ "fi": "Rivit ovat täydellisiä sarakkeiden osajoukossa",
1895
+ "is": "Raðir eru heildstæðar í undirsafni dálka",
1896
+ "ar": "الصفوف مكتملة عبر مجموعة فرعية من الأعمدة",
1897
+ "hi": "पंक्तियां स्तंभों के एक उपसमूह में पूर्ण हैं",
1898
+ "el": "Οι γραμμές είναι πλήρεις σε ένα υποσύνολο στηλών",
1899
+ },
1738
1900
  "report_for_step_i": {
1739
1901
  "en": "Report for Validation Step {i}",
1740
1902
  "fr": "Rapport pour l'étape de validation {i}",
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import functools
3
4
  from dataclasses import dataclass
4
5
  from typing import TYPE_CHECKING, Any
5
6
 
@@ -1219,6 +1220,36 @@ class Interrogator:
1219
1220
 
1220
1221
  return tbl.to_native()
1221
1222
 
1223
+ def rows_complete(self) -> FrameT | Any:
1224
+ # Ibis backends ---------------------------------------------
1225
+
1226
+ if self.tbl_type in IBIS_BACKENDS:
1227
+ tbl = self.x
1228
+
1229
+ # Determine the number of null values in each row (column subsets are handled in
1230
+ # the `_check_nulls_across_columns_ibis()` function)
1231
+ tbl = _check_nulls_across_columns_ibis(table=tbl, columns_subset=self.columns_subset)
1232
+
1233
+ # Failing rows will have the value `True` in the generated column, so we need to negate
1234
+ # the result to get the passing rows
1235
+ return tbl.mutate(pb_is_good_=~tbl["_any_is_null_"]).drop("_any_is_null_")
1236
+
1237
+ # Local backends (Narwhals) ---------------------------------
1238
+
1239
+ tbl = self.x
1240
+
1241
+ # Determine the number of null values in each row (column subsets are handled in
1242
+ # the `_check_nulls_across_columns_nw()` function)
1243
+ tbl = _check_nulls_across_columns_nw(table=tbl, columns_subset=self.columns_subset)
1244
+
1245
+ # Failing rows will have the value `True` in the generated column, so we need to negate
1246
+ # the result to get the passing rows
1247
+ tbl = tbl.with_columns(pb_is_good_=~nw.col("_any_is_null_"))
1248
+ tbl = tbl.drop("_any_is_null_")
1249
+
1250
+ # Convert the table to a native format
1251
+ return tbl.to_native()
1252
+
1222
1253
 
1223
1254
  @dataclass
1224
1255
  class ColValsCompareOne:
@@ -1794,6 +1825,58 @@ class RowsDistinct:
1794
1825
  return self.test_unit_res
1795
1826
 
1796
1827
 
1828
+ @dataclass
1829
+ class RowsComplete:
1830
+ """
1831
+ Check if rows in a DataFrame are complete.
1832
+
1833
+ Parameters
1834
+ ----------
1835
+ data_tbl
1836
+ A data table.
1837
+ columns_subset
1838
+ A list of columns to check for completeness.
1839
+ threshold
1840
+ The maximum number of failing test units to allow.
1841
+ tbl_type
1842
+ The type of table to use for the assertion.
1843
+
1844
+ Returns
1845
+ -------
1846
+ bool
1847
+ `True` when test units pass below the threshold level for failing test units, `False`
1848
+ otherwise.
1849
+ """
1850
+
1851
+ data_tbl: FrameT
1852
+ columns_subset: list[str] | None
1853
+ threshold: int
1854
+ tbl_type: str = "local"
1855
+
1856
+ def __post_init__(self):
1857
+ if self.tbl_type == "local":
1858
+ # Convert the DataFrame to a format that narwhals can work with, and:
1859
+ # - check if the `column=` exists
1860
+ # - check if the `column=` type is compatible with the test
1861
+ tbl = _column_subset_test_prep(df=self.data_tbl, columns_subset=self.columns_subset)
1862
+
1863
+ # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
1864
+ # for now, just pass the table as is
1865
+ if self.tbl_type in IBIS_BACKENDS:
1866
+ tbl = self.data_tbl
1867
+
1868
+ # Collect results for the test units; the results are a list of booleans where
1869
+ # `True` indicates a passing test unit
1870
+ self.test_unit_res = Interrogator(
1871
+ x=tbl,
1872
+ columns_subset=self.columns_subset,
1873
+ tbl_type=self.tbl_type,
1874
+ ).rows_complete()
1875
+
1876
+ def get_test_results(self):
1877
+ return self.test_unit_res
1878
+
1879
+
1797
1880
  @dataclass
1798
1881
  class ColSchemaMatch:
1799
1882
  """
@@ -2207,6 +2290,40 @@ def _column_has_null_values(table: FrameT, column: str) -> bool:
2207
2290
  return True
2208
2291
 
2209
2292
 
2293
+ def _check_nulls_across_columns_ibis(table, columns_subset):
2294
+ # Get all column names from the table
2295
+ column_names = columns_subset if columns_subset else table.columns
2296
+
2297
+ # Build the expression by combining each column's isnull() with OR operations
2298
+ null_expr = functools.reduce(
2299
+ lambda acc, col: acc | table[col].isnull() if acc is not None else table[col].isnull(),
2300
+ column_names,
2301
+ None,
2302
+ )
2303
+
2304
+ # Add the expression as a new column to the table
2305
+ result = table.mutate(_any_is_null_=null_expr)
2306
+
2307
+ return result
2308
+
2309
+
2310
+ def _check_nulls_across_columns_nw(table, columns_subset):
2311
+ # Get all column names from the table
2312
+ column_names = columns_subset if columns_subset else table.columns
2313
+
2314
+ # Build the expression by combining each column's `is_null()` with OR operations
2315
+ null_expr = functools.reduce(
2316
+ lambda acc, col: acc | table[col].is_null() if acc is not None else table[col].is_null(),
2317
+ column_names,
2318
+ None,
2319
+ )
2320
+
2321
+ # Add the expression as a new column to the table
2322
+ result = table.with_columns(_any_is_null_=null_expr)
2323
+
2324
+ return result
2325
+
2326
+
2210
2327
  def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
2211
2328
  tgt_col_dtype_str = str(tgt_column.dtype).lower()
2212
2329
 
pointblank/_utils.py CHANGED
@@ -485,6 +485,7 @@ def _get_api_text() -> str:
485
485
  "Validate.col_vals_expr",
486
486
  "Validate.col_exists",
487
487
  "Validate.rows_distinct",
488
+ "Validate.rows_complete",
488
489
  "Validate.col_schema_match",
489
490
  "Validate.row_count_match",
490
491
  "Validate.col_count_match",
@@ -4367,6 +4367,192 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
4367
4367
  others.
4368
4368
 
4369
4369
 
4370
+ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4371
+
4372
+ Validate whether row data are complete by having no missing values.
4373
+
4374
+ The `rows_complete()` method checks whether rows in the table are complete. Completeness
4375
+ of a row means that there are no missing values within the row. This validation will operate
4376
+ over the number of test units that is equal to the number of rows in the table (determined
4377
+ after any `pre=` mutation has been applied). A subset of columns can be specified for the
4378
+ completeness check. If no subset is provided, all columns in the table will be used.
4379
+
4380
+ Parameters
4381
+ ----------
4382
+ columns_subset
4383
+ A single column or a list of columns to use as a subset for the completeness check. If
4384
+ `None` (the default), then all columns in the table will be used.
4385
+ pre
4386
+ An optional preprocessing function or lambda to apply to the data table during
4387
+ interrogation. This function should take a table as input and return a modified table.
4388
+ Have a look at the *Preprocessing* section for more information on how to use this
4389
+ argument.
4390
+ segments
4391
+ An optional directive on segmentation, which serves to split a validation step into
4392
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4393
+ column name and its corresponding values to segment on, or a combination of both
4394
+ (provided as a list). Read the *Segmentation* section for usage information.
4395
+ thresholds
4396
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
4397
+ The thresholds are set at the step level and will override any global thresholds set in
4398
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
4399
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
4400
+ section for information on how to set threshold levels.
4401
+ actions
4402
+ Optional actions to take when the validation step meets or exceeds any set threshold
4403
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
4404
+ define the actions.
4405
+ brief
4406
+ An optional brief description of the validation step that will be displayed in the
4407
+ reporting table. You can use the templating elements like `"{step}"` to insert
4408
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
4409
+ the entire brief will be automatically generated. If `None` (the default) then there
4410
+ won't be a brief.
4411
+ active
4412
+ A boolean value indicating whether the validation step should be active. Using `False`
4413
+ will make the validation step inactive (still reporting its presence and keeping indexes
4414
+ for the steps unchanged).
4415
+
4416
+ Returns
4417
+ -------
4418
+ Validate
4419
+ The `Validate` object with the added validation step.
4420
+
4421
+ Preprocessing
4422
+ -------------
4423
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
4424
+ table during interrogation. This function should take a table as input and return a modified
4425
+ table. This is useful for performing any necessary transformations or filtering on the data
4426
+ before the validation step is applied.
4427
+
4428
+ The preprocessing function can be any callable that takes a table as input and returns a
4429
+ modified table. For example, you could use a lambda function to filter the table based on
4430
+ certain criteria or to apply a transformation to the data. Note that you can refer to
4431
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
4432
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
4433
+ table, it only exists during the validation step and is not stored in the `Validate` object
4434
+ or used in subsequent validation steps.
4435
+
4436
+ Segmentation
4437
+ ------------
4438
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4439
+ segments. This is useful for applying the same validation step to different subsets of the
4440
+ data. The segmentation can be done based on a single column or specific fields within a
4441
+ column.
4442
+
4443
+ Providing a single column name will result in a separate validation step for each unique
4444
+ value in that column. For example, if you have a column called `"region"` with values
4445
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4446
+ region.
4447
+
4448
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4449
+ values to segment on. For example, if you have a column called `"date"` and you want to
4450
+ segment on only specific dates, you can provide a tuple like
4451
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4452
+ (i.e., no validation steps will be created for them).
4453
+
4454
+ A list with a combination of column names and tuples can be provided as well. This allows
4455
+ for more complex segmentation scenarios. The following inputs are all valid:
4456
+
4457
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4458
+ in the `"region"` column and specific dates in the `"date"` column
4459
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4460
+ columns
4461
+
4462
+ The segmentation is performed during interrogation, and the resulting validation steps will
4463
+ be numbered sequentially. Each segment will have its own validation step, and the results
4464
+ will be reported separately. This allows for a more granular analysis of the data and helps
4465
+ identify issues within specific segments.
4466
+
4467
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4468
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4469
+ that can be used for segmentation. For example, you could create a new column called
4470
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4471
+
4472
+ Thresholds
4473
+ ----------
4474
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
4475
+ step. If they are set here at the step level, these thresholds will override any thresholds
4476
+ set at the global level in `Validate(thresholds=...)`.
4477
+
4478
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
4479
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
4480
+ or, the absolute number of failing test units (as integer that's `1` or greater).
4481
+
4482
+ Thresholds can be defined using one of these input schemes:
4483
+
4484
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
4485
+ thresholds)
4486
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
4487
+ the 'error' level, and position `2` is the 'critical' level
4488
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
4489
+ 'critical'
4490
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
4491
+ for the 'warning' level only
4492
+
4493
+ If the number of failing test units exceeds set thresholds, the validation step will be
4494
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
4495
+ set, you're free to set any combination of them.
4496
+
4497
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
4498
+ take for each level of failure (using the `actions=` parameter).
4499
+
4500
+ Examples
4501
+ --------
4502
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
4503
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
4504
+
4505
+ ```python
4506
+ import pointblank as pb
4507
+ import polars as pl
4508
+
4509
+ tbl = pl.DataFrame(
4510
+ {
4511
+ "col_1": ["a", None, "c", "d"],
4512
+ "col_2": ["a", "a", "c", None],
4513
+ "col_3": ["a", "a", "d", None],
4514
+ }
4515
+ )
4516
+
4517
+ pb.preview(tbl)
4518
+ ```
4519
+
4520
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
4521
+ determine if this validation had any failing test units (there are four test units, one for
4522
+ each row). A failing test units means that a given row is not complete (i.e., has at least
4523
+ one missing value).
4524
+
4525
+ ```python
4526
+ validation = (
4527
+ pb.Validate(data=tbl)
4528
+ .rows_complete()
4529
+ .interrogate()
4530
+ )
4531
+
4532
+ validation
4533
+ ```
4534
+
4535
+ From this validation table we see that there are two failing test units. This is because
4536
+ two rows in the table have at least one missing value (the second row and the last row).
4537
+
4538
+ We can also use a subset of columns to determine completeness. Let's specify the subset
4539
+ using columns `col_2` and `col_3` for the next validation.
4540
+
4541
+ ```python
4542
+ validation = (
4543
+ pb.Validate(data=tbl)
4544
+ .rows_complete(columns_subset=["col_2", "col_3"])
4545
+ .interrogate()
4546
+ )
4547
+
4548
+ validation
4549
+ ```
4550
+
4551
+ The validation table reports a single failing test units. The last row contains missing
4552
+ values in both the `col_2` and `col_3` columns.
4553
+ others.
4554
+
4555
+
4370
4556
  col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4371
4557
 
4372
4558
  Do columns in the table (and their types) match a predefined schema?
@@ -6614,6 +6800,7 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
6614
6800
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
6615
6801
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
6616
6802
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
6803
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
6617
6804
  - [`conjointly()`](`pointblank.Validate.conjointly`)
6618
6805
 
6619
6806
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
@@ -6698,17 +6885,133 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li
6698
6885
 
6699
6886
  Get a report of the validation results as a JSON-formatted string.
6700
6887
 
6888
+ The `get_json_report()` method provides a machine-readable report of validation results in
6889
+ JSON format. This is particularly useful for programmatic processing, storing validation
6890
+ results, or integrating with other systems. The report includes detailed information about
6891
+ each validation step, such as assertion type, columns validated, threshold values, test
6892
+ results, and more.
6893
+
6894
+ By default, all available validation information fields are included in the report. However,
6895
+ you can customize the fields to include or exclude using the `use_fields=` and
6896
+ `exclude_fields=` parameters.
6897
+
6701
6898
  Parameters
6702
6899
  ----------
6703
6900
  use_fields
6704
- A list of fields to include in the report. If `None`, all fields are included.
6901
+ An optional list of specific fields to include in the report. If provided, only these
6902
+ fields will be included in the JSON output. If `None` (the default), all standard
6903
+ validation report fields are included. Have a look at the *Available Report Fields*
6904
+ section below for a list of fields that can be included in the report.
6705
6905
  exclude_fields
6706
- A list of fields to exclude from the report. If `None`, no fields are excluded.
6906
+ An optional list of fields to exclude from the report. If provided, these fields will
6907
+ be omitted from the JSON output. If `None` (the default), no fields are excluded.
6908
+ This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
6909
+ provides a listing of fields that can be excluded from the report.
6707
6910
 
6708
6911
  Returns
6709
6912
  -------
6710
6913
  str
6711
- A JSON-formatted string representing the validation report.
6914
+ A JSON-formatted string representing the validation report, with each validation step
6915
+ as an object in the report array.
6916
+
6917
+ Available Report Fields
6918
+ -----------------------
6919
+ The JSON report can include any of the standard validation report fields, including:
6920
+
6921
+ - `i`: the step number (1-indexed)
6922
+ - `i_o`: the original step index from the validation plan (pre-expansion)
6923
+ - `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
6924
+ - `column`: the column being validated (or columns used in certain validations)
6925
+ - `values`: the comparison values or parameters used in the validation
6926
+ - `inclusive`: whether the comparison is inclusive (for range-based validations)
6927
+ - `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
6928
+ - `pre`: preprocessing function applied before validation
6929
+ - `segments`: data segments to which the validation was applied
6930
+ - `thresholds`: threshold level statement that was used for the validation step
6931
+ - `label`: custom label for the validation step
6932
+ - `brief`: a brief description of the validation step
6933
+ - `active`: whether the validation step is active
6934
+ - `all_passed`: whether all test units passed in the step
6935
+ - `n`: total number of test units
6936
+ - `n_passed`, `n_failed`: number of test units that passed and failed
6937
+ - `f_passed`, `f_failed`: Fraction of test units that passed and failed
6938
+ - `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
6939
+ `null` if threshold not set)
6940
+ - `time_processed`: when the validation step was processed (ISO 8601 format)
6941
+ - `proc_duration_s`: the processing duration in seconds
6942
+
6943
+ Examples
6944
+ --------
6945
+ Let's create a validation plan with a few validation steps and generate a JSON report of the
6946
+ results:
6947
+
6948
+ ```python
6949
+ import pointblank as pb
6950
+ import polars as pl
6951
+
6952
+ # Create a sample DataFrame
6953
+ tbl = pl.DataFrame({
6954
+ "a": [5, 7, 8, 9],
6955
+ "b": [3, 4, 2, 1]
6956
+ })
6957
+
6958
+ # Create and execute a validation plan
6959
+ validation = (
6960
+ pb.Validate(data=tbl)
6961
+ .col_vals_gt(columns="a", value=6)
6962
+ .col_vals_lt(columns="b", value=4)
6963
+ .interrogate()
6964
+ )
6965
+
6966
+ # Get the full JSON report
6967
+ json_report = validation.get_json_report()
6968
+
6969
+ print(json_report)
6970
+ ```
6971
+
6972
+ You can also customize which fields to include:
6973
+
6974
+ ```python
6975
+ json_report = validation.get_json_report(
6976
+ use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
6977
+ )
6978
+
6979
+ print(json_report)
6980
+ ```
6981
+
6982
+ Or which fields to exclude:
6983
+
6984
+ ```python
6985
+ json_report = validation.get_json_report(
6986
+ exclude_fields=[
6987
+ "i_o", "thresholds", "pre", "segments", "values",
6988
+ "na_pass", "inclusive", "label", "brief", "active",
6989
+ "time_processed", "proc_duration_s"
6990
+ ]
6991
+ )
6992
+
6993
+ print(json_report)
6994
+ ```
6995
+
6996
+ The JSON output can be further processed or analyzed programmatically:
6997
+
6998
+ ```python
6999
+ import json
7000
+
7001
+ # Parse the JSON report
7002
+ report_data = json.loads(validation.get_json_report())
7003
+
7004
+ # Extract and analyze validation results
7005
+ failing_steps = [step for step in report_data if step["n_failed"] > 0]
7006
+ print(f"Number of failing validation steps: {len(failing_steps)}")
7007
+ ```
7008
+
7009
+ See Also
7010
+ --------
7011
+ - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
7012
+ report as a GT table
7013
+ - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
7014
+ failed validation
6712
7015
 
6713
7016
 
6714
7017
  get_sundered_data(self, type='pass') -> 'FrameT'
pointblank/validate.py CHANGED
@@ -56,6 +56,7 @@ from pointblank._interrogation import (
56
56
  ConjointlyValidation,
57
57
  NumberOfTestUnits,
58
58
  RowCountMatch,
59
+ RowsComplete,
59
60
  RowsDistinct,
60
61
  )
61
62
  from pointblank._typing import SegmentSpec
@@ -6546,6 +6547,243 @@ class Validate:
6546
6547
 
6547
6548
  return self
6548
6549
 
6550
+ def rows_complete(
6551
+ self,
6552
+ columns_subset: str | list[str] | None = None,
6553
+ pre: Callable | None = None,
6554
+ segments: SegmentSpec | None = None,
6555
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
6556
+ actions: Actions | None = None,
6557
+ brief: str | bool | None = None,
6558
+ active: bool = True,
6559
+ ) -> Validate:
6560
+ """
6561
+ Validate whether row data are complete by having no missing values.
6562
+
6563
+ The `rows_complete()` method checks whether rows in the table are complete. Completeness
6564
+ of a row means that there are no missing values within the row. This validation will operate
6565
+ over the number of test units that is equal to the number of rows in the table (determined
6566
+ after any `pre=` mutation has been applied). A subset of columns can be specified for the
6567
+ completeness check. If no subset is provided, all columns in the table will be used.
6568
+
6569
+ Parameters
6570
+ ----------
6571
+ columns_subset
6572
+ A single column or a list of columns to use as a subset for the completeness check. If
6573
+ `None` (the default), then all columns in the table will be used.
6574
+ pre
6575
+ An optional preprocessing function or lambda to apply to the data table during
6576
+ interrogation. This function should take a table as input and return a modified table.
6577
+ Have a look at the *Preprocessing* section for more information on how to use this
6578
+ argument.
6579
+ segments
6580
+ An optional directive on segmentation, which serves to split a validation step into
6581
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6582
+ column name and its corresponding values to segment on, or a combination of both
6583
+ (provided as a list). Read the *Segmentation* section for usage information.
6584
+ thresholds
6585
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
6586
+ The thresholds are set at the step level and will override any global thresholds set in
6587
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
6588
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
6589
+ section for information on how to set threshold levels.
6590
+ actions
6591
+ Optional actions to take when the validation step meets or exceeds any set threshold
6592
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6593
+ define the actions.
6594
+ brief
6595
+ An optional brief description of the validation step that will be displayed in the
6596
+ reporting table. You can use the templating elements like `"{step}"` to insert
6597
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6598
+ the entire brief will be automatically generated. If `None` (the default) then there
6599
+ won't be a brief.
6600
+ active
6601
+ A boolean value indicating whether the validation step should be active. Using `False`
6602
+ will make the validation step inactive (still reporting its presence and keeping indexes
6603
+ for the steps unchanged).
6604
+
6605
+ Returns
6606
+ -------
6607
+ Validate
6608
+ The `Validate` object with the added validation step.
6609
+
6610
+ Preprocessing
6611
+ -------------
6612
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
6613
+ table during interrogation. This function should take a table as input and return a modified
6614
+ table. This is useful for performing any necessary transformations or filtering on the data
6615
+ before the validation step is applied.
6616
+
6617
+ The preprocessing function can be any callable that takes a table as input and returns a
6618
+ modified table. For example, you could use a lambda function to filter the table based on
6619
+ certain criteria or to apply a transformation to the data. Note that you can refer to
6620
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
6621
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
6622
+ table, it only exists during the validation step and is not stored in the `Validate` object
6623
+ or used in subsequent validation steps.
6624
+
6625
+ Segmentation
6626
+ ------------
6627
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6628
+ segments. This is useful for applying the same validation step to different subsets of the
6629
+ data. The segmentation can be done based on a single column or specific fields within a
6630
+ column.
6631
+
6632
+ Providing a single column name will result in a separate validation step for each unique
6633
+ value in that column. For example, if you have a column called `"region"` with values
6634
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6635
+ region.
6636
+
6637
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6638
+ values to segment on. For example, if you have a column called `"date"` and you want to
6639
+ segment on only specific dates, you can provide a tuple like
6640
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6641
+ (i.e., no validation steps will be created for them).
6642
+
6643
+ A list with a combination of column names and tuples can be provided as well. This allows
6644
+ for more complex segmentation scenarios. The following inputs are all valid:
6645
+
6646
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6647
+ in the `"region"` column and specific dates in the `"date"` column
6648
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6649
+ columns
6650
+
6651
+ The segmentation is performed during interrogation, and the resulting validation steps will
6652
+ be numbered sequentially. Each segment will have its own validation step, and the results
6653
+ will be reported separately. This allows for a more granular analysis of the data and helps
6654
+ identify issues within specific segments.
6655
+
6656
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6657
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6658
+ that can be used for segmentation. For example, you could create a new column called
6659
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6660
+
6661
+ Thresholds
6662
+ ----------
6663
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6664
+ step. If they are set here at the step level, these thresholds will override any thresholds
6665
+ set at the global level in `Validate(thresholds=...)`.
6666
+
6667
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
6668
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
6669
+ or, the absolute number of failing test units (as integer that's `1` or greater).
6670
+
6671
+ Thresholds can be defined using one of these input schemes:
6672
+
6673
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6674
+ thresholds)
6675
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6676
+ the 'error' level, and position `2` is the 'critical' level
6677
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6678
+ 'critical'
6679
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6680
+ for the 'warning' level only
6681
+
6682
+ If the number of failing test units exceeds set thresholds, the validation step will be
6683
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
6684
+ set, you're free to set any combination of them.
6685
+
6686
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
6687
+ take for each level of failure (using the `actions=` parameter).
6688
+
6689
+ Examples
6690
+ --------
6691
+ ```{python}
6692
+ #| echo: false
6693
+ #| output: false
6694
+ import pointblank as pb
6695
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
6696
+ ```
6697
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
6698
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
6699
+
6700
+ ```{python}
6701
+ import pointblank as pb
6702
+ import polars as pl
6703
+
6704
+ tbl = pl.DataFrame(
6705
+ {
6706
+ "col_1": ["a", None, "c", "d"],
6707
+ "col_2": ["a", "a", "c", None],
6708
+ "col_3": ["a", "a", "d", None],
6709
+ }
6710
+ )
6711
+
6712
+ pb.preview(tbl)
6713
+ ```
6714
+
6715
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
6716
+ determine if this validation had any failing test units (there are four test units, one for
6717
+ each row). A failing test units means that a given row is not complete (i.e., has at least
6718
+ one missing value).
6719
+
6720
+ ```{python}
6721
+ validation = (
6722
+ pb.Validate(data=tbl)
6723
+ .rows_complete()
6724
+ .interrogate()
6725
+ )
6726
+
6727
+ validation
6728
+ ```
6729
+
6730
+ From this validation table we see that there are two failing test units. This is because
6731
+ two rows in the table have at least one missing value (the second row and the last row).
6732
+
6733
+ We can also use a subset of columns to determine completeness. Let's specify the subset
6734
+ using columns `col_2` and `col_3` for the next validation.
6735
+
6736
+ ```{python}
6737
+ validation = (
6738
+ pb.Validate(data=tbl)
6739
+ .rows_complete(columns_subset=["col_2", "col_3"])
6740
+ .interrogate()
6741
+ )
6742
+
6743
+ validation
6744
+ ```
6745
+
6746
+ The validation table reports a single failing test units. The last row contains missing
6747
+ values in both the `col_2` and `col_3` columns.
6748
+ others.
6749
+ """
6750
+
6751
+ assertion_type = _get_fn_name()
6752
+
6753
+ _check_pre(pre=pre)
6754
+ # TODO: add check for segments
6755
+ # _check_segments(segments=segments)
6756
+ _check_thresholds(thresholds=thresholds)
6757
+ _check_boolean_input(param=active, param_name="active")
6758
+
6759
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
6760
+ thresholds = (
6761
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
6762
+ )
6763
+
6764
+ if columns_subset is not None and isinstance(columns_subset, str):
6765
+ columns_subset = [columns_subset]
6766
+
6767
+ # TODO: incorporate Column object
6768
+
6769
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
6770
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
6771
+
6772
+ val_info = _ValidationInfo(
6773
+ assertion_type=assertion_type,
6774
+ column=columns_subset,
6775
+ pre=pre,
6776
+ segments=segments,
6777
+ thresholds=thresholds,
6778
+ actions=actions,
6779
+ brief=brief,
6780
+ active=active,
6781
+ )
6782
+
6783
+ self._add_validation(validation_info=val_info)
6784
+
6785
+ return self
6786
+
6549
6787
  def col_schema_match(
6550
6788
  self,
6551
6789
  schema: Schema,
@@ -7724,6 +7962,14 @@ class Validate:
7724
7962
  tbl_type=tbl_type,
7725
7963
  ).get_test_results()
7726
7964
 
7965
+ if assertion_category == "ROWS_COMPLETE":
7966
+ results_tbl = RowsComplete(
7967
+ data_tbl=data_tbl_step,
7968
+ columns_subset=column,
7969
+ threshold=threshold,
7970
+ tbl_type=tbl_type,
7971
+ ).get_test_results()
7972
+
7727
7973
  if assertion_category == "COL_EXISTS_HAS_TYPE":
7728
7974
  result_bool = ColExistsHasType(
7729
7975
  data_tbl=data_tbl_step,
@@ -7994,7 +8240,8 @@ class Validate:
7994
8240
  # TODO: Add support for extraction of rows for Ibis backends
7995
8241
  if (
7996
8242
  collect_extracts
7997
- and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
8243
+ and assertion_type
8244
+ in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"]
7998
8245
  and tbl_type not in IBIS_BACKENDS
7999
8246
  ):
8000
8247
  # Add row numbers to the results table
@@ -9076,19 +9323,134 @@ class Validate:
9076
9323
  """
9077
9324
  Get a report of the validation results as a JSON-formatted string.
9078
9325
 
9326
+ The `get_json_report()` method provides a machine-readable report of validation results in
9327
+ JSON format. This is particularly useful for programmatic processing, storing validation
9328
+ results, or integrating with other systems. The report includes detailed information about
9329
+ each validation step, such as assertion type, columns validated, threshold values, test
9330
+ results, and more.
9331
+
9332
+ By default, all available validation information fields are included in the report. However,
9333
+ you can customize the fields to include or exclude using the `use_fields=` and
9334
+ `exclude_fields=` parameters.
9335
+
9079
9336
  Parameters
9080
9337
  ----------
9081
9338
  use_fields
9082
- A list of fields to include in the report. If `None`, all fields are included.
9339
+ An optional list of specific fields to include in the report. If provided, only these
9340
+ fields will be included in the JSON output. If `None` (the default), all standard
9341
+ validation report fields are included. Have a look at the *Available Report Fields*
9342
+ section below for a list of fields that can be included in the report.
9083
9343
  exclude_fields
9084
- A list of fields to exclude from the report. If `None`, no fields are excluded.
9344
+ An optional list of fields to exclude from the report. If provided, these fields will
9345
+ be omitted from the JSON output. If `None` (the default), no fields are excluded.
9346
+ This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
9347
+ provides a listing of fields that can be excluded from the report.
9085
9348
 
9086
9349
  Returns
9087
9350
  -------
9088
9351
  str
9089
- A JSON-formatted string representing the validation report.
9090
- """
9352
+ A JSON-formatted string representing the validation report, with each validation step
9353
+ as an object in the report array.
9354
+
9355
+ Available Report Fields
9356
+ -----------------------
9357
+ The JSON report can include any of the standard validation report fields, including:
9358
+
9359
+ - `i`: the step number (1-indexed)
9360
+ - `i_o`: the original step index from the validation plan (pre-expansion)
9361
+ - `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
9362
+ - `column`: the column being validated (or columns used in certain validations)
9363
+ - `values`: the comparison values or parameters used in the validation
9364
+ - `inclusive`: whether the comparison is inclusive (for range-based validations)
9365
+ - `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
9366
+ - `pre`: preprocessing function applied before validation
9367
+ - `segments`: data segments to which the validation was applied
9368
+ - `thresholds`: threshold level statement that was used for the validation step
9369
+ - `label`: custom label for the validation step
9370
+ - `brief`: a brief description of the validation step
9371
+ - `active`: whether the validation step is active
9372
+ - `all_passed`: whether all test units passed in the step
9373
+ - `n`: total number of test units
9374
+ - `n_passed`, `n_failed`: number of test units that passed and failed
9375
+ - `f_passed`, `f_failed`: Fraction of test units that passed and failed
9376
+ - `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
9377
+ `null` if threshold not set)
9378
+ - `time_processed`: when the validation step was processed (ISO 8601 format)
9379
+ - `proc_duration_s`: the processing duration in seconds
9380
+
9381
+ Examples
9382
+ --------
9383
+ Let's create a validation plan with a few validation steps and generate a JSON report of the
9384
+ results:
9385
+
9386
+ ```{python}
9387
+ import pointblank as pb
9388
+ import polars as pl
9389
+
9390
+ # Create a sample DataFrame
9391
+ tbl = pl.DataFrame({
9392
+ "a": [5, 7, 8, 9],
9393
+ "b": [3, 4, 2, 1]
9394
+ })
9395
+
9396
+ # Create and execute a validation plan
9397
+ validation = (
9398
+ pb.Validate(data=tbl)
9399
+ .col_vals_gt(columns="a", value=6)
9400
+ .col_vals_lt(columns="b", value=4)
9401
+ .interrogate()
9402
+ )
9403
+
9404
+ # Get the full JSON report
9405
+ json_report = validation.get_json_report()
9406
+
9407
+ print(json_report)
9408
+ ```
9409
+
9410
+ You can also customize which fields to include:
9411
+
9412
+ ```{python}
9413
+ json_report = validation.get_json_report(
9414
+ use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
9415
+ )
9416
+
9417
+ print(json_report)
9418
+ ```
9419
+
9420
+ Or which fields to exclude:
9421
+
9422
+ ```{python}
9423
+ json_report = validation.get_json_report(
9424
+ exclude_fields=[
9425
+ "i_o", "thresholds", "pre", "segments", "values",
9426
+ "na_pass", "inclusive", "label", "brief", "active",
9427
+ "time_processed", "proc_duration_s"
9428
+ ]
9429
+ )
9430
+
9431
+ print(json_report)
9432
+ ```
9433
+
9434
+ The JSON output can be further processed or analyzed programmatically:
9435
+
9436
+ ```{python}
9437
+ import json
9438
+
9439
+ # Parse the JSON report
9440
+ report_data = json.loads(validation.get_json_report())
9091
9441
 
9442
+ # Extract and analyze validation results
9443
+ failing_steps = [step for step in report_data if step["n_failed"] > 0]
9444
+ print(f"Number of failing validation steps: {len(failing_steps)}")
9445
+ ```
9446
+
9447
+ See Also
9448
+ --------
9449
+ - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
9450
+ report as a GT table
9451
+ - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
9452
+ failed validation
9453
+ """
9092
9454
  if use_fields is not None and exclude_fields is not None:
9093
9455
  raise ValueError("Cannot specify both `use_fields=` and `exclude_fields=`.")
9094
9456
 
@@ -9597,7 +9959,7 @@ class Validate:
9597
9959
  "col_vals_expr",
9598
9960
  ]:
9599
9961
  columns_upd.append("&mdash;")
9600
- elif assertion_type[i] in ["rows_distinct"]:
9962
+ elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
9601
9963
  if not column:
9602
9964
  # If there is no column subset, then all columns are used
9603
9965
  columns_upd.append("ALL COLUMNS")
@@ -9660,6 +10022,7 @@ class Validate:
9660
10022
  "col_vals_not_null",
9661
10023
  "col_exists",
9662
10024
  "rows_distinct",
10025
+ "rows_complete",
9663
10026
  ]:
9664
10027
  values_upd.append("&mdash;")
9665
10028
 
@@ -10213,6 +10576,7 @@ class Validate:
10213
10576
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10214
10577
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
10215
10578
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
10579
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
10216
10580
  - [`conjointly()`](`pointblank.Validate.conjointly`)
10217
10581
 
10218
10582
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
@@ -10372,7 +10736,7 @@ class Validate:
10372
10736
  # if get_row_count(extract) == 0:
10373
10737
  # return "No rows were extracted."
10374
10738
 
10375
- if assertion_type in ROW_BASED_VALIDATION_TYPES:
10739
+ if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]:
10376
10740
  # Get the extracted data for the step
10377
10741
  extract = self.get_data_extracts(i=i, frame=True)
10378
10742
 
@@ -11082,6 +11446,13 @@ def _create_autobrief_or_failure_text(
11082
11446
  for_failure=for_failure,
11083
11447
  )
11084
11448
 
11449
+ if assertion_type == "rows_complete":
11450
+ return _create_text_rows_complete(
11451
+ lang=lang,
11452
+ columns_subset=column,
11453
+ for_failure=for_failure,
11454
+ )
11455
+
11085
11456
  if assertion_type == "row_count_match":
11086
11457
  return _create_text_row_count_match(
11087
11458
  lang=lang,
@@ -11257,6 +11628,24 @@ def _create_text_rows_distinct(
11257
11628
  return text
11258
11629
 
11259
11630
 
11631
+ def _create_text_rows_complete(
11632
+ lang: str, columns_subset: list[str] | None, for_failure: bool = False
11633
+ ) -> str:
11634
+ type_ = _expect_failure_type(for_failure=for_failure)
11635
+
11636
+ if columns_subset is None:
11637
+ text = EXPECT_FAIL_TEXT[f"all_row_complete_{type_}_text"][lang]
11638
+
11639
+ else:
11640
+ column_text = _prep_values_text(values=columns_subset, lang=lang, limit=3)
11641
+
11642
+ text = EXPECT_FAIL_TEXT[f"across_row_complete_{type_}_text"][lang].format(
11643
+ column_text=column_text
11644
+ )
11645
+
11646
+ return text
11647
+
11648
+
11260
11649
  def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
11261
11650
  type_ = _expect_failure_type(for_failure=for_failure)
11262
11651
 
@@ -12057,6 +12446,11 @@ def _step_report_row_based(
12057
12446
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
12058
12447
  elif assertion_type == "col_vals_not_null":
12059
12448
  text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
12449
+ elif assertion_type == "rows_complete":
12450
+ if column is None:
12451
+ text = STEP_REPORT_TEXT["rows_complete_all"][lang]
12452
+ else:
12453
+ text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
12060
12454
 
12061
12455
  # Wrap assertion text in a <code> tag
12062
12456
  text = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pointblank
3
- Version: 0.9.0
3
+ Version: 0.9.1
4
4
  Summary: Find out if your data is what you think it is.
5
5
  Author-email: Richard Iannone <riannone@me.com>
6
6
  License: MIT License
@@ -1,10 +1,10 @@
1
1
  pointblank/__init__.py,sha256=uHrX-ARZOhvWogXXqKV65RO2DXdYLZNCD1oNcm8hE6o,1585
2
- pointblank/_constants.py,sha256=l7jNb-UqrKER30FnrRKlIK5onc0Id37CVV6l7i4Pn00,75777
2
+ pointblank/_constants.py,sha256=tlelmeuftW4BpVeEILbsbuuCaMQ7yA_FYqM6PJPXH58,78561
3
3
  pointblank/_constants_docs.py,sha256=JBmtt16zTYQ-zaM4ElLExtKs-dKlnN553Ys2ML1Y1C8,2099
4
- pointblank/_constants_translations.py,sha256=QfOmVESwWFokWXpgLkEFHGik8o1EUBhIXYtaEqtGGNg,166575
5
- pointblank/_interrogation.py,sha256=SkW0DUoCafQbpPToVseUPLzaYXXTTzN9y6mbzjbRmNw,81082
4
+ pointblank/_constants_translations.py,sha256=FHkY2Bh1VBmBwbiGRIRSMU1tNGxgQAkjoPoYlwOHSKU,180685
5
+ pointblank/_interrogation.py,sha256=BjN60ed7BH4ZnoPtkmVSvVEqJgf8k9mce4Zb63_jv_s,85155
6
6
  pointblank/_typing.py,sha256=ConITAbsFxU8CkNXY7l0Lua9hGofeDDJAWw-lGAIVgI,764
7
- pointblank/_utils.py,sha256=0V-LxUjSjGfcZV2_IH-5KPikYiVWdt4QSMQDioyZoZc,24681
7
+ pointblank/_utils.py,sha256=g7vbvV33tKNvznUoYsHcZW90bYm1LPb76njQeDJDPyQ,24715
8
8
  pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeTLs,5489
9
9
  pointblank/_utils_html.py,sha256=sTcmnBljkPjRZF1hbpoHl4HmnXOazsA91gC9iWVIrRk,2848
10
10
  pointblank/actions.py,sha256=oazJk4pe3lIA14hjyCDtPOr4r_sp4vGGo2eyU_LX5_0,18268
@@ -15,8 +15,8 @@ pointblank/draft.py,sha256=cusr4fBiNncCKIOU8UwvJcvkBeBuUnqH_UfYp9dtNss,15777
15
15
  pointblank/schema.py,sha256=gzUCmtccO2v15MH2bo9uHUYjkKEEne1okQucxcH39pc,44291
16
16
  pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
17
17
  pointblank/thresholds.py,sha256=aAPfdo3VMCw_G_OAh4nEsCYfIynDfNRJOMrG8yDM6U8,25717
18
- pointblank/validate.py,sha256=pr1Riar-axz17aUwiinZdJE67tH8x24eBPUB1Dw5aYk,570820
19
- pointblank/data/api-docs.txt,sha256=Jf_akggFaJPh0chntpq2cRTa1Enuupk723zty_x0k-s,452511
18
+ pointblank/validate.py,sha256=EPqtxw5sQG4Xh7WSaViVEUtm4FmpFOsyh4KM9EzuqkU,588834
19
+ pointblank/data/api-docs.txt,sha256=JkV9SdXyB3ftBMXVFdFqGZNpyfBdWUpyisn4QHco56w,467666
20
20
  pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
21
21
  pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
22
22
  pointblank/data/nycflights-duckdb.zip,sha256=GQrHO9tp7d9cNGFNSbA9EKF19MLf6t2wZE0U9-hIKow,5293077
@@ -24,8 +24,8 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
24
24
  pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
25
25
  pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
26
26
  pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
27
- pointblank-0.9.0.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
28
- pointblank-0.9.0.dist-info/METADATA,sha256=091J9RejW9b-vT63C6w_5shh-82G0t451895oNm479M,14732
29
- pointblank-0.9.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
30
- pointblank-0.9.0.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
31
- pointblank-0.9.0.dist-info/RECORD,,
27
+ pointblank-0.9.1.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
28
+ pointblank-0.9.1.dist-info/METADATA,sha256=1o11OgPSmpB4qBDEG1HyHDfVj5emxcT_yxHeFsVPVUc,14732
29
+ pointblank-0.9.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
30
+ pointblank-0.9.1.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
31
+ pointblank-0.9.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5