pointblank 0.11.6__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/schema.py CHANGED
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
  import copy
4
4
  from dataclasses import dataclass
5
5
 
6
+ import narwhals as nw
7
+
6
8
  from pointblank._constants import IBIS_BACKENDS
7
9
  from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
8
10
 
@@ -59,10 +61,15 @@ class Schema:
59
61
 
60
62
  - Polars DataFrame (`"polars"`)
61
63
  - Pandas DataFrame (`"pandas"`)
64
+ - PySpark table (`"pyspark"`)
62
65
  - DuckDB table (`"duckdb"`)*
63
66
  - MySQL table (`"mysql"`)*
64
67
  - PostgreSQL table (`"postgresql"`)*
65
68
  - SQLite table (`"sqlite"`)*
69
+ - Microsoft SQL Server table (`"mssql"`)*
70
+ - Snowflake table (`"snowflake"`)*
71
+ - Databricks table (`"databricks"`)*
72
+ - BigQuery table (`"bigquery"`)*
66
73
  - Parquet table (`"parquet"`)*
67
74
 
68
75
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -336,6 +343,16 @@ class Schema:
336
343
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
337
344
  self.columns = list(schema_dict.items())
338
345
 
346
+ elif table_type == "pyspark":
347
+ # Convert PySpark DataFrame to Narwhals to get schema
348
+ nw_df = nw.from_native(self.tbl)
349
+ if _is_lazy_frame(data=nw_df):
350
+ schema_dict = dict(nw_df.collect_schema())
351
+ else:
352
+ schema_dict = dict(nw_df.schema.items())
353
+ schema_dict = {k: str(v) for k, v in schema_dict.items()}
354
+ self.columns = list(schema_dict.items())
355
+
339
356
  elif table_type in IBIS_BACKENDS:
340
357
  schema_dict = dict(self.tbl.schema().items())
341
358
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
pointblank/segments.py ADDED
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ __all__ = [
7
+ "seg_group",
8
+ ]
9
+
10
+
11
+ @dataclass
12
+ class Segment:
13
+ """
14
+ A class to represent a segment.
15
+ """
16
+
17
+ segments: list[list[Any]]
18
+
19
+ def __post_init__(self) -> None:
20
+ # Check that segments is a list of lists
21
+ if not isinstance(self.segments, list):
22
+ raise TypeError(f"Segments must be lists. Got {type(self.segments).__name__} instead.")
23
+
24
+ if not all(isinstance(seg, list) for seg in self.segments):
25
+ raise TypeError("Sub-segments must be lists.")
26
+
27
+ # Check segment groups have the same type
28
+ seg_types = {type(seg) for segment in self.segments for seg in segment}
29
+ if len(seg_types) > 1:
30
+ raise TypeError(f"All segment values must have the same type. Got {seg_types} instead.")
31
+
32
+
33
+ def seg_group(values: list[Any]) -> Segment:
34
+ """
35
+ Group together values for segmentation.
36
+
37
+ Many validation methods have a `segments=` argument that can be used to specify one or more
38
+ columns, or certain values within a column, to create segments for validation (e.g.,
39
+ [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
40
+ [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). When passing in a column, or
41
+ a tuple with a column and certain values, a segment will be created for each individual value
42
+ within the column or given values. The `seg_group()` selector enables values to be grouped
43
+ together into a segment. For example, if you were to create a segment for a column "region",
44
+ investigating just "North" and "South" regions, a typical segment would look like:
45
+
46
+ `segments=("region", ["North", "South"])`
47
+
48
+ This would create two validation steps, one for each of the regions. If you wanted to group
49
+ these two regions into a single segment, you could use the `seg_group()` function like this:
50
+
51
+ `segments=("region", pb.seg_group(["North", "South"]))`
52
+
53
+ You could create a second segment for "East" and "West" regions like this:
54
+
55
+ `segments=("region", pb.seg_group([["North", "South"], ["East", "West"]]))`
56
+
57
+ There will be a validation step created for every segment. Note that if there aren't any
58
+ segments created using `seg_group()` (or any other segment expression), the validation step will
59
+ fail to be evaluated during the interrogation process. Such a failure to evaluate will be
60
+ reported in the validation results but it won't affect the interrogation process overall
61
+ (i.e., the process won't be halted).
62
+
63
+ Parameters
64
+ ----------
65
+ values
66
+ A list of values to be grouped into a segment. This can be a single list or a list of lists.
67
+
68
+ Returns
69
+ -------
70
+ Segment
71
+ A `Segment` object, which can be used to combine values into a segment.
72
+
73
+ Examples
74
+ --------
75
+ ```{python}
76
+ #| echo: false
77
+ #| output: false
78
+ import pointblank as pb
79
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
80
+ ```
81
+
82
+ Let's say we're analyzing sales from our local bookstore, and want to check the number of books
83
+ sold for the month exceeds a certain threshold. We could pass in the argument
84
+ `segments="genre"`, which would return a segment for each unique genre in the datasets. We could
85
+ also pass in `segments=("genre", ["Fantasy", "Science Fiction"])`, to only create segments for
86
+ those two genres. However, if we wanted to group these two genres into a single segment, we
87
+ could use the `seg_group()` function.
88
+
89
+ ```{python}
90
+ import pointblank as pb
91
+ import polars as pl
92
+
93
+ tbl = pl.DataFrame(
94
+ {
95
+ "title": [
96
+ "The Hobbit",
97
+ "Harry Potter and the Sorcerer's Stone",
98
+ "The Lord of the Rings",
99
+ "A Game of Thrones",
100
+ "The Name of the Wind",
101
+ "The Girl with the Dragon Tattoo",
102
+ "The Da Vinci Code",
103
+ "The Hitchhiker's Guide to the Galaxy",
104
+ "The Martian",
105
+ "Brave New World"
106
+ ],
107
+ "genre": [
108
+ "Fantasy",
109
+ "Fantasy",
110
+ "Fantasy",
111
+ "Fantasy",
112
+ "Fantasy",
113
+ "Mystery",
114
+ "Mystery",
115
+ "Science Fiction",
116
+ "Science Fiction",
117
+ "Science Fiction",
118
+ ],
119
+ "units_sold": [875, 932, 756, 623, 445, 389, 678, 534, 712, 598],
120
+ }
121
+ )
122
+
123
+ validation = (
124
+ pb.Validate(data=tbl)
125
+ .col_vals_gt(
126
+ columns="units_sold",
127
+ value=500,
128
+ segments=("genre", pb.seg_group(["Fantasy", "Science Fiction"]))
129
+ )
130
+ .interrogate()
131
+ )
132
+
133
+ validation
134
+ ```
135
+
136
+ What's more, we can create multiple segments, combining the genres in different ways.
137
+
138
+ ```{python}
139
+ validation = (
140
+ pb.Validate(data=tbl)
141
+ .col_vals_gt(
142
+ columns="units_sold",
143
+ value=500,
144
+ segments=("genre", pb.seg_group([
145
+ ["Fantasy", "Science Fiction"],
146
+ ["Fantasy", "Mystery"],
147
+ ["Mystery", "Science Fiction"]
148
+ ]))
149
+ )
150
+ .interrogate()
151
+ )
152
+
153
+ validation
154
+ ```
155
+
156
+ """
157
+ if isinstance(values, list):
158
+ if all(isinstance(v, list) for v in values):
159
+ return Segment(values)
160
+ else:
161
+ return Segment([values])
162
+ else:
163
+ raise ValueError("Must input a list of values for a segment.")