pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +0 -1
- pointblank/_interrogation.py +244 -606
- pointblank/_utils.py +65 -3
- pointblank/assistant.py +9 -0
- pointblank/cli.py +39 -24
- pointblank/data/api-docs.txt +658 -29
- pointblank/schema.py +17 -0
- pointblank/segments.py +163 -0
- pointblank/validate.py +344 -92
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/METADATA +59 -6
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/RECORD +16 -15
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/WHEEL +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/top_level.txt +0 -0
pointblank/schema.py
CHANGED
|
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
import copy
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
|
|
6
8
|
from pointblank._constants import IBIS_BACKENDS
|
|
7
9
|
from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
|
|
8
10
|
|
|
@@ -59,10 +61,15 @@ class Schema:
|
|
|
59
61
|
|
|
60
62
|
- Polars DataFrame (`"polars"`)
|
|
61
63
|
- Pandas DataFrame (`"pandas"`)
|
|
64
|
+
- PySpark table (`"pyspark"`)
|
|
62
65
|
- DuckDB table (`"duckdb"`)*
|
|
63
66
|
- MySQL table (`"mysql"`)*
|
|
64
67
|
- PostgreSQL table (`"postgresql"`)*
|
|
65
68
|
- SQLite table (`"sqlite"`)*
|
|
69
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
70
|
+
- Snowflake table (`"snowflake"`)*
|
|
71
|
+
- Databricks table (`"databricks"`)*
|
|
72
|
+
- BigQuery table (`"bigquery"`)*
|
|
66
73
|
- Parquet table (`"parquet"`)*
|
|
67
74
|
|
|
68
75
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -336,6 +343,16 @@ class Schema:
|
|
|
336
343
|
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
337
344
|
self.columns = list(schema_dict.items())
|
|
338
345
|
|
|
346
|
+
elif table_type == "pyspark":
|
|
347
|
+
# Convert PySpark DataFrame to Narwhals to get schema
|
|
348
|
+
nw_df = nw.from_native(self.tbl)
|
|
349
|
+
if _is_lazy_frame(data=nw_df):
|
|
350
|
+
schema_dict = dict(nw_df.collect_schema())
|
|
351
|
+
else:
|
|
352
|
+
schema_dict = dict(nw_df.schema.items())
|
|
353
|
+
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
354
|
+
self.columns = list(schema_dict.items())
|
|
355
|
+
|
|
339
356
|
elif table_type in IBIS_BACKENDS:
|
|
340
357
|
schema_dict = dict(self.tbl.schema().items())
|
|
341
358
|
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
pointblank/segments.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"seg_group",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Segment:
|
|
13
|
+
"""
|
|
14
|
+
A class to represent a segment.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
segments: list[list[Any]]
|
|
18
|
+
|
|
19
|
+
def __post_init__(self) -> None:
|
|
20
|
+
# Check that segments is a list of lists
|
|
21
|
+
if not isinstance(self.segments, list):
|
|
22
|
+
raise TypeError(f"Segments must be lists. Got {type(self.segments).__name__} instead.")
|
|
23
|
+
|
|
24
|
+
if not all(isinstance(seg, list) for seg in self.segments):
|
|
25
|
+
raise TypeError("Sub-segments must be lists.")
|
|
26
|
+
|
|
27
|
+
# Check segment groups have the same type
|
|
28
|
+
seg_types = {type(seg) for segment in self.segments for seg in segment}
|
|
29
|
+
if len(seg_types) > 1:
|
|
30
|
+
raise TypeError(f"All segment values must have the same type. Got {seg_types} instead.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def seg_group(values: list[Any]) -> Segment:
|
|
34
|
+
"""
|
|
35
|
+
Group together values for segmentation.
|
|
36
|
+
|
|
37
|
+
Many validation methods have a `segments=` argument that can be used to specify one or more
|
|
38
|
+
columns, or certain values within a column, to create segments for validation (e.g.,
|
|
39
|
+
[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
|
|
40
|
+
[`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). When passing in a column, or
|
|
41
|
+
a tuple with a column and certain values, a segment will be created for each individual value
|
|
42
|
+
within the column or given values. The `seg_group()` selector enables values to be grouped
|
|
43
|
+
together into a segment. For example, if you were to create a segment for a column "region",
|
|
44
|
+
investigating just "North" and "South" regions, a typical segment would look like:
|
|
45
|
+
|
|
46
|
+
`segments=("region", ["North", "South"])`
|
|
47
|
+
|
|
48
|
+
This would create two validation steps, one for each of the regions. If you wanted to group
|
|
49
|
+
these two regions into a single segment, you could use the `seg_group()` function like this:
|
|
50
|
+
|
|
51
|
+
`segments=("region", pb.seg_group(["North", "South"]))`
|
|
52
|
+
|
|
53
|
+
You could create a second segment for "East" and "West" regions like this:
|
|
54
|
+
|
|
55
|
+
`segments=("region", pb.seg_group([["North", "South"], ["East", "West"]]))`
|
|
56
|
+
|
|
57
|
+
There will be a validation step created for every segment. Note that if there aren't any
|
|
58
|
+
segments created using `seg_group()` (or any other segment expression), the validation step will
|
|
59
|
+
fail to be evaluated during the interrogation process. Such a failure to evaluate will be
|
|
60
|
+
reported in the validation results but it won't affect the interrogation process overall
|
|
61
|
+
(i.e., the process won't be halted).
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
values
|
|
66
|
+
A list of values to be grouped into a segment. This can be a single list or a list of lists.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
Segment
|
|
71
|
+
A `Segment` object, which can be used to combine values into a segment.
|
|
72
|
+
|
|
73
|
+
Examples
|
|
74
|
+
--------
|
|
75
|
+
```{python}
|
|
76
|
+
#| echo: false
|
|
77
|
+
#| output: false
|
|
78
|
+
import pointblank as pb
|
|
79
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Let's say we're analyzing sales from our local bookstore, and want to check the number of books
|
|
83
|
+
sold for the month exceeds a certain threshold. We could pass in the argument
|
|
84
|
+
`segments="genre"`, which would return a segment for each unique genre in the datasets. We could
|
|
85
|
+
also pass in `segments=("genre", ["Fantasy", "Science Fiction"])`, to only create segments for
|
|
86
|
+
those two genres. However, if we wanted to group these two genres into a single segment, we
|
|
87
|
+
could use the `seg_group()` function.
|
|
88
|
+
|
|
89
|
+
```{python}
|
|
90
|
+
import pointblank as pb
|
|
91
|
+
import polars as pl
|
|
92
|
+
|
|
93
|
+
tbl = pl.DataFrame(
|
|
94
|
+
{
|
|
95
|
+
"title": [
|
|
96
|
+
"The Hobbit",
|
|
97
|
+
"Harry Potter and the Sorcerer's Stone",
|
|
98
|
+
"The Lord of the Rings",
|
|
99
|
+
"A Game of Thrones",
|
|
100
|
+
"The Name of the Wind",
|
|
101
|
+
"The Girl with the Dragon Tattoo",
|
|
102
|
+
"The Da Vinci Code",
|
|
103
|
+
"The Hitchhiker's Guide to the Galaxy",
|
|
104
|
+
"The Martian",
|
|
105
|
+
"Brave New World"
|
|
106
|
+
],
|
|
107
|
+
"genre": [
|
|
108
|
+
"Fantasy",
|
|
109
|
+
"Fantasy",
|
|
110
|
+
"Fantasy",
|
|
111
|
+
"Fantasy",
|
|
112
|
+
"Fantasy",
|
|
113
|
+
"Mystery",
|
|
114
|
+
"Mystery",
|
|
115
|
+
"Science Fiction",
|
|
116
|
+
"Science Fiction",
|
|
117
|
+
"Science Fiction",
|
|
118
|
+
],
|
|
119
|
+
"units_sold": [875, 932, 756, 623, 445, 389, 678, 534, 712, 598],
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
validation = (
|
|
124
|
+
pb.Validate(data=tbl)
|
|
125
|
+
.col_vals_gt(
|
|
126
|
+
columns="units_sold",
|
|
127
|
+
value=500,
|
|
128
|
+
segments=("genre", pb.seg_group(["Fantasy", "Science Fiction"]))
|
|
129
|
+
)
|
|
130
|
+
.interrogate()
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
validation
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
What's more, we can create multiple segments, combining the genres in different ways.
|
|
137
|
+
|
|
138
|
+
```{python}
|
|
139
|
+
validation = (
|
|
140
|
+
pb.Validate(data=tbl)
|
|
141
|
+
.col_vals_gt(
|
|
142
|
+
columns="units_sold",
|
|
143
|
+
value=500,
|
|
144
|
+
segments=("genre", pb.seg_group([
|
|
145
|
+
["Fantasy", "Science Fiction"],
|
|
146
|
+
["Fantasy", "Mystery"],
|
|
147
|
+
["Mystery", "Science Fiction"]
|
|
148
|
+
]))
|
|
149
|
+
)
|
|
150
|
+
.interrogate()
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
validation
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
"""
|
|
157
|
+
if isinstance(values, list):
|
|
158
|
+
if all(isinstance(v, list) for v in values):
|
|
159
|
+
return Segment(values)
|
|
160
|
+
else:
|
|
161
|
+
return Segment([values])
|
|
162
|
+
else:
|
|
163
|
+
raise ValueError("Must input a list of values for a segment.")
|