dr-frames 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dr_frames/__init__.py +161 -0
- dr_frames/aggregation.py +138 -0
- dr_frames/cells.py +262 -0
- dr_frames/columns.py +108 -0
- dr_frames/filtering.py +96 -0
- dr_frames/formatting.py +257 -0
- dr_frames/parsing.py +69 -0
- dr_frames/profiling.py +246 -0
- dr_frames/py.typed +0 -0
- dr_frames/schema.py +244 -0
- dr_frames/types.py +61 -0
- dr_frames-0.1.0.dist-info/METADATA +207 -0
- dr_frames-0.1.0.dist-info/RECORD +15 -0
- dr_frames-0.1.0.dist-info/WHEEL +4 -0
- dr_frames-0.1.0.dist-info/licenses/LICENSE +21 -0
dr_frames/__init__.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .aggregation import (
|
|
4
|
+
aggregate_over_seeds,
|
|
5
|
+
apply_aggregations,
|
|
6
|
+
fillna_with_defaults,
|
|
7
|
+
get_constant_cols,
|
|
8
|
+
maybe_pipe,
|
|
9
|
+
unique_by_col,
|
|
10
|
+
unique_by_cols,
|
|
11
|
+
unique_non_null,
|
|
12
|
+
)
|
|
13
|
+
from .cells import (
|
|
14
|
+
apply_column_converters,
|
|
15
|
+
apply_if_column,
|
|
16
|
+
ensure_column,
|
|
17
|
+
fill_missing_values,
|
|
18
|
+
force_set_cell,
|
|
19
|
+
group_col_by_prefix,
|
|
20
|
+
map_column_with_fallback,
|
|
21
|
+
masked_getter,
|
|
22
|
+
masked_setter,
|
|
23
|
+
maybe_update_cell,
|
|
24
|
+
rename_columns,
|
|
25
|
+
require_row_index,
|
|
26
|
+
)
|
|
27
|
+
from .columns import (
|
|
28
|
+
apply_skip,
|
|
29
|
+
contained_cols,
|
|
30
|
+
drop_all_null_cols,
|
|
31
|
+
get_cols_by_contains,
|
|
32
|
+
get_cols_by_prefix,
|
|
33
|
+
move_cols_to_beginning,
|
|
34
|
+
move_cols_with_prefix_to_end,
|
|
35
|
+
move_numeric_cols_to_end,
|
|
36
|
+
remaining_cols,
|
|
37
|
+
strip_col_prefixes,
|
|
38
|
+
strip_col_prefixes_batch,
|
|
39
|
+
)
|
|
40
|
+
from .filtering import (
|
|
41
|
+
apply_filters_to_df,
|
|
42
|
+
filter_to_best_metric,
|
|
43
|
+
filter_to_range,
|
|
44
|
+
filter_to_value,
|
|
45
|
+
filter_to_values,
|
|
46
|
+
make_filter_fxn,
|
|
47
|
+
select_subset,
|
|
48
|
+
)
|
|
49
|
+
from .parsing import (
|
|
50
|
+
is_homogeneous,
|
|
51
|
+
parse_first_element,
|
|
52
|
+
parse_list_string,
|
|
53
|
+
sum_list_elements,
|
|
54
|
+
)
|
|
55
|
+
from .profiling import (
|
|
56
|
+
ColInfo,
|
|
57
|
+
DFColInfo,
|
|
58
|
+
infer_col_name_contains_tags,
|
|
59
|
+
infer_col_name_prefix_tags,
|
|
60
|
+
infer_col_name_suffix_tags,
|
|
61
|
+
infer_series_base_tag_type,
|
|
62
|
+
infer_tags_from_series_sample,
|
|
63
|
+
looks_like_json,
|
|
64
|
+
looks_like_path,
|
|
65
|
+
)
|
|
66
|
+
from .schema import (
|
|
67
|
+
ComputedField,
|
|
68
|
+
DataField,
|
|
69
|
+
DataFormat,
|
|
70
|
+
MetricDataField,
|
|
71
|
+
)
|
|
72
|
+
from .types import (
|
|
73
|
+
coerce_numeric_cols,
|
|
74
|
+
coerce_string_cols,
|
|
75
|
+
is_string_series,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from .formatting import ( # noqa: F401
|
|
80
|
+
FORMATTER_TYPES,
|
|
81
|
+
OUTPUT_FORMATS,
|
|
82
|
+
format_coverage_table,
|
|
83
|
+
format_table,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
_HAS_FORMATTING = True
|
|
87
|
+
except ImportError:
|
|
88
|
+
_HAS_FORMATTING = False
|
|
89
|
+
|
|
90
|
+
__all__ = [
|
|
91
|
+
"aggregate_over_seeds",
|
|
92
|
+
"apply_aggregations",
|
|
93
|
+
"apply_column_converters",
|
|
94
|
+
"apply_filters_to_df",
|
|
95
|
+
"apply_if_column",
|
|
96
|
+
"apply_skip",
|
|
97
|
+
"coerce_numeric_cols",
|
|
98
|
+
"coerce_string_cols",
|
|
99
|
+
"ColInfo",
|
|
100
|
+
"ComputedField",
|
|
101
|
+
"contained_cols",
|
|
102
|
+
"DataField",
|
|
103
|
+
"DataFormat",
|
|
104
|
+
"DFColInfo",
|
|
105
|
+
"drop_all_null_cols",
|
|
106
|
+
"ensure_column",
|
|
107
|
+
"fill_missing_values",
|
|
108
|
+
"fillna_with_defaults",
|
|
109
|
+
"filter_to_best_metric",
|
|
110
|
+
"filter_to_range",
|
|
111
|
+
"filter_to_value",
|
|
112
|
+
"filter_to_values",
|
|
113
|
+
"force_set_cell",
|
|
114
|
+
"get_cols_by_contains",
|
|
115
|
+
"get_cols_by_prefix",
|
|
116
|
+
"get_constant_cols",
|
|
117
|
+
"group_col_by_prefix",
|
|
118
|
+
"infer_col_name_contains_tags",
|
|
119
|
+
"infer_col_name_prefix_tags",
|
|
120
|
+
"infer_col_name_suffix_tags",
|
|
121
|
+
"infer_series_base_tag_type",
|
|
122
|
+
"infer_tags_from_series_sample",
|
|
123
|
+
"is_homogeneous",
|
|
124
|
+
"is_string_series",
|
|
125
|
+
"looks_like_json",
|
|
126
|
+
"looks_like_path",
|
|
127
|
+
"make_filter_fxn",
|
|
128
|
+
"map_column_with_fallback",
|
|
129
|
+
"masked_getter",
|
|
130
|
+
"masked_setter",
|
|
131
|
+
"maybe_pipe",
|
|
132
|
+
"maybe_update_cell",
|
|
133
|
+
"MetricDataField",
|
|
134
|
+
"move_cols_to_beginning",
|
|
135
|
+
"move_cols_with_prefix_to_end",
|
|
136
|
+
"move_numeric_cols_to_end",
|
|
137
|
+
"parse_first_element",
|
|
138
|
+
"parse_list_string",
|
|
139
|
+
"remaining_cols",
|
|
140
|
+
"rename_columns",
|
|
141
|
+
"require_row_index",
|
|
142
|
+
"select_subset",
|
|
143
|
+
"strip_col_prefixes",
|
|
144
|
+
"strip_col_prefixes_batch",
|
|
145
|
+
"sum_list_elements",
|
|
146
|
+
"unique_by_col",
|
|
147
|
+
"unique_by_cols",
|
|
148
|
+
"unique_non_null",
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
if _HAS_FORMATTING:
|
|
152
|
+
__all__.extend(
|
|
153
|
+
[
|
|
154
|
+
"format_table",
|
|
155
|
+
"format_coverage_table",
|
|
156
|
+
"FORMATTER_TYPES",
|
|
157
|
+
"OUTPUT_FORMATS",
|
|
158
|
+
]
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
__version__ = "0.1.0"
|
dr_frames/aggregation.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from .columns import (
|
|
9
|
+
apply_skip,
|
|
10
|
+
contained_cols,
|
|
11
|
+
move_cols_to_beginning,
|
|
12
|
+
move_cols_with_prefix_to_end,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"apply_aggregations",
|
|
17
|
+
"aggregate_over_seeds",
|
|
18
|
+
"unique_non_null",
|
|
19
|
+
"unique_by_col",
|
|
20
|
+
"unique_by_cols",
|
|
21
|
+
"get_constant_cols",
|
|
22
|
+
"fillna_with_defaults",
|
|
23
|
+
"maybe_pipe",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def unique_non_null(values: pd.Series | Iterable[Any]) -> list[Any]:
|
|
28
|
+
series = values if isinstance(values, pd.Series) else pd.Series(list(values))
|
|
29
|
+
return series.dropna().unique().tolist()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def unique_by_col(df: pd.DataFrame, col: str) -> list[Any]:
|
|
33
|
+
return df[col].unique().tolist()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def unique_by_cols(df: pd.DataFrame, cols: Sequence[str]) -> dict[str, Any]:
|
|
37
|
+
contained = contained_cols(df, cols)
|
|
38
|
+
return {col: unique_by_col(df, col) for col in contained}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_constant_cols(df: pd.DataFrame, skip: Iterable[str] = ()) -> dict[str, Any]:
|
|
42
|
+
if df.empty or len(df) <= 1:
|
|
43
|
+
return {}
|
|
44
|
+
return {
|
|
45
|
+
c: df[c].iloc[0]
|
|
46
|
+
for c in apply_skip(df.columns, skip)
|
|
47
|
+
if df[c].nunique(dropna=False) <= 1
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def fillna_with_defaults(
|
|
52
|
+
df: pd.DataFrame,
|
|
53
|
+
defaults: Mapping[str, object] | Iterable[tuple[str, object]],
|
|
54
|
+
) -> pd.DataFrame:
|
|
55
|
+
defaults_dict = dict(defaults)
|
|
56
|
+
if not defaults_dict:
|
|
57
|
+
return df
|
|
58
|
+
present = {c: value for c, value in defaults_dict.items() if c in df.columns}
|
|
59
|
+
return df.fillna(value=present) if present else df
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def maybe_pipe(
|
|
63
|
+
df: pd.DataFrame,
|
|
64
|
+
condition: bool | Callable[[pd.DataFrame], bool] | Iterable | Mapping,
|
|
65
|
+
func: Callable[..., pd.DataFrame],
|
|
66
|
+
*args: Any,
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
) -> pd.DataFrame:
|
|
69
|
+
should_apply = condition(df) if callable(condition) else bool(condition)
|
|
70
|
+
return df.pipe(func, *args, **kwargs) if should_apply else df
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def apply_aggregations(
|
|
74
|
+
df: pd.DataFrame,
|
|
75
|
+
group_col: str,
|
|
76
|
+
agg_over_cols: Sequence[str],
|
|
77
|
+
drop_cols: Sequence[str] | None = None,
|
|
78
|
+
start_cols: Sequence[str] | None = None,
|
|
79
|
+
sort_cols: Sequence[str] | None = None,
|
|
80
|
+
end_prefix: str = "metrics_",
|
|
81
|
+
) -> pd.DataFrame:
|
|
82
|
+
if df.empty:
|
|
83
|
+
return df
|
|
84
|
+
if group_col not in df.columns:
|
|
85
|
+
raise ValueError(f"Group column '{group_col}' not found in dataframe.")
|
|
86
|
+
if group_col in (drop_cols or []):
|
|
87
|
+
raise ValueError(f"Group column '{group_col}' cannot be in drop_cols.")
|
|
88
|
+
|
|
89
|
+
cols_to_drop = {*(drop_cols or []), *agg_over_cols}
|
|
90
|
+
numeric_cols = set(df.select_dtypes(include=["number"]).columns.tolist())
|
|
91
|
+
cols_to_use = set(df.columns) - cols_to_drop - {group_col}
|
|
92
|
+
mean_agg_cols = numeric_cols & cols_to_use
|
|
93
|
+
first_agg_cols = cols_to_use - mean_agg_cols
|
|
94
|
+
|
|
95
|
+
df = df.copy()
|
|
96
|
+
df = df.drop(columns=list(cols_to_drop))
|
|
97
|
+
df = (
|
|
98
|
+
df.groupby(group_col)
|
|
99
|
+
.agg(
|
|
100
|
+
{
|
|
101
|
+
**dict.fromkeys(first_agg_cols, "first"),
|
|
102
|
+
**dict.fromkeys(mean_agg_cols, "mean"),
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
.reset_index()
|
|
106
|
+
)
|
|
107
|
+
df = move_cols_with_prefix_to_end(df, prefix=end_prefix)
|
|
108
|
+
df = move_cols_to_beginning(df, [group_col, *(start_cols or [])])
|
|
109
|
+
if sort_cols:
|
|
110
|
+
df = df.sort_values(list(sort_cols))
|
|
111
|
+
return df
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def aggregate_over_seeds(
|
|
115
|
+
df: pd.DataFrame,
|
|
116
|
+
config_cols: list[str],
|
|
117
|
+
metric_cols: list[str] | None = None,
|
|
118
|
+
agg_funcs: list[str] | None = None,
|
|
119
|
+
) -> pd.DataFrame:
|
|
120
|
+
assert config_cols, "config_cols must be provided"
|
|
121
|
+
|
|
122
|
+
if agg_funcs is None:
|
|
123
|
+
agg_funcs = ["mean", "std", "count"]
|
|
124
|
+
|
|
125
|
+
if metric_cols is None:
|
|
126
|
+
metric_cols = [col for col in df.columns if col.startswith("eval/")]
|
|
127
|
+
|
|
128
|
+
valid_config_cols = [c for c in config_cols if c in df.columns]
|
|
129
|
+
|
|
130
|
+
agg_dict = {metric: agg_funcs for metric in metric_cols if metric in df.columns}
|
|
131
|
+
|
|
132
|
+
if not agg_dict:
|
|
133
|
+
return df.groupby(valid_config_cols, dropna=False).first().reset_index()
|
|
134
|
+
|
|
135
|
+
aggregated = df.groupby(valid_config_cols, dropna=False).agg(agg_dict)
|
|
136
|
+
aggregated.columns = [f"{col}_{agg}" for col, agg in aggregated.columns]
|
|
137
|
+
|
|
138
|
+
return aggregated.reset_index()
|
dr_frames/cells.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
4
|
+
from typing import Any, cast
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ensure_column",
|
|
10
|
+
"fill_missing_values",
|
|
11
|
+
"rename_columns",
|
|
12
|
+
"map_column_with_fallback",
|
|
13
|
+
"apply_column_converters",
|
|
14
|
+
"maybe_update_cell",
|
|
15
|
+
"force_set_cell",
|
|
16
|
+
"apply_if_column",
|
|
17
|
+
"masked_getter",
|
|
18
|
+
"masked_setter",
|
|
19
|
+
"require_row_index",
|
|
20
|
+
"group_col_by_prefix",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
MissingMarkers = Iterable[Any]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def ensure_column(
|
|
27
|
+
df: pd.DataFrame,
|
|
28
|
+
column: str,
|
|
29
|
+
default: Any,
|
|
30
|
+
*,
|
|
31
|
+
inplace: bool = False,
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
target = df if inplace else df.copy()
|
|
34
|
+
if column not in target.columns:
|
|
35
|
+
target[column] = default
|
|
36
|
+
elif default is None:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
target[column] = target[column].fillna(default)
|
|
40
|
+
return target
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def fill_missing_values(
|
|
44
|
+
df: pd.DataFrame,
|
|
45
|
+
defaults: Mapping[str, Any],
|
|
46
|
+
*,
|
|
47
|
+
inplace: bool = False,
|
|
48
|
+
) -> pd.DataFrame:
|
|
49
|
+
target = df if inplace else df.copy()
|
|
50
|
+
for column, default in defaults.items():
|
|
51
|
+
if column in target.columns:
|
|
52
|
+
target[column] = target[column].fillna(default)
|
|
53
|
+
return target
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def rename_columns(
|
|
57
|
+
df: pd.DataFrame,
|
|
58
|
+
mapping: Mapping[str, str],
|
|
59
|
+
*,
|
|
60
|
+
inplace: bool = False,
|
|
61
|
+
) -> pd.DataFrame:
|
|
62
|
+
target = df if inplace else df.copy()
|
|
63
|
+
existing_map = {old: new for old, new in mapping.items() if old in target.columns}
|
|
64
|
+
if existing_map:
|
|
65
|
+
target = target.rename(columns=existing_map)
|
|
66
|
+
return target
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def map_column_with_fallback(
|
|
70
|
+
df: pd.DataFrame,
|
|
71
|
+
column: str,
|
|
72
|
+
mapping: Mapping[str, Any],
|
|
73
|
+
*,
|
|
74
|
+
inplace: bool = False,
|
|
75
|
+
) -> pd.DataFrame:
|
|
76
|
+
if column not in df.columns:
|
|
77
|
+
return df if inplace else df.copy()
|
|
78
|
+
|
|
79
|
+
target = df if inplace else df.copy()
|
|
80
|
+
|
|
81
|
+
def _mapper(value: Any) -> Any:
|
|
82
|
+
if pd.isna(value):
|
|
83
|
+
return value
|
|
84
|
+
return mapping.get(value, value)
|
|
85
|
+
|
|
86
|
+
target[column] = target[column].map(_mapper)
|
|
87
|
+
return target
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def apply_column_converters(
|
|
91
|
+
df: pd.DataFrame,
|
|
92
|
+
converters: Mapping[str, Callable[[Any], Any]],
|
|
93
|
+
*,
|
|
94
|
+
inplace: bool = False,
|
|
95
|
+
) -> pd.DataFrame:
|
|
96
|
+
target = df if inplace else df.copy()
|
|
97
|
+
for column, converter in converters.items():
|
|
98
|
+
if column in target.columns:
|
|
99
|
+
target[column] = target[column].apply(converter)
|
|
100
|
+
return target
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _matches_missing_marker(current: Any, marker: Any) -> bool:
|
|
104
|
+
"""Safely check if current value matches a missing marker.
|
|
105
|
+
|
|
106
|
+
Handles NaN values via pd.isna comparisons and catches TypeError
|
|
107
|
+
for unhashable types to ensure comparisons never raise.
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
# Handle NaN comparisons first
|
|
111
|
+
if pd.isna(current) and pd.isna(marker):
|
|
112
|
+
return True
|
|
113
|
+
if pd.isna(current) or pd.isna(marker):
|
|
114
|
+
return False
|
|
115
|
+
# Safe equality check for hashable and unhashable types
|
|
116
|
+
return current == marker
|
|
117
|
+
except TypeError:
|
|
118
|
+
# Unhashable types (e.g., lists, dicts) can't be compared with ==
|
|
119
|
+
# in some contexts, so return False to be safe
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def maybe_update_cell(
|
|
124
|
+
df: pd.DataFrame,
|
|
125
|
+
row_index: int,
|
|
126
|
+
column: str,
|
|
127
|
+
value: Any,
|
|
128
|
+
*,
|
|
129
|
+
missing_markers: MissingMarkers = (None, "N/A"),
|
|
130
|
+
inplace: bool = False,
|
|
131
|
+
) -> pd.DataFrame:
|
|
132
|
+
target = df if inplace else df.copy()
|
|
133
|
+
if column not in target.columns or row_index not in target.index:
|
|
134
|
+
return target
|
|
135
|
+
|
|
136
|
+
current = target.loc[row_index, column]
|
|
137
|
+
is_missing = pd.isna(current) or any(
|
|
138
|
+
_matches_missing_marker(current, m) for m in missing_markers
|
|
139
|
+
)
|
|
140
|
+
if is_missing:
|
|
141
|
+
target.loc[row_index, column] = value
|
|
142
|
+
return target
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def force_set_cell(
|
|
146
|
+
df: pd.DataFrame,
|
|
147
|
+
row_index: int,
|
|
148
|
+
column: str,
|
|
149
|
+
value: Any,
|
|
150
|
+
*,
|
|
151
|
+
default: Any = None,
|
|
152
|
+
inplace: bool = False,
|
|
153
|
+
) -> pd.DataFrame:
|
|
154
|
+
target = df if inplace else df.copy()
|
|
155
|
+
target = ensure_column(target, column, default, inplace=True)
|
|
156
|
+
target.loc[row_index, column] = value
|
|
157
|
+
return target
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def apply_if_column(
|
|
161
|
+
df: pd.DataFrame,
|
|
162
|
+
column: str,
|
|
163
|
+
func: Callable[[pd.Series], pd.Series],
|
|
164
|
+
*,
|
|
165
|
+
inplace: bool = False,
|
|
166
|
+
) -> pd.DataFrame:
|
|
167
|
+
if column not in df.columns:
|
|
168
|
+
return df if inplace else df.copy()
|
|
169
|
+
|
|
170
|
+
target = df if inplace else df.copy()
|
|
171
|
+
target[column] = func(target[column])
|
|
172
|
+
return target
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def require_row_index(
|
|
176
|
+
df: pd.DataFrame,
|
|
177
|
+
column: str,
|
|
178
|
+
value: Any,
|
|
179
|
+
) -> int:
|
|
180
|
+
matches = df.index[df[column] == value]
|
|
181
|
+
if len(matches) == 0:
|
|
182
|
+
raise ValueError(f"No rows found where {column} == {value!r}")
|
|
183
|
+
if len(matches) > 1:
|
|
184
|
+
raise ValueError(f"Multiple rows found where {column} == {value!r}")
|
|
185
|
+
return int(matches[0])
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def masked_getter(df: pd.DataFrame, mask: pd.Series, column: str) -> Any:
|
|
189
|
+
if column not in df.columns:
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
selection = df.loc[mask, column]
|
|
193
|
+
if selection.empty:
|
|
194
|
+
return None
|
|
195
|
+
return selection.iloc[0]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def masked_setter(
|
|
199
|
+
df: pd.DataFrame,
|
|
200
|
+
mask: pd.Series,
|
|
201
|
+
column: str,
|
|
202
|
+
value: Any,
|
|
203
|
+
*,
|
|
204
|
+
inplace: bool = False,
|
|
205
|
+
) -> pd.DataFrame:
|
|
206
|
+
target = df if inplace else df.copy()
|
|
207
|
+
target.loc[mask, column] = value
|
|
208
|
+
return target
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _normalize_prefix_items(
|
|
212
|
+
prefix_map: Mapping[str, str] | Iterable[tuple[str, str]] | None,
|
|
213
|
+
) -> list[tuple[str, str]]:
|
|
214
|
+
if prefix_map is None:
|
|
215
|
+
return []
|
|
216
|
+
items = list(prefix_map.items() if isinstance(prefix_map, Mapping) else prefix_map)
|
|
217
|
+
assert all(isinstance(item, tuple) and len(item) == 2 for item in items), (
|
|
218
|
+
f"Prefix map must be a mapping or iterable of tuples, received {type(prefix_map)!r}."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
normalized: list[tuple[str, str]] = []
|
|
222
|
+
for prefix, group in items:
|
|
223
|
+
assert isinstance(prefix, str), (
|
|
224
|
+
f"Prefix keys must be strings, received {type(prefix)!r}."
|
|
225
|
+
)
|
|
226
|
+
assert isinstance(group, str), (
|
|
227
|
+
f"Group names must be strings, received {type(group)!r}."
|
|
228
|
+
)
|
|
229
|
+
lowered = prefix.strip().lower()
|
|
230
|
+
assert lowered, "Prefix keys must be non-empty after stripping whitespace."
|
|
231
|
+
normalized.append((lowered, group))
|
|
232
|
+
normalized.sort(key=lambda item: (-len(item[0]), item[0]))
|
|
233
|
+
return normalized
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def group_col_by_prefix(
|
|
237
|
+
df: pd.DataFrame,
|
|
238
|
+
column: str,
|
|
239
|
+
prefix_map: Mapping[str, str] | Iterable[tuple[str, str]] | None,
|
|
240
|
+
*,
|
|
241
|
+
output_col: str,
|
|
242
|
+
) -> pd.Series:
|
|
243
|
+
assert column in df.columns, f"Column '{column}' not present in DataFrame."
|
|
244
|
+
|
|
245
|
+
series = cast(pd.Series, df[column])
|
|
246
|
+
normalized_prefixes = _normalize_prefix_items(prefix_map)
|
|
247
|
+
|
|
248
|
+
if not normalized_prefixes:
|
|
249
|
+
return series.copy().rename(output_col)
|
|
250
|
+
|
|
251
|
+
def resolve(value: object) -> object:
|
|
252
|
+
if bool(pd.isna(value)):
|
|
253
|
+
return value
|
|
254
|
+
if isinstance(value, str):
|
|
255
|
+
lowered = value.lower()
|
|
256
|
+
for prefix, group in normalized_prefixes:
|
|
257
|
+
if lowered.startswith(prefix):
|
|
258
|
+
return group
|
|
259
|
+
return value
|
|
260
|
+
return value
|
|
261
|
+
|
|
262
|
+
return series.map(resolve).rename(output_col)
|
dr_frames/columns.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from dr_frames.types import is_string_series
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"apply_skip",
|
|
11
|
+
"contained_cols",
|
|
12
|
+
"remaining_cols",
|
|
13
|
+
"get_cols_by_prefix",
|
|
14
|
+
"get_cols_by_contains",
|
|
15
|
+
"strip_col_prefixes",
|
|
16
|
+
"strip_col_prefixes_batch",
|
|
17
|
+
"move_cols_to_beginning",
|
|
18
|
+
"move_numeric_cols_to_end",
|
|
19
|
+
"move_cols_with_prefix_to_end",
|
|
20
|
+
"drop_all_null_cols",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _strip_prefix(text: str, prefix: str) -> str:
|
|
25
|
+
return text[len(prefix) :] if text.startswith(prefix) else text
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def apply_skip(
|
|
29
|
+
columns: Sequence[str] | pd.Index, skip: Iterable[str] = ()
|
|
30
|
+
) -> list[str]:
|
|
31
|
+
skip_set = set(skip)
|
|
32
|
+
return [column for column in columns if column not in skip_set]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def contained_cols(df: pd.DataFrame, columns: Sequence[str]) -> list[str]:
|
|
36
|
+
return [column for column in columns if column in df.columns]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def remaining_cols(df: pd.DataFrame, cols: Iterable[str]) -> list[str]:
|
|
40
|
+
skip_set = set(cols)
|
|
41
|
+
return [column for column in df.columns if column not in skip_set]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_cols_by_prefix(
|
|
45
|
+
df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
|
|
46
|
+
) -> list[str]:
|
|
47
|
+
return [c for c in apply_skip(df.columns, skip) if c.startswith(prefix)]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_cols_by_contains(
|
|
51
|
+
df: pd.DataFrame, substr: str, skip: Iterable[str] = ()
|
|
52
|
+
) -> list[str]:
|
|
53
|
+
return [c for c in apply_skip(df.columns, skip) if substr in c]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def strip_col_prefixes(
|
|
57
|
+
df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
|
|
58
|
+
) -> pd.DataFrame:
|
|
59
|
+
return df.rename(
|
|
60
|
+
columns={
|
|
61
|
+
c: _strip_prefix(c, prefix) for c in get_cols_by_prefix(df, prefix, skip)
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def strip_col_prefixes_batch(
|
|
67
|
+
df: pd.DataFrame,
|
|
68
|
+
prefix_map: Mapping[str, Iterable[str]] | None = None,
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
if not prefix_map:
|
|
71
|
+
return df
|
|
72
|
+
|
|
73
|
+
sorted_items = sorted(prefix_map.items(), key=lambda x: len(x[0]), reverse=True)
|
|
74
|
+
|
|
75
|
+
working = df
|
|
76
|
+
for prefix, skip in sorted_items:
|
|
77
|
+
working = strip_col_prefixes(working, prefix, skip)
|
|
78
|
+
return working
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def move_cols_to_beginning(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
|
|
82
|
+
return df.loc[:, [*contained_cols(df, cols), *remaining_cols(df, cols)]]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def move_numeric_cols_to_end(df: pd.DataFrame) -> pd.DataFrame:
|
|
86
|
+
numeric_columns = df.select_dtypes(include=["number"]).columns.tolist()
|
|
87
|
+
return df.loc[:, [*remaining_cols(df, numeric_columns), *numeric_columns]]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def move_cols_with_prefix_to_end(
|
|
91
|
+
df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
|
|
92
|
+
) -> pd.DataFrame:
|
|
93
|
+
target_columns = get_cols_by_prefix(df, prefix, skip)
|
|
94
|
+
return df.loc[:, [*remaining_cols(df, target_columns), *target_columns]]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def drop_all_null_cols(df: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
working = df.copy()
|
|
99
|
+
object_cols = working.select_dtypes(include=["object", "string"])
|
|
100
|
+
blank_mask = pd.DataFrame(False, index=working.index, columns=working.columns)
|
|
101
|
+
if not object_cols.empty:
|
|
102
|
+
string_cols = [c for c, col in object_cols.items() if is_string_series(col)]
|
|
103
|
+
if string_cols:
|
|
104
|
+
blank_mask[string_cols] = object_cols[string_cols].apply(
|
|
105
|
+
lambda col: col.str.strip() == ""
|
|
106
|
+
)
|
|
107
|
+
working = working.mask(blank_mask, other=pd.NA)
|
|
108
|
+
return working.dropna(axis=1, how="all")
|