dr-frames 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dr_frames/__init__.py ADDED
@@ -0,0 +1,161 @@
1
+ from __future__ import annotations
2
+
3
+ from .aggregation import (
4
+ aggregate_over_seeds,
5
+ apply_aggregations,
6
+ fillna_with_defaults,
7
+ get_constant_cols,
8
+ maybe_pipe,
9
+ unique_by_col,
10
+ unique_by_cols,
11
+ unique_non_null,
12
+ )
13
+ from .cells import (
14
+ apply_column_converters,
15
+ apply_if_column,
16
+ ensure_column,
17
+ fill_missing_values,
18
+ force_set_cell,
19
+ group_col_by_prefix,
20
+ map_column_with_fallback,
21
+ masked_getter,
22
+ masked_setter,
23
+ maybe_update_cell,
24
+ rename_columns,
25
+ require_row_index,
26
+ )
27
+ from .columns import (
28
+ apply_skip,
29
+ contained_cols,
30
+ drop_all_null_cols,
31
+ get_cols_by_contains,
32
+ get_cols_by_prefix,
33
+ move_cols_to_beginning,
34
+ move_cols_with_prefix_to_end,
35
+ move_numeric_cols_to_end,
36
+ remaining_cols,
37
+ strip_col_prefixes,
38
+ strip_col_prefixes_batch,
39
+ )
40
+ from .filtering import (
41
+ apply_filters_to_df,
42
+ filter_to_best_metric,
43
+ filter_to_range,
44
+ filter_to_value,
45
+ filter_to_values,
46
+ make_filter_fxn,
47
+ select_subset,
48
+ )
49
+ from .parsing import (
50
+ is_homogeneous,
51
+ parse_first_element,
52
+ parse_list_string,
53
+ sum_list_elements,
54
+ )
55
+ from .profiling import (
56
+ ColInfo,
57
+ DFColInfo,
58
+ infer_col_name_contains_tags,
59
+ infer_col_name_prefix_tags,
60
+ infer_col_name_suffix_tags,
61
+ infer_series_base_tag_type,
62
+ infer_tags_from_series_sample,
63
+ looks_like_json,
64
+ looks_like_path,
65
+ )
66
+ from .schema import (
67
+ ComputedField,
68
+ DataField,
69
+ DataFormat,
70
+ MetricDataField,
71
+ )
72
+ from .types import (
73
+ coerce_numeric_cols,
74
+ coerce_string_cols,
75
+ is_string_series,
76
+ )
77
+
78
+ try:
79
+ from .formatting import ( # noqa: F401
80
+ FORMATTER_TYPES,
81
+ OUTPUT_FORMATS,
82
+ format_coverage_table,
83
+ format_table,
84
+ )
85
+
86
+ _HAS_FORMATTING = True
87
+ except ImportError:
88
+ _HAS_FORMATTING = False
89
+
90
+ __all__ = [
91
+ "aggregate_over_seeds",
92
+ "apply_aggregations",
93
+ "apply_column_converters",
94
+ "apply_filters_to_df",
95
+ "apply_if_column",
96
+ "apply_skip",
97
+ "coerce_numeric_cols",
98
+ "coerce_string_cols",
99
+ "ColInfo",
100
+ "ComputedField",
101
+ "contained_cols",
102
+ "DataField",
103
+ "DataFormat",
104
+ "DFColInfo",
105
+ "drop_all_null_cols",
106
+ "ensure_column",
107
+ "fill_missing_values",
108
+ "fillna_with_defaults",
109
+ "filter_to_best_metric",
110
+ "filter_to_range",
111
+ "filter_to_value",
112
+ "filter_to_values",
113
+ "force_set_cell",
114
+ "get_cols_by_contains",
115
+ "get_cols_by_prefix",
116
+ "get_constant_cols",
117
+ "group_col_by_prefix",
118
+ "infer_col_name_contains_tags",
119
+ "infer_col_name_prefix_tags",
120
+ "infer_col_name_suffix_tags",
121
+ "infer_series_base_tag_type",
122
+ "infer_tags_from_series_sample",
123
+ "is_homogeneous",
124
+ "is_string_series",
125
+ "looks_like_json",
126
+ "looks_like_path",
127
+ "make_filter_fxn",
128
+ "map_column_with_fallback",
129
+ "masked_getter",
130
+ "masked_setter",
131
+ "maybe_pipe",
132
+ "maybe_update_cell",
133
+ "MetricDataField",
134
+ "move_cols_to_beginning",
135
+ "move_cols_with_prefix_to_end",
136
+ "move_numeric_cols_to_end",
137
+ "parse_first_element",
138
+ "parse_list_string",
139
+ "remaining_cols",
140
+ "rename_columns",
141
+ "require_row_index",
142
+ "select_subset",
143
+ "strip_col_prefixes",
144
+ "strip_col_prefixes_batch",
145
+ "sum_list_elements",
146
+ "unique_by_col",
147
+ "unique_by_cols",
148
+ "unique_non_null",
149
+ ]
150
+
151
+ if _HAS_FORMATTING:
152
+ __all__.extend(
153
+ [
154
+ "format_table",
155
+ "format_coverage_table",
156
+ "FORMATTER_TYPES",
157
+ "OUTPUT_FORMATS",
158
+ ]
159
+ )
160
+
161
+ __version__ = "0.1.0"
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterable, Mapping, Sequence
4
+ from typing import Any
5
+
6
+ import pandas as pd
7
+
8
+ from .columns import (
9
+ apply_skip,
10
+ contained_cols,
11
+ move_cols_to_beginning,
12
+ move_cols_with_prefix_to_end,
13
+ )
14
+
15
+ __all__ = [
16
+ "apply_aggregations",
17
+ "aggregate_over_seeds",
18
+ "unique_non_null",
19
+ "unique_by_col",
20
+ "unique_by_cols",
21
+ "get_constant_cols",
22
+ "fillna_with_defaults",
23
+ "maybe_pipe",
24
+ ]
25
+
26
+
27
+ def unique_non_null(values: pd.Series | Iterable[Any]) -> list[Any]:
28
+ series = values if isinstance(values, pd.Series) else pd.Series(list(values))
29
+ return series.dropna().unique().tolist()
30
+
31
+
32
+ def unique_by_col(df: pd.DataFrame, col: str) -> list[Any]:
33
+ return df[col].unique().tolist()
34
+
35
+
36
+ def unique_by_cols(df: pd.DataFrame, cols: Sequence[str]) -> dict[str, Any]:
37
+ contained = contained_cols(df, cols)
38
+ return {col: unique_by_col(df, col) for col in contained}
39
+
40
+
41
+ def get_constant_cols(df: pd.DataFrame, skip: Iterable[str] = ()) -> dict[str, Any]:
42
+ if df.empty or len(df) <= 1:
43
+ return {}
44
+ return {
45
+ c: df[c].iloc[0]
46
+ for c in apply_skip(df.columns, skip)
47
+ if df[c].nunique(dropna=False) <= 1
48
+ }
49
+
50
+
51
+ def fillna_with_defaults(
52
+ df: pd.DataFrame,
53
+ defaults: Mapping[str, object] | Iterable[tuple[str, object]],
54
+ ) -> pd.DataFrame:
55
+ defaults_dict = dict(defaults)
56
+ if not defaults_dict:
57
+ return df
58
+ present = {c: value for c, value in defaults_dict.items() if c in df.columns}
59
+ return df.fillna(value=present) if present else df
60
+
61
+
62
+ def maybe_pipe(
63
+ df: pd.DataFrame,
64
+ condition: bool | Callable[[pd.DataFrame], bool] | Iterable | Mapping,
65
+ func: Callable[..., pd.DataFrame],
66
+ *args: Any,
67
+ **kwargs: Any,
68
+ ) -> pd.DataFrame:
69
+ should_apply = condition(df) if callable(condition) else bool(condition)
70
+ return df.pipe(func, *args, **kwargs) if should_apply else df
71
+
72
+
73
+ def apply_aggregations(
74
+ df: pd.DataFrame,
75
+ group_col: str,
76
+ agg_over_cols: Sequence[str],
77
+ drop_cols: Sequence[str] | None = None,
78
+ start_cols: Sequence[str] | None = None,
79
+ sort_cols: Sequence[str] | None = None,
80
+ end_prefix: str = "metrics_",
81
+ ) -> pd.DataFrame:
82
+ if df.empty:
83
+ return df
84
+ if group_col not in df.columns:
85
+ raise ValueError(f"Group column '{group_col}' not found in dataframe.")
86
+ if group_col in (drop_cols or []):
87
+ raise ValueError(f"Group column '{group_col}' cannot be in drop_cols.")
88
+
89
+ cols_to_drop = {*(drop_cols or []), *agg_over_cols}
90
+ numeric_cols = set(df.select_dtypes(include=["number"]).columns.tolist())
91
+ cols_to_use = set(df.columns) - cols_to_drop - {group_col}
92
+ mean_agg_cols = numeric_cols & cols_to_use
93
+ first_agg_cols = cols_to_use - mean_agg_cols
94
+
95
+ df = df.copy()
96
+ df = df.drop(columns=list(cols_to_drop))
97
+ df = (
98
+ df.groupby(group_col)
99
+ .agg(
100
+ {
101
+ **dict.fromkeys(first_agg_cols, "first"),
102
+ **dict.fromkeys(mean_agg_cols, "mean"),
103
+ }
104
+ )
105
+ .reset_index()
106
+ )
107
+ df = move_cols_with_prefix_to_end(df, prefix=end_prefix)
108
+ df = move_cols_to_beginning(df, [group_col, *(start_cols or [])])
109
+ if sort_cols:
110
+ df = df.sort_values(list(sort_cols))
111
+ return df
112
+
113
+
114
+ def aggregate_over_seeds(
115
+ df: pd.DataFrame,
116
+ config_cols: list[str],
117
+ metric_cols: list[str] | None = None,
118
+ agg_funcs: list[str] | None = None,
119
+ ) -> pd.DataFrame:
120
+ assert config_cols, "config_cols must be provided"
121
+
122
+ if agg_funcs is None:
123
+ agg_funcs = ["mean", "std", "count"]
124
+
125
+ if metric_cols is None:
126
+ metric_cols = [col for col in df.columns if col.startswith("eval/")]
127
+
128
+ valid_config_cols = [c for c in config_cols if c in df.columns]
129
+
130
+ agg_dict = {metric: agg_funcs for metric in metric_cols if metric in df.columns}
131
+
132
+ if not agg_dict:
133
+ return df.groupby(valid_config_cols, dropna=False).first().reset_index()
134
+
135
+ aggregated = df.groupby(valid_config_cols, dropna=False).agg(agg_dict)
136
+ aggregated.columns = [f"{col}_{agg}" for col, agg in aggregated.columns]
137
+
138
+ return aggregated.reset_index()
dr_frames/cells.py ADDED
@@ -0,0 +1,262 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterable, Mapping
4
+ from typing import Any, cast
5
+
6
+ import pandas as pd
7
+
8
+ __all__ = [
9
+ "ensure_column",
10
+ "fill_missing_values",
11
+ "rename_columns",
12
+ "map_column_with_fallback",
13
+ "apply_column_converters",
14
+ "maybe_update_cell",
15
+ "force_set_cell",
16
+ "apply_if_column",
17
+ "masked_getter",
18
+ "masked_setter",
19
+ "require_row_index",
20
+ "group_col_by_prefix",
21
+ ]
22
+
23
+ MissingMarkers = Iterable[Any]
24
+
25
+
26
+ def ensure_column(
27
+ df: pd.DataFrame,
28
+ column: str,
29
+ default: Any,
30
+ *,
31
+ inplace: bool = False,
32
+ ) -> pd.DataFrame:
33
+ target = df if inplace else df.copy()
34
+ if column not in target.columns:
35
+ target[column] = default
36
+ elif default is None:
37
+ pass
38
+ else:
39
+ target[column] = target[column].fillna(default)
40
+ return target
41
+
42
+
43
+ def fill_missing_values(
44
+ df: pd.DataFrame,
45
+ defaults: Mapping[str, Any],
46
+ *,
47
+ inplace: bool = False,
48
+ ) -> pd.DataFrame:
49
+ target = df if inplace else df.copy()
50
+ for column, default in defaults.items():
51
+ if column in target.columns:
52
+ target[column] = target[column].fillna(default)
53
+ return target
54
+
55
+
56
+ def rename_columns(
57
+ df: pd.DataFrame,
58
+ mapping: Mapping[str, str],
59
+ *,
60
+ inplace: bool = False,
61
+ ) -> pd.DataFrame:
62
+ target = df if inplace else df.copy()
63
+ existing_map = {old: new for old, new in mapping.items() if old in target.columns}
64
+ if existing_map:
65
+ target = target.rename(columns=existing_map)
66
+ return target
67
+
68
+
69
+ def map_column_with_fallback(
70
+ df: pd.DataFrame,
71
+ column: str,
72
+ mapping: Mapping[str, Any],
73
+ *,
74
+ inplace: bool = False,
75
+ ) -> pd.DataFrame:
76
+ if column not in df.columns:
77
+ return df if inplace else df.copy()
78
+
79
+ target = df if inplace else df.copy()
80
+
81
+ def _mapper(value: Any) -> Any:
82
+ if pd.isna(value):
83
+ return value
84
+ return mapping.get(value, value)
85
+
86
+ target[column] = target[column].map(_mapper)
87
+ return target
88
+
89
+
90
+ def apply_column_converters(
91
+ df: pd.DataFrame,
92
+ converters: Mapping[str, Callable[[Any], Any]],
93
+ *,
94
+ inplace: bool = False,
95
+ ) -> pd.DataFrame:
96
+ target = df if inplace else df.copy()
97
+ for column, converter in converters.items():
98
+ if column in target.columns:
99
+ target[column] = target[column].apply(converter)
100
+ return target
101
+
102
+
103
+ def _matches_missing_marker(current: Any, marker: Any) -> bool:
104
+ """Safely check if current value matches a missing marker.
105
+
106
+ Handles NaN values via pd.isna comparisons and catches TypeError
107
+ for unhashable types to ensure comparisons never raise.
108
+ """
109
+ try:
110
+ # Handle NaN comparisons first
111
+ if pd.isna(current) and pd.isna(marker):
112
+ return True
113
+ if pd.isna(current) or pd.isna(marker):
114
+ return False
115
+ # Safe equality check for hashable and unhashable types
116
+ return current == marker
117
+ except TypeError:
118
+ # Unhashable types (e.g., lists, dicts) can't be compared with ==
119
+ # in some contexts, so return False to be safe
120
+ return False
121
+
122
+
123
+ def maybe_update_cell(
124
+ df: pd.DataFrame,
125
+ row_index: int,
126
+ column: str,
127
+ value: Any,
128
+ *,
129
+ missing_markers: MissingMarkers = (None, "N/A"),
130
+ inplace: bool = False,
131
+ ) -> pd.DataFrame:
132
+ target = df if inplace else df.copy()
133
+ if column not in target.columns or row_index not in target.index:
134
+ return target
135
+
136
+ current = target.loc[row_index, column]
137
+ is_missing = pd.isna(current) or any(
138
+ _matches_missing_marker(current, m) for m in missing_markers
139
+ )
140
+ if is_missing:
141
+ target.loc[row_index, column] = value
142
+ return target
143
+
144
+
145
+ def force_set_cell(
146
+ df: pd.DataFrame,
147
+ row_index: int,
148
+ column: str,
149
+ value: Any,
150
+ *,
151
+ default: Any = None,
152
+ inplace: bool = False,
153
+ ) -> pd.DataFrame:
154
+ target = df if inplace else df.copy()
155
+ target = ensure_column(target, column, default, inplace=True)
156
+ target.loc[row_index, column] = value
157
+ return target
158
+
159
+
160
+ def apply_if_column(
161
+ df: pd.DataFrame,
162
+ column: str,
163
+ func: Callable[[pd.Series], pd.Series],
164
+ *,
165
+ inplace: bool = False,
166
+ ) -> pd.DataFrame:
167
+ if column not in df.columns:
168
+ return df if inplace else df.copy()
169
+
170
+ target = df if inplace else df.copy()
171
+ target[column] = func(target[column])
172
+ return target
173
+
174
+
175
+ def require_row_index(
176
+ df: pd.DataFrame,
177
+ column: str,
178
+ value: Any,
179
+ ) -> int:
180
+ matches = df.index[df[column] == value]
181
+ if len(matches) == 0:
182
+ raise ValueError(f"No rows found where {column} == {value!r}")
183
+ if len(matches) > 1:
184
+ raise ValueError(f"Multiple rows found where {column} == {value!r}")
185
+ return int(matches[0])
186
+
187
+
188
+ def masked_getter(df: pd.DataFrame, mask: pd.Series, column: str) -> Any:
189
+ if column not in df.columns:
190
+ return None
191
+
192
+ selection = df.loc[mask, column]
193
+ if selection.empty:
194
+ return None
195
+ return selection.iloc[0]
196
+
197
+
198
+ def masked_setter(
199
+ df: pd.DataFrame,
200
+ mask: pd.Series,
201
+ column: str,
202
+ value: Any,
203
+ *,
204
+ inplace: bool = False,
205
+ ) -> pd.DataFrame:
206
+ target = df if inplace else df.copy()
207
+ target.loc[mask, column] = value
208
+ return target
209
+
210
+
211
+ def _normalize_prefix_items(
212
+ prefix_map: Mapping[str, str] | Iterable[tuple[str, str]] | None,
213
+ ) -> list[tuple[str, str]]:
214
+ if prefix_map is None:
215
+ return []
216
+ items = list(prefix_map.items() if isinstance(prefix_map, Mapping) else prefix_map)
217
+ assert all(isinstance(item, tuple) and len(item) == 2 for item in items), (
218
+ f"Prefix map must be a mapping or iterable of tuples, received {type(prefix_map)!r}."
219
+ )
220
+
221
+ normalized: list[tuple[str, str]] = []
222
+ for prefix, group in items:
223
+ assert isinstance(prefix, str), (
224
+ f"Prefix keys must be strings, received {type(prefix)!r}."
225
+ )
226
+ assert isinstance(group, str), (
227
+ f"Group names must be strings, received {type(group)!r}."
228
+ )
229
+ lowered = prefix.strip().lower()
230
+ assert lowered, "Prefix keys must be non-empty after stripping whitespace."
231
+ normalized.append((lowered, group))
232
+ normalized.sort(key=lambda item: (-len(item[0]), item[0]))
233
+ return normalized
234
+
235
+
236
+ def group_col_by_prefix(
237
+ df: pd.DataFrame,
238
+ column: str,
239
+ prefix_map: Mapping[str, str] | Iterable[tuple[str, str]] | None,
240
+ *,
241
+ output_col: str,
242
+ ) -> pd.Series:
243
+ assert column in df.columns, f"Column '{column}' not present in DataFrame."
244
+
245
+ series = cast(pd.Series, df[column])
246
+ normalized_prefixes = _normalize_prefix_items(prefix_map)
247
+
248
+ if not normalized_prefixes:
249
+ return series.copy().rename(output_col)
250
+
251
+ def resolve(value: object) -> object:
252
+ if bool(pd.isna(value)):
253
+ return value
254
+ if isinstance(value, str):
255
+ lowered = value.lower()
256
+ for prefix, group in normalized_prefixes:
257
+ if lowered.startswith(prefix):
258
+ return group
259
+ return value
260
+ return value
261
+
262
+ return series.map(resolve).rename(output_col)
dr_frames/columns.py ADDED
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Mapping, Sequence
4
+
5
+ import pandas as pd
6
+
7
+ from dr_frames.types import is_string_series
8
+
9
+ __all__ = [
10
+ "apply_skip",
11
+ "contained_cols",
12
+ "remaining_cols",
13
+ "get_cols_by_prefix",
14
+ "get_cols_by_contains",
15
+ "strip_col_prefixes",
16
+ "strip_col_prefixes_batch",
17
+ "move_cols_to_beginning",
18
+ "move_numeric_cols_to_end",
19
+ "move_cols_with_prefix_to_end",
20
+ "drop_all_null_cols",
21
+ ]
22
+
23
+
24
+ def _strip_prefix(text: str, prefix: str) -> str:
25
+ return text[len(prefix) :] if text.startswith(prefix) else text
26
+
27
+
28
+ def apply_skip(
29
+ columns: Sequence[str] | pd.Index, skip: Iterable[str] = ()
30
+ ) -> list[str]:
31
+ skip_set = set(skip)
32
+ return [column for column in columns if column not in skip_set]
33
+
34
+
35
+ def contained_cols(df: pd.DataFrame, columns: Sequence[str]) -> list[str]:
36
+ return [column for column in columns if column in df.columns]
37
+
38
+
39
+ def remaining_cols(df: pd.DataFrame, cols: Iterable[str]) -> list[str]:
40
+ skip_set = set(cols)
41
+ return [column for column in df.columns if column not in skip_set]
42
+
43
+
44
+ def get_cols_by_prefix(
45
+ df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
46
+ ) -> list[str]:
47
+ return [c for c in apply_skip(df.columns, skip) if c.startswith(prefix)]
48
+
49
+
50
+ def get_cols_by_contains(
51
+ df: pd.DataFrame, substr: str, skip: Iterable[str] = ()
52
+ ) -> list[str]:
53
+ return [c for c in apply_skip(df.columns, skip) if substr in c]
54
+
55
+
56
+ def strip_col_prefixes(
57
+ df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
58
+ ) -> pd.DataFrame:
59
+ return df.rename(
60
+ columns={
61
+ c: _strip_prefix(c, prefix) for c in get_cols_by_prefix(df, prefix, skip)
62
+ }
63
+ )
64
+
65
+
66
+ def strip_col_prefixes_batch(
67
+ df: pd.DataFrame,
68
+ prefix_map: Mapping[str, Iterable[str]] | None = None,
69
+ ) -> pd.DataFrame:
70
+ if not prefix_map:
71
+ return df
72
+
73
+ sorted_items = sorted(prefix_map.items(), key=lambda x: len(x[0]), reverse=True)
74
+
75
+ working = df
76
+ for prefix, skip in sorted_items:
77
+ working = strip_col_prefixes(working, prefix, skip)
78
+ return working
79
+
80
+
81
+ def move_cols_to_beginning(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
82
+ return df.loc[:, [*contained_cols(df, cols), *remaining_cols(df, cols)]]
83
+
84
+
85
+ def move_numeric_cols_to_end(df: pd.DataFrame) -> pd.DataFrame:
86
+ numeric_columns = df.select_dtypes(include=["number"]).columns.tolist()
87
+ return df.loc[:, [*remaining_cols(df, numeric_columns), *numeric_columns]]
88
+
89
+
90
+ def move_cols_with_prefix_to_end(
91
+ df: pd.DataFrame, prefix: str, skip: Iterable[str] = ()
92
+ ) -> pd.DataFrame:
93
+ target_columns = get_cols_by_prefix(df, prefix, skip)
94
+ return df.loc[:, [*remaining_cols(df, target_columns), *target_columns]]
95
+
96
+
97
+ def drop_all_null_cols(df: pd.DataFrame) -> pd.DataFrame:
98
+ working = df.copy()
99
+ object_cols = working.select_dtypes(include=["object", "string"])
100
+ blank_mask = pd.DataFrame(False, index=working.index, columns=working.columns)
101
+ if not object_cols.empty:
102
+ string_cols = [c for c, col in object_cols.items() if is_string_series(col)]
103
+ if string_cols:
104
+ blank_mask[string_cols] = object_cols[string_cols].apply(
105
+ lambda col: col.str.strip() == ""
106
+ )
107
+ working = working.mask(blank_mask, other=pd.NA)
108
+ return working.dropna(axis=1, how="all")