dr-frames 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dr_frames/schema.py ADDED
@@ -0,0 +1,244 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from typing import Literal
5
+
6
+ import pandas as pd
7
+ from pydantic import BaseModel, Field, computed_field, model_validator
8
+
9
+ __all__ = [
10
+ "DataField",
11
+ "ComputedField",
12
+ "MetricDataField",
13
+ "DataFormat",
14
+ ]
15
+
16
+
17
+ class DataField(BaseModel):
18
+ id_string: str
19
+ description: str | None = None
20
+ column_name: str | None = None
21
+ display_name: str | None = None
22
+ altair_type: Literal["Q", "N", "O", "T"] | None = None
23
+ scale_hint: Literal["linear", "log"] | None = None
24
+ is_config: bool = True
25
+
26
+ @model_validator(mode="after")
27
+ def derive_display_name(self) -> DataField:
28
+ if self.display_name is None:
29
+ object.__setattr__(
30
+ self,
31
+ "display_name",
32
+ self.id_string.replace("_", " ").replace(".", " ").title(),
33
+ )
34
+ return self
35
+
36
+ @computed_field
37
+ @property
38
+ def is_resolved(self) -> bool:
39
+ return self.column_name is not None
40
+
41
+ def resolve_column(self, df: pd.DataFrame) -> str:
42
+ if self.column_name:
43
+ return self.column_name
44
+ if self.id_string in df.columns:
45
+ return self.id_string
46
+ raise ValueError(
47
+ f"Cannot resolve column for field with id_string='{self.id_string}'. "
48
+ f"Neither column_name nor id_string '{self.id_string}' found in DataFrame columns. "
49
+ f"Available columns: {list(df.columns)}"
50
+ )
51
+
52
+ def infer_altair_type(self, df: pd.DataFrame) -> str:
53
+ if self.altair_type:
54
+ return self.altair_type
55
+ try:
56
+ col = self.resolve_column(df)
57
+ except ValueError:
58
+ return "N"
59
+ dtype = df[col].dtype
60
+ if pd.api.types.is_numeric_dtype(dtype):
61
+ return "Q"
62
+ if pd.api.types.is_datetime64_any_dtype(dtype):
63
+ return "T"
64
+ return "N"
65
+
66
+
67
+ class ComputedField(DataField):
68
+ source_columns: list[str] = Field(default_factory=list)
69
+ compute: Callable[[pd.DataFrame], pd.Series] = Field(exclude=True)
70
+
71
+ model_config = {"arbitrary_types_allowed": True}
72
+
73
+ def apply(self, df: pd.DataFrame) -> pd.Series:
74
+ return self.compute(df)
75
+
76
+
77
+ class MetricDataField(DataField):
78
+ group: str = ""
79
+ metric_type: str = ""
80
+
81
+ @classmethod
82
+ def from_column_name(cls, col: str) -> MetricDataField:
83
+ parts = col.split("/")
84
+ if len(parts) >= 4:
85
+ group = parts[1]
86
+ metric_type = parts[-1]
87
+ display = " ".join(parts[2:]).replace("(", "").replace(")", "")
88
+ else:
89
+ group = "unknown"
90
+ metric_type = col
91
+ display = col
92
+ return cls(
93
+ id_string=col,
94
+ column_name=col,
95
+ display_name=display,
96
+ group=group,
97
+ metric_type=metric_type,
98
+ altair_type="Q",
99
+ )
100
+
101
+
102
+ class DataFormat(BaseModel):
103
+ fields: list[DataField] = Field(default_factory=list)
104
+ computed_fields: list[ComputedField] = Field(default_factory=list)
105
+ metrics: list[MetricDataField] = Field(default_factory=list)
106
+ column_overrides: dict[str, str] = Field(default_factory=dict)
107
+ metric_prefix: str = "eval/"
108
+
109
+ @classmethod
110
+ def from_dict(
111
+ cls,
112
+ field_descriptions: dict[str, str],
113
+ df: pd.DataFrame,
114
+ column_overrides: dict[str, str] | None = None,
115
+ ) -> DataFormat:
116
+ overrides = column_overrides or {}
117
+ fields = [
118
+ DataField(
119
+ id_string=k,
120
+ description=v,
121
+ column_name=overrides.get(k) or (k if k in df.columns else None),
122
+ )
123
+ for k, v in field_descriptions.items()
124
+ ]
125
+ return cls(fields=fields, column_overrides=overrides)
126
+
127
+ @classmethod
128
+ def from_df(
129
+ cls,
130
+ df: pd.DataFrame,
131
+ field_descriptions: dict[str, str] | None = None,
132
+ computed_fields: list[ComputedField] | None = None,
133
+ column_overrides: dict[str, str] | None = None,
134
+ metric_prefix: str = "eval/",
135
+ ) -> DataFormat:
136
+ overrides = column_overrides or {}
137
+ defaults = cls()
138
+
139
+ if field_descriptions is not None:
140
+ fields = [
141
+ DataField(
142
+ id_string=k,
143
+ description=v,
144
+ column_name=overrides.get(k) or (k if k in df.columns else None),
145
+ )
146
+ for k, v in field_descriptions.items()
147
+ ]
148
+ else:
149
+ fields = [
150
+ field.model_copy(
151
+ update={
152
+ "column_name": overrides.get(field.id_string)
153
+ or (field.id_string if field.id_string in df.columns else None)
154
+ }
155
+ )
156
+ if field.column_name is None
157
+ else field
158
+ for field in defaults.fields
159
+ ]
160
+
161
+ cf_list = (
162
+ computed_fields if computed_fields is not None else defaults.computed_fields
163
+ )
164
+
165
+ metrics = [
166
+ MetricDataField.from_column_name(col)
167
+ for col in df.columns
168
+ if col.startswith(metric_prefix)
169
+ ]
170
+
171
+ return cls(
172
+ fields=fields,
173
+ computed_fields=cf_list,
174
+ metrics=metrics,
175
+ column_overrides=overrides,
176
+ metric_prefix=metric_prefix,
177
+ )
178
+
179
+ @property
180
+ def unresolved_fields(self) -> list[DataField]:
181
+ return [f for f in self.fields if not f.is_resolved]
182
+
183
+ @computed_field
184
+ @property
185
+ def is_fully_resolved(self) -> bool:
186
+ return all(f.is_resolved for f in self.fields)
187
+
188
+ def prepare_for_plotting(
189
+ self, df: pd.DataFrame, drop_unknown: bool = True
190
+ ) -> pd.DataFrame:
191
+ result = df.copy()
192
+ for cf in self.computed_fields:
193
+ col_name = cf.column_name or cf.id_string
194
+ result[col_name] = cf.apply(result)
195
+
196
+ if drop_unknown:
197
+ known_cols = set()
198
+ for f in self.fields:
199
+ known_cols.add(f.column_name or f.id_string)
200
+ for cf in self.computed_fields:
201
+ known_cols.add(cf.column_name or cf.id_string)
202
+ for m in self.metrics:
203
+ if m.column_name:
204
+ known_cols.add(m.column_name)
205
+ keep_cols = [c for c in result.columns if c in known_cols]
206
+ result = result[keep_cols]
207
+
208
+ return result
209
+
210
+ def get_metric(self, pattern: str) -> MetricDataField | None:
211
+ for m in self.metrics:
212
+ if pattern in (m.column_name or "") or pattern in (m.display_name or ""):
213
+ return m
214
+ return None
215
+
216
+ def metric_col(self, pattern: str) -> str:
217
+ metric = self.get_metric(pattern)
218
+ if metric is None:
219
+ raise ValueError(f"No metric found matching '{pattern}'")
220
+ if metric.column_name is None:
221
+ raise ValueError(
222
+ f"Metric matching '{pattern}' exists but has no column_name"
223
+ )
224
+ return metric.column_name
225
+
226
+ def get_metrics(self, df: pd.DataFrame) -> list[str]:
227
+ return [col for col in df.columns if col.startswith(self.metric_prefix)]
228
+
229
+ def get_config_columns(self, use_computed: bool = True) -> list[str]:
230
+ if use_computed:
231
+ config_cols = [cf.id_string for cf in self.computed_fields if cf.is_config]
232
+
233
+ computed_sources = set()
234
+ for cf in self.computed_fields:
235
+ if cf.is_config:
236
+ computed_sources.update(cf.source_columns)
237
+
238
+ for f in self.fields:
239
+ if f.is_config and f.id_string not in computed_sources:
240
+ config_cols.append(f.id_string)
241
+
242
+ return config_cols
243
+ else:
244
+ return [f.id_string for f in self.fields if f.is_config]
dr_frames/types.py ADDED
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Sequence
4
+ from typing import cast
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ __all__ = [
10
+ "coerce_numeric_cols",
11
+ "coerce_string_cols",
12
+ "is_string_series",
13
+ ]
14
+
15
+
16
+ def is_string_series(series: pd.Series) -> bool:
17
+ non_null = series.dropna()
18
+ if len(non_null) == 0:
19
+ return False
20
+ return bool(non_null.map(lambda x: isinstance(x, str)).all())
21
+
22
+
23
+ def coerce_numeric_cols(
24
+ df: pd.DataFrame,
25
+ columns: Sequence[str] | Iterable[str],
26
+ dtype: type[float] | type[int] = float,
27
+ ) -> pd.DataFrame:
28
+ columns_list = list(columns)
29
+ working = df.copy()
30
+ if not columns_list:
31
+ return working
32
+ for c in columns_list:
33
+ if c not in working.columns:
34
+ continue
35
+ coerced = cast(pd.Series, pd.to_numeric(working[c], errors="coerce"))
36
+ if dtype is int:
37
+ non_null = coerced.dropna()
38
+ if not np.isclose(non_null, non_null.astype(int)).all():
39
+ raise ValueError(
40
+ f"Column '{c}' contains non-integer values after coercion."
41
+ )
42
+ target_dtype: object = "Int64" if coerced.isna().any() else int
43
+ else:
44
+ target_dtype = dtype
45
+ working[c] = coerced.astype(target_dtype)
46
+ return working
47
+
48
+
49
+ def coerce_string_cols(
50
+ df: pd.DataFrame,
51
+ columns: Sequence[str] | Iterable[str],
52
+ ) -> pd.DataFrame:
53
+ columns_list = list(columns)
54
+ if not columns_list:
55
+ return df.copy()
56
+ working = df.copy()
57
+ for c in columns_list:
58
+ if c not in working.columns:
59
+ continue
60
+ working[c] = working[c].astype("string")
61
+ return working
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.4
2
+ Name: dr-frames
3
+ Version: 0.1.0
4
+ Summary: Pandas/DataFrame utilities for data manipulation, filtering, aggregation, and schema management
5
+ Project-URL: Homepage, https://github.com/drothermel/dr_frames
6
+ Project-URL: Repository, https://github.com/drothermel/dr_frames
7
+ Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: aggregation,data-manipulation,dataframe,filtering,pandas,schema
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: pandas>=2.0.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Provides-Extra: formatting
24
+ Requires-Dist: pyyaml>=6.0.0; extra == 'formatting'
25
+ Requires-Dist: rich>=13.0.0; extra == 'formatting'
26
+ Requires-Dist: tabulate>=0.9.0; extra == 'formatting'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # dr_frames
30
+
31
+ Pandas/DataFrame utilities for data manipulation, filtering, aggregation, and schema management.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install dr-frames
37
+ ```
38
+
39
+ For table formatting features (console, markdown, latex):
40
+ ```bash
41
+ pip install dr-frames[formatting]
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import pandas as pd
48
+ from dr_frames import (
49
+ coerce_numeric_cols,
50
+ filter_to_range,
51
+ move_cols_to_beginning,
52
+ select_subset,
53
+ )
54
+
55
+ df = pd.DataFrame({
56
+ "name": ["alice", "bob", "charlie"],
57
+ "value": ["1.0", "2.0", "3.0"],
58
+ "category": ["x", "y", "x"],
59
+ })
60
+
61
+ result = (
62
+ df.pipe(coerce_numeric_cols, ["value"])
63
+ .pipe(select_subset, {"category": "x"})
64
+ .pipe(filter_to_range, "value", 0.5, 2.5)
65
+ )
66
+ ```
67
+
68
+ ## Module Overview
69
+
70
+ | Module | Purpose | Key Functions |
71
+ |--------|---------|---------------|
72
+ | **columns** | Column selection & reordering | `move_cols_to_beginning`, `get_cols_by_prefix`, `strip_col_prefixes` |
73
+ | **filtering** | Row filtering | `select_subset`, `filter_to_range`, `make_filter_fxn` |
74
+ | **cells** | Cell-level operations | `ensure_column`, `map_column_with_fallback`, `force_set_cell` |
75
+ | **types** | Type coercion | `coerce_numeric_cols`, `coerce_string_cols` |
76
+ | **aggregation** | GroupBy & reduction | `aggregate_over_seeds`, `apply_aggregations`, `unique_non_null` |
77
+ | **parsing** | String list parsing | `parse_first_element`, `sum_list_elements`, `is_homogeneous` |
78
+ | **schema** | Data field metadata | `DataField`, `ComputedField`, `DataFormat` |
79
+ | **profiling** | Column auto-tagging | `DFColInfo`, `ColInfo`, `looks_like_json` |
80
+ | **formatting** | Table output | `format_table`, `format_coverage_table` |
81
+
82
+ ## Documentation
83
+
84
+ - [Full API Reference](docs/api.md)
85
+ - Module guides: [columns](docs/columns.md) | [filtering](docs/filtering.md) | [cells](docs/cells.md) | [types](docs/types.md) | [aggregation](docs/aggregation.md) | [parsing](docs/parsing.md) | [schema](docs/schema.md) | [profiling](docs/profiling.md) | [formatting](docs/formatting.md)
86
+ - [Recipes & Patterns](docs/recipes.md)
87
+
88
+ ### Auto-generated API Docs
89
+
90
+ ```bash
91
+ # Serve interactive docs locally
92
+ uv run pdoc dr_frames
93
+
94
+ # Generate static HTML
95
+ uv run pdoc dr_frames -o docs/api_html
96
+ ```
97
+
98
+ ## Quick Reference
99
+
100
+ ### Column Operations
101
+ ```python
102
+ from dr_frames import (
103
+ contained_cols, # cols that exist in df
104
+ remaining_cols, # cols NOT in a list
105
+ get_cols_by_prefix, # cols starting with prefix
106
+ get_cols_by_contains, # cols containing substring
107
+ move_cols_to_beginning, # reorder cols
108
+ move_cols_with_prefix_to_end,
109
+ strip_col_prefixes, # rename by removing prefix
110
+ drop_all_null_cols, # remove empty columns
111
+ )
112
+ ```
113
+
114
+ ### Filtering
115
+ ```python
116
+ from dr_frames import (
117
+ select_subset, # filter by exact column values
118
+ apply_filters_to_df, # filter by value lists
119
+ filter_to_value, # single value filter
120
+ filter_to_values, # multi-value filter
121
+ filter_to_range, # numeric range filter
122
+ filter_to_best_metric, # keep best per group
123
+ make_filter_fxn, # compose filters
124
+ )
125
+ ```
126
+
127
+ ### Cell Operations
128
+ ```python
129
+ from dr_frames import (
130
+ ensure_column, # add column if missing
131
+ fill_missing_values, # fillna with defaults dict
132
+ rename_columns, # safe rename (skips missing)
133
+ map_column_with_fallback,# map values, keep unmapped
134
+ apply_column_converters, # apply functions to columns
135
+ maybe_update_cell, # update if currently null
136
+ force_set_cell, # always update
137
+ masked_getter, # get value where mask is true
138
+ masked_setter, # set value where mask is true
139
+ )
140
+ ```
141
+
142
+ ### Type Coercion
143
+ ```python
144
+ from dr_frames import (
145
+ coerce_numeric_cols, # convert to float/int
146
+ coerce_string_cols, # convert to string dtype
147
+ is_string_series, # check if series is strings
148
+ )
149
+ ```
150
+
151
+ ### Aggregation
152
+ ```python
153
+ from dr_frames import (
154
+ aggregate_over_seeds, # mean/std/count by config
155
+ apply_aggregations, # flexible groupby
156
+ unique_non_null, # unique values excluding null
157
+ unique_by_col, # unique values in column
158
+ get_constant_cols, # cols with single value
159
+ fillna_with_defaults, # fill nulls from dict
160
+ maybe_pipe, # conditional pipe
161
+ )
162
+ ```
163
+
164
+ ### Parsing
165
+ ```python
166
+ from dr_frames import (
167
+ parse_list_string, # "[1,2,3]" -> [1,2,3]
168
+ parse_first_element, # "[1,2,3]" -> 1.0
169
+ sum_list_elements, # "[1,2,3]" -> 6.0
170
+ is_homogeneous, # "[1,1,1]" -> True
171
+ )
172
+ ```
173
+
174
+ ### Schema
175
+ ```python
176
+ from dr_frames import (
177
+ DataField, # field with metadata
178
+ ComputedField, # derived field
179
+ MetricDataField, # metric with group info
180
+ DataFormat, # container for fields
181
+ )
182
+ ```
183
+
184
+ ### Profiling
185
+ ```python
186
+ from dr_frames import (
187
+ DFColInfo, # catalog of column info
188
+ ColInfo, # single column metadata
189
+ looks_like_json, # detect JSON strings
190
+ looks_like_path, # detect file paths
191
+ infer_series_base_tag_type, # infer dtype tags
192
+ )
193
+ ```
194
+
195
+ ### Formatting (requires `[formatting]` extra)
196
+ ```python
197
+ from dr_frames import (
198
+ format_table, # render table in multiple formats
199
+ format_coverage_table, # show column coverage stats
200
+ FORMATTER_TYPES, # available formatters
201
+ OUTPUT_FORMATS, # available output formats
202
+ )
203
+ ```
204
+
205
+ ## License
206
+
207
+ MIT
@@ -0,0 +1,15 @@
1
+ dr_frames/__init__.py,sha256=tZ1ECxdSo7oUuX7gWQrsiUFNKf8OvA7-Ly33BzMrzFA,3516
2
+ dr_frames/aggregation.py,sha256=59Hl0Iw6Qgt4yl2mnrPgZ-5k9uaiUbkecTuSTNv4rbo,4151
3
+ dr_frames/cells.py,sha256=OtNSWN5qTknwwDuDXMujFQz8MjFdFpStsRIeIg6_xh8,7086
4
+ dr_frames/columns.py,sha256=mk3O5C4AjI31y3BC_8uPlLQfyxBMlDhNaKIbdXeiE-Y,3324
5
+ dr_frames/filtering.py,sha256=Mi-9R4dWXIMzszRySHq3RRRK7m8l9gpmp-5QiE2S76Y,2756
6
+ dr_frames/formatting.py,sha256=95GvlCu1PusB74ZTubdDqjOjVPqZn3l7-tE85M_2EcI,8309
7
+ dr_frames/parsing.py,sha256=_YteOY0Uvh1ndZesdaU8412H1cTSPhffB2gzR-eMEdY,1720
8
+ dr_frames/profiling.py,sha256=Wbg22et-gZGmTTeSYDo9GIzDHV_KpQP9Ne0QkyAHMVA,7029
9
+ dr_frames/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ dr_frames/schema.py,sha256=TDDP1Cv8L07ug8Sgo1me8vayX_fDhwOiR2hRFqDyY1g,7834
11
+ dr_frames/types.py,sha256=Ce1D0Cgf6bE47d9cl9G8OmrQeQMnMEL2g4_h-UhXvyA,1688
12
+ dr_frames-0.1.0.dist-info/METADATA,sha256=vOhfw7pPpVExWG4rrgptrIQEBzwrOkWfLUQPe06d7UM,6665
13
+ dr_frames-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
+ dr_frames-0.1.0.dist-info/licenses/LICENSE,sha256=6tUm1Q55M1UBMbbawzFlF0-DgCazM1BELo_5-RXA1K4,1075
15
+ dr_frames-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Danielle Rothermel
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.