dr-frames 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dr_frames/filtering.py ADDED
@@ -0,0 +1,96 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Mapping, Sequence
4
+ from typing import Any, cast
5
+
6
+ import pandas as pd
7
+
8
+ __all__ = [
9
+ "select_subset",
10
+ "apply_filters_to_df",
11
+ "filter_to_value",
12
+ "filter_to_values",
13
+ "filter_to_range",
14
+ "filter_to_best_metric",
15
+ "make_filter_fxn",
16
+ ]
17
+
18
+
19
+ def select_subset(
20
+ df: pd.DataFrame,
21
+ filters: Mapping[str, Any] | list[tuple[str, Any]] | None = None,
22
+ ) -> pd.DataFrame:
23
+ if filters is None:
24
+ filters = []
25
+ items = filters.items() if isinstance(filters, Mapping) else filters
26
+ mask = pd.Series(True, index=df.index)
27
+ for column, value in items:
28
+ assert column in df.columns, f"Column '{column}' not present in DataFrame."
29
+ if value is None or (isinstance(value, float) and pd.isna(value)):
30
+ mask &= df[column].isna()
31
+ else:
32
+ mask &= df[column] == value
33
+ return df.loc[mask].copy()
34
+
35
+
36
+ def apply_filters_to_df(
37
+ df: pd.DataFrame, filters: dict[str, Sequence[Any]]
38
+ ) -> pd.DataFrame:
39
+ df = df.copy()
40
+ avail_cols = set(df.columns.tolist())
41
+ for k, v in filters.items():
42
+ if k not in avail_cols:
43
+ continue
44
+ df = cast(pd.DataFrame, df[df[k].isin(v)])
45
+ return df.reset_index(drop=True)
46
+
47
+
48
+ def filter_to_value(
49
+ df: pd.DataFrame, column: str, value: float | str | None
50
+ ) -> pd.DataFrame:
51
+ """Filter to rows matching a specific value. Use None to match NaN values."""
52
+ if value is None:
53
+ return df[df[column].isna()].copy()
54
+ return df[df[column] == value].copy()
55
+
56
+
57
+ def filter_to_values(
58
+ df: pd.DataFrame, column: str, values: list[float | str | None]
59
+ ) -> pd.DataFrame:
60
+ """Filter to rows matching any value in list. Use None in list to include NaN."""
61
+ if None in values:
62
+ other_values = [v for v in values if v is not None]
63
+ mask = df[column].isna() | df[column].isin(other_values)
64
+ else:
65
+ mask = df[column].isin(values)
66
+ return df[mask].copy()
67
+
68
+
69
+ def filter_to_range(
70
+ df: pd.DataFrame, column: str, min_val: float, max_val: float
71
+ ) -> pd.DataFrame:
72
+ return df[(df[column] >= min_val) & (df[column] <= max_val)].copy()
73
+
74
+
75
+ def filter_to_best_metric(
76
+ df: pd.DataFrame,
77
+ group_cols: list[str],
78
+ metric_col: str,
79
+ lower_is_better: bool = True,
80
+ ) -> pd.DataFrame:
81
+ if lower_is_better:
82
+ idx = df.groupby(group_cols)[metric_col].idxmin()
83
+ else:
84
+ idx = df.groupby(group_cols)[metric_col].idxmax()
85
+ return df.loc[idx].copy()
86
+
87
+
88
+ def make_filter_fxn(
89
+ filters: list[tuple[Callable, ...]],
90
+ ) -> Callable[[pd.DataFrame], pd.DataFrame]:
91
+ def apply(df: pd.DataFrame) -> pd.DataFrame:
92
+ for fn, *args in filters:
93
+ df = fn(df, *args)
94
+ return df
95
+
96
+ return apply
@@ -0,0 +1,257 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from typing import TYPE_CHECKING, Any, Literal
5
+
6
+ import pandas as pd
7
+
8
+ if TYPE_CHECKING:
9
+ from rich.table import Table
10
+
11
+ JustifyMethod = Literal["default", "left", "center", "right", "full"]
12
+
13
+ __all__ = [
14
+ "format_table",
15
+ "format_coverage_table",
16
+ "FORMATTER_TYPES",
17
+ "OUTPUT_FORMATS",
18
+ ]
19
+
20
+ OUTPUT_FORMATS = {
21
+ "console": "grid",
22
+ "markdown": "pipe",
23
+ "latex": "latex",
24
+ "plain": "plain",
25
+ "csv": "simple",
26
+ }
27
+
28
+ FORMATTER_TYPES: dict[str, Callable] = {
29
+ "scientific": lambda x, precision=2: f"{x:.{precision}e}"
30
+ if x is not None
31
+ else "None",
32
+ "decimal": lambda x, precision=3: f"{x:.{precision}f}" if x is not None else "None",
33
+ "integer": lambda x: f"{x:,.0f}" if x is not None else "None",
34
+ "comma": lambda x: f"{x:,}" if x is not None else "None",
35
+ "truncate": lambda x, max_length=50: (
36
+ str(x)[:max_length] + "..."
37
+ if x is not None and len(str(x)) > max_length
38
+ else (str(x) if x is not None else "None")
39
+ ),
40
+ "string": lambda x: str(x) if x is not None else "None",
41
+ }
42
+
43
+ COVERAGE_TABLE_CONFIG = {
44
+ "index": {"header": "#", "formatter": "integer"},
45
+ "column": {"header": "Column", "formatter": "truncate", "max_length": 35},
46
+ "coverage": {"header": "Coverage %", "formatter": "decimal", "precision": 1},
47
+ }
48
+
49
+
50
+ def format_table(
51
+ data: list[dict] | pd.DataFrame | list[list],
52
+ headers: list[str] | None = None,
53
+ output_format: str = "console",
54
+ column_config: dict[str, dict] | None = None,
55
+ title: str | None = None,
56
+ table_style: str = "lines",
57
+ disable_numparse: bool = True,
58
+ ) -> str | Table:
59
+ processed_data = _preprocess_data(data)
60
+ column_names = _get_column_names(data)
61
+ config = column_config or {}
62
+ formatted_data = _apply_column_formatting(processed_data, config, column_names)
63
+ final_headers = _resolve_headers(headers, column_names, config)
64
+
65
+ if output_format == "console":
66
+ return _create_rich_table(
67
+ formatted_data, final_headers, config, column_names, title, table_style
68
+ )
69
+ else:
70
+ from tabulate import tabulate
71
+
72
+ tablefmt = OUTPUT_FORMATS.get(output_format, "grid")
73
+ return tabulate(
74
+ formatted_data,
75
+ headers=final_headers,
76
+ tablefmt=tablefmt,
77
+ disable_numparse=disable_numparse,
78
+ )
79
+
80
+
81
+ def format_coverage_table(
82
+ df: pd.DataFrame,
83
+ title: str = "Column Coverage",
84
+ output_format: str = "console",
85
+ table_style: str = "lines",
86
+ disable_numparse: bool = True,
87
+ ) -> str:
88
+ coverage_data = []
89
+ row_count = len(df) or df.shape[0]
90
+ for i, col in enumerate(df.columns):
91
+ if row_count == 0:
92
+ coverage = 0
93
+ else:
94
+ coverage = df[col].notna().sum() / row_count * 100
95
+ coverage_data.append({"index": i + 1, "column": col, "coverage": coverage})
96
+ result = f"{title} ({len(df.columns)} columns):\n"
97
+ table_result = format_table(
98
+ data=coverage_data,
99
+ output_format=output_format,
100
+ column_config=COVERAGE_TABLE_CONFIG,
101
+ table_style=table_style,
102
+ disable_numparse=disable_numparse,
103
+ )
104
+ result += str(table_result) if not isinstance(table_result, str) else table_result
105
+ return result
106
+
107
+
108
+ def _preprocess_data(data: list[dict] | pd.DataFrame | list[list]) -> list[list]:
109
+ if isinstance(data, pd.DataFrame):
110
+ return data.to_numpy().tolist()
111
+ elif isinstance(data, list) and len(data) > 0:
112
+ if isinstance(data[0], dict):
113
+ # Build stable union of all keys: preserve first-row order, then append new keys
114
+ keys = list(data[0].keys())
115
+ keys_set = set(keys)
116
+ for row in data[1:]:
117
+ for key in row.keys(): # type: ignore[union-attr]
118
+ if key not in keys_set:
119
+ keys.append(key)
120
+ keys_set.add(key)
121
+ return [[row.get(key) for key in keys] for row in data] # type: ignore[union-attr]
122
+ else:
123
+ return list(data) # type: ignore[arg-type]
124
+ return []
125
+
126
+
127
+ def _get_column_names(data: list[dict] | pd.DataFrame | list[list]) -> list[str]:
128
+ if isinstance(data, pd.DataFrame):
129
+ return list(data.columns)
130
+ elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
131
+ # Build stable union of all keys: preserve first-row order, then append new keys
132
+ keys = list(data[0].keys())
133
+ keys_set = set(keys)
134
+ for row in data[1:]:
135
+ for key in row.keys(): # type: ignore[union-attr]
136
+ if key not in keys_set:
137
+ keys.append(key)
138
+ keys_set.add(key)
139
+ return keys
140
+ else:
141
+ return []
142
+
143
+
144
+ def _apply_column_formatting(
145
+ processed_data: list[list],
146
+ config: dict[str, dict],
147
+ column_names: list[str] | None = None,
148
+ ) -> list[list]:
149
+ if not processed_data:
150
+ return processed_data
151
+ formatted_data = []
152
+ for row in processed_data:
153
+ formatted_row = []
154
+ for col_idx, value in enumerate(row):
155
+ col_name = (
156
+ column_names[col_idx]
157
+ if column_names and col_idx < len(column_names)
158
+ else None
159
+ )
160
+ formatted_row.append(_format_value(value, col_name, config))
161
+ formatted_data.append(formatted_row)
162
+ return formatted_data
163
+
164
+
165
+ def _format_value(
166
+ value: Any,
167
+ col_name: str | None,
168
+ config: dict[str, dict],
169
+ ) -> str:
170
+ if col_name and col_name in config:
171
+ col_config = config[col_name]
172
+ formatter_name = col_config.get("formatter", "string")
173
+ formatter = FORMATTER_TYPES.get(formatter_name, FORMATTER_TYPES["string"])
174
+ formatter_kwargs = {
175
+ k: v for k, v in col_config.items() if k not in ["header", "formatter"]
176
+ }
177
+ try:
178
+ return formatter(value, **formatter_kwargs)
179
+ except (TypeError, ValueError):
180
+ return str(value) if value is not None else "None"
181
+ return str(value) if value is not None else "None"
182
+
183
+
184
+ def _resolve_headers(
185
+ headers: list[str] | None, column_names: list[str], config: dict[str, dict]
186
+ ) -> list[str]:
187
+ if headers is not None:
188
+ return headers
189
+ if config and column_names:
190
+ result_headers = []
191
+ for col_name in column_names:
192
+ if col_name in config:
193
+ result_headers.append(config[col_name].get("header", col_name))
194
+ else:
195
+ result_headers.append(col_name)
196
+ return result_headers
197
+ if column_names:
198
+ return column_names
199
+ return []
200
+
201
+
202
+ def _create_rich_table(
203
+ formatted_data: list[list],
204
+ headers: list[str],
205
+ config: dict[str, dict],
206
+ column_names: list[str],
207
+ title: str | None = None,
208
+ table_style: str = "lines",
209
+ ) -> Table:
210
+ from rich.table import Table
211
+
212
+ if table_style == "zebra":
213
+ table = Table(
214
+ title=title,
215
+ show_header=True,
216
+ header_style="bold magenta",
217
+ row_styles=["", "dim"],
218
+ )
219
+ else:
220
+ table = Table(
221
+ title=title, show_header=True, header_style="bold magenta", show_lines=True
222
+ )
223
+
224
+ for i, header in enumerate(headers):
225
+ col_name = _get_column_name_for_index(column_names, i)
226
+ col_config = config.get(col_name, {})
227
+ justify = _get_rich_justify(col_config)
228
+ style = _get_rich_style(col_config)
229
+ table.add_column(header, justify=justify, style=style)
230
+
231
+ for row in formatted_data:
232
+ table.add_row(*[str(cell) for cell in row])
233
+
234
+ return table
235
+
236
+
237
+ def _get_column_name_for_index(column_names: list[str], index: int) -> str:
238
+ return column_names[index] if index < len(column_names) else f"col_{index}"
239
+
240
+
241
+ def _get_rich_justify(col_config: dict) -> JustifyMethod:
242
+ formatter = col_config.get("formatter", "string")
243
+ if formatter in ["scientific", "decimal", "integer", "comma"]:
244
+ return "right"
245
+ return "left"
246
+
247
+
248
+ def _get_rich_style(col_config: dict) -> str | None:
249
+ formatter = col_config.get("formatter", "string")
250
+ style_map = {
251
+ "scientific": "yellow",
252
+ "decimal": "green",
253
+ "integer": "cyan",
254
+ "comma": "cyan",
255
+ "truncate": "dim",
256
+ }
257
+ return style_map.get(formatter)
dr_frames/parsing.py ADDED
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+
5
+ import pandas as pd
6
+
7
+ __all__ = [
8
+ "parse_first_element",
9
+ "sum_list_elements",
10
+ "is_homogeneous",
11
+ "parse_list_string",
12
+ ]
13
+
14
+
15
+ def parse_list_string(val: str) -> list | None:
16
+ if pd.isna(val):
17
+ return None
18
+ try:
19
+ parsed = ast.literal_eval(val)
20
+ if isinstance(parsed, list):
21
+ return parsed
22
+ return [parsed]
23
+ except (ValueError, SyntaxError):
24
+ return None
25
+
26
+
27
+ def parse_first_element(val: str) -> float:
28
+ if pd.isna(val):
29
+ return float("nan")
30
+ try:
31
+ parsed = ast.literal_eval(val)
32
+ if isinstance(parsed, list):
33
+ if len(parsed) > 0:
34
+ return float(parsed[0])
35
+ return float("nan")
36
+ return float(parsed)
37
+ except (ValueError, SyntaxError):
38
+ return float("nan")
39
+
40
+
41
+ def sum_list_elements(val: str) -> float:
42
+ if pd.isna(val):
43
+ return float("nan")
44
+ try:
45
+ parsed = ast.literal_eval(val)
46
+ if isinstance(parsed, list):
47
+ if len(parsed) > 0:
48
+ try:
49
+ return float(sum(float(item) for item in parsed))
50
+ except (ValueError, TypeError):
51
+ return float("nan")
52
+ return float("nan")
53
+ return float(parsed)
54
+ except (ValueError, SyntaxError):
55
+ return float("nan")
56
+
57
+
58
+ def is_homogeneous(val: str) -> bool:
59
+ if pd.isna(val):
60
+ return False
61
+ try:
62
+ parsed = ast.literal_eval(val)
63
+ if isinstance(parsed, list):
64
+ if len(parsed) > 0:
65
+ return len(set(parsed)) == 1
66
+ return False
67
+ return True
68
+ except (ValueError, SyntaxError, TypeError):
69
+ return False
dr_frames/profiling.py ADDED
@@ -0,0 +1,246 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections.abc import Callable
5
+ from pathlib import PurePath
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+ from pydantic import BaseModel, Field
10
+
11
+ from .aggregation import unique_non_null
12
+
13
+ __all__ = [
14
+ "ColInfo",
15
+ "DFColInfo",
16
+ "looks_like_json",
17
+ "looks_like_path",
18
+ "infer_series_base_tag_type",
19
+ "infer_tags_from_series_sample",
20
+ "infer_col_name_contains_tags",
21
+ "infer_col_name_suffix_tags",
22
+ "infer_col_name_prefix_tags",
23
+ ]
24
+
25
+
26
+ class ColInfo(BaseModel):
27
+ catalog: DFColInfo
28
+ name: str
29
+ dtype: str | None = None
30
+ tags: set[str] = Field(default_factory=set)
31
+
32
+ model_config = {"arbitrary_types_allowed": True}
33
+
34
+ def update_tags(self, series: pd.Series) -> None:
35
+ self.tags.update(
36
+ infer_series_base_tag_type(
37
+ series,
38
+ self.catalog.pd_type_to_tags,
39
+ self.catalog.pd_default_type,
40
+ )
41
+ )
42
+ self.tags.update(
43
+ infer_col_name_contains_tags(
44
+ self.name,
45
+ self.catalog.col_name_contains_map,
46
+ )
47
+ )
48
+ self.tags.update(
49
+ infer_col_name_suffix_tags(
50
+ self.name,
51
+ self.catalog.col_name_suffix_map,
52
+ )
53
+ )
54
+ self.tags.update(
55
+ infer_col_name_prefix_tags(
56
+ self.name,
57
+ self.catalog.col_name_prefix_map,
58
+ )
59
+ )
60
+ self.tags.update(
61
+ infer_tags_from_series_sample(
62
+ series,
63
+ self.catalog.path_like_extensions,
64
+ )
65
+ )
66
+
67
+ def has_tag(self, tag: str) -> bool:
68
+ return tag in self.tags
69
+
70
+ def add_tags(self, extra: list[str]) -> None:
71
+ self.tags.update(extra)
72
+
73
+
74
+ class DFColInfo(BaseModel):
75
+ columns: dict[str, ColInfo] = Field(default_factory=dict)
76
+ pd_type_to_tags: list[tuple[Callable, list[str]]] = Field(
77
+ default_factory=lambda: [
78
+ (pd.api.types.is_bool_dtype, ["bool"]),
79
+ (pd.api.types.is_integer_dtype, ["int", "numeric"]),
80
+ (pd.api.types.is_float_dtype, ["float", "numeric"]),
81
+ (pd.api.types.is_datetime64_any_dtype, ["datetime"]),
82
+ (pd.api.types.is_timedelta64_dtype, ["timedelta"]),
83
+ (lambda dt: isinstance(dt, pd.CategoricalDtype), ["categorical"]),
84
+ ]
85
+ )
86
+ pd_default_type: list[str] = Field(default_factory=lambda: ["str"])
87
+
88
+ col_name_contains_map: dict[tuple[str, ...], str] = Field(
89
+ default_factory=lambda: {
90
+ ("config", "kwargs", "settings", "params"): "config",
91
+ }
92
+ )
93
+ col_name_suffix_map: dict[tuple[str, ...], str] = Field(
94
+ default_factory=lambda: {
95
+ ("_path", "_dir"): "path",
96
+ ("_id",): "id",
97
+ ("_tag", "_tags"): "categorical",
98
+ }
99
+ )
100
+ col_name_prefix_map: dict[tuple[str, ...], str] = Field(
101
+ default_factory=lambda: {
102
+ ("is_", "has_"): "bool_like",
103
+ ("metric_",): "metric",
104
+ }
105
+ )
106
+
107
+ path_like_extensions: set[str] = Field(
108
+ default_factory=lambda: {
109
+ ".json",
110
+ ".jsonl",
111
+ ".csv",
112
+ ".tsv",
113
+ ".parquet",
114
+ ".txt",
115
+ ".yaml",
116
+ ".yml",
117
+ ".log",
118
+ }
119
+ )
120
+
121
+ def update_from_df(self, df: pd.DataFrame) -> None:
122
+ for column in df.columns:
123
+ info = ColInfo(catalog=self, name=column)
124
+ info.update_tags(df[column])
125
+ self.columns[column] = info
126
+
127
+ def get(self, name: str) -> ColInfo | None:
128
+ return self.columns.get(name)
129
+
130
+ def names_with_tag(self, tag: str) -> list[str]:
131
+ return [name for name, info in self.columns.items() if info.has_tag(tag)]
132
+
133
+
134
+ def infer_tags_from_series_sample(
135
+ series: pd.Series,
136
+ path_like_extensions: set[str],
137
+ sample_size: int = 10,
138
+ ) -> set[str]:
139
+ tags: set[str] = set()
140
+ unique_vals = unique_non_null(series)
141
+ if len(unique_vals) == 0:
142
+ return tags
143
+
144
+ # Sample up to sample_size unique non-null values
145
+ sample_vals = unique_vals[: min(sample_size, len(unique_vals))]
146
+
147
+ # Check each sampled value for path and JSON patterns
148
+ path_matches = 0
149
+ json_matches = 0
150
+ for value in sample_vals:
151
+ if looks_like_path(value, path_like_extensions):
152
+ path_matches += 1
153
+ if looks_like_json(value):
154
+ json_matches += 1
155
+
156
+ # Add tags if at least one value matches
157
+ if path_matches > 0:
158
+ tags.add("path")
159
+ if json_matches > 0:
160
+ tags.add("json")
161
+ return tags
162
+
163
+
164
+ def infer_col_name_contains_tags(
165
+ name: str,
166
+ col_name_contains_map: dict[tuple[str, ...], str],
167
+ ) -> set[str]:
168
+ lower_name = name.lower()
169
+ tags: set[str] = set()
170
+ for contains_tuple, tag in col_name_contains_map.items():
171
+ for contains in contains_tuple:
172
+ if contains.lower() in lower_name:
173
+ tags.add(tag)
174
+ return tags
175
+
176
+
177
+ def infer_col_name_suffix_tags(
178
+ name: str,
179
+ col_name_suffix_map: dict[tuple[str, ...], str],
180
+ ) -> set[str]:
181
+ lower_name = name.lower()
182
+ tags: set[str] = set()
183
+ for suffix_tuple, tag in col_name_suffix_map.items():
184
+ for suffix in suffix_tuple:
185
+ if lower_name.endswith(suffix.lower()):
186
+ tags.add(tag)
187
+ break
188
+ return tags
189
+
190
+
191
+ def infer_col_name_prefix_tags(
192
+ name: str,
193
+ col_name_prefix_map: dict[tuple[str, ...], str],
194
+ ) -> set[str]:
195
+ lower_name = name.lower()
196
+ tags: set[str] = set()
197
+ for prefix_tuple, tag in col_name_prefix_map.items():
198
+ for prefix in prefix_tuple:
199
+ if lower_name.startswith(prefix.lower()):
200
+ tags.add(tag)
201
+ break
202
+ return tags
203
+
204
+
205
+ def infer_series_base_tag_type(
206
+ series: pd.Series,
207
+ pd_type_to_tags: list[tuple[Callable, list[str]]],
208
+ pd_default_type: list[str],
209
+ ) -> set[str]:
210
+ dtype = series.dtype
211
+ tags: set[str] = set()
212
+ for type_fxn, tag_list in pd_type_to_tags:
213
+ tags.update(tag_list if type_fxn(dtype) else [])
214
+ if len(tags) == 0:
215
+ tags.update(pd_default_type)
216
+ if series.isna().any():
217
+ tags.add("nullable")
218
+ return tags
219
+
220
+
221
+ def looks_like_path(value: Any, path_extensions: set[str]) -> bool:
222
+ if not value or not isinstance(value, str) or value.isspace():
223
+ return False
224
+ normalized = value.strip()
225
+ if any(sep in normalized for sep in ("/", "\\")):
226
+ return True
227
+ try:
228
+ suffix = PurePath(normalized).suffix.lower()
229
+ except (TypeError, ValueError):
230
+ return False
231
+ return suffix in path_extensions
232
+
233
+
234
+ def looks_like_json(value: Any) -> bool:
235
+ if not value or not isinstance(value, str):
236
+ return False
237
+ trimmed = value.strip()
238
+ if not trimmed:
239
+ return False
240
+ if not (trimmed.startswith("{") or trimmed.startswith("[")):
241
+ return False
242
+ try:
243
+ parsed = json.loads(trimmed)
244
+ except Exception:
245
+ return False
246
+ return isinstance(parsed, (dict, list))
dr_frames/py.typed ADDED
File without changes