dr-frames 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dr_frames/__init__.py +161 -0
- dr_frames/aggregation.py +138 -0
- dr_frames/cells.py +262 -0
- dr_frames/columns.py +108 -0
- dr_frames/filtering.py +96 -0
- dr_frames/formatting.py +257 -0
- dr_frames/parsing.py +69 -0
- dr_frames/profiling.py +246 -0
- dr_frames/py.typed +0 -0
- dr_frames/schema.py +244 -0
- dr_frames/types.py +61 -0
- dr_frames-0.1.0.dist-info/METADATA +207 -0
- dr_frames-0.1.0.dist-info/RECORD +15 -0
- dr_frames-0.1.0.dist-info/WHEEL +4 -0
- dr_frames-0.1.0.dist-info/licenses/LICENSE +21 -0
dr_frames/filtering.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
4
|
+
from typing import Any, cast
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"select_subset",
|
|
10
|
+
"apply_filters_to_df",
|
|
11
|
+
"filter_to_value",
|
|
12
|
+
"filter_to_values",
|
|
13
|
+
"filter_to_range",
|
|
14
|
+
"filter_to_best_metric",
|
|
15
|
+
"make_filter_fxn",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def select_subset(
|
|
20
|
+
df: pd.DataFrame,
|
|
21
|
+
filters: Mapping[str, Any] | list[tuple[str, Any]] | None = None,
|
|
22
|
+
) -> pd.DataFrame:
|
|
23
|
+
if filters is None:
|
|
24
|
+
filters = []
|
|
25
|
+
items = filters.items() if isinstance(filters, Mapping) else filters
|
|
26
|
+
mask = pd.Series(True, index=df.index)
|
|
27
|
+
for column, value in items:
|
|
28
|
+
assert column in df.columns, f"Column '{column}' not present in DataFrame."
|
|
29
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
30
|
+
mask &= df[column].isna()
|
|
31
|
+
else:
|
|
32
|
+
mask &= df[column] == value
|
|
33
|
+
return df.loc[mask].copy()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def apply_filters_to_df(
|
|
37
|
+
df: pd.DataFrame, filters: dict[str, Sequence[Any]]
|
|
38
|
+
) -> pd.DataFrame:
|
|
39
|
+
df = df.copy()
|
|
40
|
+
avail_cols = set(df.columns.tolist())
|
|
41
|
+
for k, v in filters.items():
|
|
42
|
+
if k not in avail_cols:
|
|
43
|
+
continue
|
|
44
|
+
df = cast(pd.DataFrame, df[df[k].isin(v)])
|
|
45
|
+
return df.reset_index(drop=True)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def filter_to_value(
|
|
49
|
+
df: pd.DataFrame, column: str, value: float | str | None
|
|
50
|
+
) -> pd.DataFrame:
|
|
51
|
+
"""Filter to rows matching a specific value. Use None to match NaN values."""
|
|
52
|
+
if value is None:
|
|
53
|
+
return df[df[column].isna()].copy()
|
|
54
|
+
return df[df[column] == value].copy()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def filter_to_values(
|
|
58
|
+
df: pd.DataFrame, column: str, values: list[float | str | None]
|
|
59
|
+
) -> pd.DataFrame:
|
|
60
|
+
"""Filter to rows matching any value in list. Use None in list to include NaN."""
|
|
61
|
+
if None in values:
|
|
62
|
+
other_values = [v for v in values if v is not None]
|
|
63
|
+
mask = df[column].isna() | df[column].isin(other_values)
|
|
64
|
+
else:
|
|
65
|
+
mask = df[column].isin(values)
|
|
66
|
+
return df[mask].copy()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def filter_to_range(
|
|
70
|
+
df: pd.DataFrame, column: str, min_val: float, max_val: float
|
|
71
|
+
) -> pd.DataFrame:
|
|
72
|
+
return df[(df[column] >= min_val) & (df[column] <= max_val)].copy()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def filter_to_best_metric(
|
|
76
|
+
df: pd.DataFrame,
|
|
77
|
+
group_cols: list[str],
|
|
78
|
+
metric_col: str,
|
|
79
|
+
lower_is_better: bool = True,
|
|
80
|
+
) -> pd.DataFrame:
|
|
81
|
+
if lower_is_better:
|
|
82
|
+
idx = df.groupby(group_cols)[metric_col].idxmin()
|
|
83
|
+
else:
|
|
84
|
+
idx = df.groupby(group_cols)[metric_col].idxmax()
|
|
85
|
+
return df.loc[idx].copy()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def make_filter_fxn(
|
|
89
|
+
filters: list[tuple[Callable, ...]],
|
|
90
|
+
) -> Callable[[pd.DataFrame], pd.DataFrame]:
|
|
91
|
+
def apply(df: pd.DataFrame) -> pd.DataFrame:
|
|
92
|
+
for fn, *args in filters:
|
|
93
|
+
df = fn(df, *args)
|
|
94
|
+
return df
|
|
95
|
+
|
|
96
|
+
return apply
|
dr_frames/formatting.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
JustifyMethod = Literal["default", "left", "center", "right", "full"]
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"format_table",
|
|
15
|
+
"format_coverage_table",
|
|
16
|
+
"FORMATTER_TYPES",
|
|
17
|
+
"OUTPUT_FORMATS",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
OUTPUT_FORMATS = {
|
|
21
|
+
"console": "grid",
|
|
22
|
+
"markdown": "pipe",
|
|
23
|
+
"latex": "latex",
|
|
24
|
+
"plain": "plain",
|
|
25
|
+
"csv": "simple",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
FORMATTER_TYPES: dict[str, Callable] = {
|
|
29
|
+
"scientific": lambda x, precision=2: f"{x:.{precision}e}"
|
|
30
|
+
if x is not None
|
|
31
|
+
else "None",
|
|
32
|
+
"decimal": lambda x, precision=3: f"{x:.{precision}f}" if x is not None else "None",
|
|
33
|
+
"integer": lambda x: f"{x:,.0f}" if x is not None else "None",
|
|
34
|
+
"comma": lambda x: f"{x:,}" if x is not None else "None",
|
|
35
|
+
"truncate": lambda x, max_length=50: (
|
|
36
|
+
str(x)[:max_length] + "..."
|
|
37
|
+
if x is not None and len(str(x)) > max_length
|
|
38
|
+
else (str(x) if x is not None else "None")
|
|
39
|
+
),
|
|
40
|
+
"string": lambda x: str(x) if x is not None else "None",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
COVERAGE_TABLE_CONFIG = {
|
|
44
|
+
"index": {"header": "#", "formatter": "integer"},
|
|
45
|
+
"column": {"header": "Column", "formatter": "truncate", "max_length": 35},
|
|
46
|
+
"coverage": {"header": "Coverage %", "formatter": "decimal", "precision": 1},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_table(
|
|
51
|
+
data: list[dict] | pd.DataFrame | list[list],
|
|
52
|
+
headers: list[str] | None = None,
|
|
53
|
+
output_format: str = "console",
|
|
54
|
+
column_config: dict[str, dict] | None = None,
|
|
55
|
+
title: str | None = None,
|
|
56
|
+
table_style: str = "lines",
|
|
57
|
+
disable_numparse: bool = True,
|
|
58
|
+
) -> str | Table:
|
|
59
|
+
processed_data = _preprocess_data(data)
|
|
60
|
+
column_names = _get_column_names(data)
|
|
61
|
+
config = column_config or {}
|
|
62
|
+
formatted_data = _apply_column_formatting(processed_data, config, column_names)
|
|
63
|
+
final_headers = _resolve_headers(headers, column_names, config)
|
|
64
|
+
|
|
65
|
+
if output_format == "console":
|
|
66
|
+
return _create_rich_table(
|
|
67
|
+
formatted_data, final_headers, config, column_names, title, table_style
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
from tabulate import tabulate
|
|
71
|
+
|
|
72
|
+
tablefmt = OUTPUT_FORMATS.get(output_format, "grid")
|
|
73
|
+
return tabulate(
|
|
74
|
+
formatted_data,
|
|
75
|
+
headers=final_headers,
|
|
76
|
+
tablefmt=tablefmt,
|
|
77
|
+
disable_numparse=disable_numparse,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def format_coverage_table(
|
|
82
|
+
df: pd.DataFrame,
|
|
83
|
+
title: str = "Column Coverage",
|
|
84
|
+
output_format: str = "console",
|
|
85
|
+
table_style: str = "lines",
|
|
86
|
+
disable_numparse: bool = True,
|
|
87
|
+
) -> str:
|
|
88
|
+
coverage_data = []
|
|
89
|
+
row_count = len(df) or df.shape[0]
|
|
90
|
+
for i, col in enumerate(df.columns):
|
|
91
|
+
if row_count == 0:
|
|
92
|
+
coverage = 0
|
|
93
|
+
else:
|
|
94
|
+
coverage = df[col].notna().sum() / row_count * 100
|
|
95
|
+
coverage_data.append({"index": i + 1, "column": col, "coverage": coverage})
|
|
96
|
+
result = f"{title} ({len(df.columns)} columns):\n"
|
|
97
|
+
table_result = format_table(
|
|
98
|
+
data=coverage_data,
|
|
99
|
+
output_format=output_format,
|
|
100
|
+
column_config=COVERAGE_TABLE_CONFIG,
|
|
101
|
+
table_style=table_style,
|
|
102
|
+
disable_numparse=disable_numparse,
|
|
103
|
+
)
|
|
104
|
+
result += str(table_result) if not isinstance(table_result, str) else table_result
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _preprocess_data(data: list[dict] | pd.DataFrame | list[list]) -> list[list]:
|
|
109
|
+
if isinstance(data, pd.DataFrame):
|
|
110
|
+
return data.to_numpy().tolist()
|
|
111
|
+
elif isinstance(data, list) and len(data) > 0:
|
|
112
|
+
if isinstance(data[0], dict):
|
|
113
|
+
# Build stable union of all keys: preserve first-row order, then append new keys
|
|
114
|
+
keys = list(data[0].keys())
|
|
115
|
+
keys_set = set(keys)
|
|
116
|
+
for row in data[1:]:
|
|
117
|
+
for key in row.keys(): # type: ignore[union-attr]
|
|
118
|
+
if key not in keys_set:
|
|
119
|
+
keys.append(key)
|
|
120
|
+
keys_set.add(key)
|
|
121
|
+
return [[row.get(key) for key in keys] for row in data] # type: ignore[union-attr]
|
|
122
|
+
else:
|
|
123
|
+
return list(data) # type: ignore[arg-type]
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _get_column_names(data: list[dict] | pd.DataFrame | list[list]) -> list[str]:
|
|
128
|
+
if isinstance(data, pd.DataFrame):
|
|
129
|
+
return list(data.columns)
|
|
130
|
+
elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
131
|
+
# Build stable union of all keys: preserve first-row order, then append new keys
|
|
132
|
+
keys = list(data[0].keys())
|
|
133
|
+
keys_set = set(keys)
|
|
134
|
+
for row in data[1:]:
|
|
135
|
+
for key in row.keys(): # type: ignore[union-attr]
|
|
136
|
+
if key not in keys_set:
|
|
137
|
+
keys.append(key)
|
|
138
|
+
keys_set.add(key)
|
|
139
|
+
return keys
|
|
140
|
+
else:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _apply_column_formatting(
|
|
145
|
+
processed_data: list[list],
|
|
146
|
+
config: dict[str, dict],
|
|
147
|
+
column_names: list[str] | None = None,
|
|
148
|
+
) -> list[list]:
|
|
149
|
+
if not processed_data:
|
|
150
|
+
return processed_data
|
|
151
|
+
formatted_data = []
|
|
152
|
+
for row in processed_data:
|
|
153
|
+
formatted_row = []
|
|
154
|
+
for col_idx, value in enumerate(row):
|
|
155
|
+
col_name = (
|
|
156
|
+
column_names[col_idx]
|
|
157
|
+
if column_names and col_idx < len(column_names)
|
|
158
|
+
else None
|
|
159
|
+
)
|
|
160
|
+
formatted_row.append(_format_value(value, col_name, config))
|
|
161
|
+
formatted_data.append(formatted_row)
|
|
162
|
+
return formatted_data
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _format_value(
|
|
166
|
+
value: Any,
|
|
167
|
+
col_name: str | None,
|
|
168
|
+
config: dict[str, dict],
|
|
169
|
+
) -> str:
|
|
170
|
+
if col_name and col_name in config:
|
|
171
|
+
col_config = config[col_name]
|
|
172
|
+
formatter_name = col_config.get("formatter", "string")
|
|
173
|
+
formatter = FORMATTER_TYPES.get(formatter_name, FORMATTER_TYPES["string"])
|
|
174
|
+
formatter_kwargs = {
|
|
175
|
+
k: v for k, v in col_config.items() if k not in ["header", "formatter"]
|
|
176
|
+
}
|
|
177
|
+
try:
|
|
178
|
+
return formatter(value, **formatter_kwargs)
|
|
179
|
+
except (TypeError, ValueError):
|
|
180
|
+
return str(value) if value is not None else "None"
|
|
181
|
+
return str(value) if value is not None else "None"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _resolve_headers(
|
|
185
|
+
headers: list[str] | None, column_names: list[str], config: dict[str, dict]
|
|
186
|
+
) -> list[str]:
|
|
187
|
+
if headers is not None:
|
|
188
|
+
return headers
|
|
189
|
+
if config and column_names:
|
|
190
|
+
result_headers = []
|
|
191
|
+
for col_name in column_names:
|
|
192
|
+
if col_name in config:
|
|
193
|
+
result_headers.append(config[col_name].get("header", col_name))
|
|
194
|
+
else:
|
|
195
|
+
result_headers.append(col_name)
|
|
196
|
+
return result_headers
|
|
197
|
+
if column_names:
|
|
198
|
+
return column_names
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _create_rich_table(
|
|
203
|
+
formatted_data: list[list],
|
|
204
|
+
headers: list[str],
|
|
205
|
+
config: dict[str, dict],
|
|
206
|
+
column_names: list[str],
|
|
207
|
+
title: str | None = None,
|
|
208
|
+
table_style: str = "lines",
|
|
209
|
+
) -> Table:
|
|
210
|
+
from rich.table import Table
|
|
211
|
+
|
|
212
|
+
if table_style == "zebra":
|
|
213
|
+
table = Table(
|
|
214
|
+
title=title,
|
|
215
|
+
show_header=True,
|
|
216
|
+
header_style="bold magenta",
|
|
217
|
+
row_styles=["", "dim"],
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
table = Table(
|
|
221
|
+
title=title, show_header=True, header_style="bold magenta", show_lines=True
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
for i, header in enumerate(headers):
|
|
225
|
+
col_name = _get_column_name_for_index(column_names, i)
|
|
226
|
+
col_config = config.get(col_name, {})
|
|
227
|
+
justify = _get_rich_justify(col_config)
|
|
228
|
+
style = _get_rich_style(col_config)
|
|
229
|
+
table.add_column(header, justify=justify, style=style)
|
|
230
|
+
|
|
231
|
+
for row in formatted_data:
|
|
232
|
+
table.add_row(*[str(cell) for cell in row])
|
|
233
|
+
|
|
234
|
+
return table
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _get_column_name_for_index(column_names: list[str], index: int) -> str:
|
|
238
|
+
return column_names[index] if index < len(column_names) else f"col_{index}"
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _get_rich_justify(col_config: dict) -> JustifyMethod:
|
|
242
|
+
formatter = col_config.get("formatter", "string")
|
|
243
|
+
if formatter in ["scientific", "decimal", "integer", "comma"]:
|
|
244
|
+
return "right"
|
|
245
|
+
return "left"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _get_rich_style(col_config: dict) -> str | None:
|
|
249
|
+
formatter = col_config.get("formatter", "string")
|
|
250
|
+
style_map = {
|
|
251
|
+
"scientific": "yellow",
|
|
252
|
+
"decimal": "green",
|
|
253
|
+
"integer": "cyan",
|
|
254
|
+
"comma": "cyan",
|
|
255
|
+
"truncate": "dim",
|
|
256
|
+
}
|
|
257
|
+
return style_map.get(formatter)
|
dr_frames/parsing.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"parse_first_element",
|
|
9
|
+
"sum_list_elements",
|
|
10
|
+
"is_homogeneous",
|
|
11
|
+
"parse_list_string",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_list_string(val: str) -> list | None:
|
|
16
|
+
if pd.isna(val):
|
|
17
|
+
return None
|
|
18
|
+
try:
|
|
19
|
+
parsed = ast.literal_eval(val)
|
|
20
|
+
if isinstance(parsed, list):
|
|
21
|
+
return parsed
|
|
22
|
+
return [parsed]
|
|
23
|
+
except (ValueError, SyntaxError):
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_first_element(val: str) -> float:
|
|
28
|
+
if pd.isna(val):
|
|
29
|
+
return float("nan")
|
|
30
|
+
try:
|
|
31
|
+
parsed = ast.literal_eval(val)
|
|
32
|
+
if isinstance(parsed, list):
|
|
33
|
+
if len(parsed) > 0:
|
|
34
|
+
return float(parsed[0])
|
|
35
|
+
return float("nan")
|
|
36
|
+
return float(parsed)
|
|
37
|
+
except (ValueError, SyntaxError):
|
|
38
|
+
return float("nan")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def sum_list_elements(val: str) -> float:
|
|
42
|
+
if pd.isna(val):
|
|
43
|
+
return float("nan")
|
|
44
|
+
try:
|
|
45
|
+
parsed = ast.literal_eval(val)
|
|
46
|
+
if isinstance(parsed, list):
|
|
47
|
+
if len(parsed) > 0:
|
|
48
|
+
try:
|
|
49
|
+
return float(sum(float(item) for item in parsed))
|
|
50
|
+
except (ValueError, TypeError):
|
|
51
|
+
return float("nan")
|
|
52
|
+
return float("nan")
|
|
53
|
+
return float(parsed)
|
|
54
|
+
except (ValueError, SyntaxError):
|
|
55
|
+
return float("nan")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_homogeneous(val: str) -> bool:
|
|
59
|
+
if pd.isna(val):
|
|
60
|
+
return False
|
|
61
|
+
try:
|
|
62
|
+
parsed = ast.literal_eval(val)
|
|
63
|
+
if isinstance(parsed, list):
|
|
64
|
+
if len(parsed) > 0:
|
|
65
|
+
return len(set(parsed)) == 1
|
|
66
|
+
return False
|
|
67
|
+
return True
|
|
68
|
+
except (ValueError, SyntaxError, TypeError):
|
|
69
|
+
return False
|
dr_frames/profiling.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from pathlib import PurePath
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from .aggregation import unique_non_null
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ColInfo",
|
|
15
|
+
"DFColInfo",
|
|
16
|
+
"looks_like_json",
|
|
17
|
+
"looks_like_path",
|
|
18
|
+
"infer_series_base_tag_type",
|
|
19
|
+
"infer_tags_from_series_sample",
|
|
20
|
+
"infer_col_name_contains_tags",
|
|
21
|
+
"infer_col_name_suffix_tags",
|
|
22
|
+
"infer_col_name_prefix_tags",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ColInfo(BaseModel):
|
|
27
|
+
catalog: DFColInfo
|
|
28
|
+
name: str
|
|
29
|
+
dtype: str | None = None
|
|
30
|
+
tags: set[str] = Field(default_factory=set)
|
|
31
|
+
|
|
32
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
33
|
+
|
|
34
|
+
def update_tags(self, series: pd.Series) -> None:
|
|
35
|
+
self.tags.update(
|
|
36
|
+
infer_series_base_tag_type(
|
|
37
|
+
series,
|
|
38
|
+
self.catalog.pd_type_to_tags,
|
|
39
|
+
self.catalog.pd_default_type,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
self.tags.update(
|
|
43
|
+
infer_col_name_contains_tags(
|
|
44
|
+
self.name,
|
|
45
|
+
self.catalog.col_name_contains_map,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
self.tags.update(
|
|
49
|
+
infer_col_name_suffix_tags(
|
|
50
|
+
self.name,
|
|
51
|
+
self.catalog.col_name_suffix_map,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
self.tags.update(
|
|
55
|
+
infer_col_name_prefix_tags(
|
|
56
|
+
self.name,
|
|
57
|
+
self.catalog.col_name_prefix_map,
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
self.tags.update(
|
|
61
|
+
infer_tags_from_series_sample(
|
|
62
|
+
series,
|
|
63
|
+
self.catalog.path_like_extensions,
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def has_tag(self, tag: str) -> bool:
|
|
68
|
+
return tag in self.tags
|
|
69
|
+
|
|
70
|
+
def add_tags(self, extra: list[str]) -> None:
|
|
71
|
+
self.tags.update(extra)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DFColInfo(BaseModel):
|
|
75
|
+
columns: dict[str, ColInfo] = Field(default_factory=dict)
|
|
76
|
+
pd_type_to_tags: list[tuple[Callable, list[str]]] = Field(
|
|
77
|
+
default_factory=lambda: [
|
|
78
|
+
(pd.api.types.is_bool_dtype, ["bool"]),
|
|
79
|
+
(pd.api.types.is_integer_dtype, ["int", "numeric"]),
|
|
80
|
+
(pd.api.types.is_float_dtype, ["float", "numeric"]),
|
|
81
|
+
(pd.api.types.is_datetime64_any_dtype, ["datetime"]),
|
|
82
|
+
(pd.api.types.is_timedelta64_dtype, ["timedelta"]),
|
|
83
|
+
(lambda dt: isinstance(dt, pd.CategoricalDtype), ["categorical"]),
|
|
84
|
+
]
|
|
85
|
+
)
|
|
86
|
+
pd_default_type: list[str] = Field(default_factory=lambda: ["str"])
|
|
87
|
+
|
|
88
|
+
col_name_contains_map: dict[tuple[str, ...], str] = Field(
|
|
89
|
+
default_factory=lambda: {
|
|
90
|
+
("config", "kwargs", "settings", "params"): "config",
|
|
91
|
+
}
|
|
92
|
+
)
|
|
93
|
+
col_name_suffix_map: dict[tuple[str, ...], str] = Field(
|
|
94
|
+
default_factory=lambda: {
|
|
95
|
+
("_path", "_dir"): "path",
|
|
96
|
+
("_id",): "id",
|
|
97
|
+
("_tag", "_tags"): "categorical",
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
col_name_prefix_map: dict[tuple[str, ...], str] = Field(
|
|
101
|
+
default_factory=lambda: {
|
|
102
|
+
("is_", "has_"): "bool_like",
|
|
103
|
+
("metric_",): "metric",
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
path_like_extensions: set[str] = Field(
|
|
108
|
+
default_factory=lambda: {
|
|
109
|
+
".json",
|
|
110
|
+
".jsonl",
|
|
111
|
+
".csv",
|
|
112
|
+
".tsv",
|
|
113
|
+
".parquet",
|
|
114
|
+
".txt",
|
|
115
|
+
".yaml",
|
|
116
|
+
".yml",
|
|
117
|
+
".log",
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def update_from_df(self, df: pd.DataFrame) -> None:
|
|
122
|
+
for column in df.columns:
|
|
123
|
+
info = ColInfo(catalog=self, name=column)
|
|
124
|
+
info.update_tags(df[column])
|
|
125
|
+
self.columns[column] = info
|
|
126
|
+
|
|
127
|
+
def get(self, name: str) -> ColInfo | None:
|
|
128
|
+
return self.columns.get(name)
|
|
129
|
+
|
|
130
|
+
def names_with_tag(self, tag: str) -> list[str]:
|
|
131
|
+
return [name for name, info in self.columns.items() if info.has_tag(tag)]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def infer_tags_from_series_sample(
|
|
135
|
+
series: pd.Series,
|
|
136
|
+
path_like_extensions: set[str],
|
|
137
|
+
sample_size: int = 10,
|
|
138
|
+
) -> set[str]:
|
|
139
|
+
tags: set[str] = set()
|
|
140
|
+
unique_vals = unique_non_null(series)
|
|
141
|
+
if len(unique_vals) == 0:
|
|
142
|
+
return tags
|
|
143
|
+
|
|
144
|
+
# Sample up to sample_size unique non-null values
|
|
145
|
+
sample_vals = unique_vals[: min(sample_size, len(unique_vals))]
|
|
146
|
+
|
|
147
|
+
# Check each sampled value for path and JSON patterns
|
|
148
|
+
path_matches = 0
|
|
149
|
+
json_matches = 0
|
|
150
|
+
for value in sample_vals:
|
|
151
|
+
if looks_like_path(value, path_like_extensions):
|
|
152
|
+
path_matches += 1
|
|
153
|
+
if looks_like_json(value):
|
|
154
|
+
json_matches += 1
|
|
155
|
+
|
|
156
|
+
# Add tags if at least one value matches
|
|
157
|
+
if path_matches > 0:
|
|
158
|
+
tags.add("path")
|
|
159
|
+
if json_matches > 0:
|
|
160
|
+
tags.add("json")
|
|
161
|
+
return tags
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def infer_col_name_contains_tags(
|
|
165
|
+
name: str,
|
|
166
|
+
col_name_contains_map: dict[tuple[str, ...], str],
|
|
167
|
+
) -> set[str]:
|
|
168
|
+
lower_name = name.lower()
|
|
169
|
+
tags: set[str] = set()
|
|
170
|
+
for contains_tuple, tag in col_name_contains_map.items():
|
|
171
|
+
for contains in contains_tuple:
|
|
172
|
+
if contains.lower() in lower_name:
|
|
173
|
+
tags.add(tag)
|
|
174
|
+
return tags
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def infer_col_name_suffix_tags(
|
|
178
|
+
name: str,
|
|
179
|
+
col_name_suffix_map: dict[tuple[str, ...], str],
|
|
180
|
+
) -> set[str]:
|
|
181
|
+
lower_name = name.lower()
|
|
182
|
+
tags: set[str] = set()
|
|
183
|
+
for suffix_tuple, tag in col_name_suffix_map.items():
|
|
184
|
+
for suffix in suffix_tuple:
|
|
185
|
+
if lower_name.endswith(suffix.lower()):
|
|
186
|
+
tags.add(tag)
|
|
187
|
+
break
|
|
188
|
+
return tags
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def infer_col_name_prefix_tags(
|
|
192
|
+
name: str,
|
|
193
|
+
col_name_prefix_map: dict[tuple[str, ...], str],
|
|
194
|
+
) -> set[str]:
|
|
195
|
+
lower_name = name.lower()
|
|
196
|
+
tags: set[str] = set()
|
|
197
|
+
for prefix_tuple, tag in col_name_prefix_map.items():
|
|
198
|
+
for prefix in prefix_tuple:
|
|
199
|
+
if lower_name.startswith(prefix.lower()):
|
|
200
|
+
tags.add(tag)
|
|
201
|
+
break
|
|
202
|
+
return tags
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def infer_series_base_tag_type(
|
|
206
|
+
series: pd.Series,
|
|
207
|
+
pd_type_to_tags: list[tuple[Callable, list[str]]],
|
|
208
|
+
pd_default_type: list[str],
|
|
209
|
+
) -> set[str]:
|
|
210
|
+
dtype = series.dtype
|
|
211
|
+
tags: set[str] = set()
|
|
212
|
+
for type_fxn, tag_list in pd_type_to_tags:
|
|
213
|
+
tags.update(tag_list if type_fxn(dtype) else [])
|
|
214
|
+
if len(tags) == 0:
|
|
215
|
+
tags.update(pd_default_type)
|
|
216
|
+
if series.isna().any():
|
|
217
|
+
tags.add("nullable")
|
|
218
|
+
return tags
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def looks_like_path(value: Any, path_extensions: set[str]) -> bool:
|
|
222
|
+
if not value or not isinstance(value, str) or value.isspace():
|
|
223
|
+
return False
|
|
224
|
+
normalized = value.strip()
|
|
225
|
+
if any(sep in normalized for sep in ("/", "\\")):
|
|
226
|
+
return True
|
|
227
|
+
try:
|
|
228
|
+
suffix = PurePath(normalized).suffix.lower()
|
|
229
|
+
except (TypeError, ValueError):
|
|
230
|
+
return False
|
|
231
|
+
return suffix in path_extensions
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def looks_like_json(value: Any) -> bool:
|
|
235
|
+
if not value or not isinstance(value, str):
|
|
236
|
+
return False
|
|
237
|
+
trimmed = value.strip()
|
|
238
|
+
if not trimmed:
|
|
239
|
+
return False
|
|
240
|
+
if not (trimmed.startswith("{") or trimmed.startswith("[")):
|
|
241
|
+
return False
|
|
242
|
+
try:
|
|
243
|
+
parsed = json.loads(trimmed)
|
|
244
|
+
except Exception:
|
|
245
|
+
return False
|
|
246
|
+
return isinstance(parsed, (dict, list))
|
dr_frames/py.typed
ADDED
|
File without changes
|