macrodata-refiner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- macrodata_refiner-0.1.0.dist-info/METADATA +25 -0
- macrodata_refiner-0.1.0.dist-info/RECORD +72 -0
- macrodata_refiner-0.1.0.dist-info/WHEEL +5 -0
- macrodata_refiner-0.1.0.dist-info/entry_points.txt +2 -0
- macrodata_refiner-0.1.0.dist-info/licenses/LICENSE +214 -0
- macrodata_refiner-0.1.0.dist-info/top_level.txt +1 -0
- refiner/__init__.py +65 -0
- refiner/cli/__init__.py +1 -0
- refiner/cli/auth.py +147 -0
- refiner/cli/main.py +51 -0
- refiner/cli/ui.py +26 -0
- refiner/expressions.py +556 -0
- refiner/io/__init__.py +9 -0
- refiner/io/datafile.py +72 -0
- refiner/io/datafolder.py +145 -0
- refiner/io/fileset.py +172 -0
- refiner/ledger/__init__.py +10 -0
- refiner/ledger/backend/__init__.py +10 -0
- refiner/ledger/backend/base.py +60 -0
- refiner/ledger/backend/cloud.py +85 -0
- refiner/ledger/backend/fs.py +267 -0
- refiner/ledger/config.py +49 -0
- refiner/ledger/policy.py +144 -0
- refiner/ledger/shard.py +126 -0
- refiner/metrics.py +115 -0
- refiner/pipeline.py +381 -0
- refiner/platform/__init__.py +28 -0
- refiner/platform/auth.py +72 -0
- refiner/platform/client.py +303 -0
- refiner/platform/cloud/__init__.py +1 -0
- refiner/platform/cloud/models.py +73 -0
- refiner/platform/cloud/serialize.py +34 -0
- refiner/platform/config.py +14 -0
- refiner/platform/http.py +97 -0
- refiner/platform/manifest.py +167 -0
- refiner/platform/telemetry/__init__.py +5 -0
- refiner/platform/telemetry/emitter.py +267 -0
- refiner/platform/telemetry/metric_helpers.py +117 -0
- refiner/processors/__init__.py +21 -0
- refiner/processors/step.py +197 -0
- refiner/py.typed +0 -0
- refiner/runtime/__init__.py +1 -0
- refiner/runtime/errors.py +8 -0
- refiner/runtime/execution/__init__.py +19 -0
- refiner/runtime/execution/engine.py +331 -0
- refiner/runtime/execution/row_queue.py +75 -0
- refiner/runtime/execution/row_steps.py +104 -0
- refiner/runtime/execution/vectorized.py +114 -0
- refiner/runtime/launchers/__init__.py +11 -0
- refiner/runtime/launchers/base.py +152 -0
- refiner/runtime/launchers/cloud.py +92 -0
- refiner/runtime/launchers/local.py +294 -0
- refiner/runtime/metrics_context.py +143 -0
- refiner/runtime/planning.py +289 -0
- refiner/runtime/resources/__init__.py +10 -0
- refiner/runtime/resources/cpu.py +52 -0
- refiner/runtime/resources/memory.py +39 -0
- refiner/runtime/types.py +16 -0
- refiner/runtime/worker/__init__.py +3 -0
- refiner/runtime/worker/entrypoint.py +142 -0
- refiner/runtime/worker/runner.py +242 -0
- refiner/sources/__init__.py +23 -0
- refiner/sources/base.py +74 -0
- refiner/sources/items.py +69 -0
- refiner/sources/readers/__init__.py +17 -0
- refiner/sources/readers/base.py +146 -0
- refiner/sources/readers/csv.py +307 -0
- refiner/sources/readers/jsonl.py +111 -0
- refiner/sources/readers/parquet.py +228 -0
- refiner/sources/readers/utils.py +114 -0
- refiner/sources/row.py +155 -0
- refiner/sources/task.py +40 -0
refiner/expressions.py
ADDED
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import builtins
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime as datetime_cls
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
import pyarrow.compute as pc
|
|
10
|
+
|
|
11
|
+
_ARROW_FUNCTIONS = frozenset(pc.list_functions())
|
|
12
|
+
_HAS_FLOOR_DIVIDE_KERNEL = "floor_divide" in _ARROW_FUNCTIONS
|
|
13
|
+
_HAS_MOD_KERNEL = "mod" in _ARROW_FUNCTIONS
|
|
14
|
+
_HAS_MAXIMUM_KERNEL = "maximum" in _ARROW_FUNCTIONS
|
|
15
|
+
_HAS_MINIMUM_KERNEL = "minimum" in _ARROW_FUNCTIONS
|
|
16
|
+
_ELEMENT_WISE_KEEP_NULLS = pc.ElementWiseAggregateOptions(skip_nulls=False)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _as_expr(value: Any) -> "Expr":
|
|
20
|
+
if isinstance(value, Expr):
|
|
21
|
+
return value
|
|
22
|
+
return Expr(op="lit", args=(value,))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def value_to_code(value: Any) -> str:
|
|
26
|
+
if isinstance(value, Expr):
|
|
27
|
+
return expr_to_code(value)
|
|
28
|
+
if isinstance(value, str):
|
|
29
|
+
return repr(value)
|
|
30
|
+
if isinstance(value, (int, float, bool)) or value is None:
|
|
31
|
+
return repr(value)
|
|
32
|
+
if isinstance(value, list):
|
|
33
|
+
return "[" + ", ".join(value_to_code(item) for item in value) + "]"
|
|
34
|
+
if isinstance(value, tuple):
|
|
35
|
+
if len(value) == 1:
|
|
36
|
+
return f"({value_to_code(value[0])},)"
|
|
37
|
+
return "(" + ", ".join(value_to_code(item) for item in value) + ")"
|
|
38
|
+
if isinstance(value, dict):
|
|
39
|
+
return (
|
|
40
|
+
"{"
|
|
41
|
+
+ ", ".join(
|
|
42
|
+
f"{value_to_code(key)}: {value_to_code(item)}"
|
|
43
|
+
for key, item in value.items()
|
|
44
|
+
)
|
|
45
|
+
+ "}"
|
|
46
|
+
)
|
|
47
|
+
return repr(value)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def expr_to_code(value: Any) -> str:
|
|
51
|
+
if not isinstance(value, Expr):
|
|
52
|
+
return value_to_code(value)
|
|
53
|
+
|
|
54
|
+
op = value.op
|
|
55
|
+
args = value.args
|
|
56
|
+
|
|
57
|
+
if op == "col":
|
|
58
|
+
return f"col({args[0]!r})"
|
|
59
|
+
if op == "lit":
|
|
60
|
+
return value_to_code(args[0])
|
|
61
|
+
if op == "coalesce":
|
|
62
|
+
return f"coalesce({', '.join(expr_to_code(arg) for arg in args)})"
|
|
63
|
+
if op == "if_else":
|
|
64
|
+
return f"if_else({expr_to_code(args[0])}, {expr_to_code(args[1])}, {expr_to_code(args[2])})"
|
|
65
|
+
|
|
66
|
+
binary_ops = {
|
|
67
|
+
"add": "+",
|
|
68
|
+
"sub": "-",
|
|
69
|
+
"mul": "*",
|
|
70
|
+
"div": "/",
|
|
71
|
+
"floordiv": "//",
|
|
72
|
+
"mod": "%",
|
|
73
|
+
"eq": "==",
|
|
74
|
+
"ne": "!=",
|
|
75
|
+
"lt": "<",
|
|
76
|
+
"le": "<=",
|
|
77
|
+
"gt": ">",
|
|
78
|
+
"ge": ">=",
|
|
79
|
+
"and": "&",
|
|
80
|
+
"or": "|",
|
|
81
|
+
}
|
|
82
|
+
if op in binary_ops:
|
|
83
|
+
return f"({expr_to_code(args[0])} {binary_ops[op]} {expr_to_code(args[1])})"
|
|
84
|
+
if op == "not":
|
|
85
|
+
return f"(~{expr_to_code(args[0])})"
|
|
86
|
+
|
|
87
|
+
if op == "is_null":
|
|
88
|
+
return f"{expr_to_code(args[0])}.is_null()"
|
|
89
|
+
if op == "is_not_null":
|
|
90
|
+
return f"{expr_to_code(args[0])}.is_not_null()"
|
|
91
|
+
if op == "is_in":
|
|
92
|
+
return f"{expr_to_code(args[0])}.is_in({value_to_code(list(args[1]))})"
|
|
93
|
+
if op == "fill_null":
|
|
94
|
+
return f"{expr_to_code(args[0])}.fill_null({expr_to_code(args[1])})"
|
|
95
|
+
if op == "null_if":
|
|
96
|
+
return f"{expr_to_code(args[0])}.null_if({expr_to_code(args[1])})"
|
|
97
|
+
|
|
98
|
+
if op == "abs":
|
|
99
|
+
return f"{expr_to_code(args[0])}.abs()"
|
|
100
|
+
if op == "floor":
|
|
101
|
+
return f"{expr_to_code(args[0])}.floor()"
|
|
102
|
+
if op == "ceil":
|
|
103
|
+
return f"{expr_to_code(args[0])}.ceil()"
|
|
104
|
+
if op == "round":
|
|
105
|
+
return f"{expr_to_code(args[0])}.round({value_to_code(args[1])})"
|
|
106
|
+
if op == "clip":
|
|
107
|
+
kwargs: list[str] = []
|
|
108
|
+
if args[1] is not None:
|
|
109
|
+
kwargs.append(f"min_value={expr_to_code(args[1])}")
|
|
110
|
+
if args[2] is not None:
|
|
111
|
+
kwargs.append(f"max_value={expr_to_code(args[2])}")
|
|
112
|
+
return f"{expr_to_code(args[0])}.clip({', '.join(kwargs)})"
|
|
113
|
+
|
|
114
|
+
if op == "str_lower":
|
|
115
|
+
return f"{expr_to_code(args[0])}.str.lower()"
|
|
116
|
+
if op == "str_upper":
|
|
117
|
+
return f"{expr_to_code(args[0])}.str.upper()"
|
|
118
|
+
if op == "str_strip":
|
|
119
|
+
return f"{expr_to_code(args[0])}.str.strip()"
|
|
120
|
+
if op == "str_len":
|
|
121
|
+
return f"{expr_to_code(args[0])}.str.len()"
|
|
122
|
+
if op == "str_contains":
|
|
123
|
+
return f"{expr_to_code(args[0])}.str.contains({value_to_code(args[1])})"
|
|
124
|
+
if op == "str_startswith":
|
|
125
|
+
return f"{expr_to_code(args[0])}.str.startswith({value_to_code(args[1])})"
|
|
126
|
+
if op == "str_endswith":
|
|
127
|
+
return f"{expr_to_code(args[0])}.str.endswith({value_to_code(args[1])})"
|
|
128
|
+
if op == "str_regex_contains":
|
|
129
|
+
return f"{expr_to_code(args[0])}.str.regex_contains({value_to_code(args[1])})"
|
|
130
|
+
if op == "str_replace":
|
|
131
|
+
return (
|
|
132
|
+
f"{expr_to_code(args[0])}.str.replace("
|
|
133
|
+
f"{value_to_code(args[1])}, {value_to_code(args[2])})"
|
|
134
|
+
)
|
|
135
|
+
if op == "str_regex_replace":
|
|
136
|
+
return (
|
|
137
|
+
f"{expr_to_code(args[0])}.str.regex_replace("
|
|
138
|
+
f"{value_to_code(args[1])}, {value_to_code(args[2])})"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if op == "datetime_year":
|
|
142
|
+
return f"{expr_to_code(args[0])}.datetime.year()"
|
|
143
|
+
if op == "datetime_month":
|
|
144
|
+
return f"{expr_to_code(args[0])}.datetime.month()"
|
|
145
|
+
if op == "datetime_day":
|
|
146
|
+
return f"{expr_to_code(args[0])}.datetime.day()"
|
|
147
|
+
if op == "datetime_hour":
|
|
148
|
+
return f"{expr_to_code(args[0])}.datetime.hour()"
|
|
149
|
+
if op == "datetime_to_date":
|
|
150
|
+
return f"{expr_to_code(args[0])}.datetime.to_date()"
|
|
151
|
+
|
|
152
|
+
return value_to_code(value.to_plan())
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def with_columns_assignments_to_code(assignments: dict[str, Any]) -> str:
|
|
156
|
+
return ", ".join(
|
|
157
|
+
f"{name}={expr.to_code() if isinstance(expr, Expr) else expr_to_code(expr)}"
|
|
158
|
+
for name, expr in assignments.items()
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@dataclass(frozen=True, slots=True)
|
|
163
|
+
class Expr:
|
|
164
|
+
op: builtins.str
|
|
165
|
+
args: tuple[Any, ...]
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def str(self) -> "StringExpr":
|
|
169
|
+
return StringExpr(self)
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def datetime(self) -> "DateTimeExpr":
|
|
173
|
+
return DateTimeExpr(self)
|
|
174
|
+
|
|
175
|
+
def is_null(self) -> "Expr":
|
|
176
|
+
return Expr(op="is_null", args=(self,))
|
|
177
|
+
|
|
178
|
+
def is_not_null(self) -> "Expr":
|
|
179
|
+
return Expr(op="is_not_null", args=(self,))
|
|
180
|
+
|
|
181
|
+
def is_in(self, values: list[Any] | tuple[Any, ...]) -> "Expr":
|
|
182
|
+
return Expr(op="is_in", args=(self, tuple(values)))
|
|
183
|
+
|
|
184
|
+
def between(self, lower: Any, upper: Any) -> "Expr":
|
|
185
|
+
return (self >= _as_expr(lower)) & (self <= _as_expr(upper))
|
|
186
|
+
|
|
187
|
+
def fill_null(self, value: Any) -> "Expr":
|
|
188
|
+
return Expr(op="fill_null", args=(self, _as_expr(value)))
|
|
189
|
+
|
|
190
|
+
def null_if(self, value: Any) -> "Expr":
|
|
191
|
+
return Expr(op="null_if", args=(self, _as_expr(value)))
|
|
192
|
+
|
|
193
|
+
def abs(self) -> "Expr":
|
|
194
|
+
return Expr(op="abs", args=(self,))
|
|
195
|
+
|
|
196
|
+
def floor(self) -> "Expr":
|
|
197
|
+
return Expr(op="floor", args=(self,))
|
|
198
|
+
|
|
199
|
+
def ceil(self) -> "Expr":
|
|
200
|
+
return Expr(op="ceil", args=(self,))
|
|
201
|
+
|
|
202
|
+
def round(self, ndigits: int = 0) -> "Expr":
|
|
203
|
+
return Expr(op="round", args=(self, int(ndigits)))
|
|
204
|
+
|
|
205
|
+
def clip(
|
|
206
|
+
self, min_value: Any | None = None, max_value: Any | None = None
|
|
207
|
+
) -> "Expr":
|
|
208
|
+
if min_value is None and max_value is None:
|
|
209
|
+
raise ValueError("clip requires min_value and/or max_value")
|
|
210
|
+
return Expr(
|
|
211
|
+
op="clip",
|
|
212
|
+
args=(
|
|
213
|
+
self,
|
|
214
|
+
_as_expr(min_value) if min_value is not None else None,
|
|
215
|
+
_as_expr(max_value) if max_value is not None else None,
|
|
216
|
+
),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def to_plan(self) -> dict[builtins.str, Any]:
|
|
220
|
+
def _serialize(v: Any) -> Any:
|
|
221
|
+
if isinstance(v, Expr):
|
|
222
|
+
return v.to_plan()
|
|
223
|
+
if isinstance(v, (list, tuple)):
|
|
224
|
+
return [_serialize(x) for x in v]
|
|
225
|
+
if isinstance(v, dict):
|
|
226
|
+
return {str(k): _serialize(val) for k, val in v.items()}
|
|
227
|
+
if isinstance(v, datetime_cls):
|
|
228
|
+
return v.isoformat()
|
|
229
|
+
return v
|
|
230
|
+
|
|
231
|
+
return {"op": self.op, "args": [_serialize(v) for v in self.args]}
|
|
232
|
+
|
|
233
|
+
def to_code(self) -> builtins.str:
|
|
234
|
+
return expr_to_code(self)
|
|
235
|
+
|
|
236
|
+
def __add__(self, other: Any) -> "Expr":
|
|
237
|
+
return Expr(op="add", args=(self, _as_expr(other)))
|
|
238
|
+
|
|
239
|
+
def __sub__(self, other: Any) -> "Expr":
|
|
240
|
+
return Expr(op="sub", args=(self, _as_expr(other)))
|
|
241
|
+
|
|
242
|
+
def __mul__(self, other: Any) -> "Expr":
|
|
243
|
+
return Expr(op="mul", args=(self, _as_expr(other)))
|
|
244
|
+
|
|
245
|
+
def __truediv__(self, other: Any) -> "Expr":
|
|
246
|
+
return Expr(op="div", args=(self, _as_expr(other)))
|
|
247
|
+
|
|
248
|
+
def __floordiv__(self, other: Any) -> "Expr":
|
|
249
|
+
return Expr(op="floordiv", args=(self, _as_expr(other)))
|
|
250
|
+
|
|
251
|
+
def __mod__(self, other: Any) -> "Expr":
|
|
252
|
+
return Expr(op="mod", args=(self, _as_expr(other)))
|
|
253
|
+
|
|
254
|
+
def __eq__(self, other: object) -> "Expr": # type: ignore[override]
|
|
255
|
+
return Expr(op="eq", args=(self, _as_expr(other)))
|
|
256
|
+
|
|
257
|
+
def __ne__(self, other: object) -> "Expr": # type: ignore[override]
|
|
258
|
+
return Expr(op="ne", args=(self, _as_expr(other)))
|
|
259
|
+
|
|
260
|
+
def __lt__(self, other: Any) -> "Expr":
|
|
261
|
+
return Expr(op="lt", args=(self, _as_expr(other)))
|
|
262
|
+
|
|
263
|
+
def __le__(self, other: Any) -> "Expr":
|
|
264
|
+
return Expr(op="le", args=(self, _as_expr(other)))
|
|
265
|
+
|
|
266
|
+
def __gt__(self, other: Any) -> "Expr":
|
|
267
|
+
return Expr(op="gt", args=(self, _as_expr(other)))
|
|
268
|
+
|
|
269
|
+
def __ge__(self, other: Any) -> "Expr":
|
|
270
|
+
return Expr(op="ge", args=(self, _as_expr(other)))
|
|
271
|
+
|
|
272
|
+
def __and__(self, other: Any) -> "Expr":
|
|
273
|
+
return Expr(op="and", args=(self, _as_expr(other)))
|
|
274
|
+
|
|
275
|
+
def __or__(self, other: Any) -> "Expr":
|
|
276
|
+
return Expr(op="or", args=(self, _as_expr(other)))
|
|
277
|
+
|
|
278
|
+
def __invert__(self) -> "Expr":
|
|
279
|
+
return Expr(op="not", args=(self,))
|
|
280
|
+
|
|
281
|
+
def __bool__(self) -> bool:
|
|
282
|
+
raise TypeError(
|
|
283
|
+
"Expr cannot be coerced to bool; use '&', '|' and '~' to compose predicates."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@dataclass(frozen=True, slots=True)
|
|
288
|
+
class StringExpr:
|
|
289
|
+
base: Expr
|
|
290
|
+
|
|
291
|
+
def lower(self) -> Expr:
|
|
292
|
+
return Expr(op="str_lower", args=(self.base,))
|
|
293
|
+
|
|
294
|
+
def upper(self) -> Expr:
|
|
295
|
+
return Expr(op="str_upper", args=(self.base,))
|
|
296
|
+
|
|
297
|
+
def strip(self) -> Expr:
|
|
298
|
+
return Expr(op="str_strip", args=(self.base,))
|
|
299
|
+
|
|
300
|
+
def len(self) -> Expr:
|
|
301
|
+
return Expr(op="str_len", args=(self.base,))
|
|
302
|
+
|
|
303
|
+
def contains(self, pattern: str) -> Expr:
|
|
304
|
+
return Expr(op="str_contains", args=(self.base, pattern))
|
|
305
|
+
|
|
306
|
+
def startswith(self, prefix: str) -> Expr:
|
|
307
|
+
return Expr(op="str_startswith", args=(self.base, prefix))
|
|
308
|
+
|
|
309
|
+
def endswith(self, suffix: str) -> Expr:
|
|
310
|
+
return Expr(op="str_endswith", args=(self.base, suffix))
|
|
311
|
+
|
|
312
|
+
def regex_contains(self, pattern: str) -> Expr:
|
|
313
|
+
return Expr(op="str_regex_contains", args=(self.base, pattern))
|
|
314
|
+
|
|
315
|
+
def replace(self, pattern: str, replacement: str) -> Expr:
|
|
316
|
+
return Expr(op="str_replace", args=(self.base, pattern, replacement))
|
|
317
|
+
|
|
318
|
+
def regex_replace(self, pattern: str, replacement: str) -> Expr:
|
|
319
|
+
return Expr(op="str_regex_replace", args=(self.base, pattern, replacement))
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@dataclass(frozen=True, slots=True)
|
|
323
|
+
class DateTimeExpr:
|
|
324
|
+
base: Expr
|
|
325
|
+
|
|
326
|
+
def year(self) -> Expr:
|
|
327
|
+
return Expr(op="datetime_year", args=(self.base,))
|
|
328
|
+
|
|
329
|
+
def month(self) -> Expr:
|
|
330
|
+
return Expr(op="datetime_month", args=(self.base,))
|
|
331
|
+
|
|
332
|
+
def day(self) -> Expr:
|
|
333
|
+
return Expr(op="datetime_day", args=(self.base,))
|
|
334
|
+
|
|
335
|
+
def hour(self) -> Expr:
|
|
336
|
+
return Expr(op="datetime_hour", args=(self.base,))
|
|
337
|
+
|
|
338
|
+
def to_date(self) -> Expr:
|
|
339
|
+
return Expr(op="datetime_to_date", args=(self.base,))
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def col(name: str) -> Expr:
|
|
343
|
+
return Expr(op="col", args=(name,))
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def lit(value: Any) -> Expr:
|
|
347
|
+
return Expr(op="lit", args=(value,))
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def coalesce(*values: Any) -> Expr:
|
|
351
|
+
return Expr(op="coalesce", args=tuple(_as_expr(v) for v in values))
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def if_else(condition: Any, on_true: Any, on_false: Any) -> Expr:
|
|
355
|
+
return Expr(
|
|
356
|
+
op="if_else", args=(_as_expr(condition), _as_expr(on_true), _as_expr(on_false))
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _call(name: str, *args: Any, **kwargs: Any) -> Any:
|
|
361
|
+
return pc.call_function(name, list(args), **kwargs)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _null_scalar_like(value: pa.Array | pa.ChunkedArray | pa.Scalar) -> pa.Scalar:
|
|
365
|
+
return pa.scalar(None, type=value.type)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def eval_expr_arrow(
|
|
369
|
+
expr: Expr, table: pa.Table | pa.RecordBatch
|
|
370
|
+
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
|
|
371
|
+
op = expr.op
|
|
372
|
+
args = expr.args
|
|
373
|
+
|
|
374
|
+
if op == "col":
|
|
375
|
+
return table.column(str(args[0]))
|
|
376
|
+
if op == "lit":
|
|
377
|
+
return pa.scalar(args[0])
|
|
378
|
+
if op == "coalesce":
|
|
379
|
+
return _call("coalesce", *[eval_expr_arrow(v, table) for v in args])
|
|
380
|
+
if op == "if_else":
|
|
381
|
+
condition = eval_expr_arrow(args[0], table)
|
|
382
|
+
on_true = eval_expr_arrow(args[1], table)
|
|
383
|
+
on_false = eval_expr_arrow(args[2], table)
|
|
384
|
+
return _call("if_else", condition, on_true, on_false)
|
|
385
|
+
if op == "is_in":
|
|
386
|
+
options = pc.SetLookupOptions(value_set=pa.array(list(args[1])))
|
|
387
|
+
return _call("is_in", eval_expr_arrow(args[0], table), options=options)
|
|
388
|
+
if op == "fill_null":
|
|
389
|
+
return _call(
|
|
390
|
+
"coalesce", eval_expr_arrow(args[0], table), eval_expr_arrow(args[1], table)
|
|
391
|
+
)
|
|
392
|
+
if op == "null_if":
|
|
393
|
+
value = eval_expr_arrow(args[0], table)
|
|
394
|
+
other = eval_expr_arrow(args[1], table)
|
|
395
|
+
return _call(
|
|
396
|
+
"if_else", _call("equal", value, other), _null_scalar_like(value), value
|
|
397
|
+
)
|
|
398
|
+
if op == "abs":
|
|
399
|
+
return _call("abs", eval_expr_arrow(args[0], table))
|
|
400
|
+
if op == "floor":
|
|
401
|
+
return _call("floor", eval_expr_arrow(args[0], table))
|
|
402
|
+
if op == "ceil":
|
|
403
|
+
return _call("ceil", eval_expr_arrow(args[0], table))
|
|
404
|
+
if op == "round":
|
|
405
|
+
options = pc.RoundOptions(int(args[1]))
|
|
406
|
+
return _call("round", eval_expr_arrow(args[0], table), options=options)
|
|
407
|
+
if op == "clip":
|
|
408
|
+
value = eval_expr_arrow(args[0], table)
|
|
409
|
+
lower = args[1]
|
|
410
|
+
upper = args[2]
|
|
411
|
+
if lower is not None:
|
|
412
|
+
lower_value = eval_expr_arrow(lower, table)
|
|
413
|
+
if _HAS_MAXIMUM_KERNEL:
|
|
414
|
+
value = _call("maximum", value, lower_value)
|
|
415
|
+
else:
|
|
416
|
+
# `maximum` is not available on older Arrow versions.
|
|
417
|
+
value = _call(
|
|
418
|
+
"max_element_wise",
|
|
419
|
+
value,
|
|
420
|
+
lower_value,
|
|
421
|
+
options=_ELEMENT_WISE_KEEP_NULLS,
|
|
422
|
+
)
|
|
423
|
+
if upper is not None:
|
|
424
|
+
upper_value = eval_expr_arrow(upper, table)
|
|
425
|
+
if _HAS_MINIMUM_KERNEL:
|
|
426
|
+
value = _call("minimum", value, upper_value)
|
|
427
|
+
else:
|
|
428
|
+
# `minimum` is not available on older Arrow versions.
|
|
429
|
+
value = _call(
|
|
430
|
+
"min_element_wise",
|
|
431
|
+
value,
|
|
432
|
+
upper_value,
|
|
433
|
+
options=_ELEMENT_WISE_KEEP_NULLS,
|
|
434
|
+
)
|
|
435
|
+
return value
|
|
436
|
+
|
|
437
|
+
if op in {
|
|
438
|
+
"add",
|
|
439
|
+
"sub",
|
|
440
|
+
"mul",
|
|
441
|
+
"div",
|
|
442
|
+
"floordiv",
|
|
443
|
+
"mod",
|
|
444
|
+
"eq",
|
|
445
|
+
"ne",
|
|
446
|
+
"lt",
|
|
447
|
+
"le",
|
|
448
|
+
"gt",
|
|
449
|
+
"ge",
|
|
450
|
+
"and",
|
|
451
|
+
"or",
|
|
452
|
+
}:
|
|
453
|
+
left = eval_expr_arrow(args[0], table)
|
|
454
|
+
right = eval_expr_arrow(args[1], table)
|
|
455
|
+
binary_map = {
|
|
456
|
+
"add": "add",
|
|
457
|
+
"sub": "subtract",
|
|
458
|
+
"mul": "multiply",
|
|
459
|
+
"div": "divide",
|
|
460
|
+
"floordiv": "floor_divide" if _HAS_FLOOR_DIVIDE_KERNEL else None,
|
|
461
|
+
"mod": "mod" if _HAS_MOD_KERNEL else None,
|
|
462
|
+
"eq": "equal",
|
|
463
|
+
"ne": "not_equal",
|
|
464
|
+
"lt": "less",
|
|
465
|
+
"le": "less_equal",
|
|
466
|
+
"gt": "greater",
|
|
467
|
+
"ge": "greater_equal",
|
|
468
|
+
"and": "and_kleene",
|
|
469
|
+
"or": "or_kleene",
|
|
470
|
+
}
|
|
471
|
+
direct_kernel = binary_map[op]
|
|
472
|
+
if direct_kernel is not None:
|
|
473
|
+
return _call(direct_kernel, left, right)
|
|
474
|
+
if op == "floordiv":
|
|
475
|
+
return _call("floor", _call("divide", left, right))
|
|
476
|
+
if op == "mod":
|
|
477
|
+
floored = _call("floor", _call("divide", left, right))
|
|
478
|
+
return _call("subtract", left, _call("multiply", floored, right))
|
|
479
|
+
raise ValueError(f"Unsupported binary expression op: {op}")
|
|
480
|
+
|
|
481
|
+
if op == "not":
|
|
482
|
+
return _call("invert", eval_expr_arrow(args[0], table))
|
|
483
|
+
if op == "is_null":
|
|
484
|
+
return _call("is_null", eval_expr_arrow(args[0], table))
|
|
485
|
+
if op == "is_not_null":
|
|
486
|
+
return _call("is_valid", eval_expr_arrow(args[0], table))
|
|
487
|
+
|
|
488
|
+
if op == "str_lower":
|
|
489
|
+
return _call("utf8_lower", eval_expr_arrow(args[0], table))
|
|
490
|
+
if op == "str_upper":
|
|
491
|
+
return _call("utf8_upper", eval_expr_arrow(args[0], table))
|
|
492
|
+
if op == "str_strip":
|
|
493
|
+
return _call("utf8_trim_whitespace", eval_expr_arrow(args[0], table))
|
|
494
|
+
if op == "str_len":
|
|
495
|
+
return _call("utf8_length", eval_expr_arrow(args[0], table))
|
|
496
|
+
if op == "str_contains":
|
|
497
|
+
options = pc.MatchSubstringOptions(pattern=str(args[1]))
|
|
498
|
+
return _call(
|
|
499
|
+
"match_substring", eval_expr_arrow(args[0], table), options=options
|
|
500
|
+
)
|
|
501
|
+
if op == "str_startswith":
|
|
502
|
+
options = pc.MatchSubstringOptions(pattern=str(args[1]))
|
|
503
|
+
return _call("starts_with", eval_expr_arrow(args[0], table), options=options)
|
|
504
|
+
if op == "str_endswith":
|
|
505
|
+
options = pc.MatchSubstringOptions(pattern=str(args[1]))
|
|
506
|
+
return _call("ends_with", eval_expr_arrow(args[0], table), options=options)
|
|
507
|
+
if op == "str_regex_contains":
|
|
508
|
+
options = pc.MatchSubstringOptions(pattern=str(args[1]))
|
|
509
|
+
return _call(
|
|
510
|
+
"match_substring_regex",
|
|
511
|
+
eval_expr_arrow(args[0], table),
|
|
512
|
+
options=options,
|
|
513
|
+
)
|
|
514
|
+
if op == "str_replace":
|
|
515
|
+
options = pc.ReplaceSubstringOptions(
|
|
516
|
+
pattern=str(args[1]),
|
|
517
|
+
replacement=str(args[2]),
|
|
518
|
+
)
|
|
519
|
+
return _call(
|
|
520
|
+
"replace_substring", eval_expr_arrow(args[0], table), options=options
|
|
521
|
+
)
|
|
522
|
+
if op == "str_regex_replace":
|
|
523
|
+
options = pc.ReplaceSubstringOptions(
|
|
524
|
+
pattern=str(args[1]),
|
|
525
|
+
replacement=str(args[2]),
|
|
526
|
+
)
|
|
527
|
+
return _call(
|
|
528
|
+
"replace_substring_regex",
|
|
529
|
+
eval_expr_arrow(args[0], table),
|
|
530
|
+
options=options,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
if op == "datetime_year":
|
|
534
|
+
return _call("year", eval_expr_arrow(args[0], table))
|
|
535
|
+
if op == "datetime_month":
|
|
536
|
+
return _call("month", eval_expr_arrow(args[0], table))
|
|
537
|
+
if op == "datetime_day":
|
|
538
|
+
return _call("day", eval_expr_arrow(args[0], table))
|
|
539
|
+
if op == "datetime_hour":
|
|
540
|
+
return _call("hour", eval_expr_arrow(args[0], table))
|
|
541
|
+
if op == "datetime_to_date":
|
|
542
|
+
return pc.cast(eval_expr_arrow(args[0], table), target_type=pa.date32())
|
|
543
|
+
|
|
544
|
+
raise ValueError(f"Unsupported expression op: {op}")
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
__all__ = [
|
|
548
|
+
"Expr",
|
|
549
|
+
"StringExpr",
|
|
550
|
+
"DateTimeExpr",
|
|
551
|
+
"col",
|
|
552
|
+
"lit",
|
|
553
|
+
"coalesce",
|
|
554
|
+
"if_else",
|
|
555
|
+
"eval_expr_arrow",
|
|
556
|
+
]
|
refiner/io/__init__.py
ADDED
refiner/io/datafile.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, TypeAlias, Union, cast
|
|
6
|
+
|
|
7
|
+
from fsspec import AbstractFileSystem, url_to_fs
|
|
8
|
+
from fsspec.implementations.local import LocalFileSystem
|
|
9
|
+
|
|
10
|
+
DataFileLike: TypeAlias = Union[str, "DataFile"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True, slots=True)
|
|
14
|
+
class DataFile:
|
|
15
|
+
"""A minimal (fs, path) file abstraction with a small normalization factory.
|
|
16
|
+
|
|
17
|
+
Notes:
|
|
18
|
+
- `path` is stored in the form expected by `fs.open/fs.exists` (no protocol required).
|
|
19
|
+
- `resolve()` accepts `str` URL/path or `DataFile` (pass-through).
|
|
20
|
+
- If `fs` is provided to `resolve()`, it wins and `storage_options` is ignored.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
fs: AbstractFileSystem
|
|
24
|
+
path: str
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def resolve(
|
|
28
|
+
cls,
|
|
29
|
+
data: DataFileLike,
|
|
30
|
+
*,
|
|
31
|
+
fs: AbstractFileSystem | None = None,
|
|
32
|
+
storage_options: Mapping[str, Any] | None = None,
|
|
33
|
+
) -> "DataFile":
|
|
34
|
+
"""Resolve a string URL/path into a `DataFile`, or pass through an existing `DataFile`.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data: A `str` URL/path or an existing `DataFile`.
|
|
38
|
+
fs: Optional initialized filesystem to use. If provided, `storage_options` is ignored.
|
|
39
|
+
storage_options: Optional fsspec filesystem init options (used only when `fs` is not provided).
|
|
40
|
+
"""
|
|
41
|
+
if isinstance(data, cls):
|
|
42
|
+
return data
|
|
43
|
+
|
|
44
|
+
# simple string url/path
|
|
45
|
+
if isinstance(data, str):
|
|
46
|
+
if fs is not None:
|
|
47
|
+
# Best-effort strip protocol so `.path` is in the form expected by `fs.open/fs.exists`.
|
|
48
|
+
path = fs._strip_protocol(data) # type: ignore[attr-defined]
|
|
49
|
+
return cls(fs=fs, path=path)
|
|
50
|
+
|
|
51
|
+
next_fs, path = url_to_fs(
|
|
52
|
+
data, **cast(Mapping[str, Any], storage_options or {})
|
|
53
|
+
)
|
|
54
|
+
return cls(fs=next_fs, path=path)
|
|
55
|
+
|
|
56
|
+
raise TypeError("DataFileLike must be: str | DataFile")
|
|
57
|
+
|
|
58
|
+
def open(self, mode: str = "rt", **kwargs):
|
|
59
|
+
return self.fs.open(self.path, mode=mode, **kwargs)
|
|
60
|
+
|
|
61
|
+
def exists(self) -> bool:
|
|
62
|
+
return self.fs.exists(self.path)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def is_local(self) -> bool:
|
|
66
|
+
return isinstance(self.fs, LocalFileSystem)
|
|
67
|
+
|
|
68
|
+
def __str__(self) -> str:
|
|
69
|
+
try:
|
|
70
|
+
return self.fs.unstrip_protocol(self.path)
|
|
71
|
+
except Exception:
|
|
72
|
+
return self.path
|