dftly 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dftly/__init__.py +13 -0
- dftly/grammar.lark +88 -0
- dftly/nodes.py +147 -0
- dftly/parser.py +588 -0
- dftly/polars.py +237 -0
- dftly-0.0.1.dist-info/METADATA +807 -0
- dftly-0.0.1.dist-info/RECORD +10 -0
- dftly-0.0.1.dist-info/WHEEL +5 -0
- dftly-0.0.1.dist-info/licenses/LICENSE +21 -0
- dftly-0.0.1.dist-info/top_level.txt +1 -0
dftly/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""dftly - DataFrame Transformation Language parser."""
|
|
2
|
+
|
|
3
|
+
from .nodes import Column, Expression, Literal
|
|
4
|
+
from .parser import Parser, from_yaml, parse
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Column",
|
|
8
|
+
"Expression",
|
|
9
|
+
"Literal",
|
|
10
|
+
"Parser",
|
|
11
|
+
"parse",
|
|
12
|
+
"from_yaml",
|
|
13
|
+
]
|
dftly/grammar.lark
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
%import common.WS
|
|
2
|
+
%ignore WS
|
|
3
|
+
STRING: /'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"/
|
|
4
|
+
|
|
5
|
+
PLUS: "+"
|
|
6
|
+
MINUS: "-"
|
|
7
|
+
AT: "@"
|
|
8
|
+
AS: /as/i
|
|
9
|
+
IF: /if/i
|
|
10
|
+
ELSE: /else/i
|
|
11
|
+
AND_SYM: "&&"
|
|
12
|
+
OR_SYM: "||"
|
|
13
|
+
NOT_SYM: "!"
|
|
14
|
+
NOT_MATCH.2: /not\s+match/i
|
|
15
|
+
AND.2: "and"i
|
|
16
|
+
OR.2: "or"i
|
|
17
|
+
NOT.2: "not"i
|
|
18
|
+
NAME: /[A-Za-z_][A-Za-z0-9_]*/
|
|
19
|
+
IN: /in/i
|
|
20
|
+
NUMBER: /\d+(?:\.\d+)?/
|
|
21
|
+
REGEX_PAREN_TOKEN.2: /\([^\s]+\)/
|
|
22
|
+
REGEX_TOKEN: /[^\s()]+/
|
|
23
|
+
LPAR: "("
|
|
24
|
+
RPAR: ")"
|
|
25
|
+
EXTRACT.2: /extract/i
|
|
26
|
+
GROUP.2: /group/i
|
|
27
|
+
OF.2: /of/i
|
|
28
|
+
FROM.2: /from/i
|
|
29
|
+
MATCH.2: /match/i
|
|
30
|
+
AGAINST.2: /against/i
|
|
31
|
+
|
|
32
|
+
start: expr
|
|
33
|
+
|
|
34
|
+
?expr: conditional
|
|
35
|
+
|
|
36
|
+
conditional: bool_expr IF bool_expr ELSE expr -> ifexpr
|
|
37
|
+
| bool_expr
|
|
38
|
+
|
|
39
|
+
?bool_expr: bool_expr (OR|OR_SYM) bool_term -> or_expr
|
|
40
|
+
| bool_term
|
|
41
|
+
|
|
42
|
+
?bool_term: bool_term (AND|AND_SYM) bool_factor -> and_expr
|
|
43
|
+
| bool_factor
|
|
44
|
+
|
|
45
|
+
?bool_factor: (NOT|NOT_SYM) bool_factor -> not_expr
|
|
46
|
+
| in_expr
|
|
47
|
+
|
|
48
|
+
?in_expr: additive IN set_literal -> value_in_set
|
|
49
|
+
| additive IN range_literal -> value_in_range
|
|
50
|
+
| additive
|
|
51
|
+
|
|
52
|
+
?additive: additive PLUS multiplicative
|
|
53
|
+
| additive MINUS multiplicative
|
|
54
|
+
| multiplicative
|
|
55
|
+
|
|
56
|
+
?multiplicative: multiplicative AT unary -> resolve_ts
|
|
57
|
+
| unary
|
|
58
|
+
|
|
59
|
+
?unary: primary
|
|
60
|
+
|
|
61
|
+
primary: call_expr
|
|
62
|
+
| regex_extract
|
|
63
|
+
| regex_match
|
|
64
|
+
| NAME AS STRING -> parse_as_format
|
|
65
|
+
| NAME AS NAME -> cast
|
|
66
|
+
| NUMBER -> number
|
|
67
|
+
| STRING -> string
|
|
68
|
+
| NAME -> name
|
|
69
|
+
| group
|
|
70
|
+
|
|
71
|
+
group: "(" expr ")" -> paren_expr
|
|
72
|
+
|
|
73
|
+
call_expr: NAME "(" [args] ")" -> func
|
|
74
|
+
args: expr ("," expr)* -> arg_list
|
|
75
|
+
|
|
76
|
+
set_literal: "{" [args] "}" -> literal_set
|
|
77
|
+
|
|
78
|
+
range_literal: "[" expr "," expr "]" -> range_inc
|
|
79
|
+
| "[" expr "," expr ")" -> range_ie
|
|
80
|
+
| "(" expr "," expr "]" -> range_ei
|
|
81
|
+
| "(" expr "," expr ")" -> range_exc
|
|
82
|
+
|
|
83
|
+
regex_extract: EXTRACT (GROUP NUMBER OF)? regex FROM expr
|
|
84
|
+
regex_match: MATCH regex AGAINST expr -> regex_match
|
|
85
|
+
| NOT_MATCH regex AGAINST expr -> regex_match
|
|
86
|
+
regex: REGEX_PAREN_TOKEN
|
|
87
|
+
| REGEX_TOKEN
|
|
88
|
+
| STRING
|
dftly/nodes.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import (
|
|
5
|
+
Any,
|
|
6
|
+
ClassVar,
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
9
|
+
Mapping,
|
|
10
|
+
Optional,
|
|
11
|
+
TYPE_CHECKING,
|
|
12
|
+
Union,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING: # pragma: no cover - imported for type checking only
|
|
16
|
+
from .parser import Parser
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NodeBase:
|
|
20
|
+
"""Base utilities for node dataclasses."""
|
|
21
|
+
|
|
22
|
+
KEY: ClassVar[str]
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _validate_keys(
|
|
26
|
+
mapping: Mapping[str, Any],
|
|
27
|
+
allowed: set[str],
|
|
28
|
+
*,
|
|
29
|
+
label: str,
|
|
30
|
+
required: Optional[set[str]] = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
required = required or set()
|
|
33
|
+
extra = set(mapping) - allowed
|
|
34
|
+
if extra:
|
|
35
|
+
raise ValueError(f"invalid {label} keys: {extra}")
|
|
36
|
+
missing = required - set(mapping)
|
|
37
|
+
if missing:
|
|
38
|
+
raise ValueError(f"{label} missing required keys: {missing}")
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def _validate_map(cls, value: Any, **kwargs: Any) -> Any:
|
|
42
|
+
"""Optional hook to validate the mapping value."""
|
|
43
|
+
return value
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_mapping(cls, mapping: Mapping[str, Any], **kwargs: Any) -> "NodeBase":
|
|
47
|
+
cls._validate_keys(
|
|
48
|
+
mapping,
|
|
49
|
+
{cls.KEY},
|
|
50
|
+
label=f"{cls.KEY} mapping",
|
|
51
|
+
required={cls.KEY},
|
|
52
|
+
)
|
|
53
|
+
value = cls._validate_map(mapping[cls.KEY], **kwargs)
|
|
54
|
+
if isinstance(value, Mapping):
|
|
55
|
+
return cls(**value)
|
|
56
|
+
return cls(value)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class Literal(NodeBase):
|
|
61
|
+
"""A literal value."""
|
|
62
|
+
|
|
63
|
+
KEY: ClassVar[str] = "literal"
|
|
64
|
+
|
|
65
|
+
value: Any
|
|
66
|
+
|
|
67
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
68
|
+
return {"literal": self.value}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Column(NodeBase):
|
|
73
|
+
"""Reference to a dataframe column."""
|
|
74
|
+
|
|
75
|
+
KEY: ClassVar[str] = "column"
|
|
76
|
+
|
|
77
|
+
name: str
|
|
78
|
+
type: Optional[str] = None
|
|
79
|
+
|
|
80
|
+
def __post_init__(self) -> None:
|
|
81
|
+
if not isinstance(self.name, str):
|
|
82
|
+
raise TypeError("column name must be a string")
|
|
83
|
+
if self.type is not None and not isinstance(self.type, str):
|
|
84
|
+
raise TypeError("column type must be a string")
|
|
85
|
+
|
|
86
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
87
|
+
data = {"name": self.name}
|
|
88
|
+
if self.type is not None:
|
|
89
|
+
data["type"] = self.type
|
|
90
|
+
return {"column": data}
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def _validate_map(
|
|
94
|
+
cls,
|
|
95
|
+
value: Any,
|
|
96
|
+
*,
|
|
97
|
+
input_schema: Optional[Mapping[str, Optional[str]]] = None,
|
|
98
|
+
) -> Mapping[str, Any]:
|
|
99
|
+
if isinstance(value, str):
|
|
100
|
+
typ = None if input_schema is None else input_schema.get(value)
|
|
101
|
+
return {"name": value, "type": typ}
|
|
102
|
+
if isinstance(value, Mapping):
|
|
103
|
+
cls._validate_keys(
|
|
104
|
+
value, {"name", "type"}, label="column", required={"name"}
|
|
105
|
+
)
|
|
106
|
+
name = value["name"]
|
|
107
|
+
typ = value.get("type")
|
|
108
|
+
if typ is None and input_schema is not None:
|
|
109
|
+
typ = input_schema.get(name)
|
|
110
|
+
return {"name": name, "type": typ}
|
|
111
|
+
raise TypeError("column value must be a string or mapping")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class Expression(NodeBase):
|
|
116
|
+
"""A parsed expression."""
|
|
117
|
+
|
|
118
|
+
KEY: ClassVar[str] = "expression"
|
|
119
|
+
|
|
120
|
+
type: str
|
|
121
|
+
arguments: Union[List[Any], Dict[str, Any]]
|
|
122
|
+
|
|
123
|
+
def __post_init__(self) -> None:
|
|
124
|
+
if not isinstance(self.type, str):
|
|
125
|
+
raise TypeError("expression type must be a string")
|
|
126
|
+
if not isinstance(self.arguments, (list, dict)):
|
|
127
|
+
raise TypeError("expression arguments must be list or dict")
|
|
128
|
+
|
|
129
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
130
|
+
return {"expression": {"type": self.type, "arguments": self.arguments}}
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def _validate_map(
|
|
134
|
+
cls,
|
|
135
|
+
value: Any,
|
|
136
|
+
*,
|
|
137
|
+
parser: "Parser",
|
|
138
|
+
) -> Mapping[str, Any]:
|
|
139
|
+
if not isinstance(value, Mapping):
|
|
140
|
+
raise TypeError("expression value must be a mapping")
|
|
141
|
+
cls._validate_keys(
|
|
142
|
+
value, {"type", "arguments"}, label="expression", required={"type"}
|
|
143
|
+
)
|
|
144
|
+
expr_type = value["type"]
|
|
145
|
+
args = value.get("arguments", [])
|
|
146
|
+
parsed_args = parser._parse_arguments(args)
|
|
147
|
+
return {"type": expr_type, "arguments": parsed_args}
|