llparse 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llparse/constants.py ADDED
@@ -0,0 +1,48 @@
1
+ CONTAINER_KEY = "c"
2
+ LABEL_PREFIX = ""
3
+ STATE_PREFIX = "s_n_"
4
+ STATE_ERROR = "s_error"
5
+ BLOB_PREFIX = "llparse_blob"
6
+ ARG_STATE = "state"
7
+ ARG_POS = "p"
8
+ ARG_ENDPOS = "endp"
9
+ VAR_MATCH = "match"
10
+
11
+ # MatchSequence
12
+
13
+ SEQUENCE_COMPLETE = "kMatchComplete"
14
+ SEQUENCE_MISMATCH = "kMatchMismatch"
15
+ SEQUENCE_PAUSE = "kMatchPause"
16
+
17
+
18
+ # I Thought it might be a little but faster to use tuples instead of lists - Vizonex
19
+ SIGNED_LIMITS: dict[str, tuple[str, str]] = {
20
+ "i8": ("-0x80", "0x7f"),
21
+ "i16": ("-0x8000", "0x7fff"),
22
+ "i32": ("(-0x7fffffff - 1)", "0x7fffffff"),
23
+ "i64": ("(-0x7fffffffffffffffLL - 1)", "0x7fffffffffffffffLL"),
24
+ }
25
+
26
+ # TODO (Vizonex) : Propose changes to llparse
27
+ # typescript program which uses two i8's
28
+ # which I belive is an error and a mistake
29
+ UNSIGNED_LIMITS: dict[str, tuple[str, str]] = {
30
+ "i8": ("0", "0xff"),
31
+ "i16": ("0", "0xffff"),
32
+ "i32": ("0", "0xffffffff"),
33
+ "i64": ("0ULL", "0xffffffffffffffffULL"),
34
+ }
35
+
36
+ UNSIGNED_TYPES: dict[str, str] = {
37
+ "i8": "int8_t",
38
+ "i16": "int16_t",
39
+ "i32": "int32_t",
40
+ "i64": "int64_t",
41
+ }
42
+
43
+ SIGNED_TYPES: dict[str, str] = {
44
+ "i8": "int8_t",
45
+ "i16": "int16_t",
46
+ "i32": "int32_t",
47
+ "i64": "int64_t",
48
+ }
@@ -0,0 +1,311 @@
1
+ """Used to build apis and more from llparse WARNING: This may or may not be stable yet!!!"""
2
+
3
+ from contextlib import contextmanager
4
+ from typing import Optional
5
+ from .frontend import IFrontendResult
6
+ from .pyfront.front import Match
7
+ from .pyfront.nodes import Invoke
8
+
9
+ # Inspired by Cython's Writer
10
+
11
+ # You will notice many simillarities
12
+ # because there's was no way to for me to optimize the
13
+ # originals further than what was given by Cython itself.
14
+ # This will be used to help me write my own custom
15
+ # Finite Machine Parts
16
+
17
+ VA_ARGS_CALLBACK = """#define CALLBACK_MAYBE(PARSER, NAME, ...) \\
18
+ do { \\
19
+ %s_settings_t* settings; \\
20
+ settings = (%s_settings_t*) (PARSER)->settings; \\
21
+ if (settings == NULL || settings->NAME == NULL) { \\
22
+ err = 0; \\
23
+ break; \\
24
+ } \\
25
+ err = settings->NAME(__VA_ARGS__); \\
26
+ } while (0)"""
27
+
28
+
29
+ class LineWriter(object):
30
+ def __init__(self) -> None:
31
+ self.lines: list[str] = []
32
+ self.s = ""
33
+
34
+ def put(self, s: str):
35
+ self.s += s
36
+
37
+ def newline(self):
38
+ self.lines.append(self.s)
39
+ self.s = ""
40
+
41
+ def putline(self, s: str):
42
+ self.s += s
43
+ self.newline()
44
+
45
+
46
+ class CodeWriter:
47
+ """Used as a baseplate for writing code..."""
48
+
49
+ line_indent: str = " "
50
+
51
+ def __init__(self) -> None:
52
+ self._indentures = 0
53
+ self.lw = LineWriter()
54
+
55
+ def __indent(self):
56
+ self._indentures += 1
57
+
58
+ def __dedent(self):
59
+ self._indentures -= 1
60
+
61
+ @contextmanager
62
+ def indent(self):
63
+ """Used to mirror/mimic programming with indentures and to make everything cleaner and easier to read"""
64
+ self.__indent()
65
+ yield
66
+ self.__dedent()
67
+
68
+ def startline(self, s: str):
69
+ self.lw.put((self._indentures * self.line_indent) + s)
70
+
71
+ def put(self, s: str):
72
+ self.lw.put(s)
73
+
74
+ def putline_with_format(self, s: str, *args):
75
+ """Makes a cleaner format than what would've been used to workaround formmating with curly brackets `{}`"""
76
+ self.lw.putline((self._indentures * self.line_indent) + s % args)
77
+
78
+ def putline(self, s: str):
79
+ self.lw.putline((self._indentures * self.line_indent) + s)
80
+
81
+ def endline(self, s: str = ""):
82
+ self.lw.putline(s)
83
+
84
+ @property
85
+ def lines(self) -> list[str]:
86
+ return self.lw.lines
87
+
88
+ @property
89
+ def code(self) -> str:
90
+ return "\n".join(self.lines)
91
+
92
+
93
+ class CythonWriter(CodeWriter):
94
+ """Coming soon..."""
95
+
96
+ line_indent = " "
97
+
98
+
99
+ # TODO Vizonex Maybe see if Indutny would like to use a special codewriter to
100
+ # help with building llparse's c code in typescript It would be less prone to
101
+ # compile-time errors
102
+
103
+
104
+ class MainCompiler:
105
+ """Used to Create APIS like those seen in llhttp"""
106
+
107
+ def __init__(self, info: IFrontendResult) -> None:
108
+ self.data_cb: set[str] = set()
109
+ """Used to identify span related Callbacks"""
110
+ self.cb: set[str] = set()
111
+ """Used to identify match callbacks that are use-handled"""
112
+ self.info = info
113
+
114
+ def get_user_callbacks(self):
115
+ for s in self.info.spans:
116
+ self.data_cb.update(cb.ref.name for cb in s.callbacks)
117
+ for s in self.info.resumptionTargets:
118
+ for slot in s.ref.buildSlots():
119
+ if isinstance(slot.node.ref, Invoke) and isinstance(
120
+ slot.node.ref.code.ref, Match
121
+ ):
122
+ self.cb.add(slot.node.ref.code.ref.name)
123
+ # Some nodes like to hide themselves inside other nodes so this is my only simple solution which is to do it a second time...
124
+ for _slot in slot.node.ref.getSlots():
125
+ if isinstance(_slot.node.ref, Invoke) and isinstance(
126
+ _slot.node.ref.code.ref, Match
127
+ ):
128
+ self.cb.add(_slot.node.ref.code.ref.name)
129
+
130
+
131
+ class ApiCompiler(MainCompiler):
132
+ """Builds external api assuming that our prefix is an internal one"""
133
+
134
+ def __init__(
135
+ self, new_preifx: str, info: IFrontendResult, header_name: Optional[str] = None
136
+ ) -> None:
137
+ self.new_prefix = new_preifx
138
+ super().__init__(info)
139
+ self.get_user_callbacks()
140
+ self.header_name = header_name
141
+
142
+ def build_C(self):
143
+ # I find using codewriters to be more elegant so we will be using that instead - Vizonex
144
+ prefix = self.info.prefix
145
+ writer = CodeWriter()
146
+ writer.putline(
147
+ f'#include <stdlib.h>\n#include <stdio.h>\n#include <string.h>\n#include "{self.header_name or self.new_prefix}.h"'
148
+ )
149
+ writer.endline()
150
+ writer.putline("/* Inspired by llhttp */")
151
+ writer.endline()
152
+ writer.putline_with_format(VA_ARGS_CALLBACK, self.new_prefix, self.new_prefix)
153
+ writer.endline()
154
+ writer.endline()
155
+ writer.putline(f"void {self.new_prefix}_init({self.new_prefix}_t* parser,")
156
+ with writer.indent():
157
+ writer.putline(f"const {self.new_prefix}_settings_t* settings) " + "{")
158
+ writer.putline(f"{prefix}_init(parser);")
159
+ writer.putline("parser->settings = (void*) settings;")
160
+ writer.endline("}")
161
+ writer.putline("/* Callbacks */")
162
+
163
+ # This is where everything comes together and makes sense
164
+ for data in sorted(self.data_cb):
165
+ writer.putline(
166
+ f"int {data}({self.new_prefix}_t* s, const char* p, const char* endp) "
167
+ + "{"
168
+ )
169
+ with writer.indent():
170
+ writer.putline("int err;")
171
+ writer.putline(
172
+ f"CALLBACK_MAYBE(s, {data.removeprefix(prefix).strip('_')}, s, p, endp - p);"
173
+ )
174
+ writer.putline("return err;")
175
+ writer.putline("}")
176
+ writer.endline()
177
+ writer.endline()
178
+
179
+ for data in sorted(self.cb):
180
+ writer.putline(
181
+ f"int {data}({self.new_prefix}_t* s, const char* p, const char* endp) "
182
+ + "{"
183
+ )
184
+ with writer.indent():
185
+ writer.putline("int err;")
186
+ writer.putline(
187
+ f"CALLBACK_MAYBE(s, {data.removeprefix(prefix).strip('_')}, s);"
188
+ )
189
+ writer.putline("return err;")
190
+ writer.putline("}")
191
+ writer.endline()
192
+ writer.endline()
193
+
194
+ writer.putline(
195
+ f"int {self.new_prefix}_execute({self.new_prefix}_t* parser, const char* data, size_t len) "
196
+ + "{"
197
+ )
198
+ with writer.indent():
199
+ writer.putline(f"return {prefix}_execute(parser, data, data + len);")
200
+ writer.putline("}")
201
+
202
+ # Reset Parser
203
+
204
+ writer.putline(
205
+ f"void {self.new_prefix}_settings_init({self.new_prefix}_settings_t* settings)"
206
+ + " {"
207
+ )
208
+ writer.putline("\tmemset(settings, 0, sizeof(*settings));\n}")
209
+
210
+ return writer.code
211
+
212
+ def build_H(self, headerguard: Optional[str] = None):
213
+ """Builds Headerfile api extensions..."""
214
+ writer = CodeWriter()
215
+ headerguard = self.new_prefix.upper() if not headerguard else headerguard
216
+ writer.putline(f"\n#ifndef {headerguard}_API_H_")
217
+ writer.putline(f"#define {headerguard}_API_H_")
218
+ writer.endline()
219
+ writer.putline_with_format(
220
+ "typedef %s_t %s_t;", self.info.prefix, self.new_prefix
221
+ )
222
+ writer.putline(
223
+ f"typedef struct {self.new_prefix}_settings_s {self.new_prefix}_settings_t;"
224
+ )
225
+ writer.endline()
226
+ writer.putline(
227
+ f"typedef int (*{self.new_prefix}_data_cb)({self.new_prefix}_t*, const char *at, size_t length);"
228
+ )
229
+ writer.putline(f"typedef int (*{self.new_prefix}_cb)({self.new_prefix}_t*);")
230
+ writer.endline()
231
+ writer.putline(f"struct {self.new_prefix}_settings_s " + "{")
232
+ with writer.indent():
233
+ for data in self.data_cb:
234
+ writer.putline(
235
+ f"{self.new_prefix}_data_cb {data.removeprefix(self.info.prefix).strip('_')};"
236
+ )
237
+ for data in self.cb:
238
+ writer.putline(
239
+ f"{self.new_prefix}_cb {data.removeprefix(self.info.prefix).strip('_')};"
240
+ )
241
+ writer.putline("};")
242
+ writer.endline()
243
+ writer.putline(
244
+ f"int {self.new_prefix}_execute({self.new_prefix}_t* parser, const char* data, size_t len);"
245
+ )
246
+ writer.putline(f"void {self.new_prefix}_init({self.new_prefix}_t* parser,")
247
+ with writer.indent():
248
+ writer.putline(f"const {self.new_prefix}_settings_t* settings);")
249
+ writer.putline(
250
+ f"void {self.new_prefix}_settings_init({self.new_prefix}_settings_t* settings);"
251
+ )
252
+ writer.putline(f"#endif /* {headerguard}_API_H_ */")
253
+ return writer.code
254
+
255
+ def build_pxd(self):
256
+ writer = CythonWriter()
257
+
258
+ writer.putline("#cython: language_level = 3")
259
+ writer.endline()
260
+ writer.putline("from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t")
261
+ writer.endline()
262
+ writer.putline("# Automatically generated in pyparse a parody of llparse")
263
+ writer.putline(f'cdef extern from "{self.header_name or self.new_prefix}.h":')
264
+ with writer.indent():
265
+ writer.putline(f"struct {self.new_prefix}_t:")
266
+ with writer.indent():
267
+ ty = ""
268
+
269
+ for prop in self.info.properties():
270
+ if prop.ty == "i8":
271
+ ty = "uint8_t"
272
+ elif prop.ty == "i16":
273
+ ty = "uint16_t"
274
+ elif prop.ty == "i32":
275
+ ty = "uint32_t"
276
+ elif prop.ty == "i64":
277
+ ty = "uint64_t"
278
+ elif prop.ty == "ptr":
279
+ ty = "void*"
280
+ else:
281
+ raise Exception(f'Unknown state property type: "{prop.ty}"')
282
+ writer.putline("%s %s" % (ty, prop.name))
283
+ writer.putline("void* data")
284
+
285
+ writer.endline()
286
+ writer.putline(
287
+ f"ctypedef int (*{self.new_prefix}_data_cb)({self.new_prefix}_t*, const char *at, size_t length)"
288
+ )
289
+ writer.putline(
290
+ f"ctypedef int (*{self.new_prefix}_cb)({self.new_prefix}_t*)"
291
+ )
292
+ writer.endline()
293
+ writer.putline(f"struct {self.new_prefix}_settings_t:")
294
+ with writer.indent():
295
+ for data in sorted(self.data_cb):
296
+ writer.putline(
297
+ f"{self.new_prefix}_data_cb {data.removeprefix(self.info.prefix).strip('_')}"
298
+ )
299
+ for data in sorted(self.cb):
300
+ writer.putline(
301
+ f"{self.new_prefix}_cb {data.removeprefix(self.info.prefix).strip('_')}"
302
+ )
303
+ writer.endline()
304
+ writer.putline(
305
+ f"int {self.new_prefix}_execute({self.new_prefix}_t* parser, const char* data, size_t len)"
306
+ )
307
+ writer.putline(
308
+ f"void {self.new_prefix}_init({self.new_prefix}_t* parser, const {self.new_prefix}_settings_t* settings)"
309
+ )
310
+ writer.endline()
311
+ return writer.code
llparse/debug.py ADDED
@@ -0,0 +1,23 @@
1
+ from .pybuilder import Node
2
+
3
+
4
+ class Debugger:
5
+ @staticmethod
6
+ def getAllNodes(root: Node):
7
+ nodes: set[Node] = set()
8
+ queue: list[Node] = [root]
9
+
10
+ while queue:
11
+ node = queue.pop()
12
+ print(node.name)
13
+ if node.name == "nmethods":
14
+ print(node.getEdges())
15
+ if edges := node.getEdges():
16
+ for slot in edges:
17
+ if slot.node in nodes:
18
+ continue
19
+
20
+ nodes.add(slot.node)
21
+ queue.append(slot.node)
22
+
23
+ return list(nodes)
llparse/dot.py ADDED
@@ -0,0 +1,213 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Union
3
+
4
+ from .pybuilder.main_code import Edge, Node
5
+
6
+ # TODO: Fix all graphs and more It's currently broken...
7
+
8
+ COLOR_ADVANCE = "black"
9
+ COLOR_NO_ADVANCE = "blue"
10
+ COLOR_INVOKE = "green"
11
+ COLOR_OTHERWISE = "red"
12
+
13
+
14
+ class Dot:
15
+ """Used to create a graphviz of your parser"""
16
+
17
+ def __init__(self) -> None:
18
+ self.idCache: dict[Node, str] = {}
19
+ self.ns: set[str] = set()
20
+
21
+ def dump_to_file(self, filename: Union[str, Path], root: Node):
22
+ open(filename, "w").write(self.build(root))
23
+
24
+ def build(self, root: Node):
25
+ res = ""
26
+ res += "digraph {\n"
27
+ res += ' concentrate="true"\n'
28
+
29
+ for node in self.enumerateNodes(root):
30
+ res += self.buildNode(node)
31
+
32
+ res += "}\n"
33
+ return res
34
+
35
+ def enumerateNodes(self, root: Node):
36
+ queue = [root]
37
+ seen: set[Node] = set()
38
+
39
+ while queue:
40
+ node = queue.pop()
41
+ if node in seen:
42
+ continue
43
+
44
+ seen.add(node)
45
+
46
+ for edge in node:
47
+ queue.append(edge.node)
48
+
49
+ otherwise = node.getOtherwiseEdge()
50
+ if otherwise:
51
+ queue.append(otherwise.node)
52
+
53
+ return seen
54
+
55
+ def buildNode(self, node: Node):
56
+ res: str = ""
57
+ edges = list(node)
58
+ otherwise = node.getOtherwiseEdge()
59
+ if otherwise:
60
+ edges.append(otherwise)
61
+
62
+ advance: dict[Node, list[Edge]] = {}
63
+ noAdvance: dict[Node, list[Edge]] = {}
64
+
65
+ for edge in edges:
66
+ targets = noAdvance if edge.noAdvance else advance
67
+
68
+ if targets.get(edge.node):
69
+ targets[edge.node].append(edge)
70
+ else:
71
+ targets[edge.node] = [edge]
72
+
73
+ res += self.buildEdgeMap(node, advance, "advance")
74
+ res += self.buildEdgeMap(node, noAdvance, "noAdvance")
75
+
76
+ return res
77
+
78
+ def buildEdgeMap(self, node: Node, Map: dict[Node, list[Edge]], kind: str):
79
+ res = ""
80
+ for target, edges in Map.items():
81
+ otherwise: list[Edge] = []
82
+ single: list[Edge] = []
83
+ sequence: list[Edge] = []
84
+ code: list[Edge] = []
85
+
86
+ for edge in edges:
87
+ if not edge.key:
88
+ otherwise.append(edge)
89
+ elif isinstance(edge.key, int):
90
+ code.append(edge)
91
+ elif len(edge.key) == 1:
92
+ single.append(edge)
93
+ else:
94
+ sequence.append(edge)
95
+ labels: list[str] = []
96
+ # print(target.name,otherwise,code,single,sequence)
97
+
98
+ # end:int node:Node start:int
99
+ ranges: list[dict[str, Union[int, Node]]] = []
100
+
101
+ firstKey: Optional[int] = None
102
+ lastKey: Optional[int] = None
103
+
104
+ for edge in single:
105
+ key = (
106
+ edge.key[0]
107
+ if isinstance(edge.key, (bytes, list))
108
+ else (
109
+ edge.key
110
+ if not isinstance(edge.key, str)
111
+ else edge.key.encode()[0]
112
+ )
113
+ )
114
+
115
+ if lastKey and lastKey == key - 1:
116
+ lastKey = key
117
+ continue
118
+
119
+ if lastKey is not None:
120
+ ranges.append({"start": firstKey, "end": lastKey, "node": target})
121
+
122
+ firstKey = key
123
+ lastKey = key
124
+
125
+ if lastKey:
126
+ assert firstKey
127
+ ranges.append({"start": firstKey, "end": lastKey, "node": target})
128
+
129
+ for _range in ranges:
130
+ labels.append(self.buildRangeLabel(node, _range))
131
+
132
+ for edge in sequence:
133
+ labels.append(self.buildEdgeLabel(node, edge))
134
+
135
+ for edge in code:
136
+ labels.append(self.buildInvokeLabel(node, edge))
137
+
138
+ for edge in otherwise:
139
+ labels.append(self.buildOtherwiseLabel(node, edge))
140
+
141
+ color = COLOR_NO_ADVANCE if kind == "noAdvance" else COLOR_ADVANCE
142
+
143
+ res += (
144
+ f' "{self.id(node)}" -> "{self.id(target)}"'
145
+ f'[label="{"|".join(labels)}" color="{color}" decorate=true];\n'
146
+ )
147
+
148
+ return res
149
+
150
+ def buildRangeLabel(self, node: Node, _range: dict[str, Union[int, Node]]):
151
+ start = self.buildChar(_range["start"])
152
+ end = self.buildChar(_range["end"])
153
+ # return range.start === range.end ? start : `${start}:${end}`;
154
+ return start if _range["start"] == _range["end"] else f"{start}:{end}"
155
+
156
+ def buildEdgeLabel(self, node: Node, edge: Edge):
157
+ return f"{self.buildBuffer(edge.key)}"
158
+
159
+ def buildInvokeLabel(self, node: Node, edge: Edge):
160
+ return f"code={int(edge.key)}"
161
+
162
+ def buildOtherwiseLabel(self, node: Node, edge: Edge):
163
+ return "otherwise" if edge.noAdvance else "skipTo"
164
+
165
+ def buildChar(self, code: int):
166
+ if not isinstance(code, int):
167
+ code = ord(code)
168
+ if code == 0x0A:
169
+ return self.escape("'\\n'")
170
+ if code == 0x0D:
171
+ return self.escape("'\\r'")
172
+ if code == 0x09:
173
+ return self.escape("'\\t'")
174
+
175
+ if 0x20 <= code and code <= 0x7E:
176
+ return self.escape(chr(code))
177
+ # I Don't know how accurate this is but it was worth a shot
178
+ res = hex(code)
179
+ return res
180
+
181
+ def buildBuffer(self, buffer: bytes):
182
+ s = buffer.decode() if isinstance(buffer, bytes) else buffer
183
+ return (
184
+ "'"
185
+ + s.replace("\n", "\\n")
186
+ .replace("\t", "\\t")
187
+ .replace("\r", "\\r")
188
+ .replace("\\", "\\$1")
189
+ + "'"
190
+ )
191
+
192
+ def id(self, node: Node):
193
+ if self.idCache.get(node):
194
+ return self.idCache[node]
195
+
196
+ res = node.name
197
+ if res in self.ns:
198
+ for i in range(len(self.ns)):
199
+ if (res + "_%i" % i) in self.ns:
200
+ break
201
+
202
+ res += "_%i" % i
203
+
204
+ self.ns.add(res)
205
+ res = self.escape(res)
206
+ self.idCache[node] = res
207
+ return res
208
+
209
+ def escape(self, value: str):
210
+ return "'" + value.replace("\\", "\\$1").replace('"', "\\$1") + "'"
211
+
212
+
213
+ # TODO FIX ALL BUFFERS BACK TO STRINGS!
llparse/enumerator.py ADDED
@@ -0,0 +1,20 @@
1
+ from .pyfront.front import IWrap
2
+ from .pyfront.nodes import Node
3
+
4
+
5
+ class Enumerator:
6
+ @staticmethod
7
+ def getAllNodes(root: IWrap[Node]):
8
+ nodes: set[IWrap[Node]] = set()
9
+ queue: list[IWrap[Node]] = [root]
10
+
11
+ while queue:
12
+ node = queue.pop()
13
+ for slot in node.ref.getSlots():
14
+ if slot.node in nodes:
15
+ continue
16
+
17
+ nodes.add(slot.node)
18
+ queue.append(slot.node)
19
+
20
+ return list(nodes)