boho 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
boho-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GregorBokal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
boho-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: boho
3
+ Version: 0.1.0
4
+ Summary: A self-hosting parser generator with a modal lexer and LR(1) parser
5
+ Author-email: Gregor Bokal <gregor.bokal@gimb.org>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/GregorBokal/boho-parser-generator?tab=readme-ov-file
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Topic :: Software Development :: Compilers
10
+ Classifier: Topic :: Text Processing :: General
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: greenery>=4.2
15
+ Dynamic: license-file
16
+
17
+ # Boho
18
+
19
+ A self-hosting parser generator for Python. Define your grammar in a concise EBNF-based metalanguage and Boho will generate a modal lexer (DFA-based) and an LR(1) parser that produce a clean syntax tree.
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install boho
25
+ ```
26
+
27
+ ## Quick start
28
+
29
+ ```python
30
+ from boho import Boho
31
+ from boho.interpreter import Interpreter
32
+
33
+ grammar = '''
34
+ start: sum
35
+
36
+ sum: sum "+" prod
37
+ | prod
38
+
39
+ prod: prod "*" INT
40
+ | INT
41
+
42
+ INT: @INT
43
+
44
+ %ignore " "
45
+ '''
46
+
47
+ b = Boho(grammar)
48
+ tree = b("2 + 3 * 4")
49
+ print(tree.pretty())
50
+ ```
51
+
52
+ Output:
53
+
54
+ ```
55
+ start:
56
+ sum:
57
+ sum:
58
+ prod:
59
+ 'INT' '2'
60
+ prod:
61
+ prod:
62
+ 'INT' '3'
63
+ 'INT' '4'
64
+ ```
65
+
66
+ ### Writing an interpreter
67
+
68
+ Subclass `Interpreter` and define methods matching your nonterminal names:
69
+
70
+ ```python
71
+ class Calc(Interpreter):
72
+ def start(self, tree):
73
+ return self(tree[0])
74
+
75
+ def sum(self, tree):
76
+ return sum(self(c) for c in tree)
77
+
78
+ def prod(self, tree):
79
+ result = int(self(tree[0]))
80
+ for i in range(1, len(tree)):
81
+ result *= int(self(tree[i]))
82
+ return result
83
+
84
+ calc = Calc()
85
+ print(calc(tree)) # 14
86
+ ```
87
+
88
+ ## The Boho metalanguage
89
+
90
+ ### Terminal definitions
91
+
92
+ Terminals are named in `UPPER_CASE` and can be described three ways:
93
+
94
+ ```
95
+ PLUS: "+" // string literal
96
+ NUMBER: /\d+(\.\d+)?/ // regular expression
97
+ STRING: @STR // built-in description (@INT, @FLOAT, @STR)
98
+ ```
99
+
100
+ Terminal descriptions can also be used directly (unnamed) in grammar rules -- they will be pruned from the syntax tree.
101
+
102
+ Prefixing a terminal name with `_` (e.g. `_WHITESPACE`) prunes it from the tree despite being named.
103
+
104
+ ### Grammar rules
105
+
106
+ Nonterminals use `lower_case` names. Alternatives are separated with `|`:
107
+
108
+ ```
109
+ value: NAME | NUMBER
110
+ assignment: NAME "=" value
111
+ ```
112
+
113
+ EBNF extensions:
114
+
115
+ ```
116
+ items: item+ // one or more
117
+ list: (item ",")* item // zero or more (with grouping)
118
+ optional: modifier? // optional
119
+ ```
120
+
121
+ Inline aliases with `->`:
122
+
123
+ ```
124
+ expr: term "+" term -> addition
125
+ | term "-" term -> subtraction
126
+ ```
127
+
128
+ ### Fake terminals
129
+
130
+ A name like `COMMENT_` (uppercase ending with `_`) defines a fake terminal -- described like a nonterminal but collapsed into a single token. Useful for structures that regular expressions cannot describe (e.g. nested block comments).
131
+
132
+ ### Lexer modes
133
+
134
+ Following ANTLR's approach, a modal lexer is supported. Terminals before the first `#mode` belong to all modes.
135
+
136
+ ```
137
+ LBRACE: "{" -> +inner // push mode
138
+ RBRACE: "}" -> - // pop mode
139
+
140
+ #inner
141
+ CONTENT: /[^{}]+/
142
+ ```
143
+
144
+ | Syntax | Effect |
145
+ |------------|----------------------|
146
+ | `-> +mode` | push mode onto stack |
147
+ | `-> -` | pop one mode |
148
+ | `-> -N` | pop N modes |
149
+ | `-> --` | clear the stack |
150
+ | `-> mode` | replace top of stack |
151
+
152
+ ### Ignoring tokens
153
+
154
+ ```
155
+ %ignore " "
156
+ %ignore /\/\/[^\n]*/ // ignore line comments
157
+ ```
158
+
159
+ ## Project structure
160
+
161
+ ```
162
+ boho/
163
+ __init__.py # exports the Boho class
164
+ boho.py # main orchestrator
165
+ lexer.py # modal finite-automaton lexer
166
+ lexer_generator.py # terminal descriptions -> lexer DFA
167
+ parser.py # LR(1) shift-reduce parser
168
+ parser_generator.py # grammar -> LR(1) parse tables
169
+ grammar_interpreter.py # interprets the Boho metalanguage
170
+ interpreter.py # base Interpreter class (visitor pattern)
171
+ objects.py # Token, Tree, LR1Item dataclasses
172
+ regex.py # regex-to-DFA via greenery
173
+ grammars.py # pre-compiled Boho grammar tables
174
+ docs/ # English documentation
175
+ slo-dokumentacija/ # Slovenian documentation
176
+ examples/ # usage examples
177
+ tests/ # test suite
178
+ ```
179
+
180
+ ## How it works
181
+
182
+ 1. Your grammar string is parsed by Boho's own (bootstrapped) parser.
183
+ 2. Terminal descriptions are compiled into merged DFAs for a modal lexer.
184
+ 3. Grammar rules are compiled into LR(1) parse tables.
185
+ 4. At runtime, input text is tokenized by the lexer and then parsed into a `Tree` of `Token` leaves.
186
+
187
+ Boho is self-hosting -- its own metalanguage is specified in Boho (see `examples/boho_in_boho.py`).
188
+
189
+ ## API
190
+
191
+ ### `Boho(grammar, log=False)`
192
+
193
+ Create a parser from a grammar string. Set `log=True` to print the generated lexer and parser tables.
194
+
195
+ ### `boho(text, log=False) -> Tree`
196
+
197
+ Parse input text. Returns a `Tree` with `Token` leaves. Set `log=True` for step-by-step tracing.
198
+
199
+ ### `Tree`
200
+
201
+ - `tree.name` -- nonterminal name
202
+ - `tree.children` -- list of `Tree` / `Token` children
203
+ - `tree.value` -- concatenated text of all descendant tokens
204
+ - `tree.pretty()` -- indented string representation
205
+ - Supports iteration and indexing (`tree[0]`, `for child in tree`)
206
+
207
+ ### `Token`
208
+
209
+ - `token.name` -- terminal name
210
+ - `token.value` -- matched text
211
+ - `token.line`, `token.col` -- source location
212
+
213
+ ### `Interpreter`
214
+
215
+ Base class for tree walkers. Subclass it and define methods named after your nonterminals. The default behavior for unhandled nodes: tokens return their value, trees return a list of children's results.
216
+
217
+ ## Dependencies
218
+
219
+ - [greenery](https://github.com/qntm/greenery) -- regex-to-FSM conversion
220
+ - Python 3.10+ (uses `match` statements and `X | Y` type unions)
221
+
222
+ ## License
223
+
224
+ MIT
boho-0.1.0/README.md ADDED
@@ -0,0 +1,208 @@
1
+ # Boho
2
+
3
+ A self-hosting parser generator for Python. Define your grammar in a concise EBNF-based metalanguage and Boho will generate a modal lexer (DFA-based) and an LR(1) parser that produce a clean syntax tree.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install boho
9
+ ```
10
+
11
+ ## Quick start
12
+
13
+ ```python
14
+ from boho import Boho
15
+ from boho.interpreter import Interpreter
16
+
17
+ grammar = '''
18
+ start: sum
19
+
20
+ sum: sum "+" prod
21
+ | prod
22
+
23
+ prod: prod "*" INT
24
+ | INT
25
+
26
+ INT: @INT
27
+
28
+ %ignore " "
29
+ '''
30
+
31
+ b = Boho(grammar)
32
+ tree = b("2 + 3 * 4")
33
+ print(tree.pretty())
34
+ ```
35
+
36
+ Output:
37
+
38
+ ```
39
+ start:
40
+ sum:
41
+ sum:
42
+ prod:
43
+ 'INT' '2'
44
+ prod:
45
+ prod:
46
+ 'INT' '3'
47
+ 'INT' '4'
48
+ ```
49
+
50
+ ### Writing an interpreter
51
+
52
+ Subclass `Interpreter` and define methods matching your nonterminal names:
53
+
54
+ ```python
55
+ class Calc(Interpreter):
56
+ def start(self, tree):
57
+ return self(tree[0])
58
+
59
+ def sum(self, tree):
60
+ return sum(self(c) for c in tree)
61
+
62
+ def prod(self, tree):
63
+ result = int(self(tree[0]))
64
+ for i in range(1, len(tree)):
65
+ result *= int(self(tree[i]))
66
+ return result
67
+
68
+ calc = Calc()
69
+ print(calc(tree)) # 14
70
+ ```
71
+
72
+ ## The Boho metalanguage
73
+
74
+ ### Terminal definitions
75
+
76
+ Terminals are named in `UPPER_CASE` and can be described three ways:
77
+
78
+ ```
79
+ PLUS: "+" // string literal
80
+ NUMBER: /\d+(\.\d+)?/ // regular expression
81
+ STRING: @STR // built-in description (@INT, @FLOAT, @STR)
82
+ ```
83
+
84
+ Terminal descriptions can also be used directly (unnamed) in grammar rules -- they will be pruned from the syntax tree.
85
+
86
+ Prefixing a terminal name with `_` (e.g. `_WHITESPACE`) prunes it from the tree despite being named.
87
+
88
+ ### Grammar rules
89
+
90
+ Nonterminals use `lower_case` names. Alternatives are separated with `|`:
91
+
92
+ ```
93
+ value: NAME | NUMBER
94
+ assignment: NAME "=" value
95
+ ```
96
+
97
+ EBNF extensions:
98
+
99
+ ```
100
+ items: item+ // one or more
101
+ list: (item ",")* item // zero or more (with grouping)
102
+ optional: modifier? // optional
103
+ ```
104
+
105
+ Inline aliases with `->`:
106
+
107
+ ```
108
+ expr: term "+" term -> addition
109
+ | term "-" term -> subtraction
110
+ ```
111
+
112
+ ### Fake terminals
113
+
114
+ A name like `COMMENT_` (uppercase ending with `_`) defines a fake terminal -- described like a nonterminal but collapsed into a single token. Useful for structures that regular expressions cannot describe (e.g. nested block comments).
115
+
116
+ ### Lexer modes
117
+
118
+ Following ANTLR's approach, a modal lexer is supported. Terminals before the first `#mode` belong to all modes.
119
+
120
+ ```
121
+ LBRACE: "{" -> +inner // push mode
122
+ RBRACE: "}" -> - // pop mode
123
+
124
+ #inner
125
+ CONTENT: /[^{}]+/
126
+ ```
127
+
128
+ | Syntax | Effect |
129
+ |------------|----------------------|
130
+ | `-> +mode` | push mode onto stack |
131
+ | `-> -` | pop one mode |
132
+ | `-> -N` | pop N modes |
133
+ | `-> --` | clear the stack |
134
+ | `-> mode` | replace top of stack |
135
+
136
+ ### Ignoring tokens
137
+
138
+ ```
139
+ %ignore " "
140
+ %ignore /\/\/[^\n]*/ // ignore line comments
141
+ ```
142
+
143
+ ## Project structure
144
+
145
+ ```
146
+ boho/
147
+ __init__.py # exports the Boho class
148
+ boho.py # main orchestrator
149
+ lexer.py # modal finite-automaton lexer
150
+ lexer_generator.py # terminal descriptions -> lexer DFA
151
+ parser.py # LR(1) shift-reduce parser
152
+ parser_generator.py # grammar -> LR(1) parse tables
153
+ grammar_interpreter.py # interprets the Boho metalanguage
154
+ interpreter.py # base Interpreter class (visitor pattern)
155
+ objects.py # Token, Tree, LR1Item dataclasses
156
+ regex.py # regex-to-DFA via greenery
157
+ grammars.py # pre-compiled Boho grammar tables
158
+ docs/ # English documentation
159
+ slo-dokumentacija/ # Slovenian documentation
160
+ examples/ # usage examples
161
+ tests/ # test suite
162
+ ```
163
+
164
+ ## How it works
165
+
166
+ 1. Your grammar string is parsed by Boho's own (bootstrapped) parser.
167
+ 2. Terminal descriptions are compiled into merged DFAs for a modal lexer.
168
+ 3. Grammar rules are compiled into LR(1) parse tables.
169
+ 4. At runtime, input text is tokenized by the lexer and then parsed into a `Tree` of `Token` leaves.
170
+
171
+ Boho is self-hosting -- its own metalanguage is specified in Boho (see `examples/boho_in_boho.py`).
172
+
173
+ ## API
174
+
175
+ ### `Boho(grammar, log=False)`
176
+
177
+ Create a parser from a grammar string. Set `log=True` to print the generated lexer and parser tables.
178
+
179
+ ### `boho(text, log=False) -> Tree`
180
+
181
+ Parse input text. Returns a `Tree` with `Token` leaves. Set `log=True` for step-by-step tracing.
182
+
183
+ ### `Tree`
184
+
185
+ - `tree.name` -- nonterminal name
186
+ - `tree.children` -- list of `Tree` / `Token` children
187
+ - `tree.value` -- concatenated text of all descendant tokens
188
+ - `tree.pretty()` -- indented string representation
189
+ - Supports iteration and indexing (`tree[0]`, `for child in tree`)
190
+
191
+ ### `Token`
192
+
193
+ - `token.name` -- terminal name
194
+ - `token.value` -- matched text
195
+ - `token.line`, `token.col` -- source location
196
+
197
+ ### `Interpreter`
198
+
199
+ Base class for tree walkers. Subclass it and define methods named after your nonterminals. The default behavior for unhandled nodes: tokens return their value, trees return a list of children's results.
200
+
201
+ ## Dependencies
202
+
203
+ - [greenery](https://github.com/qntm/greenery) -- regex-to-FSM conversion
204
+ - Python 3.10+ (uses `match` statements and `X | Y` type unions)
205
+
206
+ ## License
207
+
208
+ MIT
@@ -0,0 +1 @@
1
+ from .boho import Boho
@@ -0,0 +1,38 @@
1
+ from .grammar_interpreter import interpret
2
+ from .lexer import Lexer
3
+ from .parser import Parser
4
+ from .lexer_generator import generate as prepare_lex
5
+ from .parser_generator import generate as prepare_pars
6
+
7
+
8
+ class Boho:
9
+
10
+ def __init__(self, grammar: str, log: bool = False):
11
+ (
12
+ self.tokens,
13
+ self.grammar,
14
+ self.ignore_dict
15
+ ) = interpret(grammar)
16
+ self.lex_table = prepare_lex(self.tokens, log=log)
17
+ self.pars_table = prepare_pars(self.grammar, log=log)
18
+ for mode in self.ignore_dict:
19
+ for description in self.ignore_dict[mode]:
20
+ if (
21
+ (len(description) > 4) or
22
+ (description[0] not in '\'"') or
23
+ (len(description) == 4 and description[1] != '\\')
24
+ ):
25
+ if '' in self.pars_table:
26
+ self.pars_table[''].append(description)
27
+ else:
28
+ self.pars_table[''] = [description]
29
+ continue
30
+ if '' in self.lex_table[mode]:
31
+ self.lex_table[mode][''].append(description[1:-1])
32
+ else:
33
+ self.lex_table[mode][''] = [description[1:-1]]
34
+ self.lexer = Lexer(self.lex_table)
35
+ self.parser = Parser(self.pars_table)
36
+
37
+ def __call__(self, text: str, log: bool = False):
38
+ return self.parser(self.lexer(text, log), log)
@@ -0,0 +1,167 @@
1
+ from .grammars import boho_grammar
2
+ from .lexer import Lexer
3
+ from .parser import Parser
4
+ from .interpreter import Interpreter
5
+ from .objects import (
6
+ TerminalList,
7
+ TerminalDescription as Terminals,
8
+ Grammar,
9
+ Tree, unnamed
10
+ )
11
+ from typing import Dict, List
12
+
13
+ alphabet = 'abcdefghijklmnopqrstuvwxyz'
14
+
15
+
16
+ class GrammarInterpreter(Interpreter):
17
+
18
+ def __init__(self):
19
+ self.terminal_list: TerminalList = []
20
+ self.terminals: Terminals = {}
21
+ self.grammar: Grammar = {}
22
+ self.ignore_dict: Dict[str, List[str]] = {}
23
+ self.current_mode: str = ''
24
+ self.i: int = 0
25
+
26
+ def start(self, tree):
27
+ self.terminal_list = []
28
+ self.terminals = {}
29
+ self.grammar = {}
30
+ self.ignore_dict = {'': []}
31
+ self.current_mode = ''
32
+ self.i = 0
33
+
34
+ for statement in tree:
35
+ self(statement)
36
+
37
+ if self.terminals:
38
+ for mode in self.terminals:
39
+ for terminal in self.terminal_list:
40
+ self.terminals[mode].append(terminal)
41
+ else:
42
+ self.terminals[''] = self.terminal_list
43
+
44
+ return self.terminals, self.grammar, self.ignore_dict
45
+
46
+ def terminal(self, tree):
47
+ desc = self(tree[1])
48
+ action = [self(tree[0])]
49
+ if len(tree) > 2:
50
+ b = self(tree[2])
51
+ action += b
52
+ self.add_terminal(desc, action)
53
+
54
+ def add_terminal(self, *new):
55
+ if self.current_mode:
56
+ if new not in self.terminals[self.current_mode]:
57
+ self.terminals[self.current_mode].append(new)
58
+ elif new not in self.terminal_list:
59
+ self.terminal_list.append(new)
60
+
61
+ def description(self, token):
62
+ return token.value
63
+
64
+ def operations(self, tree):
65
+ list = []
66
+ for token in tree:
67
+ list += self(token)
68
+ return list
69
+
70
+ @staticmethod
71
+ def push_mode(token):
72
+ return [token.value[1:]]
73
+
74
+ @staticmethod
75
+ def pop_mode(token):
76
+ if token.children:
77
+ return [int(token.value)]
78
+ else:
79
+ return [1]
80
+
81
+ @staticmethod
82
+ def reset_mode(*args):
83
+ return [0]
84
+
85
+ @staticmethod
86
+ def change_mode(token):
87
+ return [1, token.value]
88
+
89
+ def nonterminal(self, tree):
90
+ name = self(tree[0])
91
+ options = self(tree[1])
92
+ self.grammar[name] = options
93
+
94
+ def option(self, tree):
95
+ units = self(tree[0])
96
+ if len(tree) > 2:
97
+ alias = self(tree[1])
98
+ self.grammar[alias] = units
99
+ return (alias,)
100
+ return tuple(units)
101
+
102
+ def unit(self, tree):
103
+ atom = self(tree[0])
104
+ if atom[0] in unnamed:
105
+ self.add_terminal(atom, [atom])
106
+ if len(tree) > 1:
107
+ quantifier = tree[1][0].value
108
+ name = self.random_name()
109
+ match quantifier:
110
+ case '?':
111
+ self.grammar[name] = [(atom,), ()]
112
+ case '*':
113
+ self.grammar[name] = [(name, atom), ()]
114
+ case '+':
115
+ self.grammar[name] = [(name, atom), (atom,)]
116
+ return name
117
+ return atom
118
+
119
+ def atom(self, tree):
120
+ desc = self(tree[0])
121
+ if isinstance(desc, str):
122
+ return desc
123
+ name = self.random_name()
124
+ self.grammar[name] = desc
125
+ return name
126
+
127
+ def random_name(self):
128
+ while (name := self.next_name()) in self.grammar:
129
+ self.i += 1
130
+ return name
131
+
132
+ def next_name(self):
133
+ result = []
134
+ n = self.i
135
+ while n > 0:
136
+ remainder = n % len(alphabet)
137
+ result.append(alphabet[remainder])
138
+ n = n // len(alphabet)
139
+ return '_' + ''.join(result or 'a')
140
+
141
+ def mode(self, token):
142
+ self.current_mode = token.value
143
+ self.terminals[self.current_mode] = []
144
+ self.ignore_dict[self.current_mode] = []
145
+
146
+ def ignore(self, node):
147
+ terminal = self(node[0])
148
+ if isinstance(node, Tree):
149
+ if (
150
+ (len(terminal) > 4) or
151
+ (terminal[0] not in '\'"') or
152
+ (len(terminal) == 4 and terminal[1] != '\\')
153
+ ):
154
+ self.add_terminal(terminal, [terminal])
155
+ self.ignore_dict[self.current_mode].append(terminal)
156
+
157
+
158
+ l, p = boho_grammar
159
+ lexer = Lexer(l)
160
+ parser = Parser(p)
161
+ interpreter = GrammarInterpreter()
162
+
163
+
164
+ def interpret(grammar: str):
165
+ tokens = lexer(grammar)
166
+ tree = parser(tokens)
167
+ return interpreter(tree)