PyPI - boho - Versions diffs - 0.1.0__tar.gz - Mend

boho 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

boho-0.1.0/LICENSE +21 -0
boho-0.1.0/PKG-INFO +224 -0
boho-0.1.0/README.md +208 -0
boho-0.1.0/boho/__init__.py +1 -0
boho-0.1.0/boho/boho.py +38 -0
boho-0.1.0/boho/grammar_interpreter.py +167 -0
boho-0.1.0/boho/grammars.py +379 -0
boho-0.1.0/boho/interpreter.py +12 -0
boho-0.1.0/boho/lexer.py +230 -0
boho-0.1.0/boho/lexer_generator.py +266 -0
boho-0.1.0/boho/objects.py +80 -0
boho-0.1.0/boho/parser.py +137 -0
boho-0.1.0/boho/parser_generator.py +244 -0
boho-0.1.0/boho/regex.py +42 -0
boho-0.1.0/boho.egg-info/PKG-INFO +224 -0
boho-0.1.0/boho.egg-info/SOURCES.txt +24 -0
boho-0.1.0/boho.egg-info/dependency_links.txt +1 -0
boho-0.1.0/boho.egg-info/requires.txt +1 -0
boho-0.1.0/boho.egg-info/top_level.txt +1 -0
boho-0.1.0/pyproject.toml +24 -0
boho-0.1.0/setup.cfg +4 -0
boho-0.1.0/tests/test_interpreter.py +145 -0
boho-0.1.0/tests/test_lexer.py +482 -0
boho-0.1.0/tests/test_lexer_generator.py +560 -0
boho-0.1.0/tests/test_parser.py +692 -0
boho-0.1.0/tests/test_parser_generator.py +736 -0

boho-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 GregorBokal
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

boho-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,224 @@
+Metadata-Version: 2.4
+Name: boho
+Version: 0.1.0
+Summary: A self-hosting parser generator with a modal lexer and LR(1) parser
+Author-email: Gregor Bokal <gregor.bokal@gimb.org>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/GregorBokal/boho-parser-generator?tab=readme-ov-file
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Compilers
+Classifier: Topic :: Text Processing :: General
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: greenery>=4.2
+Dynamic: license-file
+# Boho
+A self-hosting parser generator for Python. Define your grammar in a concise EBNF-based metalanguage and Boho will generate a modal lexer (DFA-based) and an LR(1) parser that produce a clean syntax tree.
+## Installation
+```bash
+pip install boho
+```
+## Quick start
+```python
+from boho import Boho
+from boho.interpreter import Interpreter
+grammar = '''
+start: sum
+sum: sum "+" prod
+   | prod
+prod: prod "*" INT
+    | INT
+INT: @INT
+%ignore " "
+'''
+b = Boho(grammar)
+tree = b("2 + 3 * 4")
+print(tree.pretty())
+```
+Output:
+```
+start:
+  sum:
+    sum:
+      prod:
+        'INT' '2'
+    prod:
+      prod:
+        'INT' '3'
+      'INT' '4'
+```
+### Writing an interpreter
+Subclass `Interpreter` and define methods matching your nonterminal names:
+```python
+class Calc(Interpreter):
+    def start(self, tree):
+        return self(tree[0])
+    def sum(self, tree):
+        return sum(self(c) for c in tree)
+    def prod(self, tree):
+        result = int(self(tree[0]))
+        for i in range(1, len(tree)):
+            result *= int(self(tree[i]))
+        return result
+calc = Calc()
+print(calc(tree))  # 14
+```
+## The Boho metalanguage
+### Terminal definitions
+Terminals are named in `UPPER_CASE` and can be described three ways:
+```
+PLUS: "+"               // string literal
+NUMBER: /\d+(\.\d+)?/   // regular expression
+STRING: @STR             // built-in description (@INT, @FLOAT, @STR)
+```
+Terminal descriptions can also be used directly (unnamed) in grammar rules -- they will be pruned from the syntax tree.
+Prefixing a terminal name with `_` (e.g. `_WHITESPACE`) prunes it from the tree despite being named.
+### Grammar rules
+Nonterminals use `lower_case` names. Alternatives are separated with `|`:
+```
+value: NAME | NUMBER
+assignment: NAME "=" value
+```
+EBNF extensions:
+```
+items: item+              // one or more
+list: (item ",")*  item   // zero or more (with grouping)
+optional: modifier?       // optional
+```
+Inline aliases with `->`:
+```
+expr: term "+" term -> addition
+    | term "-" term -> subtraction
+```
+### Fake terminals
+A name like `COMMENT_` (uppercase ending with `_`) defines a fake terminal -- described like a nonterminal but collapsed into a single token. Useful for structures that regular expressions cannot describe (e.g. nested block comments).
+### Lexer modes
+Following ANTLR's approach, a modal lexer is supported. Terminals before the first `#mode` belong to all modes.
+```
+LBRACE: "{" -> +inner    // push mode
+RBRACE: "}" -> -         // pop mode
+#inner
+CONTENT: /[^{}]+/
+```
+| Syntax     | Effect               |
+|------------|----------------------|
+| `-> +mode` | push mode onto stack |
+| `-> -`     | pop one mode         |
+| `-> -N`    | pop N modes          |
+| `-> --`    | clear the stack      |
+| `-> mode`  | replace top of stack |
+### Ignoring tokens
+```
+%ignore " "
+%ignore /\/\/[^\n]*/    // ignore line comments
+```
+## Project structure
+```
+boho/
+  __init__.py            # exports the Boho class
+  boho.py                # main orchestrator
+  lexer.py               # modal finite-automaton lexer
+  lexer_generator.py     # terminal descriptions -> lexer DFA
+  parser.py              # LR(1) shift-reduce parser
+  parser_generator.py    # grammar -> LR(1) parse tables
+  grammar_interpreter.py # interprets the Boho metalanguage
+  interpreter.py         # base Interpreter class (visitor pattern)
+  objects.py             # Token, Tree, LR1Item dataclasses
+  regex.py               # regex-to-DFA via greenery
+  grammars.py            # pre-compiled Boho grammar tables
+docs/                    # English documentation
+slo-dokumentacija/       # Slovenian documentation
+examples/                # usage examples
+tests/                   # test suite
+```
+## How it works
+1. Your grammar string is parsed by Boho's own (bootstrapped) parser.
+2. Terminal descriptions are compiled into merged DFAs for a modal lexer.
+3. Grammar rules are compiled into LR(1) parse tables.
+4. At runtime, input text is tokenized by the lexer and then parsed into a `Tree` of `Token` leaves.
+Boho is self-hosting -- its own metalanguage is specified in Boho (see `examples/boho_in_boho.py`).
+## API
+### `Boho(grammar, log=False)`
+Create a parser from a grammar string. Set `log=True` to print the generated lexer and parser tables.
+### `boho(text, log=False) -> Tree`
+Parse input text. Returns a `Tree` with `Token` leaves. Set `log=True` for step-by-step tracing.
+### `Tree`
+- `tree.name` -- nonterminal name
+- `tree.children` -- list of `Tree` / `Token` children
+- `tree.value` -- concatenated text of all descendant tokens
+- `tree.pretty()` -- indented string representation
+- Supports iteration and indexing (`tree[0]`, `for child in tree`)
+### `Token`
+- `token.name` -- terminal name
+- `token.value` -- matched text
+- `token.line`, `token.col` -- source location
+### `Interpreter`
+Base class for tree walkers. Subclass it and define methods named after your nonterminals. The default behavior for unhandled nodes: tokens return their value, trees return a list of children's results.
+## Dependencies
+- [greenery](https://github.com/qntm/greenery) -- regex-to-FSM conversion
+- Python 3.10+ (uses `match` statements and `X | Y` type unions)
+## License
+MIT

boho-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,208 @@
+# Boho
+A self-hosting parser generator for Python. Define your grammar in a concise EBNF-based metalanguage and Boho will generate a modal lexer (DFA-based) and an LR(1) parser that produce a clean syntax tree.
+## Installation
+```bash
+pip install boho
+```
+## Quick start
+```python
+from boho import Boho
+from boho.interpreter import Interpreter
+grammar = '''
+start: sum
+sum: sum "+" prod
+   | prod
+prod: prod "*" INT
+    | INT
+INT: @INT
+%ignore " "
+'''
+b = Boho(grammar)
+tree = b("2 + 3 * 4")
+print(tree.pretty())
+```
+Output:
+```
+start:
+  sum:
+    sum:
+      prod:
+        'INT' '2'
+    prod:
+      prod:
+        'INT' '3'
+      'INT' '4'
+```
+### Writing an interpreter
+Subclass `Interpreter` and define methods matching your nonterminal names:
+```python
+class Calc(Interpreter):
+    def start(self, tree):
+        return self(tree[0])
+    def sum(self, tree):
+        return sum(self(c) for c in tree)
+    def prod(self, tree):
+        result = int(self(tree[0]))
+        for i in range(1, len(tree)):
+            result *= int(self(tree[i]))
+        return result
+calc = Calc()
+print(calc(tree))  # 14
+```
+## The Boho metalanguage
+### Terminal definitions
+Terminals are named in `UPPER_CASE` and can be described three ways:
+```
+PLUS: "+"               // string literal
+NUMBER: /\d+(\.\d+)?/   // regular expression
+STRING: @STR             // built-in description (@INT, @FLOAT, @STR)
+```
+Terminal descriptions can also be used directly (unnamed) in grammar rules -- they will be pruned from the syntax tree.
+Prefixing a terminal name with `_` (e.g. `_WHITESPACE`) prunes it from the tree despite being named.
+### Grammar rules
+Nonterminals use `lower_case` names. Alternatives are separated with `|`:
+```
+value: NAME | NUMBER
+assignment: NAME "=" value
+```
+EBNF extensions:
+```
+items: item+              // one or more
+list: (item ",")*  item   // zero or more (with grouping)
+optional: modifier?       // optional
+```
+Inline aliases with `->`:
+```
+expr: term "+" term -> addition
+    | term "-" term -> subtraction
+```
+### Fake terminals
+A name like `COMMENT_` (uppercase ending with `_`) defines a fake terminal -- described like a nonterminal but collapsed into a single token. Useful for structures that regular expressions cannot describe (e.g. nested block comments).
+### Lexer modes
+Following ANTLR's approach, a modal lexer is supported. Terminals before the first `#mode` belong to all modes.
+```
+LBRACE: "{" -> +inner    // push mode
+RBRACE: "}" -> -         // pop mode
+#inner
+CONTENT: /[^{}]+/
+```
+| Syntax     | Effect               |
+|------------|----------------------|
+| `-> +mode` | push mode onto stack |
+| `-> -`     | pop one mode         |
+| `-> -N`    | pop N modes          |
+| `-> --`    | clear the stack      |
+| `-> mode`  | replace top of stack |
+### Ignoring tokens
+```
+%ignore " "
+%ignore /\/\/[^\n]*/    // ignore line comments
+```
+## Project structure
+```
+boho/
+  __init__.py            # exports the Boho class
+  boho.py                # main orchestrator
+  lexer.py               # modal finite-automaton lexer
+  lexer_generator.py     # terminal descriptions -> lexer DFA
+  parser.py              # LR(1) shift-reduce parser
+  parser_generator.py    # grammar -> LR(1) parse tables
+  grammar_interpreter.py # interprets the Boho metalanguage
+  interpreter.py         # base Interpreter class (visitor pattern)
+  objects.py             # Token, Tree, LR1Item dataclasses
+  regex.py               # regex-to-DFA via greenery
+  grammars.py            # pre-compiled Boho grammar tables
+docs/                    # English documentation
+slo-dokumentacija/       # Slovenian documentation
+examples/                # usage examples
+tests/                   # test suite
+```
+## How it works
+1. Your grammar string is parsed by Boho's own (bootstrapped) parser.
+2. Terminal descriptions are compiled into merged DFAs for a modal lexer.
+3. Grammar rules are compiled into LR(1) parse tables.
+4. At runtime, input text is tokenized by the lexer and then parsed into a `Tree` of `Token` leaves.
+Boho is self-hosting -- its own metalanguage is specified in Boho (see `examples/boho_in_boho.py`).
+## API
+### `Boho(grammar, log=False)`
+Create a parser from a grammar string. Set `log=True` to print the generated lexer and parser tables.
+### `boho(text, log=False) -> Tree`
+Parse input text. Returns a `Tree` with `Token` leaves. Set `log=True` for step-by-step tracing.
+### `Tree`
+- `tree.name` -- nonterminal name
+- `tree.children` -- list of `Tree` / `Token` children
+- `tree.value` -- concatenated text of all descendant tokens
+- `tree.pretty()` -- indented string representation
+- Supports iteration and indexing (`tree[0]`, `for child in tree`)
+### `Token`
+- `token.name` -- terminal name
+- `token.value` -- matched text
+- `token.line`, `token.col` -- source location
+### `Interpreter`
+Base class for tree walkers. Subclass it and define methods named after your nonterminals. The default behavior for unhandled nodes: tokens return their value, trees return a list of children's results.
+## Dependencies
+- [greenery](https://github.com/qntm/greenery) -- regex-to-FSM conversion
+- Python 3.10+ (uses `match` statements and `X | Y` type unions)
+## License
+MIT

boho-0.1.0/boho/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .boho import Boho

boho-0.1.0/boho/boho.py ADDED Viewed

@@ -0,0 +1,38 @@
+from .grammar_interpreter import interpret
+from .lexer import Lexer
+from .parser import Parser
+from .lexer_generator import generate as prepare_lex
+from .parser_generator import generate as prepare_pars
+class Boho:
+    def __init__(self, grammar: str, log: bool = False):
+        (
+            self.tokens,
+            self.grammar,
+            self.ignore_dict
+        ) = interpret(grammar)
+        self.lex_table = prepare_lex(self.tokens, log=log)
+        self.pars_table = prepare_pars(self.grammar, log=log)
+        for mode in self.ignore_dict:
+            for description in self.ignore_dict[mode]:
+                if (
+                        (len(description) > 4) or
+                        (description[0] not in '\'"') or
+                        (len(description) == 4 and description[1] != '\\')
+                ):
+                    if '' in self.pars_table:
+                        self.pars_table[''].append(description)
+                    else:
+                        self.pars_table[''] = [description]
+                        continue
+                if '' in self.lex_table[mode]:
+                    self.lex_table[mode][''].append(description[1:-1])
+                else:
+                    self.lex_table[mode][''] = [description[1:-1]]
+        self.lexer = Lexer(self.lex_table)
+        self.parser = Parser(self.pars_table)
+    def __call__(self, text: str, log: bool = False):
+        return self.parser(self.lexer(text, log), log)

boho-0.1.0/boho/grammar_interpreter.py ADDED Viewed

@@ -0,0 +1,167 @@
+from .grammars import boho_grammar
+from .lexer import Lexer
+from .parser import Parser
+from .interpreter import Interpreter
+from .objects import (
+    TerminalList,
+    TerminalDescription as Terminals,
+    Grammar,
+    Tree, unnamed
+)
+from typing import Dict, List
+alphabet = 'abcdefghijklmnopqrstuvwxyz'
+class GrammarInterpreter(Interpreter):
+    def __init__(self):
+        self.terminal_list: TerminalList = []
+        self.terminals: Terminals = {}
+        self.grammar: Grammar = {}
+        self.ignore_dict: Dict[str, List[str]] = {}
+        self.current_mode: str = ''
+        self.i: int = 0
+    def start(self, tree):
+        self.terminal_list = []
+        self.terminals = {}
+        self.grammar = {}
+        self.ignore_dict = {'': []}
+        self.current_mode = ''
+        self.i = 0
+        for statement in tree:
+            self(statement)
+        if self.terminals:
+            for mode in self.terminals:
+                for terminal in self.terminal_list:
+                    self.terminals[mode].append(terminal)
+        else:
+            self.terminals[''] = self.terminal_list
+        return self.terminals, self.grammar, self.ignore_dict
+    def terminal(self, tree):
+        desc = self(tree[1])
+        action = [self(tree[0])]
+        if len(tree) > 2:
+            b = self(tree[2])
+            action += b
+        self.add_terminal(desc, action)
+    def add_terminal(self, *new):
+        if self.current_mode:
+            if new not in self.terminals[self.current_mode]:
+                self.terminals[self.current_mode].append(new)
+        elif new not in self.terminal_list:
+            self.terminal_list.append(new)
+    def description(self, token):
+        return token.value
+    def operations(self, tree):
+        list = []
+        for token in tree:
+            list += self(token)
+        return list
+    @staticmethod
+    def push_mode(token):
+        return [token.value[1:]]
+    @staticmethod
+    def pop_mode(token):
+        if token.children:
+            return [int(token.value)]
+        else:
+            return [1]
+    @staticmethod
+    def reset_mode(*args):
+        return [0]
+    @staticmethod
+    def change_mode(token):
+        return [1, token.value]
+    def nonterminal(self, tree):
+        name = self(tree[0])
+        options = self(tree[1])
+        self.grammar[name] = options
+    def option(self, tree):
+        units = self(tree[0])
+        if len(tree) > 2:
+            alias = self(tree[1])
+            self.grammar[alias] = units
+            return (alias,)
+        return tuple(units)
+    def unit(self, tree):
+        atom = self(tree[0])
+        if atom[0] in unnamed:
+            self.add_terminal(atom, [atom])
+        if len(tree) > 1:
+            quantifier = tree[1][0].value
+            name = self.random_name()
+            match quantifier:
+                case '?':
+                    self.grammar[name] = [(atom,), ()]
+                case '*':
+                    self.grammar[name] = [(name, atom), ()]
+                case '+':
+                    self.grammar[name] = [(name, atom), (atom,)]
+            return name
+        return atom
+    def atom(self, tree):
+        desc = self(tree[0])
+        if isinstance(desc, str):
+            return desc
+        name = self.random_name()
+        self.grammar[name] = desc
+        return name
+    def random_name(self):
+        while (name := self.next_name()) in self.grammar:
+            self.i += 1
+        return name
+    def next_name(self):
+        result = []
+        n = self.i
+        while n > 0:
+            remainder = n % len(alphabet)
+            result.append(alphabet[remainder])
+            n = n // len(alphabet)
+        return '_' + ''.join(result or 'a')
+    def mode(self, token):
+        self.current_mode = token.value
+        self.terminals[self.current_mode] = []
+        self.ignore_dict[self.current_mode] = []
+    def ignore(self, node):
+        terminal = self(node[0])
+        if isinstance(node, Tree):
+            if (
+                    (len(terminal) > 4) or
+                    (terminal[0] not in '\'"') or
+                    (len(terminal) == 4 and terminal[1] != '\\')
+            ):
+                self.add_terminal(terminal, [terminal])
+        self.ignore_dict[self.current_mode].append(terminal)
+l, p = boho_grammar
+lexer = Lexer(l)
+parser = Parser(p)
+interpreter = GrammarInterpreter()
+def interpret(grammar: str):
+    tokens = lexer(grammar)
+    tree = parser(tokens)
+    return interpreter(tree)