hoozuki 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Hoozuki::Parser
4
+ rule
5
+ target: choice
6
+
7
+ choice:
8
+ concatenation
9
+ | choice PIPE concatenation {
10
+ children = val[0].is_a?(Hoozuki::Node::Choice) ? val[0].children.dup : [val[0]]
11
+ children << val[2]
12
+ result = Hoozuki::Node::Choice.new(children)
13
+ }
14
+
15
+ concatenation:
16
+ repetition
17
+ | EPSILON { result = Hoozuki::Node::Epsilon.new }
18
+ | concatenation repetition {
19
+ if val[0].is_a?(Hoozuki::Node::Epsilon)
20
+ result = val[1]
21
+ else
22
+ children = val[0].is_a?(Hoozuki::Node::Concatenation) ? val[0].children.dup : [val[0]]
23
+ children << val[1]
24
+ result = Hoozuki::Node::Concatenation.new(children)
25
+ end
26
+ }
27
+
28
+ repetition:
29
+ group
30
+ | group STAR { result = Hoozuki::Node::Repetition.new(val[0], :zero_or_more) }
31
+ | group PLUS { result = Hoozuki::Node::Repetition.new(val[0], :one_or_more) }
32
+ | group QUESTION { result = Hoozuki::Node::Repetition.new(val[0], :optional) }
33
+
34
+ group:
35
+ LPAREN choice RPAREN { result = val[1] }
36
+ | literal
37
+
38
+ literal:
39
+ CHAR { result = Hoozuki::Node::Literal.new(val[0]) }
40
+
41
+ end
42
+
43
+ ---- header
44
+ require_relative 'node'
45
+
46
+ ---- inner
47
+ def initialize
48
+ @yydebug = true if ENV['DEBUG']
49
+ end
50
+
51
+ def parse(pattern)
52
+ @pattern = pattern
53
+ @offset = 0
54
+ @tokens = []
55
+ tokenize
56
+ do_parse
57
+ end
58
+
59
+ private
60
+
61
+ ESCAPABLE_CHARS = ['(', ')', '|', '*', '+', '?', '\\'].freeze
62
+ SPECIAL_TOKENS = {
63
+ '(' => :LPAREN,
64
+ ')' => :RPAREN,
65
+ '|' => :PIPE,
66
+ '*' => :STAR,
67
+ '+' => :PLUS,
68
+ '?' => :QUESTION
69
+ }.freeze
70
+
71
+ def tokenize
72
+ while @offset < @pattern.length
73
+ char = @pattern[@offset]
74
+
75
+ if char == '\\'
76
+ handle_escape_sequence
77
+ elsif SPECIAL_TOKENS.key?(char)
78
+ handle_special_char(char)
79
+ else
80
+ add_token(:CHAR, char)
81
+ end
82
+ end
83
+
84
+ @tokens << [false, false]
85
+ end
86
+
87
+ def handle_escape_sequence
88
+ @offset += 1
89
+ raise 'Unexpected end of pattern' if @offset >= @pattern.length
90
+
91
+ escaped = @pattern[@offset]
92
+ raise "Invalid escape sequence: \\#{escaped}" unless ESCAPABLE_CHARS.include?(escaped)
93
+
94
+ add_token(:CHAR, escaped)
95
+ end
96
+
97
+ def handle_special_char(char)
98
+ token_type = SPECIAL_TOKENS[char]
99
+ add_token(token_type, char)
100
+
101
+ insert_epsilon_after_lparen if char == '(' && next_char == '|'
102
+ insert_epsilon_after_pipe if char == '|' && should_insert_epsilon_after_pipe?
103
+ end
104
+
105
+ def should_insert_epsilon_after_pipe?
106
+ next_char.nil? || [')', '|'].include?(next_char)
107
+ end
108
+
109
+ def insert_epsilon_after_lparen
110
+ @tokens << [:EPSILON, nil]
111
+ end
112
+
113
+ def insert_epsilon_after_pipe
114
+ @tokens << [:EPSILON, nil]
115
+ end
116
+
117
+ def add_token(type, value)
118
+ @tokens << [type, value]
119
+ @offset += 1
120
+ end
121
+
122
+ def next_char
123
+ @offset < @pattern.length ? @pattern[@offset] : nil
124
+ end
125
+
126
+ def next_token
127
+ @tokens.shift
128
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Hoozuki
4
- VERSION = '0.1.0'
3
+ module Hoozuki
4
+ VERSION = '1.0.0'
5
5
  end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hoozuki
4
+ module VM
5
+ class Compiler
6
+ attr_reader :instructions
7
+
8
+ def initialize
9
+ @pc = 0
10
+ @instructions = []
11
+ end
12
+
13
+ def compile(ast)
14
+ compile_node(ast)
15
+ emit_match
16
+ end
17
+
18
+ private
19
+
20
+ def compile_node(ast)
21
+ case ast
22
+ when Hoozuki::Node::Literal
23
+ compile_literal(ast)
24
+ when Hoozuki::Node::Epsilon
25
+ # Do nothing for epsilon
26
+ when Node::Repetition
27
+ compile_repetition(ast)
28
+ when Node::Choice
29
+ compile_choice(ast)
30
+ when Node::Concatenation
31
+ compile_concatenation(ast)
32
+ end
33
+ end
34
+
35
+ def compile_literal(node)
36
+ emit(Hoozuki::Instruction::Char.new(node.value))
37
+ end
38
+
39
+ def compile_repetition(node)
40
+ if node.zero_or_more?
41
+ compile_zero_or_more(node.child)
42
+ elsif node.one_or_more?
43
+ compile_one_or_more(node.child)
44
+ elsif node.optional?
45
+ compile_optional(node.child)
46
+ end
47
+ end
48
+
49
+ def compile_zero_or_more(child)
50
+ split = @pc
51
+ emit(Hoozuki::Instruction::Split.new(@pc + 1, 0))
52
+ compile_node(child)
53
+ emit(Hoozuki::Instruction::Jmp.new(split))
54
+ patch(split, Hoozuki::Instruction::Split.new(split + 1, @pc))
55
+ end
56
+
57
+ def compile_one_or_more(child)
58
+ start = @pc
59
+ compile_node(child)
60
+ emit(Hoozuki::Instruction::Split.new(start, @pc + 1))
61
+ end
62
+
63
+ def compile_optional(child)
64
+ split = @pc
65
+ emit(Hoozuki::Instruction::Split.new(0, 0))
66
+ start = @pc
67
+ compile_node(child)
68
+ last = @pc
69
+ patch(split, Hoozuki::Instruction::Split.new(start, last))
70
+ end
71
+
72
+ def compile_choice(node)
73
+ if node.children.length == 2
74
+ compile_binary_choice(node.children[0], node.children[1])
75
+ else
76
+ first = node.children[0]
77
+ rest = Node::Choice.new(node.children[1..])
78
+ compile_binary_choice(first, rest)
79
+ end
80
+ end
81
+
82
+ def compile_binary_choice(left, right)
83
+ split = @pc
84
+ @pc += 1
85
+ @instructions << Hoozuki::Instruction::Split.new(@pc, 0)
86
+ compile_node(left)
87
+ jump = @pc
88
+ emit(Hoozuki::Instruction::Jmp.new(0))
89
+ validate_split_instruction(split)
90
+ @instructions[split].right = @pc
91
+ compile_node(right)
92
+ validate_jmp_instruction(jump)
93
+ @instructions[jump].target = @pc
94
+ end
95
+
96
+ def compile_concatenation(node)
97
+ node.children.each do |child|
98
+ compile_node(child)
99
+ end
100
+ end
101
+
102
+ def emit_match
103
+ @pc += 1
104
+ @instructions << Instruction::Match.new
105
+ end
106
+
107
+ def validate_split_instruction(pc)
108
+ return if @instructions[pc].is_a?(Hoozuki::Instruction::Split)
109
+
110
+ raise "Instruction at pc #{pc} is not a Split"
111
+ end
112
+
113
+ def validate_jmp_instruction(pc)
114
+ return if @instructions[pc].is_a?(Hoozuki::Instruction::Jmp)
115
+
116
+ raise "Instruction at pc #{pc} is not a Jmp"
117
+ end
118
+
119
+ def emit(instruction)
120
+ @instructions << instruction
121
+ @pc += 1
122
+ end
123
+
124
+ def patch(pc, instruction)
125
+ @instructions[pc] = instruction
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hoozuki
4
+ module VM
5
+ class Evaluator
6
+ class << self
7
+ def evaluate(instructions, input, input_pos = 0, pc = 0)
8
+ new._evaluate(instructions, input, input_pos, pc)
9
+ end
10
+ end
11
+
12
+ def _evaluate(instructions, input, input_pos, pc)
13
+ loop do
14
+ return false if pc >= instructions.size
15
+
16
+ inst = instructions[pc]
17
+ case inst
18
+ when Hoozuki::Instruction::Char
19
+ return false if input_pos >= input.size || input[input_pos] != inst.char
20
+
21
+ input_pos += 1
22
+ pc += 1
23
+ when Hoozuki::Instruction::Jmp
24
+ pc = inst.target
25
+ when Hoozuki::Instruction::Split
26
+ return true if _evaluate(instructions, input, input_pos, inst.left)
27
+
28
+ pc = inst.right
29
+
30
+ when Hoozuki::Instruction::Match
31
+ return input_pos == input.length
32
+ else
33
+ raise "Unknown instruction: #{inst.class}"
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
data/lib/hoozuki/vm.rb ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'vm/compiler'
4
+ require_relative 'vm/evaluator'
data/lib/hoozuki.rb CHANGED
@@ -1,35 +1,42 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'hoozuki/automaton'
4
+ require_relative 'hoozuki/instruction'
4
5
  require_relative 'hoozuki/node'
5
6
  require_relative 'hoozuki/parser'
6
7
  require_relative 'hoozuki/version'
8
+ require_relative 'hoozuki/vm'
7
9
 
8
- class Hoozuki
9
- def initialize(input, method: :dfa)
10
- @input = input
11
- @method = method
10
+ module Hoozuki
11
+ module_function
12
12
 
13
- ast = Hoozuki::Parser.new(input).parse
14
- case method
13
+ def compile(input, engine: :dfa)
14
+ ast = Parser.new.parse(input)
15
+ case engine
15
16
  when :dfa
16
- nfa = Automaton::NFA.new_from_node(ast, Automaton::StateID.new(0))
17
- @dfa = Automaton::DFA.from_nfa(nfa, use_cache?(input))
17
+ nfa = Automaton::NFA.from_node(ast, Automaton::StateID.new(0))
18
+ Automaton::DFA.from_nfa(nfa, use_cache?(input))
19
+ when :vm
20
+ compiler = VM::Compiler.new
21
+ compiler.compile(ast)
22
+ compiler.instructions
23
+ else
24
+ raise ArgumentError, "Unknown engine: #{engine}"
18
25
  end
19
26
  end
20
27
 
21
- def match?(input)
22
- case @method
28
+ def match?(pattern, input, engine: :dfa)
29
+ compiled = compile(pattern, engine: engine)
30
+ case engine
23
31
  when :dfa
24
- @dfa.match?(input, use_cache?(input))
25
- else
26
- raise ArgumentError, "Unknown method: #{@method}"
32
+ compiled.match?(input, use_cache?(input))
33
+ when :vm
34
+ VM::Evaluator.evaluate(compiled, input, 0, 0)
27
35
  end
28
36
  end
29
37
 
30
- private
31
-
32
38
  def use_cache?(input)
33
39
  input.length > 1000
34
40
  end
41
+ private_class_method :use_cache?
35
42
  end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Hoozuki::Automaton::DFA::Builder do
4
+ describe '#call' do
5
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
6
+
7
+ context 'with simple NFA' do
8
+ it 'builds a DFA' do
9
+ node = Hoozuki::Node::Literal.new('a')
10
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
11
+ builder = described_class.new(nfa, false)
12
+ dfa = builder.call
13
+
14
+ expect(dfa).to be_a(Hoozuki::Automaton::DFA)
15
+ expect(dfa.start).to be_a(Integer)
16
+ expect(dfa.accept).to be_a(Set)
17
+ expect(dfa.transitions).to be_a(Set)
18
+ end
19
+ end
20
+
21
+ context 'with NFA containing epsilon transitions' do
22
+ it 'eliminates epsilon transitions' do
23
+ node = Hoozuki::Parser.new.parse('a?')
24
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
25
+ builder = described_class.new(nfa, false)
26
+ dfa = builder.call
27
+
28
+ epsilon_transitions = dfa.transitions.select { |_, label, _| label.nil? }
29
+ expect(epsilon_transitions).to be_empty
30
+ end
31
+ end
32
+
33
+
34
+ context 'with alternation' do
35
+ it 'creates correct number of states' do
36
+ node = Hoozuki::Parser.new.parse('a|b')
37
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
38
+ builder = described_class.new(nfa, false)
39
+ dfa = builder.call
40
+
41
+ expect(dfa.transitions.size).to be >= 2
42
+ end
43
+ end
44
+
45
+ context 'with repetition' do
46
+ it 'handles loops correctly' do
47
+ node = Hoozuki::Parser.new.parse('a*')
48
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
49
+ builder = described_class.new(nfa, false)
50
+ dfa = builder.call
51
+
52
+ expect(dfa.accept).to include(dfa.start)
53
+ end
54
+ end
55
+
56
+ context 'with concatenation' do
57
+ it 'builds sequential transitions' do
58
+ node = Hoozuki::Parser.new.parse('abc')
59
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
60
+ builder = described_class.new(nfa, false)
61
+ dfa = builder.call
62
+
63
+ expect(dfa.transitions.size).to be >= 3
64
+ end
65
+ end
66
+
67
+ context 'with complex pattern' do
68
+ it 'builds correct DFA structure' do
69
+ node = Hoozuki::Parser.new.parse('(a|b)*c')
70
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
71
+ builder = described_class.new(nfa, false)
72
+ dfa = builder.call
73
+
74
+ expect(dfa).to be_a(Hoozuki::Automaton::DFA)
75
+ expect(dfa.accept).not_to be_empty
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Hoozuki::Automaton::DFA do
4
+ describe '.from_nfa' do
5
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
6
+
7
+ it 'converts simple NFA to DFA' do
8
+ node = Hoozuki::Node::Literal.new('a')
9
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
10
+ dfa = described_class.from_nfa(nfa, false)
11
+
12
+ expect(dfa).to be_a(described_class)
13
+ expect(dfa.start).to be_a(Integer)
14
+ expect(dfa.accept).to be_a(Set)
15
+ expect(dfa.transitions).not_to be_empty
16
+ end
17
+
18
+ it 'converts choice NFA to DFA' do
19
+ node = Hoozuki::Node::Choice.new([
20
+ Hoozuki::Node::Literal.new('a'),
21
+ Hoozuki::Node::Literal.new('b')
22
+ ])
23
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
24
+ dfa = described_class.from_nfa(nfa, false)
25
+
26
+ expect(dfa.start).to be_a(Integer)
27
+ expect(dfa.transitions.map { |_, label, _| label }).to include('a', 'b')
28
+ end
29
+
30
+ it 'converts concatenation NFA to DFA' do
31
+ node = Hoozuki::Node::Concatenation.new([
32
+ Hoozuki::Node::Literal.new('a'),
33
+ Hoozuki::Node::Literal.new('b')
34
+ ])
35
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
36
+ dfa = described_class.from_nfa(nfa, false)
37
+
38
+ expect(dfa.start).to be_a(Integer)
39
+ expect(dfa.accept).not_to be_empty
40
+ end
41
+
42
+ it 'handles alternation patterns' do
43
+ node = Hoozuki::Parser.new.parse('a|b')
44
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
45
+ dfa = described_class.from_nfa(nfa, false)
46
+
47
+ expect(dfa.transitions.size).to be >= 2
48
+ end
49
+
50
+ it 'handles repetition patterns' do
51
+ node = Hoozuki::Parser.new.parse('a*')
52
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
53
+ dfa = described_class.from_nfa(nfa, false)
54
+
55
+ expect(dfa.accept).to include(dfa.start)
56
+ end
57
+ end
58
+
59
+ describe '#match?' do
60
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
61
+
62
+ it 'matches using DFA for single literal' do
63
+ node = Hoozuki::Node::Literal.new('a')
64
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
65
+ dfa = described_class.from_nfa(nfa, false)
66
+
67
+ expect(dfa.match?('a', false)).to be true
68
+ expect(dfa.match?('b', false)).to be false
69
+ end
70
+
71
+ it 'matches choice pattern using DFA' do
72
+ node = Hoozuki::Node::Choice.new([
73
+ Hoozuki::Node::Literal.new('a'),
74
+ Hoozuki::Node::Literal.new('b')
75
+ ])
76
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
77
+ dfa = described_class.from_nfa(nfa, false)
78
+
79
+ expect(dfa.match?('a', false)).to be true
80
+ expect(dfa.match?('b', false)).to be true
81
+ expect(dfa.match?('c', false)).to be false
82
+ end
83
+
84
+ it 'matches concatenation pattern using DFA' do
85
+ node = Hoozuki::Node::Concatenation.new([
86
+ Hoozuki::Node::Literal.new('a'),
87
+ Hoozuki::Node::Literal.new('b')
88
+ ])
89
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
90
+ dfa = described_class.from_nfa(nfa, false)
91
+
92
+ expect(dfa.match?('ab', false)).to be true
93
+ expect(dfa.match?('a', false)).to be false
94
+ expect(dfa.match?('abc', false)).to be false
95
+ end
96
+
97
+ context 'with simple literal' do
98
+ it 'matches exact string' do
99
+ node = Hoozuki::Parser.new.parse('abc')
100
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
101
+ dfa = described_class.from_nfa(nfa, false)
102
+
103
+ expect(dfa.match?('abc', false)).to be true
104
+ end
105
+
106
+ it 'does not match different string' do
107
+ node = Hoozuki::Parser.new.parse('abc')
108
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
109
+ dfa = described_class.from_nfa(nfa, false)
110
+
111
+ expect(dfa.match?('abd', false)).to be false
112
+ end
113
+ end
114
+
115
+ context 'with alternation' do
116
+ it 'matches either branch' do
117
+ node = Hoozuki::Parser.new.parse('a|b')
118
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
119
+ dfa = described_class.from_nfa(nfa, false)
120
+
121
+ expect(dfa.match?('a', false)).to be true
122
+ expect(dfa.match?('b', false)).to be true
123
+ expect(dfa.match?('c', false)).to be false
124
+ end
125
+ end
126
+ end
127
+
128
+ describe '#next_transition' do
129
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
130
+
131
+ it 'finds correct next state' do
132
+ node = Hoozuki::Parser.new.parse('a')
133
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
134
+ dfa = described_class.from_nfa(nfa, false)
135
+
136
+ next_state = dfa.next_transition(dfa.start, 'a', false)
137
+ expect(next_state).not_to be_nil
138
+ end
139
+
140
+ it 'returns nil for invalid transition' do
141
+ node = Hoozuki::Parser.new.parse('a')
142
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
143
+ dfa = described_class.from_nfa(nfa, false)
144
+
145
+ next_state = dfa.next_transition(dfa.start, 'b', false)
146
+ expect(next_state).to be_nil
147
+ end
148
+ end
149
+ end