hoozuki 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Hoozuki::Parser
4
+ rule
5
+ target: choice
6
+
7
+ choice:
8
+ concatenation
9
+ | choice PIPE concatenation {
10
+ children = val[0].is_a?(Hoozuki::Node::Choice) ? val[0].children.dup : [val[0]]
11
+ children << val[2]
12
+ result = Hoozuki::Node::Choice.new(children)
13
+ }
14
+
15
+ concatenation:
16
+ repetition
17
+ | EPSILON { result = Hoozuki::Node::Epsilon.new }
18
+ | concatenation repetition {
19
+ if val[0].is_a?(Hoozuki::Node::Epsilon)
20
+ result = val[1]
21
+ else
22
+ children = val[0].is_a?(Hoozuki::Node::Concatenation) ? val[0].children.dup : [val[0]]
23
+ children << val[1]
24
+ result = Hoozuki::Node::Concatenation.new(children)
25
+ end
26
+ }
27
+
28
+ repetition:
29
+ group
30
+ | group STAR { result = Hoozuki::Node::Repetition.new(val[0], :zero_or_more) }
31
+ | group PLUS { result = Hoozuki::Node::Repetition.new(val[0], :one_or_more) }
32
+ | group QUESTION { result = Hoozuki::Node::Repetition.new(val[0], :optional) }
33
+
34
+ group:
35
+ LPAREN choice RPAREN { result = val[1] }
36
+ | literal
37
+
38
+ literal:
39
+ CHAR { result = Hoozuki::Node::Literal.new(val[0]) }
40
+
41
+ end
42
+
43
+ ---- header
44
+ require_relative 'node'
45
+
46
+ ---- inner
47
+ def initialize
48
+ @yydebug = true if ENV['DEBUG']
49
+ end
50
+
51
+ def parse(pattern)
52
+ @pattern = pattern
53
+ @offset = 0
54
+ @tokens = []
55
+ tokenize
56
+ do_parse
57
+ end
58
+
59
+ private
60
+
61
+ ESCAPABLE_CHARS = ['(', ')', '|', '*', '+', '?', '\\'].freeze
62
+ SPECIAL_TOKENS = {
63
+ '(' => :LPAREN,
64
+ ')' => :RPAREN,
65
+ '|' => :PIPE,
66
+ '*' => :STAR,
67
+ '+' => :PLUS,
68
+ '?' => :QUESTION
69
+ }.freeze
70
+
71
+ def tokenize
72
+ while @offset < @pattern.length
73
+ char = @pattern[@offset]
74
+
75
+ if char == '\\'
76
+ handle_escape_sequence
77
+ elsif SPECIAL_TOKENS.key?(char)
78
+ handle_special_char(char)
79
+ else
80
+ add_token(:CHAR, char)
81
+ end
82
+ end
83
+
84
+ @tokens << [false, false]
85
+ end
86
+
87
+ def handle_escape_sequence
88
+ @offset += 1
89
+ raise 'Unexpected end of pattern' if @offset >= @pattern.length
90
+
91
+ escaped = @pattern[@offset]
92
+ raise "Invalid escape sequence: \\#{escaped}" unless ESCAPABLE_CHARS.include?(escaped)
93
+
94
+ add_token(:CHAR, escaped)
95
+ end
96
+
97
+ def handle_special_char(char)
98
+ token_type = SPECIAL_TOKENS[char]
99
+ add_token(token_type, char)
100
+
101
+ insert_epsilon_after_lparen if char == '(' && next_char == '|'
102
+ insert_epsilon_after_pipe if char == '|' && should_insert_epsilon_after_pipe?
103
+ end
104
+
105
+ def should_insert_epsilon_after_pipe?
106
+ next_char.nil? || [')', '|'].include?(next_char)
107
+ end
108
+
109
+ def insert_epsilon_after_lparen
110
+ @tokens << [:EPSILON, nil]
111
+ end
112
+
113
+ def insert_epsilon_after_pipe
114
+ @tokens << [:EPSILON, nil]
115
+ end
116
+
117
+ def add_token(type, value)
118
+ @tokens << [type, value]
119
+ @offset += 1
120
+ end
121
+
122
+ def next_char
123
+ @offset < @pattern.length ? @pattern[@offset] : nil
124
+ end
125
+
126
+ def next_token
127
+ @tokens.shift
128
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Hoozuki
4
- VERSION = '0.2.0'
3
+ module Hoozuki
4
+ VERSION = '1.0.0'
5
5
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Hoozuki
3
+ module Hoozuki
4
4
  module VM
5
5
  class Compiler
6
6
  attr_reader :instructions
@@ -11,66 +11,111 @@ class Hoozuki
11
11
  end
12
12
 
13
13
  def compile(ast)
14
- _compile(ast)
15
- @pc += 1
16
- @instructions << Instruction::Match.new
14
+ compile_node(ast)
15
+ emit_match
17
16
  end
18
17
 
19
18
  private
20
19
 
21
- def _compile(ast)
20
+ def compile_node(ast)
22
21
  case ast
23
22
  when Hoozuki::Node::Literal
24
- emit(Hoozuki::Instruction::Char.new(ast.value))
23
+ compile_literal(ast)
25
24
  when Hoozuki::Node::Epsilon
26
25
  # Do nothing for epsilon
27
26
  when Node::Repetition
28
- if ast.zero_or_more?
29
- split = @pc
30
- emit(Hoozuki::Instruction::Split.new(@pc + 1, 0))
31
- _compile(ast.child)
32
- emit(Hoozuki::Instruction::Jmp.new(split))
33
- patch(split, Hoozuki::Instruction::Split.new(split + 1, @pc))
34
- elsif ast.one_or_more?
35
- start = @pc
36
- _compile(ast.child)
37
- emit(Hoozuki::Instruction::Split.new(start, @pc + 1))
38
- elsif ast.optional?
39
- split = @pc
40
- emit(Hoozuki::Instruction::Split.new(0, 0))
41
- start = @pc
42
- _compile(ast.child)
43
- last = @pc
44
- patch(split, Hoozuki::Instruction::Split.new(start, last))
45
- end
27
+ compile_repetition(ast)
46
28
  when Node::Choice
47
- split = @pc
48
- @pc += 1
49
- @instructions << Hoozuki::Instruction::Split.new(@pc, 0)
50
- _compile(ast.children.first)
51
- jump = @pc
52
- emit(Hoozuki::Instruction::Jmp.new(0))
53
-
54
- if @instructions[split].is_a?(Hoozuki::Instruction::Split)
55
- @instructions[split].right = @pc
56
- else
57
- raise "Instruction at pc #{split} is not a Split"
58
- end
59
-
60
- _compile(ast.children.last)
61
-
62
- if @instructions[jump].is_a?(Hoozuki::Instruction::Jmp)
63
- @instructions[jump].target = @pc
64
- else
65
- raise "Instruction at pc #{jump} is not a Jmp"
66
- end
29
+ compile_choice(ast)
67
30
  when Node::Concatenation
68
- ast.children.each do |child|
69
- _compile(child)
70
- end
31
+ compile_concatenation(ast)
32
+ end
33
+ end
34
+
35
+ def compile_literal(node)
36
+ emit(Hoozuki::Instruction::Char.new(node.value))
37
+ end
38
+
39
+ def compile_repetition(node)
40
+ if node.zero_or_more?
41
+ compile_zero_or_more(node.child)
42
+ elsif node.one_or_more?
43
+ compile_one_or_more(node.child)
44
+ elsif node.optional?
45
+ compile_optional(node.child)
46
+ end
47
+ end
48
+
49
+ def compile_zero_or_more(child)
50
+ split = @pc
51
+ emit(Hoozuki::Instruction::Split.new(@pc + 1, 0))
52
+ compile_node(child)
53
+ emit(Hoozuki::Instruction::Jmp.new(split))
54
+ patch(split, Hoozuki::Instruction::Split.new(split + 1, @pc))
55
+ end
56
+
57
+ def compile_one_or_more(child)
58
+ start = @pc
59
+ compile_node(child)
60
+ emit(Hoozuki::Instruction::Split.new(start, @pc + 1))
61
+ end
62
+
63
+ def compile_optional(child)
64
+ split = @pc
65
+ emit(Hoozuki::Instruction::Split.new(0, 0))
66
+ start = @pc
67
+ compile_node(child)
68
+ last = @pc
69
+ patch(split, Hoozuki::Instruction::Split.new(start, last))
70
+ end
71
+
72
+ def compile_choice(node)
73
+ if node.children.length == 2
74
+ compile_binary_choice(node.children[0], node.children[1])
75
+ else
76
+ first = node.children[0]
77
+ rest = Node::Choice.new(node.children[1..])
78
+ compile_binary_choice(first, rest)
71
79
  end
72
80
  end
73
81
 
82
+ def compile_binary_choice(left, right)
83
+ split = @pc
84
+ @pc += 1
85
+ @instructions << Hoozuki::Instruction::Split.new(@pc, 0)
86
+ compile_node(left)
87
+ jump = @pc
88
+ emit(Hoozuki::Instruction::Jmp.new(0))
89
+ validate_split_instruction(split)
90
+ @instructions[split].right = @pc
91
+ compile_node(right)
92
+ validate_jmp_instruction(jump)
93
+ @instructions[jump].target = @pc
94
+ end
95
+
96
+ def compile_concatenation(node)
97
+ node.children.each do |child|
98
+ compile_node(child)
99
+ end
100
+ end
101
+
102
+ def emit_match
103
+ @pc += 1
104
+ @instructions << Instruction::Match.new
105
+ end
106
+
107
+ def validate_split_instruction(pc)
108
+ return if @instructions[pc].is_a?(Hoozuki::Instruction::Split)
109
+
110
+ raise "Instruction at pc #{pc} is not a Split"
111
+ end
112
+
113
+ def validate_jmp_instruction(pc)
114
+ return if @instructions[pc].is_a?(Hoozuki::Instruction::Jmp)
115
+
116
+ raise "Instruction at pc #{pc} is not a Jmp"
117
+ end
118
+
74
119
  def emit(instruction)
75
120
  @instructions << instruction
76
121
  @pc += 1
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Hoozuki
3
+ module Hoozuki
4
4
  module VM
5
5
  class Evaluator
6
6
  class << self
@@ -17,16 +17,16 @@ class Hoozuki
17
17
  case inst
18
18
  when Hoozuki::Instruction::Char
19
19
  return false if input_pos >= input.size || input[input_pos] != inst.char
20
+
20
21
  input_pos += 1
21
22
  pc += 1
22
23
  when Hoozuki::Instruction::Jmp
23
24
  pc = inst.target
24
25
  when Hoozuki::Instruction::Split
25
- if _evaluate(instructions, input, input_pos, inst.left)
26
- return true
27
- else
28
- pc = inst.right
29
- end
26
+ return true if _evaluate(instructions, input, input_pos, inst.left)
27
+
28
+ pc = inst.right
29
+
30
30
  when Hoozuki::Instruction::Match
31
31
  return input_pos == input.length
32
32
  else
data/lib/hoozuki.rb CHANGED
@@ -7,37 +7,36 @@ require_relative 'hoozuki/parser'
7
7
  require_relative 'hoozuki/version'
8
8
  require_relative 'hoozuki/vm'
9
9
 
10
- class Hoozuki
11
- def initialize(input, engine: :dfa)
12
- @input = input
13
- @engine = engine
10
+ module Hoozuki
11
+ module_function
14
12
 
15
- ast = Hoozuki::Parser.new(input).parse
13
+ def compile(input, engine: :dfa)
14
+ ast = Parser.new.parse(input)
16
15
  case engine
17
16
  when :dfa
18
- nfa = Automaton::NFA.new_from_node(ast, Automaton::StateID.new(0))
19
- @dfa = Automaton::DFA.from_nfa(nfa, use_cache?(input))
17
+ nfa = Automaton::NFA.from_node(ast, Automaton::StateID.new(0))
18
+ Automaton::DFA.from_nfa(nfa, use_cache?(input))
20
19
  when :vm
21
20
  compiler = VM::Compiler.new
22
21
  compiler.compile(ast)
23
- @bytecode = compiler.instructions
22
+ compiler.instructions
23
+ else
24
+ raise ArgumentError, "Unknown engine: #{engine}"
24
25
  end
25
26
  end
26
27
 
27
- def match?(input)
28
- case @engine
28
+ def match?(pattern, input, engine: :dfa)
29
+ compiled = compile(pattern, engine: engine)
30
+ case engine
29
31
  when :dfa
30
- @dfa.match?(input, use_cache?(input))
32
+ compiled.match?(input, use_cache?(input))
31
33
  when :vm
32
- VM::Evaluator.evaluate(@bytecode, input, 0, 0)
33
- else
34
- raise ArgumentError, "Unknown engine: #{@engine}"
34
+ VM::Evaluator.evaluate(compiled, input, 0, 0)
35
35
  end
36
36
  end
37
37
 
38
- private
39
-
40
38
  def use_cache?(input)
41
39
  input.length > 1000
42
40
  end
41
+ private_class_method :use_cache?
43
42
  end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Hoozuki::Automaton::DFA::Builder do
4
+ describe '#call' do
5
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
6
+
7
+ context 'with simple NFA' do
8
+ it 'builds a DFA' do
9
+ node = Hoozuki::Node::Literal.new('a')
10
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
11
+ builder = described_class.new(nfa, false)
12
+ dfa = builder.call
13
+
14
+ expect(dfa).to be_a(Hoozuki::Automaton::DFA)
15
+ expect(dfa.start).to be_a(Integer)
16
+ expect(dfa.accept).to be_a(Set)
17
+ expect(dfa.transitions).to be_a(Set)
18
+ end
19
+ end
20
+
21
+ context 'with NFA containing epsilon transitions' do
22
+ it 'eliminates epsilon transitions' do
23
+ node = Hoozuki::Parser.new.parse('a?')
24
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
25
+ builder = described_class.new(nfa, false)
26
+ dfa = builder.call
27
+
28
+ epsilon_transitions = dfa.transitions.select { |_, label, _| label.nil? }
29
+ expect(epsilon_transitions).to be_empty
30
+ end
31
+ end
32
+
33
+
34
+ context 'with alternation' do
35
+ it 'creates correct number of states' do
36
+ node = Hoozuki::Parser.new.parse('a|b')
37
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
38
+ builder = described_class.new(nfa, false)
39
+ dfa = builder.call
40
+
41
+ expect(dfa.transitions.size).to be >= 2
42
+ end
43
+ end
44
+
45
+ context 'with repetition' do
46
+ it 'handles loops correctly' do
47
+ node = Hoozuki::Parser.new.parse('a*')
48
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
49
+ builder = described_class.new(nfa, false)
50
+ dfa = builder.call
51
+
52
+ expect(dfa.accept).to include(dfa.start)
53
+ end
54
+ end
55
+
56
+ context 'with concatenation' do
57
+ it 'builds sequential transitions' do
58
+ node = Hoozuki::Parser.new.parse('abc')
59
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
60
+ builder = described_class.new(nfa, false)
61
+ dfa = builder.call
62
+
63
+ expect(dfa.transitions.size).to be >= 3
64
+ end
65
+ end
66
+
67
+ context 'with complex pattern' do
68
+ it 'builds correct DFA structure' do
69
+ node = Hoozuki::Parser.new.parse('(a|b)*c')
70
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
71
+ builder = described_class.new(nfa, false)
72
+ dfa = builder.call
73
+
74
+ expect(dfa).to be_a(Hoozuki::Automaton::DFA)
75
+ expect(dfa.accept).not_to be_empty
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Hoozuki::Automaton::DFA do
4
+ describe '.from_nfa' do
5
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
6
+
7
+ it 'converts simple NFA to DFA' do
8
+ node = Hoozuki::Node::Literal.new('a')
9
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
10
+ dfa = described_class.from_nfa(nfa, false)
11
+
12
+ expect(dfa).to be_a(described_class)
13
+ expect(dfa.start).to be_a(Integer)
14
+ expect(dfa.accept).to be_a(Set)
15
+ expect(dfa.transitions).not_to be_empty
16
+ end
17
+
18
+ it 'converts choice NFA to DFA' do
19
+ node = Hoozuki::Node::Choice.new([
20
+ Hoozuki::Node::Literal.new('a'),
21
+ Hoozuki::Node::Literal.new('b')
22
+ ])
23
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
24
+ dfa = described_class.from_nfa(nfa, false)
25
+
26
+ expect(dfa.start).to be_a(Integer)
27
+ expect(dfa.transitions.map { |_, label, _| label }).to include('a', 'b')
28
+ end
29
+
30
+ it 'converts concatenation NFA to DFA' do
31
+ node = Hoozuki::Node::Concatenation.new([
32
+ Hoozuki::Node::Literal.new('a'),
33
+ Hoozuki::Node::Literal.new('b')
34
+ ])
35
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
36
+ dfa = described_class.from_nfa(nfa, false)
37
+
38
+ expect(dfa.start).to be_a(Integer)
39
+ expect(dfa.accept).not_to be_empty
40
+ end
41
+
42
+ it 'handles alternation patterns' do
43
+ node = Hoozuki::Parser.new.parse('a|b')
44
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
45
+ dfa = described_class.from_nfa(nfa, false)
46
+
47
+ expect(dfa.transitions.size).to be >= 2
48
+ end
49
+
50
+ it 'handles repetition patterns' do
51
+ node = Hoozuki::Parser.new.parse('a*')
52
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
53
+ dfa = described_class.from_nfa(nfa, false)
54
+
55
+ expect(dfa.accept).to include(dfa.start)
56
+ end
57
+ end
58
+
59
+ describe '#match?' do
60
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
61
+
62
+ it 'matches using DFA for single literal' do
63
+ node = Hoozuki::Node::Literal.new('a')
64
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
65
+ dfa = described_class.from_nfa(nfa, false)
66
+
67
+ expect(dfa.match?('a', false)).to be true
68
+ expect(dfa.match?('b', false)).to be false
69
+ end
70
+
71
+ it 'matches choice pattern using DFA' do
72
+ node = Hoozuki::Node::Choice.new([
73
+ Hoozuki::Node::Literal.new('a'),
74
+ Hoozuki::Node::Literal.new('b')
75
+ ])
76
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
77
+ dfa = described_class.from_nfa(nfa, false)
78
+
79
+ expect(dfa.match?('a', false)).to be true
80
+ expect(dfa.match?('b', false)).to be true
81
+ expect(dfa.match?('c', false)).to be false
82
+ end
83
+
84
+ it 'matches concatenation pattern using DFA' do
85
+ node = Hoozuki::Node::Concatenation.new([
86
+ Hoozuki::Node::Literal.new('a'),
87
+ Hoozuki::Node::Literal.new('b')
88
+ ])
89
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
90
+ dfa = described_class.from_nfa(nfa, false)
91
+
92
+ expect(dfa.match?('ab', false)).to be true
93
+ expect(dfa.match?('a', false)).to be false
94
+ expect(dfa.match?('abc', false)).to be false
95
+ end
96
+
97
+ context 'with simple literal' do
98
+ it 'matches exact string' do
99
+ node = Hoozuki::Parser.new.parse('abc')
100
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
101
+ dfa = described_class.from_nfa(nfa, false)
102
+
103
+ expect(dfa.match?('abc', false)).to be true
104
+ end
105
+
106
+ it 'does not match different string' do
107
+ node = Hoozuki::Parser.new.parse('abc')
108
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
109
+ dfa = described_class.from_nfa(nfa, false)
110
+
111
+ expect(dfa.match?('abd', false)).to be false
112
+ end
113
+ end
114
+
115
+ context 'with alternation' do
116
+ it 'matches either branch' do
117
+ node = Hoozuki::Parser.new.parse('a|b')
118
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
119
+ dfa = described_class.from_nfa(nfa, false)
120
+
121
+ expect(dfa.match?('a', false)).to be true
122
+ expect(dfa.match?('b', false)).to be true
123
+ expect(dfa.match?('c', false)).to be false
124
+ end
125
+ end
126
+ end
127
+
128
+ describe '#next_transition' do
129
+ let(:state) { Hoozuki::Automaton::StateID.new(0) }
130
+
131
+ it 'finds correct next state' do
132
+ node = Hoozuki::Parser.new.parse('a')
133
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
134
+ dfa = described_class.from_nfa(nfa, false)
135
+
136
+ next_state = dfa.next_transition(dfa.start, 'a', false)
137
+ expect(next_state).not_to be_nil
138
+ end
139
+
140
+ it 'returns nil for invalid transition' do
141
+ node = Hoozuki::Parser.new.parse('a')
142
+ nfa = Hoozuki::Automaton::NFA.from_node(node, state)
143
+ dfa = described_class.from_nfa(nfa, false)
144
+
145
+ next_state = dfa.next_transition(dfa.start, 'b', false)
146
+ expect(next_state).to be_nil
147
+ end
148
+ end
149
+ end