hoozuki 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/README.md +18 -5
- data/lib/hoozuki/instruction/char.rb +13 -0
- data/lib/hoozuki/instruction/jmp.rb +13 -0
- data/lib/hoozuki/instruction/match.rb +8 -0
- data/lib/hoozuki/instruction/split.rb +14 -0
- data/lib/hoozuki/instruction.rb +6 -0
- data/lib/hoozuki/version.rb +1 -1
- data/lib/hoozuki/vm/compiler.rb +84 -0
- data/lib/hoozuki/vm/evaluator.rb +39 -0
- data/lib/hoozuki/vm.rb +4 -0
- data/lib/hoozuki.rb +13 -5
- data/spec/hoozuki_spec.rb +12 -2
- metadata +9 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ff80e01946d63faf2012a01c6f7d2c04f0330b47aa63ba2a0f540c9e6cf290fc
|
|
4
|
+
data.tar.gz: 515b27f972f50c5f4c35cf6d33422637f93cdedc7db54ffdbd7a3f363a10c18f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d26025dabd381db4ec2dc42ee4baa626e5389fd4ce7ade66f6e5a6d42879c929e2529e65413c919b8119e0127027986243a3be8a5bc0c702cbc52c57c276fee3
|
|
7
|
+
data.tar.gz: edff1a212c89c9f322517649ef6b8daf27bbaa41682e312313b064e97002d3a0c395e2bc3c9a82c657ea8e38670e91f31616923f5d026776a5d81bac0892a3ec
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
# Hoozuki (鬼灯)
|
|
1
|
+
# Hoozuki (鬼灯) [](https://badge.fury.io/rb/hoozuki) [](https://github.com/ydah/hoozuki/actions/workflows/ci.yml)
|
|
2
2
|
|
|
3
3
|
A hobby regex engine written in Ruby. Designed to be simple and efficient for educational purposes.
|
|
4
|
+
Currently supports 2 engines:
|
|
5
|
+
- NFA Based Engine
|
|
6
|
+
- VM Based Engine
|
|
4
7
|
|
|
5
8
|
## Installation
|
|
6
9
|
|
|
@@ -12,10 +15,20 @@ gem install hoozuki
|
|
|
12
15
|
|
|
13
16
|
```ruby
|
|
14
17
|
require 'hoozuki'
|
|
15
|
-
regex = Hoozuki
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
regex = Hoozuki.new('a(bc|de)*f') # Or Hoozuki.new('a(bc|de)*f', engine: :nfa) for NFA based engine
|
|
19
|
+
regex.match?('abcdef') # => true
|
|
20
|
+
regex.match?('adef') # => true
|
|
21
|
+
regex.match?('xyz') # => false
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
If you want to use the VM based engine:
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
require 'hoozuki'
|
|
28
|
+
regex = Hoozuki.new('a(bc|de)*f', engine: :vm)
|
|
29
|
+
regex.match?('abcdef') # => true
|
|
30
|
+
regex.match?('adef') # => true
|
|
31
|
+
regex.match?('xyz') # => false
|
|
19
32
|
```
|
|
20
33
|
|
|
21
34
|
## License
|
data/lib/hoozuki/version.rb
CHANGED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class Hoozuki
|
|
4
|
+
module VM
|
|
5
|
+
class Compiler
|
|
6
|
+
attr_reader :instructions
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@pc = 0
|
|
10
|
+
@instructions = []
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def compile(ast)
|
|
14
|
+
_compile(ast)
|
|
15
|
+
@pc += 1
|
|
16
|
+
@instructions << Instruction::Match.new
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def _compile(ast)
|
|
22
|
+
case ast
|
|
23
|
+
when Hoozuki::Node::Literal
|
|
24
|
+
emit(Hoozuki::Instruction::Char.new(ast.value))
|
|
25
|
+
when Hoozuki::Node::Epsilon
|
|
26
|
+
# Do nothing for epsilon
|
|
27
|
+
when Node::Repetition
|
|
28
|
+
if ast.zero_or_more?
|
|
29
|
+
split = @pc
|
|
30
|
+
emit(Hoozuki::Instruction::Split.new(@pc + 1, 0))
|
|
31
|
+
_compile(ast.child)
|
|
32
|
+
emit(Hoozuki::Instruction::Jmp.new(split))
|
|
33
|
+
patch(split, Hoozuki::Instruction::Split.new(split + 1, @pc))
|
|
34
|
+
elsif ast.one_or_more?
|
|
35
|
+
start = @pc
|
|
36
|
+
_compile(ast.child)
|
|
37
|
+
emit(Hoozuki::Instruction::Split.new(start, @pc + 1))
|
|
38
|
+
elsif ast.optional?
|
|
39
|
+
split = @pc
|
|
40
|
+
emit(Hoozuki::Instruction::Split.new(0, 0))
|
|
41
|
+
start = @pc
|
|
42
|
+
_compile(ast.child)
|
|
43
|
+
last = @pc
|
|
44
|
+
patch(split, Hoozuki::Instruction::Split.new(start, last))
|
|
45
|
+
end
|
|
46
|
+
when Node::Choice
|
|
47
|
+
split = @pc
|
|
48
|
+
@pc += 1
|
|
49
|
+
@instructions << Hoozuki::Instruction::Split.new(@pc, 0)
|
|
50
|
+
_compile(ast.children.first)
|
|
51
|
+
jump = @pc
|
|
52
|
+
emit(Hoozuki::Instruction::Jmp.new(0))
|
|
53
|
+
|
|
54
|
+
if @instructions[split].is_a?(Hoozuki::Instruction::Split)
|
|
55
|
+
@instructions[split].right = @pc
|
|
56
|
+
else
|
|
57
|
+
raise "Instruction at pc #{split} is not a Split"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
_compile(ast.children.last)
|
|
61
|
+
|
|
62
|
+
if @instructions[jump].is_a?(Hoozuki::Instruction::Jmp)
|
|
63
|
+
@instructions[jump].target = @pc
|
|
64
|
+
else
|
|
65
|
+
raise "Instruction at pc #{jump} is not a Jmp"
|
|
66
|
+
end
|
|
67
|
+
when Node::Concatenation
|
|
68
|
+
ast.children.each do |child|
|
|
69
|
+
_compile(child)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def emit(instruction)
|
|
75
|
+
@instructions << instruction
|
|
76
|
+
@pc += 1
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def patch(pc, instruction)
|
|
80
|
+
@instructions[pc] = instruction
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class Hoozuki
|
|
4
|
+
module VM
|
|
5
|
+
class Evaluator
|
|
6
|
+
class << self
|
|
7
|
+
def evaluate(instructions, input, input_pos = 0, pc = 0)
|
|
8
|
+
new._evaluate(instructions, input, input_pos, pc)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def _evaluate(instructions, input, input_pos, pc)
|
|
13
|
+
loop do
|
|
14
|
+
return false if pc >= instructions.size
|
|
15
|
+
|
|
16
|
+
inst = instructions[pc]
|
|
17
|
+
case inst
|
|
18
|
+
when Hoozuki::Instruction::Char
|
|
19
|
+
return false if input_pos >= input.size || input[input_pos] != inst.char
|
|
20
|
+
input_pos += 1
|
|
21
|
+
pc += 1
|
|
22
|
+
when Hoozuki::Instruction::Jmp
|
|
23
|
+
pc = inst.target
|
|
24
|
+
when Hoozuki::Instruction::Split
|
|
25
|
+
if _evaluate(instructions, input, input_pos, inst.left)
|
|
26
|
+
return true
|
|
27
|
+
else
|
|
28
|
+
pc = inst.right
|
|
29
|
+
end
|
|
30
|
+
when Hoozuki::Instruction::Match
|
|
31
|
+
return input_pos == input.length
|
|
32
|
+
else
|
|
33
|
+
raise "Unknown instruction: #{inst.class}"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/hoozuki/vm.rb
ADDED
data/lib/hoozuki.rb
CHANGED
|
@@ -1,29 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'hoozuki/automaton'
|
|
4
|
+
require_relative 'hoozuki/instruction'
|
|
4
5
|
require_relative 'hoozuki/node'
|
|
5
6
|
require_relative 'hoozuki/parser'
|
|
6
7
|
require_relative 'hoozuki/version'
|
|
8
|
+
require_relative 'hoozuki/vm'
|
|
7
9
|
|
|
8
10
|
class Hoozuki
|
|
9
|
-
def initialize(input,
|
|
11
|
+
def initialize(input, engine: :dfa)
|
|
10
12
|
@input = input
|
|
11
|
-
@
|
|
13
|
+
@engine = engine
|
|
12
14
|
|
|
13
15
|
ast = Hoozuki::Parser.new(input).parse
|
|
14
|
-
case
|
|
16
|
+
case engine
|
|
15
17
|
when :dfa
|
|
16
18
|
nfa = Automaton::NFA.new_from_node(ast, Automaton::StateID.new(0))
|
|
17
19
|
@dfa = Automaton::DFA.from_nfa(nfa, use_cache?(input))
|
|
20
|
+
when :vm
|
|
21
|
+
compiler = VM::Compiler.new
|
|
22
|
+
compiler.compile(ast)
|
|
23
|
+
@bytecode = compiler.instructions
|
|
18
24
|
end
|
|
19
25
|
end
|
|
20
26
|
|
|
21
27
|
def match?(input)
|
|
22
|
-
case @
|
|
28
|
+
case @engine
|
|
23
29
|
when :dfa
|
|
24
30
|
@dfa.match?(input, use_cache?(input))
|
|
31
|
+
when :vm
|
|
32
|
+
VM::Evaluator.evaluate(@bytecode, input, 0, 0)
|
|
25
33
|
else
|
|
26
|
-
raise ArgumentError, "Unknown
|
|
34
|
+
raise ArgumentError, "Unknown engine: #{@engine}"
|
|
27
35
|
end
|
|
28
36
|
end
|
|
29
37
|
|
data/spec/hoozuki_spec.rb
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
RSpec.describe Hoozuki do
|
|
4
|
-
|
|
5
|
-
subject { described_class.new(pattern).match?(value) }
|
|
4
|
+
shared_examples 'regex matching behavior' do |mode|
|
|
5
|
+
subject { described_class.new(pattern, engine: mode).match?(value) }
|
|
6
6
|
|
|
7
7
|
context 'with basic concatenation' do
|
|
8
8
|
let(:pattern) { 'abc' }
|
|
@@ -208,4 +208,14 @@ RSpec.describe Hoozuki do
|
|
|
208
208
|
end
|
|
209
209
|
end
|
|
210
210
|
end
|
|
211
|
+
|
|
212
|
+
describe '#match?' do
|
|
213
|
+
context 'with :vm mode' do
|
|
214
|
+
include_examples 'regex matching behavior', :vm
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
context 'with :dfa mode' do
|
|
218
|
+
include_examples 'regex matching behavior', :dfa
|
|
219
|
+
end
|
|
220
|
+
end
|
|
211
221
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: hoozuki
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yudai Takada
|
|
@@ -28,6 +28,11 @@ files:
|
|
|
28
28
|
- lib/hoozuki/automaton/dfa.rb
|
|
29
29
|
- lib/hoozuki/automaton/nfa.rb
|
|
30
30
|
- lib/hoozuki/automaton/state_id.rb
|
|
31
|
+
- lib/hoozuki/instruction.rb
|
|
32
|
+
- lib/hoozuki/instruction/char.rb
|
|
33
|
+
- lib/hoozuki/instruction/jmp.rb
|
|
34
|
+
- lib/hoozuki/instruction/match.rb
|
|
35
|
+
- lib/hoozuki/instruction/split.rb
|
|
31
36
|
- lib/hoozuki/node.rb
|
|
32
37
|
- lib/hoozuki/node/choice.rb
|
|
33
38
|
- lib/hoozuki/node/concatenation.rb
|
|
@@ -36,6 +41,9 @@ files:
|
|
|
36
41
|
- lib/hoozuki/node/repetition.rb
|
|
37
42
|
- lib/hoozuki/parser.rb
|
|
38
43
|
- lib/hoozuki/version.rb
|
|
44
|
+
- lib/hoozuki/vm.rb
|
|
45
|
+
- lib/hoozuki/vm/compiler.rb
|
|
46
|
+
- lib/hoozuki/vm/evaluator.rb
|
|
39
47
|
- spec/hoozuki_spec.rb
|
|
40
48
|
- spec/spec_helper.rb
|
|
41
49
|
homepage: https://github.com/ydah/hoozuki
|