rbahocorasick 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rbahocorasick.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Changli Gao
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # RBAhoCorasick
2
+
3
+ A Ruby implementation of [the Aho-Corasick string matching algorithm](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'rbahocorasick'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install rbahocorasick
18
+
19
+ ## Usage
20
+
21
+ ````ruby
22
+ require 'rubygems'
23
+ require 'rbahocorasick'
24
+
25
+ nfa = RBAhoCorasick::NFA.new
26
+ %w{he she his hers}.each{|key| nfa.add(key)}
27
+ nfa.finalize
28
+ nfa.match('he and she are friends').each{|m| puts m.key}
29
+ ````
30
+
31
+ Yes, you can use DFA instead of NFA for better performance. See `test/tc_nfa.rb`
32
+ for more examples.
33
+
34
+ ## Contributing
35
+
36
+ 1. Fork it
37
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
38
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
39
+ 4. Push to the branch (`git push origin my-new-feature`)
40
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+
2
+ require "bundler/gem_tasks"
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.test_files = FileList['test/**/*.rb']
8
+ end
9
+
10
+ task :default => :build
@@ -0,0 +1,5 @@
1
+
2
+ require 'rbahocorasick/dfa'
3
+ require 'rbahocorasick/nfa'
4
+ require 'rbahocorasick/state'
5
+ require 'rbahocorasick/version'
@@ -0,0 +1,26 @@
1
+
2
+ require 'rbahocorasick/nfa'
3
+
4
+ module RBAhoCorasick
5
+ class DFA < RBAhoCorasick::NFA
6
+ def finalize
7
+ super
8
+ queue = []
9
+ (0..255).each do |byte|
10
+ unless @initial_state[byte] == @initial_state
11
+ queue << @initial_state[byte]
12
+ end
13
+ end
14
+ while queue.length > 0
15
+ r = queue.shift
16
+ (0..255).each do |byte|
17
+ if s = r[byte]
18
+ queue << s
19
+ else
20
+ r[byte] = r.default[byte]
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,95 @@
1
+
2
+ require 'rbahocorasick/state'
3
+
4
+ module RBAhoCorasick
5
+ MatchData = Struct.new(:key, :value, :offset)
6
+
7
+ class NFA
8
+ attr_reader :initial_state, :states_by_state_id
9
+
10
+ def initialize
11
+ @least_unused_state_id = 0
12
+ @states_by_state_id = []
13
+ @initial_state = create_state
14
+ end
15
+
16
+ # Add a pair of key and value
17
+ def add(key, value = nil)
18
+ @finalized and raise 'finalized'
19
+ state = @initial_state
20
+ key.each_byte do |byte|
21
+ next_state = state[byte]
22
+ unless next_state
23
+ next_state = state[byte] = create_state
24
+ end
25
+ state = next_state
26
+ end
27
+ state.add(key, value)
28
+ end
29
+
30
+ # Finalize the state machine
31
+ def finalize
32
+ @finalized and raise 'finalize twice'
33
+ @finalized = true
34
+ queue = []
35
+ (0..255).each do |byte|
36
+ if @initial_state[byte]
37
+ next_state = @initial_state[byte]
38
+ next_state.default = @initial_state
39
+ queue << next_state
40
+ else
41
+ @initial_state[byte] = @initial_state
42
+ end
43
+ end
44
+
45
+ while queue.length > 0
46
+ r = queue.shift
47
+ (0..255).each do |byte|
48
+ if s = r[byte]
49
+ queue << s
50
+ state = r.default
51
+ state = state.default until state[byte]
52
+ s.default = state[byte]
53
+ s.data.concat(s.default.data)
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ # Match the data
60
+ def match(data, length = nil)
61
+ finalize unless @finalized
62
+ length ||= data.length
63
+ matches = [] unless block_given?
64
+ offset = 0
65
+ state = @initial_state
66
+ data.each_byte do |byte|
67
+ break if offset >= length
68
+ offset += 1
69
+ state = state.default until state[byte]
70
+ state = state[byte]
71
+ if state.data.length > 0
72
+ state.data.each do |state_data|
73
+ match_data = RBAhoCorasick::MatchData.new(state_data.key,
74
+ state_data.value, offset - state_data.key.length)
75
+ if block_given?
76
+ yield match_data
77
+ else
78
+ matches << match_data
79
+ end
80
+ end
81
+ end
82
+ end
83
+ return matches unless block_given?
84
+ end
85
+
86
+ private
87
+
88
+ def create_state
89
+ state = RBAhoCorasick::State.new(@least_unused_state_id)
90
+ @least_unused_state_id += 1
91
+ @states_by_state_id[state.state_id] = state
92
+ state
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,32 @@
1
+
2
+ module RBAhoCorasick
3
+ StateData = Struct.new(:key, :value)
4
+
5
+ class State
6
+ attr_accessor :data, :default
7
+ attr_reader :state_id
8
+
9
+ def initialize(state_id)
10
+ @transition = []
11
+ @data = []
12
+ @state_id = state_id
13
+ end
14
+
15
+ def add(key, value)
16
+ @data << RBAhoCorasick::StateData.new(key, value)
17
+ end
18
+
19
+ def ==(o)
20
+ @state_id == o.state_id
21
+ end
22
+
23
+ def [](input)
24
+ @transition[input]
25
+ end
26
+
27
+ def []=(input, next_state)
28
+ @transition[input] and raise 'transition exists'
29
+ @transition[input] = next_state
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module RBAhoCorasick
3
+ VERSION = '0.0.1'
4
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rbahocorasick/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "rbahocorasick"
8
+ gem.version = RBAhoCorasick::VERSION
9
+ gem.authors = ["Changli Gao"]
10
+ gem.email = ["xiaosuo@gmail.com"]
11
+ gem.description = %q{A Ruby implementation of the Aho-Corasick string matching algorithm}
12
+ gem.summary = %q{A Ruby implementation of the Aho-Corasick string matching algorithm}
13
+ gem.homepage = "https://github.com/xiaosuo/rbahocorasick"
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+ gem.license = 'MIT'
19
+ gem.add_development_dependency 'rake'
20
+ end
data/test/tc_dfa.rb ADDED
@@ -0,0 +1,31 @@
1
+
2
+ require 'test/unit'
3
+ require 'rbahocorasick'
4
+
5
+ class TC_DFA < Test::Unit::TestCase
6
+ # verify the state machine with the original paper
7
+ def test_paper
8
+ dfa = RBAhoCorasick::DFA.new
9
+ %w{he she his hers}.each{|key| dfa.add(key)}
10
+ dfa.finalize
11
+ delta = {
12
+ 0 => {?h.ord => 1, ?s.ord => 3},
13
+ 1 => {?e.ord => 2, ?i.ord => 6, ?h.ord => 1, ?s.ord => 3},
14
+ 3 => {?h.ord => 4, ?s.ord => 3},
15
+ 7 => {?h.ord => 4, ?s.ord => 3},
16
+ 9 => {?h.ord => 4, ?s.ord => 3},
17
+ 5 => {?r.ord => 8, ?h.ord => 1, ?s.ord => 3},
18
+ 2 => {?r.ord => 8, ?h.ord => 1, ?s.ord => 3},
19
+ 6 => {?s.ord => 7, ?h.ord => 1},
20
+ 4 => {?e.ord => 5, ?i.ord => 6, ?h.ord=> 1, ?s.ord => 3},
21
+ 8 => {?s.ord => 9, ?h.ord => 1}
22
+ }
23
+ (0..9).each do |i|
24
+ state = dfa.states_by_state_id[i]
25
+ (0..255).each do |byte|
26
+ expect = (delta[i][byte] || 0)
27
+ assert_equal(expect, state[byte].state_id)
28
+ end
29
+ end
30
+ end
31
+ end
data/test/tc_nfa.rb ADDED
@@ -0,0 +1,103 @@
1
+
2
+ require 'test/unit'
3
+ require 'rbahocorasick'
4
+
5
+ class TC_NFA < Test::Unit::TestCase
6
+ def setup
7
+ @nfa = RBAhoCorasick::NFA.new
8
+ %w{he she his hers}.each{|key| @nfa.add(key)}
9
+ end
10
+
11
+ # verify the state machine with the original paper
12
+ def test_paper
13
+ @nfa.finalize
14
+ assert_equal(10, @nfa.states_by_state_id.length)
15
+ assert_equal(0, @nfa.initial_state.state_id)
16
+ goto = [
17
+ {?h.ord => 1, ?s.ord => 3},
18
+ {?e.ord => 2, ?i.ord => 6},
19
+ {?r.ord => 8},
20
+ {?h.ord => 4},
21
+ {?e.ord => 5},
22
+ {},
23
+ {?s.ord => 7},
24
+ {},
25
+ {?s.ord => 9},
26
+ {}
27
+ ]
28
+ (0...10).each{|i| assert_equal(i, @nfa.states_by_state_id[i].state_id)}
29
+ f = [0, 0, 0, 0, 1, 2, 0, 3, 0, 3]
30
+ output = {2 => ['he'], 5 => ['she', 'he'], 7 => ['his'], 9 => ['hers']}
31
+ (0..9).each do |i|
32
+ state = @nfa.states_by_state_id[i]
33
+ # goto function
34
+ (0..255).each do |byte|
35
+ if goto[i][byte]
36
+ assert_equal(goto[i][byte], state[byte].state_id)
37
+ elsif i == 0
38
+ assert_equal(0, state[byte].state_id)
39
+ else
40
+ assert_equal(nil, state[byte])
41
+ end
42
+ end
43
+ # failure function
44
+ if i == 0
45
+ assert_equal(nil, state.default)
46
+ else
47
+ assert_equal(f[i], state.default.state_id)
48
+ end
49
+ # output function
50
+ if output[i]
51
+ assert_equal(output[i].length, state.data.length)
52
+ keys = state.data.map{|data| data.key}
53
+ output[i].each{|o| assert(keys.include?(o))}
54
+ else
55
+ assert_equal(0, @nfa.states_by_state_id[i].data.length)
56
+ end
57
+ end
58
+ end
59
+
60
+ # finalize should be optional
61
+ def test_finalize
62
+ matches = @nfa.match('he and she are friends')
63
+ assert_equal(3, matches.length)
64
+ assert_equal('he', matches[0].key)
65
+ assert_equal(0, matches[0].offset)
66
+ assert_equal('she', matches[1].key)
67
+ assert_equal(7, matches[1].offset)
68
+ assert_equal('he', matches[2].key)
69
+ assert_equal(8, matches[2].offset)
70
+ assert_raise(RuntimeError){@nfa.finalize}
71
+ assert_raise(RuntimeError){@nfa.add('test')}
72
+ end
73
+
74
+ def test_match
75
+ str = 'he and she are friends'
76
+ # with length
77
+ matches = @nfa.match(str, 9)
78
+ assert_equal(1, matches.length)
79
+ assert_equal('he', matches[0].key)
80
+ assert_equal(0, matches[0].offset)
81
+ # with block
82
+ matches = []
83
+ @nfa.match(str) do |match_data|
84
+ matches << match_data.key
85
+ end
86
+ assert_equal(%w{he she he}, matches)
87
+ # block return
88
+ matches = []
89
+ @nfa.match(str) do |match_data|
90
+ matches << match_data.key
91
+ break if match_data.key == 'she'
92
+ end
93
+ assert_equal(%w{he she}, matches)
94
+ end
95
+
96
+ def test_value_retrieve
97
+ @nfa.add('key', 'value')
98
+ matches = @nfa.match('she has his key')
99
+ assert_equal(%w{she he his key}, matches.map{|m| m.key})
100
+ (0...3).each{|i| assert_equal(nil, matches[i].value)}
101
+ assert_equal('value', matches[3].value)
102
+ end
103
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbahocorasick
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Changli Gao
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-12-14 00:00:00 +08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rake
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: A Ruby implementation of the Aho-Corasick string matching algorithm
36
+ email:
37
+ - xiaosuo@gmail.com
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - Gemfile
47
+ - LICENSE.txt
48
+ - README.md
49
+ - Rakefile
50
+ - lib/rbahocorasick.rb
51
+ - lib/rbahocorasick/dfa.rb
52
+ - lib/rbahocorasick/nfa.rb
53
+ - lib/rbahocorasick/state.rb
54
+ - lib/rbahocorasick/version.rb
55
+ - rbahocorasick.gemspec
56
+ - test/tc_dfa.rb
57
+ - test/tc_nfa.rb
58
+ has_rdoc: true
59
+ homepage: https://github.com/xiaosuo/rbahocorasick
60
+ licenses:
61
+ - MIT
62
+ post_install_message:
63
+ rdoc_options: []
64
+
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ hash: 3
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ requirements: []
86
+
87
+ rubyforge_project:
88
+ rubygems_version: 1.3.7
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: A Ruby implementation of the Aho-Corasick string matching algorithm
92
+ test_files:
93
+ - test/tc_dfa.rb
94
+ - test/tc_nfa.rb