aho_corasick 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -0
- data/Gemfile.lock +14 -0
- data/README.md +24 -0
- data/Rakefile +79 -0
- data/lib/aho_corasick.rb +67 -0
- data/spec/aho_corasick_spec.rb +51 -0
- data/spec/spec_helper.rb +8 -0
- metadata +68 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/README.md
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Aho-Corasick string matching
|
2
|
+
============================
|
3
|
+
|
4
|
+
The [Aho-Corasick string matching algorithm](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) will find all instances of a set of terms that are substrings of a certain string in constant time with respect to the length of the string. It does this by pre-building a NFA-like structure which it uses to efficiently parse the string for all the terms in the set to be matched against.
|
5
|
+
|
6
|
+
Our implementation is written in pure Ruby, and is used like so:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
require 'aho_corasick'
|
10
|
+
matcher = AhoCorasick.new("woodchuck", "chuck", "could")
|
11
|
+
matcher.match("How much wood would a woodchuck chuck if a woodchuck could chuck wood.")
|
12
|
+
#=> ["woodchuck", "chuck", "woodchuck", "could", "chuck"]
|
13
|
+
```
|
14
|
+
|
15
|
+
You can insert additional terms into the matcher after instantiation, however each call to insert takes linear time with respect to the total number of terms to be matched against. You can call insert passing multiple terms to mitigate against this:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
matcher.insert("would", "wood")
|
19
|
+
matcher.match("How much wood would a woodchuck chuck if a woodchuck could chuck wood.")
|
20
|
+
#=> ["wood", "would", "wood", "woodchuck", "chuck", "wood", "woodchuck", "could", "chuck", "wood"]
|
21
|
+
```
|
22
|
+
|
23
|
+
To install: ``gem install aho_corasick`` or add ``gem "aho_corasick"`` to your Gemfile. We've tested it with Ruby 1.9.2, but there's nothing in there that should stop it working with any other Ruby. If you find any bugs, let us know - Feature requests and pull requests also welcome!
|
24
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require "rubygems/package_task"
|
3
|
+
require "rdoc/task"
|
4
|
+
require 'rake/testtask'
|
5
|
+
|
6
|
+
task :default => :test
|
7
|
+
task :spec => :test
|
8
|
+
|
9
|
+
Rake::TestTask.new do |t|
|
10
|
+
t.pattern = "spec/*_spec.rb"
|
11
|
+
end
|
12
|
+
|
13
|
+
# This builds the actual gem. For details of what all these options
|
14
|
+
# mean, and other ones you can add, check the documentation here:
|
15
|
+
#
|
16
|
+
# http://rubygems.org/read/chapter/20
|
17
|
+
#
|
18
|
+
spec = Gem::Specification.new do |s|
|
19
|
+
|
20
|
+
# Change these as appropriate
|
21
|
+
s.name = "aho_corasick"
|
22
|
+
s.version = "0.0.1"
|
23
|
+
s.summary = "The Aho-Corasick string-matching algorithm"
|
24
|
+
s.author = "Tim Cowlishaw"
|
25
|
+
s.email = "tim@timcowlishaw.co.uk"
|
26
|
+
s.homepage = "http://github.com/likely/aho_corasick"
|
27
|
+
|
28
|
+
s.has_rdoc = true
|
29
|
+
s.extra_rdoc_files = %w(README.md)
|
30
|
+
s.rdoc_options = %w(--main README.md)
|
31
|
+
|
32
|
+
# Add any extra files to include in the gem
|
33
|
+
s.files = %w(Gemfile Gemfile.lock Rakefile README.md Gemfile) + Dir.glob("{spec,lib}/**/*")
|
34
|
+
s.require_paths = ["lib"]
|
35
|
+
|
36
|
+
# If you want to depend on other gems, add them here, along with any
|
37
|
+
# relevant versions
|
38
|
+
#s.add_dependency("eventmachine")
|
39
|
+
|
40
|
+
# If your tests use any gems, include them here
|
41
|
+
s.add_development_dependency("rspec")
|
42
|
+
end
|
43
|
+
|
44
|
+
# This task actually builds the gem. We also regenerate a static
|
45
|
+
# .gemspec file, which is useful if something (i.e. GitHub) will
|
46
|
+
# be automatically building a gem for this project. If you're not
|
47
|
+
# using GitHub, edit as appropriate.
|
48
|
+
#
|
49
|
+
# To publish your gem online, install the 'gemcutter' gem; Read more
|
50
|
+
# about that here: http://gemcutter.org/pages/gem_docs
|
51
|
+
Gem::PackageTask.new(spec) do |pkg|
|
52
|
+
pkg.gem_spec = spec
|
53
|
+
end
|
54
|
+
|
55
|
+
desc "Build the gemspec file #{spec.name}.gemspec"
|
56
|
+
task :gemspec do
|
57
|
+
file = File.dirname(__FILE__) + "/#{spec.name}.gemspec"
|
58
|
+
File.open(file, "w") {|f| f << spec.to_ruby }
|
59
|
+
end
|
60
|
+
|
61
|
+
# If you don't want to generate the .gemspec file, just remove this line. Reasons
|
62
|
+
# why you might want to generate a gemspec:
|
63
|
+
# - using bundler with a git source
|
64
|
+
# - building the gem without rake (i.e. gem build blah.gemspec)
|
65
|
+
# - maybe others?
|
66
|
+
task :package => :gemspec
|
67
|
+
|
68
|
+
# Generate documentation
|
69
|
+
RDoc::Task.new do |rd|
|
70
|
+
rd.main = "README.markdown"
|
71
|
+
rd.rdoc_files.include("README.markdown", "lib/**/*.rb")
|
72
|
+
rd.rdoc_dir = "rdoc"
|
73
|
+
end
|
74
|
+
|
75
|
+
desc 'Clear out RDoc and generated packages'
|
76
|
+
task :clean => [:clobber_rdoc, :clobber_package] do
|
77
|
+
rm "#{spec.name}.gemspec"
|
78
|
+
end
|
79
|
+
|
data/lib/aho_corasick.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
class AhoCorasick
|
2
|
+
def initialize(*terms)
|
3
|
+
@root = TreeNode.new
|
4
|
+
unsafe_insert(terms)
|
5
|
+
create_suffix_links
|
6
|
+
end
|
7
|
+
|
8
|
+
def match(string)
|
9
|
+
matches = []
|
10
|
+
node = string.each_char.inject(@root) do |node, char|
|
11
|
+
matches += node.matches if node
|
12
|
+
(node && node.find(char)) || @root.find(char)
|
13
|
+
end
|
14
|
+
matches += node.matches if node
|
15
|
+
return matches
|
16
|
+
end
|
17
|
+
|
18
|
+
def insert(*terms)
|
19
|
+
unsafe_insert(terms)
|
20
|
+
create_suffix_links
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def unsafe_insert(terms)
|
26
|
+
terms.each do |t|
|
27
|
+
t.each_char.inject(@root) {|node, char| node.child_for(char) }.add_match(t)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def create_suffix_links
|
32
|
+
queue = @root.children.to_a.dup
|
33
|
+
while !queue.empty?
|
34
|
+
char, node = queue.shift
|
35
|
+
node.suffix = node.parent == @root ? @root : (node.parent.suffix && node.parent.suffix.children[char])
|
36
|
+
node.children.to_a.each do |entry|
|
37
|
+
queue.push(entry)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class TreeNode
|
43
|
+
def initialize(parent=nil)
|
44
|
+
@parent = parent
|
45
|
+
@suffix = nil
|
46
|
+
@matches = []
|
47
|
+
@children = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_reader :matches, :children, :parent
|
51
|
+
attr_accessor :suffix
|
52
|
+
|
53
|
+
|
54
|
+
def find(char)
|
55
|
+
@children[char] || (suffix && suffix.find(char))
|
56
|
+
end
|
57
|
+
|
58
|
+
def add_match(str)
|
59
|
+
@matches << str
|
60
|
+
end
|
61
|
+
|
62
|
+
def child_for(char)
|
63
|
+
@children[char] ||= TreeNode.new(self)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
$: << File.dirname(__FILE__)
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe "AhoCorasick" do
|
5
|
+
it "returns matched substrings" do
|
6
|
+
a = AhoCorasick.new("ab")
|
7
|
+
a.match("abcde").must_include("ab")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "returns multiple matched substrings" do
|
11
|
+
a = AhoCorasick.new("ab", "cd")
|
12
|
+
a.match("cd123ab").to_set.must_equal ["ab", "cd"].to_set
|
13
|
+
end
|
14
|
+
|
15
|
+
it "returns overlapping matched substrings" do
|
16
|
+
a = AhoCorasick.new("ab", "bc")
|
17
|
+
a.match("abc").to_set.must_equal ["ab", "bc"].to_set
|
18
|
+
end
|
19
|
+
|
20
|
+
it "does not return unmatched substrings" do
|
21
|
+
a = AhoCorasick.new("ab")
|
22
|
+
a.match("abc").wont_include("bc")
|
23
|
+
end
|
24
|
+
|
25
|
+
it "matches adjacent terms" do
|
26
|
+
a = AhoCorasick.new("ab", "cd")
|
27
|
+
a.match("abcd").to_set.must_equal ["ab", "cd"].to_set
|
28
|
+
end
|
29
|
+
|
30
|
+
it "returns terms added to the matcher after instantiation" do
|
31
|
+
a = AhoCorasick.new("ab")
|
32
|
+
a.insert("cd", "ef")
|
33
|
+
a.match("ab12cd12ef").to_set.must_equal ["ab", "cd", "ef"].to_set
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "benchmarks" do
|
37
|
+
|
38
|
+
before do
|
39
|
+
words = 1000.times.map { rand(6).times.inject("") {|s,_| s + (65 + rand(26)).chr}}
|
40
|
+
@matcher = AhoCorasick.new(*words)
|
41
|
+
end
|
42
|
+
|
43
|
+
bench_performance_linear "string matching" do |n|
|
44
|
+
10.times do
|
45
|
+
string = n.times.inject("") {|s, _| s + (65 + rand(26)).chr }
|
46
|
+
@matcher.match(string)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aho_corasick
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tim Cowlishaw
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-10-25 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &13072560 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *13072560
|
25
|
+
description:
|
26
|
+
email: tim@timcowlishaw.co.uk
|
27
|
+
executables: []
|
28
|
+
extensions: []
|
29
|
+
extra_rdoc_files:
|
30
|
+
- README.md
|
31
|
+
files:
|
32
|
+
- Gemfile
|
33
|
+
- Gemfile.lock
|
34
|
+
- Rakefile
|
35
|
+
- README.md
|
36
|
+
- spec/aho_corasick_spec.rb
|
37
|
+
- spec/spec_helper.rb
|
38
|
+
- lib/aho_corasick.rb
|
39
|
+
homepage: http://github.com/likely/aho_corasick
|
40
|
+
licenses: []
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options:
|
43
|
+
- --main
|
44
|
+
- README.md
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
hash: -4104333102682146250
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.8.10
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: The Aho-Corasick string-matching algorithm
|
68
|
+
test_files: []
|