aho_corasick 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :development, :test do
4
+ gem "gem-this"
5
+ gem "rake"
6
+ gem "minitest"
7
+ #gem "rspec"
8
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,14 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ gem-this (0.3.7)
5
+ minitest (2.6.2)
6
+ rake (0.9.2)
7
+
8
+ PLATFORMS
9
+ ruby
10
+
11
+ DEPENDENCIES
12
+ gem-this
13
+ minitest
14
+ rake
data/README.md ADDED
@@ -0,0 +1,24 @@
1
+ Aho-Corasick string matching
2
+ ============================
3
+
4
+ The [Aho-Corasick string matching algorithm](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) will find all instances of a set of terms that are substrings of a certain string in constant time with respect to the length of the string. It does this by pre-building a NFA-like structure which it uses to efficiently parse the string for all the terms in the set to be matched against.
5
+
6
+ Our implementation is written in pure Ruby, and is used like so:
7
+
8
+ ```ruby
9
+ require 'aho_corasick'
10
+ matcher = AhoCorasick.new("woodchuck", "chuck", "could")
11
+ matcher.match("How much wood would a woodchuck chuck if a woodchuck could chuck wood.")
12
+ #=> ["woodchuck", "chuck", "woodchuck", "could", "chuck"]
13
+ ```
14
+
15
+ You can insert additional terms into the matcher after instantiation, however each call to insert takes linear time with respect to the total number of terms to be matched against. You can call insert passing multiple terms to mitigate against this:
16
+
17
+ ```ruby
18
+ matcher.insert("would", "wood")
19
+ matcher.match("How much wood would a woodchuck chuck if a woodchuck could chuck wood.")
20
+ #=> ["wood", "would", "wood", "woodchuck", "chuck", "wood", "woodchuck", "could", "chuck", "wood"]
21
+ ```
22
+
23
+ To install: ``gem install aho_corasick`` or add ``gem "aho_corasick"`` to your Gemfile. We've tested it with Ruby 1.9.2, but there's nothing in there that should stop it working with any other Ruby. If you find any bugs, let us know - Feature requests and pull requests also welcome!
24
+
data/Rakefile ADDED
@@ -0,0 +1,79 @@
1
+ require 'bundler/setup'
2
+ require "rubygems/package_task"
3
+ require "rdoc/task"
4
+ require 'rake/testtask'
5
+
6
+ task :default => :test
7
+ task :spec => :test
8
+
9
+ Rake::TestTask.new do |t|
10
+ t.pattern = "spec/*_spec.rb"
11
+ end
12
+
13
+ # This builds the actual gem. For details of what all these options
14
+ # mean, and other ones you can add, check the documentation here:
15
+ #
16
+ # http://rubygems.org/read/chapter/20
17
+ #
18
+ spec = Gem::Specification.new do |s|
19
+
20
+ # Change these as appropriate
21
+ s.name = "aho_corasick"
22
+ s.version = "0.0.1"
23
+ s.summary = "The Aho-Corasick string-matching algorithm"
24
+ s.author = "Tim Cowlishaw"
25
+ s.email = "tim@timcowlishaw.co.uk"
26
+ s.homepage = "http://github.com/likely/aho_corasick"
27
+
28
+ s.has_rdoc = true
29
+ s.extra_rdoc_files = %w(README.md)
30
+ s.rdoc_options = %w(--main README.md)
31
+
32
+ # Add any extra files to include in the gem
33
+ s.files = %w(Gemfile Gemfile.lock Rakefile README.md Gemfile) + Dir.glob("{spec,lib}/**/*")
34
+ s.require_paths = ["lib"]
35
+
36
+ # If you want to depend on other gems, add them here, along with any
37
+ # relevant versions
38
+ #s.add_dependency("eventmachine")
39
+
40
+ # If your tests use any gems, include them here
41
+ s.add_development_dependency("rspec")
42
+ end
43
+
44
+ # This task actually builds the gem. We also regenerate a static
45
+ # .gemspec file, which is useful if something (i.e. GitHub) will
46
+ # be automatically building a gem for this project. If you're not
47
+ # using GitHub, edit as appropriate.
48
+ #
49
+ # To publish your gem online, install the 'gemcutter' gem; Read more
50
+ # about that here: http://gemcutter.org/pages/gem_docs
51
+ Gem::PackageTask.new(spec) do |pkg|
52
+ pkg.gem_spec = spec
53
+ end
54
+
55
+ desc "Build the gemspec file #{spec.name}.gemspec"
56
+ task :gemspec do
57
+ file = File.dirname(__FILE__) + "/#{spec.name}.gemspec"
58
+ File.open(file, "w") {|f| f << spec.to_ruby }
59
+ end
60
+
61
+ # If you don't want to generate the .gemspec file, just remove this line. Reasons
62
+ # why you might want to generate a gemspec:
63
+ # - using bundler with a git source
64
+ # - building the gem without rake (i.e. gem build blah.gemspec)
65
+ # - maybe others?
66
+ task :package => :gemspec
67
+
68
+ # Generate documentation
69
+ RDoc::Task.new do |rd|
70
+ rd.main = "README.markdown"
71
+ rd.rdoc_files.include("README.markdown", "lib/**/*.rb")
72
+ rd.rdoc_dir = "rdoc"
73
+ end
74
+
75
+ desc 'Clear out RDoc and generated packages'
76
+ task :clean => [:clobber_rdoc, :clobber_package] do
77
+ rm "#{spec.name}.gemspec"
78
+ end
79
+
@@ -0,0 +1,67 @@
1
+ class AhoCorasick
2
+ def initialize(*terms)
3
+ @root = TreeNode.new
4
+ unsafe_insert(terms)
5
+ create_suffix_links
6
+ end
7
+
8
+ def match(string)
9
+ matches = []
10
+ node = string.each_char.inject(@root) do |node, char|
11
+ matches += node.matches if node
12
+ (node && node.find(char)) || @root.find(char)
13
+ end
14
+ matches += node.matches if node
15
+ return matches
16
+ end
17
+
18
+ def insert(*terms)
19
+ unsafe_insert(terms)
20
+ create_suffix_links
21
+ end
22
+
23
+ private
24
+
25
+ def unsafe_insert(terms)
26
+ terms.each do |t|
27
+ t.each_char.inject(@root) {|node, char| node.child_for(char) }.add_match(t)
28
+ end
29
+ end
30
+
31
+ def create_suffix_links
32
+ queue = @root.children.to_a.dup
33
+ while !queue.empty?
34
+ char, node = queue.shift
35
+ node.suffix = node.parent == @root ? @root : (node.parent.suffix && node.parent.suffix.children[char])
36
+ node.children.to_a.each do |entry|
37
+ queue.push(entry)
38
+ end
39
+ end
40
+ end
41
+
42
+ class TreeNode
43
+ def initialize(parent=nil)
44
+ @parent = parent
45
+ @suffix = nil
46
+ @matches = []
47
+ @children = {}
48
+ end
49
+
50
+ attr_reader :matches, :children, :parent
51
+ attr_accessor :suffix
52
+
53
+
54
+ def find(char)
55
+ @children[char] || (suffix && suffix.find(char))
56
+ end
57
+
58
+ def add_match(str)
59
+ @matches << str
60
+ end
61
+
62
+ def child_for(char)
63
+ @children[char] ||= TreeNode.new(self)
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,51 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'spec_helper'
3
+
4
+ describe "AhoCorasick" do
5
+ it "returns matched substrings" do
6
+ a = AhoCorasick.new("ab")
7
+ a.match("abcde").must_include("ab")
8
+ end
9
+
10
+ it "returns multiple matched substrings" do
11
+ a = AhoCorasick.new("ab", "cd")
12
+ a.match("cd123ab").to_set.must_equal ["ab", "cd"].to_set
13
+ end
14
+
15
+ it "returns overlapping matched substrings" do
16
+ a = AhoCorasick.new("ab", "bc")
17
+ a.match("abc").to_set.must_equal ["ab", "bc"].to_set
18
+ end
19
+
20
+ it "does not return unmatched substrings" do
21
+ a = AhoCorasick.new("ab")
22
+ a.match("abc").wont_include("bc")
23
+ end
24
+
25
+ it "matches adjacent terms" do
26
+ a = AhoCorasick.new("ab", "cd")
27
+ a.match("abcd").to_set.must_equal ["ab", "cd"].to_set
28
+ end
29
+
30
+ it "returns terms added to the matcher after instantiation" do
31
+ a = AhoCorasick.new("ab")
32
+ a.insert("cd", "ef")
33
+ a.match("ab12cd12ef").to_set.must_equal ["ab", "cd", "ef"].to_set
34
+ end
35
+
36
+ describe "benchmarks" do
37
+
38
+ before do
39
+ words = 1000.times.map { rand(6).times.inject("") {|s,_| s + (65 + rand(26)).chr}}
40
+ @matcher = AhoCorasick.new(*words)
41
+ end
42
+
43
+ bench_performance_linear "string matching" do |n|
44
+ 10.times do
45
+ string = n.times.inject("") {|s, _| s + (65 + rand(26)).chr }
46
+ @matcher.match(string)
47
+ end
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,8 @@
1
+ $: << File.join(File.dirname(__FILE__),"..")
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'lib/aho_corasick'
5
+ require 'minitest/spec'
6
+ require 'minitest/benchmark'
7
+ require 'minitest/autorun'
8
+
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aho_corasick
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tim Cowlishaw
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-10-25 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &13072560 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *13072560
25
+ description:
26
+ email: tim@timcowlishaw.co.uk
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files:
30
+ - README.md
31
+ files:
32
+ - Gemfile
33
+ - Gemfile.lock
34
+ - Rakefile
35
+ - README.md
36
+ - spec/aho_corasick_spec.rb
37
+ - spec/spec_helper.rb
38
+ - lib/aho_corasick.rb
39
+ homepage: http://github.com/likely/aho_corasick
40
+ licenses: []
41
+ post_install_message:
42
+ rdoc_options:
43
+ - --main
44
+ - README.md
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ segments:
54
+ - 0
55
+ hash: -4104333102682146250
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.10
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: The Aho-Corasick string-matching algorithm
68
+ test_files: []