stringtree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/.rspec +1 -0
- data/.travis.yml +2 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +78 -0
- data/LICENSE.txt +21 -0
- data/README.md +95 -0
- data/Rakefile +13 -0
- data/examples/demo.rb +82 -0
- data/examples/dictionary.txt +61217 -0
- data/examples/hamlet.txt +5590 -0
- data/examples/warandpeace.txt +64950 -0
- data/hamlet.tokens.txt +31262 -0
- data/lib/stringtree/item.rb +28 -0
- data/lib/stringtree/node.rb +216 -0
- data/lib/stringtree/tree.rb +98 -0
- data/lib/stringtree/version.rb +5 -0
- data/lib/stringtree.rb +6 -0
- data/spec/item_spec.rb +32 -0
- data/spec/node_spec.rb +187 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/tree_spec.rb +266 -0
- data/stringtree.gemspec +26 -0
- data/warandpeace.tokens.txt +572121 -0
- metadata +220 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
|
4
|
+
|
5
|
+
We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
|
6
|
+
|
7
|
+
Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
|
8
|
+
|
9
|
+
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
|
10
|
+
|
11
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
|
12
|
+
|
13
|
+
This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/)
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
stringtree (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
coveralls (0.8.2)
|
10
|
+
json (~> 1.8)
|
11
|
+
rest-client (>= 1.6.8, < 2)
|
12
|
+
simplecov (~> 0.10.0)
|
13
|
+
term-ansicolor (~> 1.3)
|
14
|
+
thor (~> 0.19.1)
|
15
|
+
diff-lcs (1.2.5)
|
16
|
+
docile (1.1.5)
|
17
|
+
domain_name (0.5.24)
|
18
|
+
unf (>= 0.0.5, < 1.0.0)
|
19
|
+
http-cookie (1.0.2)
|
20
|
+
domain_name (~> 0.5)
|
21
|
+
json (1.8.3)
|
22
|
+
mime-types (2.6.1)
|
23
|
+
netrc (0.10.3)
|
24
|
+
rake (10.4.2)
|
25
|
+
rdoc (4.2.0)
|
26
|
+
json (~> 1.4)
|
27
|
+
rest-client (1.8.0)
|
28
|
+
http-cookie (>= 1.0.2, < 2.0)
|
29
|
+
mime-types (>= 1.16, < 3.0)
|
30
|
+
netrc (~> 0.7)
|
31
|
+
rspec (3.3.0)
|
32
|
+
rspec-core (~> 3.3.0)
|
33
|
+
rspec-expectations (~> 3.3.0)
|
34
|
+
rspec-mocks (~> 3.3.0)
|
35
|
+
rspec-core (3.3.2)
|
36
|
+
rspec-support (~> 3.3.0)
|
37
|
+
rspec-expectations (3.3.1)
|
38
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
39
|
+
rspec-support (~> 3.3.0)
|
40
|
+
rspec-mocks (3.3.2)
|
41
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
42
|
+
rspec-support (~> 3.3.0)
|
43
|
+
rspec-support (3.3.0)
|
44
|
+
sdoc (0.4.1)
|
45
|
+
json (~> 1.7, >= 1.7.7)
|
46
|
+
rdoc (~> 4.0)
|
47
|
+
simplecov (0.10.0)
|
48
|
+
docile (~> 1.1.0)
|
49
|
+
json (~> 1.8)
|
50
|
+
simplecov-html (~> 0.10.0)
|
51
|
+
simplecov-html (0.10.0)
|
52
|
+
simplecov-rcov (0.2.3)
|
53
|
+
simplecov (>= 0.4.1)
|
54
|
+
term-ansicolor (1.3.2)
|
55
|
+
tins (~> 1.0)
|
56
|
+
thor (0.19.1)
|
57
|
+
tins (1.6.0)
|
58
|
+
unf (0.1.4)
|
59
|
+
unf_ext
|
60
|
+
unf_ext (0.0.7.1)
|
61
|
+
|
62
|
+
PLATFORMS
|
63
|
+
ruby
|
64
|
+
|
65
|
+
DEPENDENCIES
|
66
|
+
bundler (~> 1.10)
|
67
|
+
coveralls
|
68
|
+
rake (~> 10.0)
|
69
|
+
rdoc (~> 4.1)
|
70
|
+
rspec (~> 3.1)
|
71
|
+
rspec-mocks (~> 3.1)
|
72
|
+
sdoc (~> 0.4)
|
73
|
+
simplecov (~> 0.10.0)
|
74
|
+
simplecov-rcov (~> 0.2)
|
75
|
+
stringtree!
|
76
|
+
|
77
|
+
BUNDLED WITH
|
78
|
+
1.10.5
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Tom Cully
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
# stringtree-ruby
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/tomdionysus/stringtree-ruby.svg?branch=master)](https://travis-ci.org/tomdionysus/stringtree-ruby)
|
4
|
+
[![Coverage Status](https://coveralls.io/repos/tomdionysus/stringtree-ruby/badge.svg?branch=master&service=github)](https://coveralls.io/github/tomdionysus/stringtree-ruby?branch=master)
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/stringtree.svg)](http://badge.fury.io/rb/stringtree)
|
6
|
+
[![Dependency Status](https://gemnasium.com/tomdionysus/stringtree.svg)](https://gemnasium.com/tomdionysus/stringtree)
|
7
|
+
[![Gem Downloads](http://ruby-gem-downloads-badge.herokuapp.com/stringtree?color=brightgreen)](http://ruby-gem-downloads-badge.herokuapp.com/stringtree?color=brightgreen)
|
8
|
+
|
9
|
+
StringTree is a fast forward-only tokeniser and partial string matcher, that is, it can:
|
10
|
+
|
11
|
+
* Load a dictionary of arbitarty size and record count - e.g. an actual dictionary, an english word list - where each record is associated with a key - e.g. a numeric identifier for the word.
|
12
|
+
* Parse an arbitary data string in a single pass, finding and storing instances of each item in the dictionary and storing their offsets and associated keys.
|
13
|
+
* Host a set of strings in such a way as to efficiently match partial input strings against the dictionary
|
14
|
+
|
15
|
+
This has become my 'hello world' over the years with any new language. I use it to get to know a language, as implementing it correctly involves many of the usual concepts needed get started coding from the hip (syntax, grammar, classes, public/private instance vars, statics, pass-by value/pass-by-reference etc.) not to mention usual code support skills like how to set up unit tests for this language and environment, etc.
|
16
|
+
|
17
|
+
## Installation
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
gem install stringtree
|
21
|
+
```
|
22
|
+
|
23
|
+
## Implementation
|
24
|
+
|
25
|
+
StringTree is based on multidimensional binary trees - Each node in the tree has its usual left/right references to its children, but also an optional 'down' reference, which would refer to the root of the binary tree representing the next character in the string from this point. This version also has an 'up' reference on the node which can be faster for iterating backward through a set of trees.
|
26
|
+
|
27
|
+
From a CS point of view, Tree is a specific implementation of an n-dimensional trie.
|
28
|
+
|
29
|
+
## Limitations
|
30
|
+
|
31
|
+
* Tree is not intended and should not be used as a key/value store. There are much faster algorithims for storing such data.
|
32
|
+
* Tree should not be used in place of regular expressions. Tree is good when the total number and/or size of the tokens to be found is either unknown or dynamic.
|
33
|
+
* Tree is essentially a byte tokeniser and is case-sensitive. It would be trivial to extend it to search in an case insensitive manner, however.
|
34
|
+
|
35
|
+
## Applications
|
36
|
+
|
37
|
+
Tree has the following applications:
|
38
|
+
|
39
|
+
* Spellchecking
|
40
|
+
* Virus scanning
|
41
|
+
* Partial string matching, e.g. autocomplete
|
42
|
+
* Tokenizing, e.g. Matching Bank Transaction data to Businesses from reference/particular/code fields, etc.
|
43
|
+
|
44
|
+
## Environment
|
45
|
+
|
46
|
+
The demo is tested and runs under Ruby 1.9.3, and will probably work under earlier and later versions but this is not guaranteed. The unit tests require rspec also:
|
47
|
+
|
48
|
+
```bash
|
49
|
+
bundle install
|
50
|
+
```
|
51
|
+
|
52
|
+
## Specs
|
53
|
+
|
54
|
+
Specs are rspec, as follows:
|
55
|
+
|
56
|
+
```bash
|
57
|
+
rspec
|
58
|
+
```
|
59
|
+
|
60
|
+
## Documentation
|
61
|
+
|
62
|
+
Regenerate the Documentation with `rake`:
|
63
|
+
|
64
|
+
```bash
|
65
|
+
rake rdoc
|
66
|
+
```
|
67
|
+
|
68
|
+
## Demo
|
69
|
+
|
70
|
+
```bash
|
71
|
+
bundle exec ruby examples/demo.rb
|
72
|
+
```
|
73
|
+
|
74
|
+
This will generate two files:
|
75
|
+
|
76
|
+
* hamlet.tokens.txt
|
77
|
+
* warandpeace.tokens.txt
|
78
|
+
|
79
|
+
Each of which contain a set of lines as such:
|
80
|
+
|
81
|
+
28: platform 41003 (8)
|
82
|
+
|
83
|
+
Where 28 is the offset where the token was found, 'platform' is the token itself, 41003 is the id of the token (in this case the line number of the dictionary file), and (8) is the length of the token.
|
84
|
+
|
85
|
+
The demo code then goes into a console, which will do partial searches within the dictionary. Type a partial word and press enter, and the demo will show all words in the dictionary that start with the partial entered.
|
86
|
+
Type 'exit' to finish the demo.
|
87
|
+
|
88
|
+
## Code of Conduct
|
89
|
+
|
90
|
+
The StringTree project is committed to the [Contributor Covenant](http://contributor-covenant.org). Please read [CODE_OF_CONDUCT.md] before making any contributions or comments.
|
91
|
+
|
92
|
+
## References
|
93
|
+
|
94
|
+
* http://www.ruby-doc.org
|
95
|
+
* http://en.wikipedia.org/wiki/Trie
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
RDoc::Task.new do |rd|
|
6
|
+
rd.main = "README.rdoc"
|
7
|
+
rd.rdoc_files.include("README.md", "lib/**/*.rb")
|
8
|
+
rd.rdoc_dir = "doc"
|
9
|
+
end
|
10
|
+
|
11
|
+
RSpec::Core::RakeTask.new(:spec)
|
12
|
+
|
13
|
+
task :default => :spec
|
data/examples/demo.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'stringtree'
|
2
|
+
|
3
|
+
class Main
|
4
|
+
def main(argc)
|
5
|
+
puts "StringTree Ruby Demo"
|
6
|
+
|
7
|
+
begin
|
8
|
+
root = File.dirname(__FILE__)
|
9
|
+
@st = StringTree::Tree.new
|
10
|
+
puts "Loading Dictionary..."
|
11
|
+
@count = 0
|
12
|
+
File.open("#{root}/dictionary.txt", "r") do |infile|
|
13
|
+
while line = infile.gets
|
14
|
+
line = line.strip
|
15
|
+
@st.add(line, @count+=1)
|
16
|
+
@st.add(line.upcase, @count+=1)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
t=Time.now
|
21
|
+
puts "Optimizing Dictionary..."
|
22
|
+
@st.optimize
|
23
|
+
puts "Optimized #{@count} entries in "+sprintf("%.2f",(Time.now-t)*1000)+"ms"
|
24
|
+
|
25
|
+
puts "-Loading Hamlet-------------------"
|
26
|
+
load "#{root}/hamlet.txt"
|
27
|
+
do_match "hamlet.tokens.txt"
|
28
|
+
|
29
|
+
puts "--Loading War And Peace-----------"
|
30
|
+
load "#{root}/warandpeace.txt"
|
31
|
+
do_match "warandpeace.tokens.txt"
|
32
|
+
|
33
|
+
do_partial_matching
|
34
|
+
rescue Exception => e
|
35
|
+
puts "StringTree demo has encountered an error (#{e.message}). Please run again."
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def load(file_name)
|
40
|
+
@contents = File.open(file_name,'rb').read
|
41
|
+
end
|
42
|
+
|
43
|
+
def do_match(output_file_name=nil)
|
44
|
+
t=Time.now
|
45
|
+
puts "Matching #{@count} entries in #{@contents.length} bytes..."
|
46
|
+
list = []
|
47
|
+
@st.match_all(@contents) { |match| list << match }
|
48
|
+
interval = (Time.now-t)*1000
|
49
|
+
|
50
|
+
puts list.length.to_s+" entities matched in "+sprintf("%.2f",interval)+"ms ("+sprintf("%.2f", list.length / interval * 1000)+" entities/s)"
|
51
|
+
|
52
|
+
unless output_file_name.nil?
|
53
|
+
puts "Writing File #{output_file_name}"
|
54
|
+
File.open(output_file_name,'wb') do |f|
|
55
|
+
list.each do |item|
|
56
|
+
f.write item.offset.to_s + ": #{item.node} #{item.node.value} (#{item.node.length})\n"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def do_partial_matching
|
63
|
+
puts "-Partial Matching -----------------"
|
64
|
+
puts "-Type 'exit' to quit"
|
65
|
+
while true do
|
66
|
+
print "> "
|
67
|
+
x = gets.chomp
|
68
|
+
return if x == 'exit'
|
69
|
+
partials = @st.partials(x)
|
70
|
+
str = ""
|
71
|
+
unless partials.nil?
|
72
|
+
partials.collect { |partial| str += "#{partial} " }
|
73
|
+
else
|
74
|
+
str = "No Matches Found for '#{x}'"
|
75
|
+
end
|
76
|
+
puts str
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
@main = Main.new
|
82
|
+
@main.main(0)
|