stringtree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ *.rbc
3
+ *.tokens.txt
4
+ .bundle
5
+ .config
6
+ coverage
7
+ doc/
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,2 @@
1
+ language: ruby
2
+ sudo: false
@@ -0,0 +1,13 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
4
+
5
+ We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
6
+
7
+ Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
8
+
9
+ Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
10
+
11
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
12
+
13
+ This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/)
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,78 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ stringtree (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ coveralls (0.8.2)
10
+ json (~> 1.8)
11
+ rest-client (>= 1.6.8, < 2)
12
+ simplecov (~> 0.10.0)
13
+ term-ansicolor (~> 1.3)
14
+ thor (~> 0.19.1)
15
+ diff-lcs (1.2.5)
16
+ docile (1.1.5)
17
+ domain_name (0.5.24)
18
+ unf (>= 0.0.5, < 1.0.0)
19
+ http-cookie (1.0.2)
20
+ domain_name (~> 0.5)
21
+ json (1.8.3)
22
+ mime-types (2.6.1)
23
+ netrc (0.10.3)
24
+ rake (10.4.2)
25
+ rdoc (4.2.0)
26
+ json (~> 1.4)
27
+ rest-client (1.8.0)
28
+ http-cookie (>= 1.0.2, < 2.0)
29
+ mime-types (>= 1.16, < 3.0)
30
+ netrc (~> 0.7)
31
+ rspec (3.3.0)
32
+ rspec-core (~> 3.3.0)
33
+ rspec-expectations (~> 3.3.0)
34
+ rspec-mocks (~> 3.3.0)
35
+ rspec-core (3.3.2)
36
+ rspec-support (~> 3.3.0)
37
+ rspec-expectations (3.3.1)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.3.0)
40
+ rspec-mocks (3.3.2)
41
+ diff-lcs (>= 1.2.0, < 2.0)
42
+ rspec-support (~> 3.3.0)
43
+ rspec-support (3.3.0)
44
+ sdoc (0.4.1)
45
+ json (~> 1.7, >= 1.7.7)
46
+ rdoc (~> 4.0)
47
+ simplecov (0.10.0)
48
+ docile (~> 1.1.0)
49
+ json (~> 1.8)
50
+ simplecov-html (~> 0.10.0)
51
+ simplecov-html (0.10.0)
52
+ simplecov-rcov (0.2.3)
53
+ simplecov (>= 0.4.1)
54
+ term-ansicolor (1.3.2)
55
+ tins (~> 1.0)
56
+ thor (0.19.1)
57
+ tins (1.6.0)
58
+ unf (0.1.4)
59
+ unf_ext
60
+ unf_ext (0.0.7.1)
61
+
62
+ PLATFORMS
63
+ ruby
64
+
65
+ DEPENDENCIES
66
+ bundler (~> 1.10)
67
+ coveralls
68
+ rake (~> 10.0)
69
+ rdoc (~> 4.1)
70
+ rspec (~> 3.1)
71
+ rspec-mocks (~> 3.1)
72
+ sdoc (~> 0.4)
73
+ simplecov (~> 0.10.0)
74
+ simplecov-rcov (~> 0.2)
75
+ stringtree!
76
+
77
+ BUNDLED WITH
78
+ 1.10.5
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Tom Cully
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,95 @@
1
+ # stringtree-ruby
2
+
3
+ [![Build Status](https://travis-ci.org/tomdionysus/stringtree-ruby.svg?branch=master)](https://travis-ci.org/tomdionysus/stringtree-ruby)
4
+ [![Coverage Status](https://coveralls.io/repos/tomdionysus/stringtree-ruby/badge.svg?branch=master&service=github)](https://coveralls.io/github/tomdionysus/stringtree-ruby?branch=master)
5
+ [![Gem Version](https://badge.fury.io/rb/stringtree.svg)](http://badge.fury.io/rb/stringtree)
6
+ [![Dependency Status](https://gemnasium.com/tomdionysus/stringtree.svg)](https://gemnasium.com/tomdionysus/stringtree)
7
+ [![Gem Downloads](http://ruby-gem-downloads-badge.herokuapp.com/stringtree?color=brightgreen)](http://ruby-gem-downloads-badge.herokuapp.com/stringtree?color=brightgreen)
8
+
9
+ StringTree is a fast forward-only tokeniser and partial string matcher, that is, it can:
10
+
11
+ * Load a dictionary of arbitarty size and record count - e.g. an actual dictionary, an english word list - where each record is associated with a key - e.g. a numeric identifier for the word.
12
+ * Parse an arbitary data string in a single pass, finding and storing instances of each item in the dictionary and storing their offsets and associated keys.
13
+ * Host a set of strings in such a way as to efficiently match partial input strings against the dictionary
14
+
15
+ This has become my 'hello world' over the years with any new language. I use it to get to know a language, as implementing it correctly involves many of the usual concepts needed get started coding from the hip (syntax, grammar, classes, public/private instance vars, statics, pass-by value/pass-by-reference etc.) not to mention usual code support skills like how to set up unit tests for this language and environment, etc.
16
+
17
+ ## Installation
18
+
19
+ ```ruby
20
+ gem install stringtree
21
+ ```
22
+
23
+ ## Implementation
24
+
25
+ StringTree is based on multidimensional binary trees - Each node in the tree has its usual left/right references to its children, but also an optional 'down' reference, which would refer to the root of the binary tree representing the next character in the string from this point. This version also has an 'up' reference on the node which can be faster for iterating backward through a set of trees.
26
+
27
+ From a CS point of view, Tree is a specific implementation of an n-dimensional trie.
28
+
29
+ ## Limitations
30
+
31
+ * Tree is not intended and should not be used as a key/value store. There are much faster algorithims for storing such data.
32
+ * Tree should not be used in place of regular expressions. Tree is good when the total number and/or size of the tokens to be found is either unknown or dynamic.
33
+ * Tree is essentially a byte tokeniser and is case-sensitive. It would be trivial to extend it to search in an case insensitive manner, however.
34
+
35
+ ## Applications
36
+
37
+ Tree has the following applications:
38
+
39
+ * Spellchecking
40
+ * Virus scanning
41
+ * Partial string matching, e.g. autocomplete
42
+ * Tokenizing, e.g. Matching Bank Transaction data to Businesses from reference/particular/code fields, etc.
43
+
44
+ ## Environment
45
+
46
+ The demo is tested and runs under Ruby 1.9.3, and will probably work under earlier and later versions but this is not guaranteed. The unit tests require rspec also:
47
+
48
+ ```bash
49
+ bundle install
50
+ ```
51
+
52
+ ## Specs
53
+
54
+ Specs are rspec, as follows:
55
+
56
+ ```bash
57
+ rspec
58
+ ```
59
+
60
+ ## Documentation
61
+
62
+ Regenerate the Documentation with `rake`:
63
+
64
+ ```bash
65
+ rake rdoc
66
+ ```
67
+
68
+ ## Demo
69
+
70
+ ```bash
71
+ bundle exec ruby examples/demo.rb
72
+ ```
73
+
74
+ This will generate two files:
75
+
76
+ * hamlet.tokens.txt
77
+ * warandpeace.tokens.txt
78
+
79
+ Each of which contain a set of lines as such:
80
+
81
+ 28: platform 41003 (8)
82
+
83
+ Where 28 is the offset where the token was found, 'platform' is the token itself, 41003 is the id of the token (in this case the line number of the dictionary file), and (8) is the length of the token.
84
+
85
+ The demo code then goes into a console, which will do partial searches within the dictionary. Type a partial word and press enter, and the demo will show all words in the dictionary that start with the partial entered.
86
+ Type 'exit' to finish the demo.
87
+
88
+ ## Code of Conduct
89
+
90
+ The StringTree project is committed to the [Contributor Covenant](http://contributor-covenant.org). Please read [CODE_OF_CONDUCT.md] before making any contributions or comments.
91
+
92
+ ## References
93
+
94
+ * http://www.ruby-doc.org
95
+ * http://en.wikipedia.org/wiki/Trie
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require 'rdoc/task'
4
+
5
+ RDoc::Task.new do |rd|
6
+ rd.main = "README.rdoc"
7
+ rd.rdoc_files.include("README.md", "lib/**/*.rb")
8
+ rd.rdoc_dir = "doc"
9
+ end
10
+
11
+ RSpec::Core::RakeTask.new(:spec)
12
+
13
+ task :default => :spec
data/examples/demo.rb ADDED
@@ -0,0 +1,82 @@
1
+ require 'stringtree'
2
+
3
+ class Main
4
+ def main(argc)
5
+ puts "StringTree Ruby Demo"
6
+
7
+ begin
8
+ root = File.dirname(__FILE__)
9
+ @st = StringTree::Tree.new
10
+ puts "Loading Dictionary..."
11
+ @count = 0
12
+ File.open("#{root}/dictionary.txt", "r") do |infile|
13
+ while line = infile.gets
14
+ line = line.strip
15
+ @st.add(line, @count+=1)
16
+ @st.add(line.upcase, @count+=1)
17
+ end
18
+ end
19
+
20
+ t=Time.now
21
+ puts "Optimizing Dictionary..."
22
+ @st.optimize
23
+ puts "Optimized #{@count} entries in "+sprintf("%.2f",(Time.now-t)*1000)+"ms"
24
+
25
+ puts "-Loading Hamlet-------------------"
26
+ load "#{root}/hamlet.txt"
27
+ do_match "hamlet.tokens.txt"
28
+
29
+ puts "--Loading War And Peace-----------"
30
+ load "#{root}/warandpeace.txt"
31
+ do_match "warandpeace.tokens.txt"
32
+
33
+ do_partial_matching
34
+ rescue Exception => e
35
+ puts "StringTree demo has encountered an error (#{e.message}). Please run again."
36
+ end
37
+ end
38
+
39
+ def load(file_name)
40
+ @contents = File.open(file_name,'rb').read
41
+ end
42
+
43
+ def do_match(output_file_name=nil)
44
+ t=Time.now
45
+ puts "Matching #{@count} entries in #{@contents.length} bytes..."
46
+ list = []
47
+ @st.match_all(@contents) { |match| list << match }
48
+ interval = (Time.now-t)*1000
49
+
50
+ puts list.length.to_s+" entities matched in "+sprintf("%.2f",interval)+"ms ("+sprintf("%.2f", list.length / interval * 1000)+" entities/s)"
51
+
52
+ unless output_file_name.nil?
53
+ puts "Writing File #{output_file_name}"
54
+ File.open(output_file_name,'wb') do |f|
55
+ list.each do |item|
56
+ f.write item.offset.to_s + ": #{item.node} #{item.node.value} (#{item.node.length})\n"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ def do_partial_matching
63
+ puts "-Partial Matching -----------------"
64
+ puts "-Type 'exit' to quit"
65
+ while true do
66
+ print "> "
67
+ x = gets.chomp
68
+ return if x == 'exit'
69
+ partials = @st.partials(x)
70
+ str = ""
71
+ unless partials.nil?
72
+ partials.collect { |partial| str += "#{partial} " }
73
+ else
74
+ str = "No Matches Found for '#{x}'"
75
+ end
76
+ puts str
77
+ end
78
+ end
79
+ end
80
+
81
+ @main = Main.new
82
+ @main.main(0)