fizx-stringset 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kyle Maxwell
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,7 @@
1
+ = stringset
2
+
3
+ Description goes here.
4
+
5
+ == Copyright
6
+
7
+ Copyright (c) 2009 Kyle Maxwell. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "stringset"
8
+ gem.summary = %Q{TODO}
9
+ gem.email = "kyle@kylemaxwell.com"
10
+ gem.homepage = "http://github.com/fizx/stringset"
11
+ gem.authors = ["Kyle Maxwell"]
12
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ require 'spec/rake/spectask'
20
+ Spec::Rake::SpecTask.new(:spec) do |spec|
21
+ spec.libs << 'lib' << 'spec'
22
+ spec.spec_files = FileList['spec/**/*_spec.rb']
23
+ end
24
+
25
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
26
+ spec.libs << 'lib' << 'spec'
27
+ spec.pattern = 'spec/**/*_spec.rb'
28
+ spec.rcov = true
29
+ end
30
+
31
+
32
+ task :default => :spec
33
+
34
+ require 'rake/rdoctask'
35
+ Rake::RDocTask.new do |rdoc|
36
+ if File.exist?('VERSION.yml')
37
+ config = YAML.load(File.read('VERSION.yml'))
38
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
39
+ else
40
+ version = ""
41
+ end
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "stringset #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
48
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 2
4
+ :patch: 0
data/lib/stringset.rb ADDED
@@ -0,0 +1,50 @@
1
+ class StringSet
2
+ class Error < ::RuntimeError; end
3
+
4
+ TOKENIZER = /\W+/
5
+
6
+ attr_reader :strings, :max_token_size
7
+
8
+ def stemming?
9
+ !!@stemming
10
+ end
11
+
12
+ def initialize(strings = [], options = {})
13
+ @stemming = options[:stem]
14
+ @strings = tokenize strings
15
+ @max_token_size = @strings.map{|str| str.split(TOKENIZER).length }.max
16
+ @strings.map! {|str| stem(str.split(TOKENIZER)).join(" ") } if stemming?
17
+ end
18
+
19
+ def substrings_in(strings)
20
+ tokenize(strings, true) & @strings
21
+ end
22
+
23
+ def tokenize(strings, ngramize = false)
24
+ tokens = case strings
25
+ when Array:
26
+ strings
27
+ when String:
28
+ stem(strings.split(TOKENIZER))
29
+ else
30
+ raise Error.new("Could not tokenize")
31
+ end
32
+ ngramize ? ngramize(tokens) : tokens
33
+ end
34
+
35
+ def ngramize(tokens, size = @max_token_size)
36
+ buffer = []
37
+ 2.upto(size) do |n|
38
+ 0.upto(tokens.length - n) do |i|
39
+ buffer << Array.new(n){|j| j }.map{|k| tokens[i+k] }.join(" ")
40
+ end
41
+ end
42
+ tokens + buffer
43
+ end
44
+
45
+ def stem(tokens)
46
+ return tokens unless stemming?
47
+ require "stemmer"
48
+ tokens.map{|t| t.stem }
49
+ end
50
+ end