fizx-stringset 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +7 -0
- data/Rakefile +48 -0
- data/VERSION.yml +4 -0
- data/lib/stringset.rb +50 -0
- data/spec/hamlet.txt +7067 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/stringset_spec.rb +67 -0
- metadata +62 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Kyle Maxwell
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "stringset"
|
8
|
+
gem.summary = %Q{TODO}
|
9
|
+
gem.email = "kyle@kylemaxwell.com"
|
10
|
+
gem.homepage = "http://github.com/fizx/stringset"
|
11
|
+
gem.authors = ["Kyle Maxwell"]
|
12
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'spec/rake/spectask'
|
20
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
21
|
+
spec.libs << 'lib' << 'spec'
|
22
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
23
|
+
end
|
24
|
+
|
25
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
26
|
+
spec.libs << 'lib' << 'spec'
|
27
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
|
+
spec.rcov = true
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task :default => :spec
|
33
|
+
|
34
|
+
require 'rake/rdoctask'
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
36
|
+
if File.exist?('VERSION.yml')
|
37
|
+
config = YAML.load(File.read('VERSION.yml'))
|
38
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
39
|
+
else
|
40
|
+
version = ""
|
41
|
+
end
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "stringset #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
48
|
+
|
data/VERSION.yml
ADDED
data/lib/stringset.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class StringSet
|
2
|
+
class Error < ::RuntimeError; end
|
3
|
+
|
4
|
+
TOKENIZER = /\W+/
|
5
|
+
|
6
|
+
attr_reader :strings, :max_token_size
|
7
|
+
|
8
|
+
def stemming?
|
9
|
+
!!@stemming
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(strings = [], options = {})
|
13
|
+
@stemming = options[:stem]
|
14
|
+
@strings = tokenize strings
|
15
|
+
@max_token_size = @strings.map{|str| str.split(TOKENIZER).length }.max
|
16
|
+
@strings.map! {|str| stem(str.split(TOKENIZER)).join(" ") } if stemming?
|
17
|
+
end
|
18
|
+
|
19
|
+
def substrings_in(strings)
|
20
|
+
tokenize(strings, true) & @strings
|
21
|
+
end
|
22
|
+
|
23
|
+
def tokenize(strings, ngramize = false)
|
24
|
+
tokens = case strings
|
25
|
+
when Array:
|
26
|
+
strings
|
27
|
+
when String:
|
28
|
+
stem(strings.split(TOKENIZER))
|
29
|
+
else
|
30
|
+
raise Error.new("Could not tokenize")
|
31
|
+
end
|
32
|
+
ngramize ? ngramize(tokens) : tokens
|
33
|
+
end
|
34
|
+
|
35
|
+
def ngramize(tokens, size = @max_token_size)
|
36
|
+
buffer = []
|
37
|
+
2.upto(size) do |n|
|
38
|
+
0.upto(tokens.length - n) do |i|
|
39
|
+
buffer << Array.new(n){|j| j }.map{|k| tokens[i+k] }.join(" ")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
tokens + buffer
|
43
|
+
end
|
44
|
+
|
45
|
+
def stem(tokens)
|
46
|
+
return tokens unless stemming?
|
47
|
+
require "stemmer"
|
48
|
+
tokens.map{|t| t.stem }
|
49
|
+
end
|
50
|
+
end
|