simhilarity 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/tests.rb ADDED
@@ -0,0 +1,125 @@
1
+ require "awesome_print"
2
+ require "benchmark"
3
+ require "csv"
4
+ require "ostruct"
5
+ require "simhilarity"
6
+ require "test/unit"
7
+
8
+ class Tests < Test::Unit::TestCase
9
+ #
10
+ # helpers
11
+ #
12
+
13
+ def setup
14
+ Dir.chdir(File.expand_path("../", __FILE__))
15
+ @matcher = Simhilarity::Matcher.new
16
+ end
17
+
18
+ # Read the sample.csv test data file. needle is a list of needle
19
+ # strings, haystack is a list of haystack strings, and matches is a
20
+ # hash mapping from needle to haystack for known good matches.
21
+ def sample
22
+ sample = OpenStruct.new(needle: [], haystack: [], matches: { })
23
+ CSV.read("sample.csv").each do |cols|
24
+ n, h = *cols
25
+ sample.needle << n if n
26
+ sample.haystack << h if h
27
+ sample.matches[n] = h if n && h
28
+ end
29
+ sample
30
+ end
31
+
32
+ def assert_bulk_candidates(candidates, percent)
33
+ sample = self.sample
34
+
35
+ # match, with benchmark
36
+ output = nil
37
+ Benchmark.bm(10) do |bm|
38
+ bm.report(candidates.to_s) do
39
+ matcher = Simhilarity::Bulk.new(candidates: candidates)
40
+ output = matcher.matches(sample.needle, sample.haystack)
41
+ end
42
+ end
43
+
44
+ # what percent of matches are correct?
45
+ correct = output.select { |n, h, score| sample.matches[n] == h }
46
+ correct = correct.length.to_f / sample.needle.length
47
+
48
+ # for debugging
49
+ # printf("%% correct: %.3f\n", correct)
50
+ # output.each do |n, h, score|
51
+ # good = sample.matches[n] == h
52
+ # printf("%2s %4.2f %-35s %-35s\n", good ? "" : "xx", score || 0, n, h)
53
+ # end
54
+
55
+ assert((correct - percent).abs < 0.001, "percent #{correct} != #{percent}")
56
+ end
57
+
58
+ def assert_system(cmd)
59
+ system("#{cmd} > /dev/null 2>&1")
60
+ assert($? == 0, "#{cmd} failed")
61
+ end
62
+
63
+ #
64
+ # tests
65
+ #
66
+
67
+ def test_read
68
+ # default
69
+ assert_equal @matcher.read("gub"), "gub"
70
+
71
+ # not a string
72
+ assert_raise(RuntimeError) { @matcher.read(123) }
73
+
74
+ # custom
75
+ @matcher.reader = lambda(&:key)
76
+ assert_equal @matcher.read(OpenStruct.new(key: "gub")), "gub"
77
+ end
78
+
79
+ def test_normalizer
80
+ # default
81
+ assert_equal @matcher.normalize(" HELLO,\tWORLD! "), "hello world"
82
+
83
+ # custom
84
+ @matcher.normalizer = lambda(&:upcase)
85
+ assert_equal @matcher.normalize("gub"), "GUB"
86
+ end
87
+
88
+ def test_ngrams
89
+ # default
90
+ assert_equal @matcher.ngrams("hi 42"), ["hi", "i ", " 4", "42"]
91
+
92
+ # custom
93
+ @matcher.ngrammer = lambda(&:split)
94
+ assert_equal @matcher.ngrams("hi 42"), ["hi", "42"]
95
+ end
96
+
97
+ def test_proc_options
98
+ matcher = Simhilarity::Matcher.new(reader: lambda(&:key), normalizer: lambda(&:upcase), ngrammer: lambda(&:split))
99
+ assert_equal matcher.read(OpenStruct.new(key: "gub")), "gub"
100
+ assert_equal matcher.normalize("gub"), "GUB"
101
+ assert_equal matcher.ngrams("hi 42"), ["hi", "42"]
102
+ end
103
+
104
+ def test_single
105
+ score = Simhilarity::Single.new.score("hello world", "hi worlds")
106
+ assert (score - 0.556).abs < 0.001, "test_single percent was wrong!"
107
+ end
108
+
109
+ def test_bulk
110
+ assert_bulk_candidates(:all, 0.974)
111
+ assert_bulk_candidates(:ngrams, 0.974)
112
+ assert_bulk_candidates(:simhash, 0.949)
113
+ end
114
+
115
+ def test_bin
116
+ bin = "../bin/simhilarity"
117
+ assert_system("#{bin} identity.txt identity.txt")
118
+ assert_system("#{bin} -v identity.txt identity.txt")
119
+ assert_system("#{bin} --candidates simhash identity.txt identity.txt")
120
+ assert_system("#{bin} --candidates simhash=3 identity.txt identity.txt")
121
+ assert_system("#{bin} --candidates ngrams identity.txt identity.txt")
122
+ assert_system("#{bin} --candidates ngrams=3 identity.txt identity.txt")
123
+ assert_system("#{bin} --candidates all identity.txt identity.txt")
124
+ end
125
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhilarity
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Doppelt
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bk
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: progressbar
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: awesome_print
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rdoc
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Measure text similarity using frequency weighted ngrams.
95
+ email:
96
+ - amd@gurge.com
97
+ executables:
98
+ - simhilarity
99
+ extensions: []
100
+ extra_rdoc_files: []
101
+ files:
102
+ - .gitignore
103
+ - Gemfile
104
+ - LICENSE
105
+ - README.md
106
+ - Rakefile
107
+ - bin/simhilarity
108
+ - lib/simhilarity.rb
109
+ - lib/simhilarity/bits.rb
110
+ - lib/simhilarity/bulk.rb
111
+ - lib/simhilarity/candidate.rb
112
+ - lib/simhilarity/element.rb
113
+ - lib/simhilarity/matcher.rb
114
+ - lib/simhilarity/single.rb
115
+ - lib/simhilarity/version.rb
116
+ - simhilarity.gemspec
117
+ - test/harness
118
+ - test/identity.txt
119
+ - test/large_haystack.txt
120
+ - test/large_needles.txt
121
+ - test/sample.csv
122
+ - test/tests.rb
123
+ homepage: http://github.com/gurgeous/simhilarity
124
+ licenses: []
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: 1.9.0
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ! '>='
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ segments:
142
+ - 0
143
+ hash: -1497809244519171705
144
+ requirements: []
145
+ rubyforge_project: simhilarity
146
+ rubygems_version: 1.8.24
147
+ signing_key:
148
+ specification_version: 3
149
+ summary: Simhilarity - measure text similarity using frequency weighted ngrams.
150
+ test_files:
151
+ - test/harness
152
+ - test/identity.txt
153
+ - test/large_haystack.txt
154
+ - test/large_needles.txt
155
+ - test/sample.csv
156
+ - test/tests.rb