simhilarity 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/test/tests.rb ADDED
@@ -0,0 +1,125 @@
1
+ require "awesome_print"
2
+ require "benchmark"
3
+ require "csv"
4
+ require "ostruct"
5
+ require "simhilarity"
6
+ require "test/unit"
7
+
8
+ class Tests < Test::Unit::TestCase
9
+ #
10
+ # helpers
11
+ #
12
+
13
+ def setup
14
+ Dir.chdir(File.expand_path("../", __FILE__))
15
+ @matcher = Simhilarity::Matcher.new
16
+ end
17
+
18
+ # Read the sample.csv test data file. needle is a list of needle
19
+ # strings, haystack is a list of haystack strings, and matches is a
20
+ # hash mapping from needle to haystack for known good matches.
21
+ def sample
22
+ sample = OpenStruct.new(needle: [], haystack: [], matches: { })
23
+ CSV.read("sample.csv").each do |cols|
24
+ n, h = *cols
25
+ sample.needle << n if n
26
+ sample.haystack << h if h
27
+ sample.matches[n] = h if n && h
28
+ end
29
+ sample
30
+ end
31
+
32
+ def assert_bulk_candidates(candidates, percent)
33
+ sample = self.sample
34
+
35
+ # match, with benchmark
36
+ output = nil
37
+ Benchmark.bm(10) do |bm|
38
+ bm.report(candidates.to_s) do
39
+ matcher = Simhilarity::Bulk.new(candidates: candidates)
40
+ output = matcher.matches(sample.needle, sample.haystack)
41
+ end
42
+ end
43
+
44
+ # what percent of matches are correct?
45
+ correct = output.select { |n, h, score| sample.matches[n] == h }
46
+ correct = correct.length.to_f / sample.needle.length
47
+
48
+ # for debugging
49
+ # printf("%% correct: %.3f\n", correct)
50
+ # output.each do |n, h, score|
51
+ # good = sample.matches[n] == h
52
+ # printf("%2s %4.2f %-35s %-35s\n", good ? "" : "xx", score || 0, n, h)
53
+ # end
54
+
55
+ assert((correct - percent).abs < 0.001, "percent #{correct} != #{percent}")
56
+ end
57
+
58
+ def assert_system(cmd)
59
+ system("#{cmd} > /dev/null 2>&1")
60
+ assert($? == 0, "#{cmd} failed")
61
+ end
62
+
63
+ #
64
+ # tests
65
+ #
66
+
67
+ def test_read
68
+ # default
69
+ assert_equal @matcher.read("gub"), "gub"
70
+
71
+ # not a string
72
+ assert_raise(RuntimeError) { @matcher.read(123) }
73
+
74
+ # custom
75
+ @matcher.reader = lambda(&:key)
76
+ assert_equal @matcher.read(OpenStruct.new(key: "gub")), "gub"
77
+ end
78
+
79
+ def test_normalizer
80
+ # default
81
+ assert_equal @matcher.normalize(" HELLO,\tWORLD! "), "hello world"
82
+
83
+ # custom
84
+ @matcher.normalizer = lambda(&:upcase)
85
+ assert_equal @matcher.normalize("gub"), "GUB"
86
+ end
87
+
88
+ def test_ngrams
89
+ # default
90
+ assert_equal @matcher.ngrams("hi 42"), ["hi", "i ", " 4", "42"]
91
+
92
+ # custom
93
+ @matcher.ngrammer = lambda(&:split)
94
+ assert_equal @matcher.ngrams("hi 42"), ["hi", "42"]
95
+ end
96
+
97
+ def test_proc_options
98
+ matcher = Simhilarity::Matcher.new(reader: lambda(&:key), normalizer: lambda(&:upcase), ngrammer: lambda(&:split))
99
+ assert_equal matcher.read(OpenStruct.new(key: "gub")), "gub"
100
+ assert_equal matcher.normalize("gub"), "GUB"
101
+ assert_equal matcher.ngrams("hi 42"), ["hi", "42"]
102
+ end
103
+
104
+ def test_single
105
+ score = Simhilarity::Single.new.score("hello world", "hi worlds")
106
+ assert (score - 0.556).abs < 0.001, "test_single percent was wrong!"
107
+ end
108
+
109
+ def test_bulk
110
+ assert_bulk_candidates(:all, 0.974)
111
+ assert_bulk_candidates(:ngrams, 0.974)
112
+ assert_bulk_candidates(:simhash, 0.949)
113
+ end
114
+
115
+ def test_bin
116
+ bin = "../bin/simhilarity"
117
+ assert_system("#{bin} identity.txt identity.txt")
118
+ assert_system("#{bin} -v identity.txt identity.txt")
119
+ assert_system("#{bin} --candidates simhash identity.txt identity.txt")
120
+ assert_system("#{bin} --candidates simhash=3 identity.txt identity.txt")
121
+ assert_system("#{bin} --candidates ngrams identity.txt identity.txt")
122
+ assert_system("#{bin} --candidates ngrams=3 identity.txt identity.txt")
123
+ assert_system("#{bin} --candidates all identity.txt identity.txt")
124
+ end
125
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simhilarity
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Doppelt
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bk
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: progressbar
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: awesome_print
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rdoc
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Measure text similarity using frequency weighted ngrams.
95
+ email:
96
+ - amd@gurge.com
97
+ executables:
98
+ - simhilarity
99
+ extensions: []
100
+ extra_rdoc_files: []
101
+ files:
102
+ - .gitignore
103
+ - Gemfile
104
+ - LICENSE
105
+ - README.md
106
+ - Rakefile
107
+ - bin/simhilarity
108
+ - lib/simhilarity.rb
109
+ - lib/simhilarity/bits.rb
110
+ - lib/simhilarity/bulk.rb
111
+ - lib/simhilarity/candidate.rb
112
+ - lib/simhilarity/element.rb
113
+ - lib/simhilarity/matcher.rb
114
+ - lib/simhilarity/single.rb
115
+ - lib/simhilarity/version.rb
116
+ - simhilarity.gemspec
117
+ - test/harness
118
+ - test/identity.txt
119
+ - test/large_haystack.txt
120
+ - test/large_needles.txt
121
+ - test/sample.csv
122
+ - test/tests.rb
123
+ homepage: http://github.com/gurgeous/simhilarity
124
+ licenses: []
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: 1.9.0
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ! '>='
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ segments:
142
+ - 0
143
+ hash: -1497809244519171705
144
+ requirements: []
145
+ rubyforge_project: simhilarity
146
+ rubygems_version: 1.8.24
147
+ signing_key:
148
+ specification_version: 3
149
+ summary: Simhilarity - measure text similarity using frequency weighted ngrams.
150
+ test_files:
151
+ - test/harness
152
+ - test/identity.txt
153
+ - test/large_haystack.txt
154
+ - test/large_needles.txt
155
+ - test/sample.csv
156
+ - test/tests.rb