simhilarity 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +165 -0
- data/Rakefile +18 -0
- data/bin/simhilarity +84 -0
- data/lib/simhilarity/bits.rb +62 -0
- data/lib/simhilarity/bulk.rb +163 -0
- data/lib/simhilarity/candidate.rb +46 -0
- data/lib/simhilarity/element.rb +50 -0
- data/lib/simhilarity/matcher.rb +164 -0
- data/lib/simhilarity/single.rb +18 -0
- data/lib/simhilarity/version.rb +4 -0
- data/lib/simhilarity.rb +8 -0
- data/simhilarity.gemspec +27 -0
- data/test/harness +138 -0
- data/test/identity.txt +1 -0
- data/test/large_haystack.txt +10000 -0
- data/test/large_needles.txt +500 -0
- data/test/sample.csv +2669 -0
- data/test/tests.rb +125 -0
- metadata +156 -0
data/test/tests.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
require "awesome_print"
|
2
|
+
require "benchmark"
|
3
|
+
require "csv"
|
4
|
+
require "ostruct"
|
5
|
+
require "simhilarity"
|
6
|
+
require "test/unit"
|
7
|
+
|
8
|
+
class Tests < Test::Unit::TestCase
|
9
|
+
#
|
10
|
+
# helpers
|
11
|
+
#
|
12
|
+
|
13
|
+
def setup
|
14
|
+
Dir.chdir(File.expand_path("../", __FILE__))
|
15
|
+
@matcher = Simhilarity::Matcher.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Read the sample.csv test data file. needle is a list of needle
|
19
|
+
# strings, haystack is a list of haystack strings, and matches is a
|
20
|
+
# hash mapping from needle to haystack for known good matches.
|
21
|
+
def sample
|
22
|
+
sample = OpenStruct.new(needle: [], haystack: [], matches: { })
|
23
|
+
CSV.read("sample.csv").each do |cols|
|
24
|
+
n, h = *cols
|
25
|
+
sample.needle << n if n
|
26
|
+
sample.haystack << h if h
|
27
|
+
sample.matches[n] = h if n && h
|
28
|
+
end
|
29
|
+
sample
|
30
|
+
end
|
31
|
+
|
32
|
+
def assert_bulk_candidates(candidates, percent)
|
33
|
+
sample = self.sample
|
34
|
+
|
35
|
+
# match, with benchmark
|
36
|
+
output = nil
|
37
|
+
Benchmark.bm(10) do |bm|
|
38
|
+
bm.report(candidates.to_s) do
|
39
|
+
matcher = Simhilarity::Bulk.new(candidates: candidates)
|
40
|
+
output = matcher.matches(sample.needle, sample.haystack)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# what percent of matches are correct?
|
45
|
+
correct = output.select { |n, h, score| sample.matches[n] == h }
|
46
|
+
correct = correct.length.to_f / sample.needle.length
|
47
|
+
|
48
|
+
# for debugging
|
49
|
+
# printf("%% correct: %.3f\n", correct)
|
50
|
+
# output.each do |n, h, score|
|
51
|
+
# good = sample.matches[n] == h
|
52
|
+
# printf("%2s %4.2f %-35s %-35s\n", good ? "" : "xx", score || 0, n, h)
|
53
|
+
# end
|
54
|
+
|
55
|
+
assert((correct - percent).abs < 0.001, "percent #{correct} != #{percent}")
|
56
|
+
end
|
57
|
+
|
58
|
+
def assert_system(cmd)
|
59
|
+
system("#{cmd} > /dev/null 2>&1")
|
60
|
+
assert($? == 0, "#{cmd} failed")
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# tests
|
65
|
+
#
|
66
|
+
|
67
|
+
def test_read
|
68
|
+
# default
|
69
|
+
assert_equal @matcher.read("gub"), "gub"
|
70
|
+
|
71
|
+
# not a string
|
72
|
+
assert_raise(RuntimeError) { @matcher.read(123) }
|
73
|
+
|
74
|
+
# custom
|
75
|
+
@matcher.reader = lambda(&:key)
|
76
|
+
assert_equal @matcher.read(OpenStruct.new(key: "gub")), "gub"
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_normalizer
|
80
|
+
# default
|
81
|
+
assert_equal @matcher.normalize(" HELLO,\tWORLD! "), "hello world"
|
82
|
+
|
83
|
+
# custom
|
84
|
+
@matcher.normalizer = lambda(&:upcase)
|
85
|
+
assert_equal @matcher.normalize("gub"), "GUB"
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_ngrams
|
89
|
+
# default
|
90
|
+
assert_equal @matcher.ngrams("hi 42"), ["hi", "i ", " 4", "42"]
|
91
|
+
|
92
|
+
# custom
|
93
|
+
@matcher.ngrammer = lambda(&:split)
|
94
|
+
assert_equal @matcher.ngrams("hi 42"), ["hi", "42"]
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_proc_options
|
98
|
+
matcher = Simhilarity::Matcher.new(reader: lambda(&:key), normalizer: lambda(&:upcase), ngrammer: lambda(&:split))
|
99
|
+
assert_equal matcher.read(OpenStruct.new(key: "gub")), "gub"
|
100
|
+
assert_equal matcher.normalize("gub"), "GUB"
|
101
|
+
assert_equal matcher.ngrams("hi 42"), ["hi", "42"]
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_single
|
105
|
+
score = Simhilarity::Single.new.score("hello world", "hi worlds")
|
106
|
+
assert (score - 0.556).abs < 0.001, "test_single percent was wrong!"
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_bulk
|
110
|
+
assert_bulk_candidates(:all, 0.974)
|
111
|
+
assert_bulk_candidates(:ngrams, 0.974)
|
112
|
+
assert_bulk_candidates(:simhash, 0.949)
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_bin
|
116
|
+
bin = "../bin/simhilarity"
|
117
|
+
assert_system("#{bin} identity.txt identity.txt")
|
118
|
+
assert_system("#{bin} -v identity.txt identity.txt")
|
119
|
+
assert_system("#{bin} --candidates simhash identity.txt identity.txt")
|
120
|
+
assert_system("#{bin} --candidates simhash=3 identity.txt identity.txt")
|
121
|
+
assert_system("#{bin} --candidates ngrams identity.txt identity.txt")
|
122
|
+
assert_system("#{bin} --candidates ngrams=3 identity.txt identity.txt")
|
123
|
+
assert_system("#{bin} --candidates all identity.txt identity.txt")
|
124
|
+
end
|
125
|
+
end
|
metadata
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simhilarity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Adam Doppelt
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bk
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: progressbar
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: awesome_print
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rake
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rdoc
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: Measure text similarity using frequency weighted ngrams.
|
95
|
+
email:
|
96
|
+
- amd@gurge.com
|
97
|
+
executables:
|
98
|
+
- simhilarity
|
99
|
+
extensions: []
|
100
|
+
extra_rdoc_files: []
|
101
|
+
files:
|
102
|
+
- .gitignore
|
103
|
+
- Gemfile
|
104
|
+
- LICENSE
|
105
|
+
- README.md
|
106
|
+
- Rakefile
|
107
|
+
- bin/simhilarity
|
108
|
+
- lib/simhilarity.rb
|
109
|
+
- lib/simhilarity/bits.rb
|
110
|
+
- lib/simhilarity/bulk.rb
|
111
|
+
- lib/simhilarity/candidate.rb
|
112
|
+
- lib/simhilarity/element.rb
|
113
|
+
- lib/simhilarity/matcher.rb
|
114
|
+
- lib/simhilarity/single.rb
|
115
|
+
- lib/simhilarity/version.rb
|
116
|
+
- simhilarity.gemspec
|
117
|
+
- test/harness
|
118
|
+
- test/identity.txt
|
119
|
+
- test/large_haystack.txt
|
120
|
+
- test/large_needles.txt
|
121
|
+
- test/sample.csv
|
122
|
+
- test/tests.rb
|
123
|
+
homepage: http://github.com/gurgeous/simhilarity
|
124
|
+
licenses: []
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ! '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: 1.9.0
|
135
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
136
|
+
none: false
|
137
|
+
requirements:
|
138
|
+
- - ! '>='
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
segments:
|
142
|
+
- 0
|
143
|
+
hash: -1497809244519171705
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project: simhilarity
|
146
|
+
rubygems_version: 1.8.24
|
147
|
+
signing_key:
|
148
|
+
specification_version: 3
|
149
|
+
summary: Simhilarity - measure text similarity using frequency weighted ngrams.
|
150
|
+
test_files:
|
151
|
+
- test/harness
|
152
|
+
- test/identity.txt
|
153
|
+
- test/large_haystack.txt
|
154
|
+
- test/large_needles.txt
|
155
|
+
- test/sample.csv
|
156
|
+
- test/tests.rb
|