simhilarity 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +165 -0
- data/Rakefile +18 -0
- data/bin/simhilarity +84 -0
- data/lib/simhilarity/bits.rb +62 -0
- data/lib/simhilarity/bulk.rb +163 -0
- data/lib/simhilarity/candidate.rb +46 -0
- data/lib/simhilarity/element.rb +50 -0
- data/lib/simhilarity/matcher.rb +164 -0
- data/lib/simhilarity/single.rb +18 -0
- data/lib/simhilarity/version.rb +4 -0
- data/lib/simhilarity.rb +8 -0
- data/simhilarity.gemspec +27 -0
- data/test/harness +138 -0
- data/test/identity.txt +1 -0
- data/test/large_haystack.txt +10000 -0
- data/test/large_needles.txt +500 -0
- data/test/sample.csv +2669 -0
- data/test/tests.rb +125 -0
- metadata +156 -0
data/test/tests.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
require "awesome_print"
|
2
|
+
require "benchmark"
|
3
|
+
require "csv"
|
4
|
+
require "ostruct"
|
5
|
+
require "simhilarity"
|
6
|
+
require "test/unit"
|
7
|
+
|
8
|
+
class Tests < Test::Unit::TestCase
|
9
|
+
#
|
10
|
+
# helpers
|
11
|
+
#
|
12
|
+
|
13
|
+
def setup
|
14
|
+
Dir.chdir(File.expand_path("../", __FILE__))
|
15
|
+
@matcher = Simhilarity::Matcher.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Read the sample.csv test data file. needle is a list of needle
|
19
|
+
# strings, haystack is a list of haystack strings, and matches is a
|
20
|
+
# hash mapping from needle to haystack for known good matches.
|
21
|
+
def sample
|
22
|
+
sample = OpenStruct.new(needle: [], haystack: [], matches: { })
|
23
|
+
CSV.read("sample.csv").each do |cols|
|
24
|
+
n, h = *cols
|
25
|
+
sample.needle << n if n
|
26
|
+
sample.haystack << h if h
|
27
|
+
sample.matches[n] = h if n && h
|
28
|
+
end
|
29
|
+
sample
|
30
|
+
end
|
31
|
+
|
32
|
+
def assert_bulk_candidates(candidates, percent)
|
33
|
+
sample = self.sample
|
34
|
+
|
35
|
+
# match, with benchmark
|
36
|
+
output = nil
|
37
|
+
Benchmark.bm(10) do |bm|
|
38
|
+
bm.report(candidates.to_s) do
|
39
|
+
matcher = Simhilarity::Bulk.new(candidates: candidates)
|
40
|
+
output = matcher.matches(sample.needle, sample.haystack)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# what percent of matches are correct?
|
45
|
+
correct = output.select { |n, h, score| sample.matches[n] == h }
|
46
|
+
correct = correct.length.to_f / sample.needle.length
|
47
|
+
|
48
|
+
# for debugging
|
49
|
+
# printf("%% correct: %.3f\n", correct)
|
50
|
+
# output.each do |n, h, score|
|
51
|
+
# good = sample.matches[n] == h
|
52
|
+
# printf("%2s %4.2f %-35s %-35s\n", good ? "" : "xx", score || 0, n, h)
|
53
|
+
# end
|
54
|
+
|
55
|
+
assert((correct - percent).abs < 0.001, "percent #{correct} != #{percent}")
|
56
|
+
end
|
57
|
+
|
58
|
+
def assert_system(cmd)
|
59
|
+
system("#{cmd} > /dev/null 2>&1")
|
60
|
+
assert($? == 0, "#{cmd} failed")
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# tests
|
65
|
+
#
|
66
|
+
|
67
|
+
def test_read
|
68
|
+
# default
|
69
|
+
assert_equal @matcher.read("gub"), "gub"
|
70
|
+
|
71
|
+
# not a string
|
72
|
+
assert_raise(RuntimeError) { @matcher.read(123) }
|
73
|
+
|
74
|
+
# custom
|
75
|
+
@matcher.reader = lambda(&:key)
|
76
|
+
assert_equal @matcher.read(OpenStruct.new(key: "gub")), "gub"
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_normalizer
|
80
|
+
# default
|
81
|
+
assert_equal @matcher.normalize(" HELLO,\tWORLD! "), "hello world"
|
82
|
+
|
83
|
+
# custom
|
84
|
+
@matcher.normalizer = lambda(&:upcase)
|
85
|
+
assert_equal @matcher.normalize("gub"), "GUB"
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_ngrams
|
89
|
+
# default
|
90
|
+
assert_equal @matcher.ngrams("hi 42"), ["hi", "i ", " 4", "42"]
|
91
|
+
|
92
|
+
# custom
|
93
|
+
@matcher.ngrammer = lambda(&:split)
|
94
|
+
assert_equal @matcher.ngrams("hi 42"), ["hi", "42"]
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_proc_options
|
98
|
+
matcher = Simhilarity::Matcher.new(reader: lambda(&:key), normalizer: lambda(&:upcase), ngrammer: lambda(&:split))
|
99
|
+
assert_equal matcher.read(OpenStruct.new(key: "gub")), "gub"
|
100
|
+
assert_equal matcher.normalize("gub"), "GUB"
|
101
|
+
assert_equal matcher.ngrams("hi 42"), ["hi", "42"]
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_single
|
105
|
+
score = Simhilarity::Single.new.score("hello world", "hi worlds")
|
106
|
+
assert (score - 0.556).abs < 0.001, "test_single percent was wrong!"
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_bulk
|
110
|
+
assert_bulk_candidates(:all, 0.974)
|
111
|
+
assert_bulk_candidates(:ngrams, 0.974)
|
112
|
+
assert_bulk_candidates(:simhash, 0.949)
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_bin
|
116
|
+
bin = "../bin/simhilarity"
|
117
|
+
assert_system("#{bin} identity.txt identity.txt")
|
118
|
+
assert_system("#{bin} -v identity.txt identity.txt")
|
119
|
+
assert_system("#{bin} --candidates simhash identity.txt identity.txt")
|
120
|
+
assert_system("#{bin} --candidates simhash=3 identity.txt identity.txt")
|
121
|
+
assert_system("#{bin} --candidates ngrams identity.txt identity.txt")
|
122
|
+
assert_system("#{bin} --candidates ngrams=3 identity.txt identity.txt")
|
123
|
+
assert_system("#{bin} --candidates all identity.txt identity.txt")
|
124
|
+
end
|
125
|
+
end
|
metadata
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simhilarity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Adam Doppelt
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bk
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: progressbar
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: awesome_print
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rake
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rdoc
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: Measure text similarity using frequency weighted ngrams.
|
95
|
+
email:
|
96
|
+
- amd@gurge.com
|
97
|
+
executables:
|
98
|
+
- simhilarity
|
99
|
+
extensions: []
|
100
|
+
extra_rdoc_files: []
|
101
|
+
files:
|
102
|
+
- .gitignore
|
103
|
+
- Gemfile
|
104
|
+
- LICENSE
|
105
|
+
- README.md
|
106
|
+
- Rakefile
|
107
|
+
- bin/simhilarity
|
108
|
+
- lib/simhilarity.rb
|
109
|
+
- lib/simhilarity/bits.rb
|
110
|
+
- lib/simhilarity/bulk.rb
|
111
|
+
- lib/simhilarity/candidate.rb
|
112
|
+
- lib/simhilarity/element.rb
|
113
|
+
- lib/simhilarity/matcher.rb
|
114
|
+
- lib/simhilarity/single.rb
|
115
|
+
- lib/simhilarity/version.rb
|
116
|
+
- simhilarity.gemspec
|
117
|
+
- test/harness
|
118
|
+
- test/identity.txt
|
119
|
+
- test/large_haystack.txt
|
120
|
+
- test/large_needles.txt
|
121
|
+
- test/sample.csv
|
122
|
+
- test/tests.rb
|
123
|
+
homepage: http://github.com/gurgeous/simhilarity
|
124
|
+
licenses: []
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ! '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: 1.9.0
|
135
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
136
|
+
none: false
|
137
|
+
requirements:
|
138
|
+
- - ! '>='
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
segments:
|
142
|
+
- 0
|
143
|
+
hash: -1497809244519171705
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project: simhilarity
|
146
|
+
rubygems_version: 1.8.24
|
147
|
+
signing_key:
|
148
|
+
specification_version: 3
|
149
|
+
summary: Simhilarity - measure text similarity using frequency weighted ngrams.
|
150
|
+
test_files:
|
151
|
+
- test/harness
|
152
|
+
- test/identity.txt
|
153
|
+
- test/large_haystack.txt
|
154
|
+
- test/large_needles.txt
|
155
|
+
- test/sample.csv
|
156
|
+
- test/tests.rb
|