string_score 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.mdown +5 -0
- data/Rakefile +7 -0
- data/lib/string_score/version.rb +3 -0
- data/lib/string_score.rb +150 -0
- data/spec/string_score_spec.rb +119 -0
- data/string_score.gemspec +25 -0
- metadata +74 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.mdown
ADDED
data/Rakefile
ADDED
data/lib/string_score.rb
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
module StringScore
|
|
2
|
+
|
|
3
|
+
class InternalError < RuntimeError; end
|
|
4
|
+
class ArgumentError < RuntimeError; end
|
|
5
|
+
|
|
6
|
+
NON_STRING_MSG = "Supply a string or an object with can be coerced to a string."
|
|
7
|
+
NON_STRING_ERROR_MESSAGE = /undefined method `to_s'/
|
|
8
|
+
|
|
9
|
+
def score(target_string, fuzziness = 0.0)
|
|
10
|
+
if @string_scorer && @string_scorer.base_string != self
|
|
11
|
+
@string_scorer = nil
|
|
12
|
+
end
|
|
13
|
+
@string_scorer ||= StringScore.new(self, fuzziness)
|
|
14
|
+
@string_scorer.score(target_string)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# proxy to Scorer to simplify calling API
|
|
18
|
+
def self.new(base_string, fuzziness=0.0)
|
|
19
|
+
StringScore::Scorer.new(base_string.to_s, fuzziness)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class Scorer
|
|
23
|
+
|
|
24
|
+
attr_accessor :base_string, :default_fuziness
|
|
25
|
+
|
|
26
|
+
def initialize(string, fuzziness=0.0)
|
|
27
|
+
with_error_handling do
|
|
28
|
+
@default_fuziness = fuzziness
|
|
29
|
+
@base_string = string.to_s
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def score(target_string, fuzziness=nil)
|
|
34
|
+
with_error_handling do
|
|
35
|
+
return 1 if (base_string == target_string.to_s)
|
|
36
|
+
partial_score(target_string, (fuzziness || default_fuziness))
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def partial_score(target, fuzziness)
|
|
41
|
+
|
|
42
|
+
string = base_string.dup # clean copy for chomping
|
|
43
|
+
over_all_index = 0
|
|
44
|
+
|
|
45
|
+
cumulative_score = 0.0
|
|
46
|
+
|
|
47
|
+
target_length = target.length.to_f
|
|
48
|
+
string_length = string.length.to_f
|
|
49
|
+
|
|
50
|
+
start_of_string_bonus = false
|
|
51
|
+
|
|
52
|
+
abbreviation_score = 0
|
|
53
|
+
fuzzies = 1.0
|
|
54
|
+
final_score = 0.0
|
|
55
|
+
|
|
56
|
+
target_index = 0
|
|
57
|
+
|
|
58
|
+
target.each_char do |c|
|
|
59
|
+
over_all_index = (base_string.length - string.length)
|
|
60
|
+
|
|
61
|
+
character_score = 0.0
|
|
62
|
+
|
|
63
|
+
index_c_lowercase = string.index(c.downcase)
|
|
64
|
+
index_c_uppercase = string.index(c.upcase)
|
|
65
|
+
|
|
66
|
+
current_index = [index_c_lowercase, index_c_uppercase].compact.min
|
|
67
|
+
over_all_index += current_index.to_i
|
|
68
|
+
|
|
69
|
+
if ! current_index
|
|
70
|
+
|
|
71
|
+
if fuzziness > 0.0
|
|
72
|
+
fuzzies += (1 - fuzziness)
|
|
73
|
+
target_index += 1
|
|
74
|
+
next
|
|
75
|
+
else
|
|
76
|
+
return 0 # abort on any mismatch
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
if string[current_index] == c
|
|
82
|
+
character_score = 0.2 # exact case match
|
|
83
|
+
else
|
|
84
|
+
character_score = 0.1 # character but not case match
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# consecutive bonus
|
|
88
|
+
if current_index == 0
|
|
89
|
+
character_score += 0.6
|
|
90
|
+
if target_index == 0
|
|
91
|
+
start_of_string_bonus = true
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# acronym bonus
|
|
96
|
+
if base_string[over_all_index - 1] == ' '
|
|
97
|
+
character_score += 0.8
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
cumulative_score += character_score
|
|
101
|
+
target_index += 1
|
|
102
|
+
string = string[(current_index + 1), (string.length - 1)]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
matched_score = cumulative_score.to_f / string_length.to_f
|
|
106
|
+
|
|
107
|
+
with_long_string_bonus = (((matched_score * (target_length.to_f / string_length.to_f)) + matched_score) / 2)
|
|
108
|
+
|
|
109
|
+
final_score = with_long_string_bonus / fuzzies
|
|
110
|
+
|
|
111
|
+
if start_of_string_bonus
|
|
112
|
+
if final_score + 0.15 < 1.0
|
|
113
|
+
final_score += 0.15
|
|
114
|
+
elsif final_score + 0.15 >= 1.0
|
|
115
|
+
final_score = 1.0
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
if final_score > 1.0 || final_score < 0.0
|
|
120
|
+
raise StringScore::InternalError, "Out of range score: '#{final_score}'"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
final_score
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def with_error_handling
|
|
127
|
+
|
|
128
|
+
yield
|
|
129
|
+
|
|
130
|
+
rescue StringScore::InternalError, StringScore::ArgumentError
|
|
131
|
+
|
|
132
|
+
raise # allow nesting of #with_error_handling
|
|
133
|
+
|
|
134
|
+
rescue NoMethodError => e
|
|
135
|
+
|
|
136
|
+
if e.message =~ NON_STRING_MSG
|
|
137
|
+
raise StringScore::ArgumentError, NON_STRING_MSG
|
|
138
|
+
else
|
|
139
|
+
raise StringScore::InternalError, "#{e.class}: #{e.message}"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
rescue => e
|
|
143
|
+
|
|
144
|
+
raise StringScore::InternalError, "#{e.class}: #{e.message}"
|
|
145
|
+
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
require 'string_score'
|
|
2
|
+
|
|
3
|
+
String.send(:include, StringScore)
|
|
4
|
+
|
|
5
|
+
RSpec::Matchers.define :be_greater_than do |expected|
|
|
6
|
+
match do |actual|
|
|
7
|
+
expected < actual
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
RSpec::Matchers.define :be_less_than do |expected|
|
|
12
|
+
match do |actual|
|
|
13
|
+
expected > actual
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
RSpec::Matchers.define :be_about do |expected|
|
|
18
|
+
match do |actual|
|
|
19
|
+
expected.to_f.round(5) == actual.to_f.round(5) # to 3 decimal points
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
describe StringScore do
|
|
24
|
+
|
|
25
|
+
subject { StringScore.new('Hello World') }
|
|
26
|
+
|
|
27
|
+
it "provides a method directly on a string instance" do
|
|
28
|
+
"foobar".score('foo').should == StringScore.new("foobar").score('foo')
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it "scores at 1 for exact match" do
|
|
32
|
+
subject.score('Hello World').should == 1
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# probably rare but need to handle just in case
|
|
36
|
+
it "allows updates of string to match against" do
|
|
37
|
+
string_to_match = "Hello World"
|
|
38
|
+
string_to_match.score('Hello World').should == 1
|
|
39
|
+
string_to_match.gsub!(/\w/, 'X')
|
|
40
|
+
string_to_match.score('Hello World').should_not == 1
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "scores 0 for non-matches (character not in string)" do
|
|
44
|
+
subject.score("hellx").should == 0
|
|
45
|
+
subject.score("hello_world").should == 0
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "matches sequentially" do
|
|
49
|
+
subject.score('WH').should == 0
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "prefers same-case matches" do
|
|
53
|
+
subject.score('hello').should be_less_than(subject.score('Hello'))
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "scores higher on closers matchs" do
|
|
57
|
+
subject.score('H').should be_less_than(subject.score('He'))
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "will match despite wrong case" do
|
|
61
|
+
subject.score("hello").should be_greater_than(0)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "scores progressively higher weighting on more matches" do
|
|
65
|
+
subject.score("e").should be_less_than(subject.score("h"))
|
|
66
|
+
subject.score("h").should be_less_than(subject.score("he"))
|
|
67
|
+
subject.score("hel").should be_less_than(subject.score("hell"))
|
|
68
|
+
subject.score("hell").should be_less_than(subject.score("hello"))
|
|
69
|
+
subject.score("hello").should be_less_than(subject.score("helloworld"))
|
|
70
|
+
subject.score("helloworl").should be_less_than(subject.score("hello worl"))
|
|
71
|
+
subject.score("hello worl").should be_less_than(subject.score("hello world"))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "provides a consecutive letter bonus" do
|
|
75
|
+
subject.score('Hel').should be_greater_than(subject.score('Hld'))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it "gives an acronym bonus" do
|
|
79
|
+
subject.score('HW').should be_greater_than(subject.score('Ho'))
|
|
80
|
+
'yet another Hello World'.score('yaHW').should be_greater_than('Hello World'.score('yet another'))
|
|
81
|
+
"Hillsdale Michigan".score("HiMi").should be_greater_than("Hillsdale Michigan".score("Hill"))
|
|
82
|
+
|
|
83
|
+
# I think these pass in error in the js version, will check
|
|
84
|
+
# "Hillsdale Michigan".score("HiMi").should be_greater_than("Hillsdale Michigan".score("hills"))
|
|
85
|
+
# "Hillsdale Michigan".score("HiMi").should be_greater_than("Hillsdale Michigan".score("hillsd"))
|
|
86
|
+
# "Hillsdale Michigan".score("HiMi").should be_greater_than("Hillsdale Michigan".score("illsda"))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "gives a bonus for matching the start of the string" do
|
|
90
|
+
"Hillsdale".score("hi").should be_greater_than("Chippewa".score("hi"))
|
|
91
|
+
"hello world".score("h").should be_greater_than("hello world".score("w"))
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it "gives proper string weights" do
|
|
95
|
+
"Research Resources North".score('res').should be_greater_than("Mary Conces".score('res'))
|
|
96
|
+
"Research Resources North".score('res').should be_greater_than("Bonnie Strathern - Southwest Michigan Title Search".score('res'))
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it "gives start of string bonuses" do
|
|
100
|
+
"Mary Large".score('mar').should be_greater_than("Large Mary".score('mar'))
|
|
101
|
+
"Silly Mary Large".score('mar').should be_about("Silly Large Mary".score('mar'))
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
it "can fuzzily match strings" do
|
|
106
|
+
subject.score('Hz').should == 0
|
|
107
|
+
subject.score('Hz', 0.5).should be_less_than(subject.score('H', 0.5))
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
it "should be tuned well" do
|
|
111
|
+
"hello world".score("hello worl", 0.5).should be_greater_than("hello world".score("hello wor1", 0.5))
|
|
112
|
+
'Hello World'.score('jello',0.5).should be_greater_than(0)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "should have varying degrees of fuziness" do
|
|
116
|
+
subject.score('Hz', 0.9).should be_greater_than(subject.score('Hz', 0.5))
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
|
3
|
+
require "string_score/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = "string_score"
|
|
7
|
+
s.version = StringScore::VERSION
|
|
8
|
+
s.platform = Gem::Platform::RUBY
|
|
9
|
+
s.authors = ["Jim Lindley"]
|
|
10
|
+
s.email = ["jim@jimlindley.com"]
|
|
11
|
+
s.homepage = ""
|
|
12
|
+
s.summary = %q{Score how close a string is to another string.}
|
|
13
|
+
s.description = %q{Port of https://github.com/joshaven/string_score from js to ruby.}
|
|
14
|
+
|
|
15
|
+
s.rubyforge_project = "string_score"
|
|
16
|
+
|
|
17
|
+
s.files = `git ls-files`.split("\n")
|
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
20
|
+
s.require_paths = ["lib"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
s.add_development_dependency "rspec", '~>2.5.0'
|
|
24
|
+
|
|
25
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: string_score
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 0.0.1
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Jim Lindley
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
|
|
13
|
+
date: 2011-03-15 00:00:00 -07:00
|
|
14
|
+
default_executable:
|
|
15
|
+
dependencies:
|
|
16
|
+
- !ruby/object:Gem::Dependency
|
|
17
|
+
name: rspec
|
|
18
|
+
prerelease: false
|
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
20
|
+
none: false
|
|
21
|
+
requirements:
|
|
22
|
+
- - ~>
|
|
23
|
+
- !ruby/object:Gem::Version
|
|
24
|
+
version: 2.5.0
|
|
25
|
+
type: :development
|
|
26
|
+
version_requirements: *id001
|
|
27
|
+
description: Port of https://github.com/joshaven/string_score from js to ruby.
|
|
28
|
+
email:
|
|
29
|
+
- jim@jimlindley.com
|
|
30
|
+
executables: []
|
|
31
|
+
|
|
32
|
+
extensions: []
|
|
33
|
+
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
|
|
36
|
+
files:
|
|
37
|
+
- .gitignore
|
|
38
|
+
- Gemfile
|
|
39
|
+
- README.mdown
|
|
40
|
+
- Rakefile
|
|
41
|
+
- lib/string_score.rb
|
|
42
|
+
- lib/string_score/version.rb
|
|
43
|
+
- spec/string_score_spec.rb
|
|
44
|
+
- string_score.gemspec
|
|
45
|
+
has_rdoc: true
|
|
46
|
+
homepage: ""
|
|
47
|
+
licenses: []
|
|
48
|
+
|
|
49
|
+
post_install_message:
|
|
50
|
+
rdoc_options: []
|
|
51
|
+
|
|
52
|
+
require_paths:
|
|
53
|
+
- lib
|
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
55
|
+
none: false
|
|
56
|
+
requirements:
|
|
57
|
+
- - ">="
|
|
58
|
+
- !ruby/object:Gem::Version
|
|
59
|
+
version: "0"
|
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
61
|
+
none: false
|
|
62
|
+
requirements:
|
|
63
|
+
- - ">="
|
|
64
|
+
- !ruby/object:Gem::Version
|
|
65
|
+
version: "0"
|
|
66
|
+
requirements: []
|
|
67
|
+
|
|
68
|
+
rubyforge_project: string_score
|
|
69
|
+
rubygems_version: 1.6.0
|
|
70
|
+
signing_key:
|
|
71
|
+
specification_version: 3
|
|
72
|
+
summary: Score how close a string is to another string.
|
|
73
|
+
test_files:
|
|
74
|
+
- spec/string_score_spec.rb
|