measurable 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/History.txt +3 -0
- data/README.md +28 -32
- data/lib/measurable.rb +1 -2
- data/lib/measurable/chebyshev.rb +23 -19
- data/lib/measurable/cosine.rb +65 -45
- data/lib/measurable/euclidean.rb +59 -68
- data/lib/measurable/hamming.rb +28 -24
- data/lib/measurable/haversine.rb +52 -47
- data/lib/measurable/jaccard.rb +58 -55
- data/lib/measurable/kullback_leibler.rb +39 -0
- data/lib/measurable/maxmin.rb +33 -28
- data/lib/measurable/minkowski.rb +39 -22
- data/lib/measurable/tanimoto.rb +47 -27
- data/lib/measurable/version.rb +1 -1
- data/spec/chebyshev_spec.rb +20 -1
- data/spec/cosine_spec.rb +16 -0
- data/spec/euclidean_spec.rb +17 -1
- data/spec/hamming_spec.rb +17 -1
- data/spec/haversine_spec.rb +21 -1
- data/spec/jaccard_spec.rb +21 -0
- data/spec/kullback_leibler_spec.rb +46 -0
- data/spec/levenshtein_spec.rb +16 -0
- data/spec/maxmin_spec.rb +20 -1
- data/spec/minkowski_spec.rb +17 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/tanimoto_spec.rb +20 -0
- metadata +6 -2
data/lib/measurable/haversine.rb
CHANGED
@@ -15,57 +15,62 @@ module Measurable
|
|
15
15
|
:meters => EARTH_RADIUS_IN_KILOMETERS * 1000
|
16
16
|
}
|
17
17
|
|
18
|
-
|
19
|
-
# haversine(u, v) -> Float
|
20
|
-
#
|
21
|
-
# Compute accurate distances between two points given their latitudes and
|
22
|
-
# longitudes, even for short distances. This isn't a distance measure in the
|
23
|
-
# same sense as the other methods in +Measurable+.
|
24
|
-
#
|
25
|
-
# The distance returned is the great circle (or orthodromic) distance between
|
26
|
-
# +u+ and +v+, which is the shortest distance between them on the surface of
|
27
|
-
# a sphere. Thus, this implementation considers the Earth to be a sphere.
|
28
|
-
#
|
29
|
-
# Reminding that the input vectors are of the form [latitude, longitude] in
|
30
|
-
# degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
|
31
|
-
# Paulo), the corresponding vector is [-23.53333, -46.61667].
|
32
|
-
#
|
33
|
-
# References:
|
34
|
-
# - http://www.movable-type.co.uk/scripts/latlong.html
|
35
|
-
# - http://en.wikipedia.org/wiki/Haversine_formula
|
36
|
-
# - http://en.wikipedia.org/wiki/Great-circle_distance
|
37
|
-
#
|
38
|
-
# * *Arguments* :
|
39
|
-
# - +u+ -> An array of Numeric objects.
|
40
|
-
# - +v+ -> An array of Numeric objects.
|
41
|
-
# - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
|
42
|
-
# options are +:miles+, +:feet+, +:km+ and +:meters+.
|
43
|
-
# * *Returns* :
|
44
|
-
# - The great circle distance between +u+ and +v+.
|
45
|
-
# * *Raises* :
|
46
|
-
# - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
|
47
|
-
# - +ArgumentError+ -> +unit+ must be a Symbol.
|
48
|
-
#
|
49
|
-
def haversine(u, v, unit = :meters)
|
50
|
-
# TODO: Create better exceptions.
|
51
|
-
raise ArgumentError if u.size != 2 || v.size != 2
|
52
|
-
raise ArgumentError if unit.class != Symbol
|
18
|
+
module Haversine
|
53
19
|
|
54
|
-
|
55
|
-
|
20
|
+
# call-seq:
|
21
|
+
# haversine(u, v) -> Float
|
22
|
+
#
|
23
|
+
# Compute accurate distances between two points given their latitudes and
|
24
|
+
# longitudes, even for short distances. This isn't a distance measure in the
|
25
|
+
# same sense as the other methods in +Measurable+.
|
26
|
+
#
|
27
|
+
# The distance returned is the great circle (or orthodromic) distance between
|
28
|
+
# +u+ and +v+, which is the shortest distance between them on the surface of
|
29
|
+
# a sphere. Thus, this implementation considers the Earth to be a sphere.
|
30
|
+
#
|
31
|
+
# Reminding that the input vectors are of the form [latitude, longitude] in
|
32
|
+
# degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
|
33
|
+
# Paulo), the corresponding vector is [-23.53333, -46.61667].
|
34
|
+
#
|
35
|
+
# References:
|
36
|
+
# - http://www.movable-type.co.uk/scripts/latlong.html
|
37
|
+
# - http://en.wikipedia.org/wiki/Haversine_formula
|
38
|
+
# - http://en.wikipedia.org/wiki/Great-circle_distance
|
39
|
+
#
|
40
|
+
# * *Arguments* :
|
41
|
+
# - +u+ -> An array of Numeric objects.
|
42
|
+
# - +v+ -> An array of Numeric objects.
|
43
|
+
# - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
|
44
|
+
# options are +:miles+, +:feet+, +:km+ and +:meters+.
|
45
|
+
# * *Returns* :
|
46
|
+
# - The great circle distance between +u+ and +v+.
|
47
|
+
# * *Raises* :
|
48
|
+
# - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
|
49
|
+
# - +ArgumentError+ -> +unit+ must be a Symbol.
|
50
|
+
#
|
51
|
+
def haversine(u, v, unit = :meters)
|
52
|
+
# TODO: Create better exceptions.
|
53
|
+
raise ArgumentError if u.size != 2 || v.size != 2
|
54
|
+
raise ArgumentError if unit.class != Symbol
|
56
55
|
|
57
|
-
|
58
|
-
|
56
|
+
dlat = u[0] - v[0]
|
57
|
+
dlon = u[1] - v[1]
|
59
58
|
|
60
|
-
|
61
|
-
|
59
|
+
dlon_rad = dlon * RAD_PER_DEG
|
60
|
+
dlat_rad = dlat * RAD_PER_DEG
|
62
61
|
|
63
|
-
|
64
|
-
|
62
|
+
lat1_rad = v[0] * RAD_PER_DEG
|
63
|
+
lon1_rad = v[1] * RAD_PER_DEG
|
65
64
|
|
66
|
-
|
67
|
-
|
65
|
+
lat2_rad = u[0] * RAD_PER_DEG
|
66
|
+
lon2_rad = u[1] * RAD_PER_DEG
|
68
67
|
|
69
|
-
|
68
|
+
a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
|
69
|
+
c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
|
70
|
+
|
71
|
+
EARTH_RADIUS[unit] * c
|
72
|
+
end
|
70
73
|
end
|
71
|
-
|
74
|
+
|
75
|
+
extend Measurable::Haversine
|
76
|
+
end
|
data/lib/measurable/jaccard.rb
CHANGED
@@ -1,62 +1,65 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Jaccard
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
4
|
+
# call-seq:
|
5
|
+
# jaccard_index(u, v) -> Float
|
6
|
+
#
|
7
|
+
# Give the similarity between two binary vectors +u+ and +v+. Calculated as:
|
8
|
+
# jaccard_index = |intersection| / |union|
|
9
|
+
#
|
10
|
+
# In which intersection and union refer to +u+ and +v+ and |x| is the
|
11
|
+
# cardinality of set x.
|
12
|
+
#
|
13
|
+
# For example:
|
14
|
+
# jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
|
15
|
+
#
|
16
|
+
# Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
|
17
|
+
#
|
18
|
+
# See: http://en.wikipedia.org/wiki/Jaccard_coefficient
|
19
|
+
#
|
20
|
+
# * *Arguments* :
|
21
|
+
# - +u+ -> Array.
|
22
|
+
# - +v+ -> Array.
|
23
|
+
# * *Returns* :
|
24
|
+
# - Float value representing the Jaccard similarity coefficient between
|
25
|
+
# +u+ and +v+.
|
26
|
+
# * *Raises* :
|
27
|
+
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
28
|
+
#
|
29
|
+
def jaccard_index(u, v)
|
30
|
+
# TODO: Change this to a more specific, custom-made exception.
|
31
|
+
raise ArgumentError if u.size != v.size
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
intersection = u & v
|
34
|
+
union = u | v
|
35
|
+
intersection.length.to_f / union.length
|
36
|
+
end
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
# call-seq:
|
39
|
+
# jaccard(u, v) -> Float
|
40
|
+
#
|
41
|
+
# The jaccard distance is a measure of dissimilarity between two sets. It is
|
42
|
+
# calculated as:
|
43
|
+
# jaccard_distance = 1 - jaccard_index
|
44
|
+
#
|
45
|
+
# This is a proper metric, i.e. the following conditions hold:
|
46
|
+
# - Symmetry: jaccard(u, v) == jaccard(v, u)
|
47
|
+
# - Non-negative: jaccard(u, v) >= 0
|
48
|
+
# - Coincidence axiom: jaccard(u, v) == 0 if u == v
|
49
|
+
# - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
|
50
|
+
#
|
51
|
+
# * *Arguments* :
|
52
|
+
# - +u+ -> Array.
|
53
|
+
# - +v+ -> Array.
|
54
|
+
# * *Returns* :
|
55
|
+
# - Float value representing the dissimilarity between +u+ and +v+.
|
56
|
+
# * *Raises* :
|
57
|
+
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
58
|
+
#
|
59
|
+
def jaccard(u, v)
|
60
|
+
1 - jaccard_index(u, v)
|
61
|
+
end
|
37
62
|
|
38
|
-
|
39
|
-
# jaccard(u, v) -> Float
|
40
|
-
#
|
41
|
-
# The jaccard distance is a measure of dissimilarity between two sets. It is
|
42
|
-
# calculated as:
|
43
|
-
# jaccard_distance = 1 - jaccard_index
|
44
|
-
#
|
45
|
-
# This is a proper metric, i.e. the following conditions hold:
|
46
|
-
# - Symmetry: jaccard(u, v) == jaccard(v, u)
|
47
|
-
# - Non-negative: jaccard(u, v) >= 0
|
48
|
-
# - Coincidence axiom: jaccard(u, v) == 0 if u == v
|
49
|
-
# - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
|
50
|
-
#
|
51
|
-
# * *Arguments* :
|
52
|
-
# - +u+ -> Array.
|
53
|
-
# - +v+ -> Array.
|
54
|
-
# * *Returns* :
|
55
|
-
# - Float value representing the dissimilarity between +u+ and +v+.
|
56
|
-
# * *Raises* :
|
57
|
-
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
58
|
-
#
|
59
|
-
def jaccard(u, v)
|
60
|
-
1 - jaccard_index(u, v)
|
63
|
+
extend Measurable::Jaccard
|
61
64
|
end
|
62
65
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Measurable
|
2
|
+
module KullbackLeibler
|
3
|
+
|
4
|
+
# call-seq:
|
5
|
+
# kullback_leibler(p, q) -> Float
|
6
|
+
#
|
7
|
+
# The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
|
8
|
+
# a measure of their dissimilarity. However, it doesn't obey the triangular
|
9
|
+
# inequality and isn't symmetric, thus it isn't a metric.
|
10
|
+
#
|
11
|
+
# It is calculated as follows:
|
12
|
+
#
|
13
|
+
# KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
|
14
|
+
#
|
15
|
+
# With distributions +p+ and +q+ represented as vectors of N elements
|
16
|
+
# summing to 1.0.
|
17
|
+
#
|
18
|
+
# References:
|
19
|
+
# - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
|
20
|
+
# - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
|
21
|
+
# Natural Language Processing.
|
22
|
+
#
|
23
|
+
# * *Arguments*:
|
24
|
+
# - +p+ -> A probability distribution represented by a n-element Array.
|
25
|
+
# - +q+ -> A probability distribution represented by a n-element Array.
|
26
|
+
# * *Returns*:
|
27
|
+
# A measure of the difference between the probability distributions p and q.
|
28
|
+
def kullback_leibler(p, q)
|
29
|
+
# TODO: Change this to a more specific, custom-made exception.
|
30
|
+
raise ArgumentError if p.size != q.size
|
31
|
+
|
32
|
+
p.zip(q).reduce(0.0) do |acc, probs|
|
33
|
+
acc += probs[0] * Math.log(probs[0] / probs[1])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
extend Measurable::KullbackLeibler
|
39
|
+
end
|
data/lib/measurable/maxmin.rb
CHANGED
@@ -1,34 +1,39 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Maxmin
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
4
|
+
# call-seq:
|
5
|
+
# maxmin(u, v) -> Float
|
6
|
+
#
|
7
|
+
# The "Max-min distance" is used to measure similarity between two vectors.
|
8
|
+
#
|
9
|
+
# When used in k-means clustering, this similarity measure can give better
|
10
|
+
# results in some datasets, as pointed out in the paper "K-means clustering
|
11
|
+
# using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
|
12
|
+
#
|
13
|
+
# See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
|
14
|
+
#
|
15
|
+
# * *Arguments* :
|
16
|
+
# - +u+ -> An array of Numeric objects.
|
17
|
+
# - +v+ -> An array of Numeric objects.
|
18
|
+
# * *Returns* :
|
19
|
+
# - Similarity between +u+ and +v+.
|
20
|
+
# * *Raises* :
|
21
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
22
|
+
#
|
23
|
+
def maxmin(u, v)
|
24
|
+
# TODO: Change this to a more specific, custom-made exception.
|
25
|
+
raise ArgumentError if u.size != v.size
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
|
28
|
+
acc[0] += attributes.min
|
29
|
+
acc[1] += attributes.max
|
30
|
+
acc
|
31
|
+
end
|
32
|
+
|
33
|
+
sum_min / sum_max
|
30
34
|
end
|
31
35
|
|
32
|
-
sum_min / sum_max
|
33
36
|
end
|
34
|
-
|
37
|
+
|
38
|
+
extend Measurable::Maxmin
|
39
|
+
end
|
data/lib/measurable/minkowski.rb
CHANGED
@@ -1,28 +1,45 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Minkowski
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
4
|
+
# call-seq:
|
5
|
+
# minkowski(u, v) -> Numeric
|
6
|
+
#
|
7
|
+
# Calculate the sum of the absolute value of the differences between each
|
8
|
+
# coordinate of +u+ and +v+.
|
9
|
+
#
|
10
|
+
# * *Arguments* :
|
11
|
+
# - +u+ -> An array of Numeric objects.
|
12
|
+
# - +v+ -> An array of Numeric objects.
|
13
|
+
# * *Returns* :
|
14
|
+
# - The Minkowski (or L1) distance between +u+ and +v+.
|
15
|
+
# * *Raises* :
|
16
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
17
|
+
#
|
18
|
+
def minkowski(u, v)
|
19
|
+
# TODO: Change this to a more specific, custom-made exception.
|
20
|
+
raise ArgumentError if u.size != v.size
|
20
21
|
|
21
|
-
|
22
|
-
|
22
|
+
u.zip(v).reduce(0) do |acc, elem|
|
23
|
+
acc += (elem[0] - elem[1]).abs
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.extended(base) # :nodoc:
|
28
|
+
base.instance_eval do
|
29
|
+
alias :cityblock :minkowski
|
30
|
+
alias :manhattan :minkowski
|
31
|
+
end
|
32
|
+
super
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.included(base) # :nodoc:
|
36
|
+
base.class_eval do
|
37
|
+
alias :cityblock :minkowski
|
38
|
+
alias :manhattan :minkowski
|
39
|
+
end
|
40
|
+
super
|
23
41
|
end
|
24
42
|
end
|
25
43
|
|
26
|
-
|
27
|
-
|
28
|
-
end
|
44
|
+
extend Measurable::Minkowski
|
45
|
+
end
|
data/lib/measurable/tanimoto.rb
CHANGED
@@ -1,32 +1,52 @@
|
|
1
|
+
require 'measurable/jaccard'
|
2
|
+
|
1
3
|
module Measurable
|
4
|
+
module Tanimoto
|
5
|
+
|
6
|
+
# call-seq:
|
7
|
+
# tanimoto(u, v) -> Float
|
8
|
+
#
|
9
|
+
# Tanimoto distance is a coefficient explicitly chosen such as to allow for
|
10
|
+
# two dissimilar specimens to be similar to a third one. This breaks the
|
11
|
+
# triangle inequality, thus this isn't a metric.
|
12
|
+
#
|
13
|
+
# More information and references on this are needed. It's left here mostly
|
14
|
+
# as a piece of curiosity.
|
15
|
+
#
|
16
|
+
# See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
|
17
|
+
#
|
18
|
+
# * *Arguments* :
|
19
|
+
# - +u+ -> An array of Numeric objects.
|
20
|
+
# - +v+ -> An array of Numeric objects.
|
21
|
+
# * *Returns* :
|
22
|
+
# - A measure of the similarity between +u+ and +v+.
|
23
|
+
# * *Raises* :
|
24
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
25
|
+
#
|
26
|
+
def tanimoto(u, v)
|
27
|
+
# TODO: Change this to a more specific, custom-made exception.
|
28
|
+
raise ArgumentError if u.size != v.size
|
2
29
|
|
3
|
-
|
4
|
-
|
30
|
+
-Math.log2(jaccard_index(u, v))
|
31
|
+
end
|
5
32
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
# as a piece of curiosity.
|
15
|
-
#
|
16
|
-
# See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
|
17
|
-
#
|
18
|
-
# * *Arguments* :
|
19
|
-
# - +u+ -> An array of Numeric objects.
|
20
|
-
# - +v+ -> An array of Numeric objects.
|
21
|
-
# * *Returns* :
|
22
|
-
# - A measure of the similarity between +u+ and +v+.
|
23
|
-
# * *Raises* :
|
24
|
-
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
25
|
-
#
|
26
|
-
def tanimoto(u, v)
|
27
|
-
# TODO: Change this to a more specific, custom-made exception.
|
28
|
-
raise ArgumentError if u.size != v.size
|
33
|
+
def self.extended(base) # :nodoc:
|
34
|
+
# Tanimoto similarity is the same as Jaccard similarity.
|
35
|
+
base.instance_eval do
|
36
|
+
extend Measurable::Jaccard
|
37
|
+
alias :tanimoto_similarity :jaccard
|
38
|
+
end
|
39
|
+
super
|
40
|
+
end
|
29
41
|
|
30
|
-
|
42
|
+
def self.included(base) # :nodoc:
|
43
|
+
base.class_eval do
|
44
|
+
include Measurable::Jaccard
|
45
|
+
alias :tanimoto_similarity :jaccard
|
46
|
+
end
|
47
|
+
super
|
48
|
+
end
|
31
49
|
end
|
32
|
-
|
50
|
+
|
51
|
+
extend Measurable::Tanimoto
|
52
|
+
end
|