measurable 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/History.txt +3 -0
- data/README.md +28 -32
- data/lib/measurable.rb +1 -2
- data/lib/measurable/chebyshev.rb +23 -19
- data/lib/measurable/cosine.rb +65 -45
- data/lib/measurable/euclidean.rb +59 -68
- data/lib/measurable/hamming.rb +28 -24
- data/lib/measurable/haversine.rb +52 -47
- data/lib/measurable/jaccard.rb +58 -55
- data/lib/measurable/kullback_leibler.rb +39 -0
- data/lib/measurable/maxmin.rb +33 -28
- data/lib/measurable/minkowski.rb +39 -22
- data/lib/measurable/tanimoto.rb +47 -27
- data/lib/measurable/version.rb +1 -1
- data/spec/chebyshev_spec.rb +20 -1
- data/spec/cosine_spec.rb +16 -0
- data/spec/euclidean_spec.rb +17 -1
- data/spec/hamming_spec.rb +17 -1
- data/spec/haversine_spec.rb +21 -1
- data/spec/jaccard_spec.rb +21 -0
- data/spec/kullback_leibler_spec.rb +46 -0
- data/spec/levenshtein_spec.rb +16 -0
- data/spec/maxmin_spec.rb +20 -1
- data/spec/minkowski_spec.rb +17 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/tanimoto_spec.rb +20 -0
- metadata +6 -2
data/lib/measurable/haversine.rb
CHANGED
@@ -15,57 +15,62 @@ module Measurable
|
|
15
15
|
:meters => EARTH_RADIUS_IN_KILOMETERS * 1000
|
16
16
|
}
|
17
17
|
|
18
|
-
|
19
|
-
# haversine(u, v) -> Float
|
20
|
-
#
|
21
|
-
# Compute accurate distances between two points given their latitudes and
|
22
|
-
# longitudes, even for short distances. This isn't a distance measure in the
|
23
|
-
# same sense as the other methods in +Measurable+.
|
24
|
-
#
|
25
|
-
# The distance returned is the great circle (or orthodromic) distance between
|
26
|
-
# +u+ and +v+, which is the shortest distance between them on the surface of
|
27
|
-
# a sphere. Thus, this implementation considers the Earth to be a sphere.
|
28
|
-
#
|
29
|
-
# Reminding that the input vectors are of the form [latitude, longitude] in
|
30
|
-
# degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
|
31
|
-
# Paulo), the corresponding vector is [-23.53333, -46.61667].
|
32
|
-
#
|
33
|
-
# References:
|
34
|
-
# - http://www.movable-type.co.uk/scripts/latlong.html
|
35
|
-
# - http://en.wikipedia.org/wiki/Haversine_formula
|
36
|
-
# - http://en.wikipedia.org/wiki/Great-circle_distance
|
37
|
-
#
|
38
|
-
# * *Arguments* :
|
39
|
-
# - +u+ -> An array of Numeric objects.
|
40
|
-
# - +v+ -> An array of Numeric objects.
|
41
|
-
# - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
|
42
|
-
# options are +:miles+, +:feet+, +:km+ and +:meters+.
|
43
|
-
# * *Returns* :
|
44
|
-
# - The great circle distance between +u+ and +v+.
|
45
|
-
# * *Raises* :
|
46
|
-
# - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
|
47
|
-
# - +ArgumentError+ -> +unit+ must be a Symbol.
|
48
|
-
#
|
49
|
-
def haversine(u, v, unit = :meters)
|
50
|
-
# TODO: Create better exceptions.
|
51
|
-
raise ArgumentError if u.size != 2 || v.size != 2
|
52
|
-
raise ArgumentError if unit.class != Symbol
|
18
|
+
module Haversine
|
53
19
|
|
54
|
-
|
55
|
-
|
20
|
+
# call-seq:
|
21
|
+
# haversine(u, v) -> Float
|
22
|
+
#
|
23
|
+
# Compute accurate distances between two points given their latitudes and
|
24
|
+
# longitudes, even for short distances. This isn't a distance measure in the
|
25
|
+
# same sense as the other methods in +Measurable+.
|
26
|
+
#
|
27
|
+
# The distance returned is the great circle (or orthodromic) distance between
|
28
|
+
# +u+ and +v+, which is the shortest distance between them on the surface of
|
29
|
+
# a sphere. Thus, this implementation considers the Earth to be a sphere.
|
30
|
+
#
|
31
|
+
# Reminding that the input vectors are of the form [latitude, longitude] in
|
32
|
+
# degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
|
33
|
+
# Paulo), the corresponding vector is [-23.53333, -46.61667].
|
34
|
+
#
|
35
|
+
# References:
|
36
|
+
# - http://www.movable-type.co.uk/scripts/latlong.html
|
37
|
+
# - http://en.wikipedia.org/wiki/Haversine_formula
|
38
|
+
# - http://en.wikipedia.org/wiki/Great-circle_distance
|
39
|
+
#
|
40
|
+
# * *Arguments* :
|
41
|
+
# - +u+ -> An array of Numeric objects.
|
42
|
+
# - +v+ -> An array of Numeric objects.
|
43
|
+
# - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
|
44
|
+
# options are +:miles+, +:feet+, +:km+ and +:meters+.
|
45
|
+
# * *Returns* :
|
46
|
+
# - The great circle distance between +u+ and +v+.
|
47
|
+
# * *Raises* :
|
48
|
+
# - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
|
49
|
+
# - +ArgumentError+ -> +unit+ must be a Symbol.
|
50
|
+
#
|
51
|
+
def haversine(u, v, unit = :meters)
|
52
|
+
# TODO: Create better exceptions.
|
53
|
+
raise ArgumentError if u.size != 2 || v.size != 2
|
54
|
+
raise ArgumentError if unit.class != Symbol
|
56
55
|
|
57
|
-
|
58
|
-
|
56
|
+
dlat = u[0] - v[0]
|
57
|
+
dlon = u[1] - v[1]
|
59
58
|
|
60
|
-
|
61
|
-
|
59
|
+
dlon_rad = dlon * RAD_PER_DEG
|
60
|
+
dlat_rad = dlat * RAD_PER_DEG
|
62
61
|
|
63
|
-
|
64
|
-
|
62
|
+
lat1_rad = v[0] * RAD_PER_DEG
|
63
|
+
lon1_rad = v[1] * RAD_PER_DEG
|
65
64
|
|
66
|
-
|
67
|
-
|
65
|
+
lat2_rad = u[0] * RAD_PER_DEG
|
66
|
+
lon2_rad = u[1] * RAD_PER_DEG
|
68
67
|
|
69
|
-
|
68
|
+
a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
|
69
|
+
c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
|
70
|
+
|
71
|
+
EARTH_RADIUS[unit] * c
|
72
|
+
end
|
70
73
|
end
|
71
|
-
|
74
|
+
|
75
|
+
extend Measurable::Haversine
|
76
|
+
end
|
data/lib/measurable/jaccard.rb
CHANGED
@@ -1,62 +1,65 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Jaccard
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
4
|
+
# call-seq:
|
5
|
+
# jaccard_index(u, v) -> Float
|
6
|
+
#
|
7
|
+
# Give the similarity between two binary vectors +u+ and +v+. Calculated as:
|
8
|
+
# jaccard_index = |intersection| / |union|
|
9
|
+
#
|
10
|
+
# In which intersection and union refer to +u+ and +v+ and |x| is the
|
11
|
+
# cardinality of set x.
|
12
|
+
#
|
13
|
+
# For example:
|
14
|
+
# jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
|
15
|
+
#
|
16
|
+
# Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
|
17
|
+
#
|
18
|
+
# See: http://en.wikipedia.org/wiki/Jaccard_coefficient
|
19
|
+
#
|
20
|
+
# * *Arguments* :
|
21
|
+
# - +u+ -> Array.
|
22
|
+
# - +v+ -> Array.
|
23
|
+
# * *Returns* :
|
24
|
+
# - Float value representing the Jaccard similarity coefficient between
|
25
|
+
# +u+ and +v+.
|
26
|
+
# * *Raises* :
|
27
|
+
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
28
|
+
#
|
29
|
+
def jaccard_index(u, v)
|
30
|
+
# TODO: Change this to a more specific, custom-made exception.
|
31
|
+
raise ArgumentError if u.size != v.size
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
intersection = u & v
|
34
|
+
union = u | v
|
35
|
+
intersection.length.to_f / union.length
|
36
|
+
end
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
# call-seq:
|
39
|
+
# jaccard(u, v) -> Float
|
40
|
+
#
|
41
|
+
# The jaccard distance is a measure of dissimilarity between two sets. It is
|
42
|
+
# calculated as:
|
43
|
+
# jaccard_distance = 1 - jaccard_index
|
44
|
+
#
|
45
|
+
# This is a proper metric, i.e. the following conditions hold:
|
46
|
+
# - Symmetry: jaccard(u, v) == jaccard(v, u)
|
47
|
+
# - Non-negative: jaccard(u, v) >= 0
|
48
|
+
# - Coincidence axiom: jaccard(u, v) == 0 if u == v
|
49
|
+
# - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
|
50
|
+
#
|
51
|
+
# * *Arguments* :
|
52
|
+
# - +u+ -> Array.
|
53
|
+
# - +v+ -> Array.
|
54
|
+
# * *Returns* :
|
55
|
+
# - Float value representing the dissimilarity between +u+ and +v+.
|
56
|
+
# * *Raises* :
|
57
|
+
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
58
|
+
#
|
59
|
+
def jaccard(u, v)
|
60
|
+
1 - jaccard_index(u, v)
|
61
|
+
end
|
37
62
|
|
38
|
-
|
39
|
-
# jaccard(u, v) -> Float
|
40
|
-
#
|
41
|
-
# The jaccard distance is a measure of dissimilarity between two sets. It is
|
42
|
-
# calculated as:
|
43
|
-
# jaccard_distance = 1 - jaccard_index
|
44
|
-
#
|
45
|
-
# This is a proper metric, i.e. the following conditions hold:
|
46
|
-
# - Symmetry: jaccard(u, v) == jaccard(v, u)
|
47
|
-
# - Non-negative: jaccard(u, v) >= 0
|
48
|
-
# - Coincidence axiom: jaccard(u, v) == 0 if u == v
|
49
|
-
# - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
|
50
|
-
#
|
51
|
-
# * *Arguments* :
|
52
|
-
# - +u+ -> Array.
|
53
|
-
# - +v+ -> Array.
|
54
|
-
# * *Returns* :
|
55
|
-
# - Float value representing the dissimilarity between +u+ and +v+.
|
56
|
-
# * *Raises* :
|
57
|
-
# - +ArgumentError+ -> The size of the input arrays doesn't match.
|
58
|
-
#
|
59
|
-
def jaccard(u, v)
|
60
|
-
1 - jaccard_index(u, v)
|
63
|
+
extend Measurable::Jaccard
|
61
64
|
end
|
62
65
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Measurable
|
2
|
+
module KullbackLeibler
|
3
|
+
|
4
|
+
# call-seq:
|
5
|
+
# kullback_leibler(p, q) -> Float
|
6
|
+
#
|
7
|
+
# The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
|
8
|
+
# a measure of their dissimilarity. However, it doesn't obey the triangular
|
9
|
+
# inequality and isn't symmetric, thus it isn't a metric.
|
10
|
+
#
|
11
|
+
# It is calculated as follows:
|
12
|
+
#
|
13
|
+
# KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
|
14
|
+
#
|
15
|
+
# With distributions +p+ and +q+ represented as vectors of N elements
|
16
|
+
# summing to 1.0.
|
17
|
+
#
|
18
|
+
# References:
|
19
|
+
# - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
|
20
|
+
# - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
|
21
|
+
# Natural Language Processing.
|
22
|
+
#
|
23
|
+
# * *Arguments*:
|
24
|
+
# - +p+ -> A probability distribution represented by a n-element Array.
|
25
|
+
# - +q+ -> A probability distribution represented by a n-element Array.
|
26
|
+
# * *Returns*:
|
27
|
+
# A measure of the difference between the probability distributions p and q.
|
28
|
+
def kullback_leibler(p, q)
|
29
|
+
# TODO: Change this to a more specific, custom-made exception.
|
30
|
+
raise ArgumentError if p.size != q.size
|
31
|
+
|
32
|
+
p.zip(q).reduce(0.0) do |acc, probs|
|
33
|
+
acc += probs[0] * Math.log(probs[0] / probs[1])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
extend Measurable::KullbackLeibler
|
39
|
+
end
|
data/lib/measurable/maxmin.rb
CHANGED
@@ -1,34 +1,39 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Maxmin
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
4
|
+
# call-seq:
|
5
|
+
# maxmin(u, v) -> Float
|
6
|
+
#
|
7
|
+
# The "Max-min distance" is used to measure similarity between two vectors.
|
8
|
+
#
|
9
|
+
# When used in k-means clustering, this similarity measure can give better
|
10
|
+
# results in some datasets, as pointed out in the paper "K-means clustering
|
11
|
+
# using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
|
12
|
+
#
|
13
|
+
# See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
|
14
|
+
#
|
15
|
+
# * *Arguments* :
|
16
|
+
# - +u+ -> An array of Numeric objects.
|
17
|
+
# - +v+ -> An array of Numeric objects.
|
18
|
+
# * *Returns* :
|
19
|
+
# - Similarity between +u+ and +v+.
|
20
|
+
# * *Raises* :
|
21
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
22
|
+
#
|
23
|
+
def maxmin(u, v)
|
24
|
+
# TODO: Change this to a more specific, custom-made exception.
|
25
|
+
raise ArgumentError if u.size != v.size
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
|
28
|
+
acc[0] += attributes.min
|
29
|
+
acc[1] += attributes.max
|
30
|
+
acc
|
31
|
+
end
|
32
|
+
|
33
|
+
sum_min / sum_max
|
30
34
|
end
|
31
35
|
|
32
|
-
sum_min / sum_max
|
33
36
|
end
|
34
|
-
|
37
|
+
|
38
|
+
extend Measurable::Maxmin
|
39
|
+
end
|
data/lib/measurable/minkowski.rb
CHANGED
@@ -1,28 +1,45 @@
|
|
1
1
|
module Measurable
|
2
|
+
module Minkowski
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
4
|
+
# call-seq:
|
5
|
+
# minkowski(u, v) -> Numeric
|
6
|
+
#
|
7
|
+
# Calculate the sum of the absolute value of the differences between each
|
8
|
+
# coordinate of +u+ and +v+.
|
9
|
+
#
|
10
|
+
# * *Arguments* :
|
11
|
+
# - +u+ -> An array of Numeric objects.
|
12
|
+
# - +v+ -> An array of Numeric objects.
|
13
|
+
# * *Returns* :
|
14
|
+
# - The Minkowski (or L1) distance between +u+ and +v+.
|
15
|
+
# * *Raises* :
|
16
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
17
|
+
#
|
18
|
+
def minkowski(u, v)
|
19
|
+
# TODO: Change this to a more specific, custom-made exception.
|
20
|
+
raise ArgumentError if u.size != v.size
|
20
21
|
|
21
|
-
|
22
|
-
|
22
|
+
u.zip(v).reduce(0) do |acc, elem|
|
23
|
+
acc += (elem[0] - elem[1]).abs
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.extended(base) # :nodoc:
|
28
|
+
base.instance_eval do
|
29
|
+
alias :cityblock :minkowski
|
30
|
+
alias :manhattan :minkowski
|
31
|
+
end
|
32
|
+
super
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.included(base) # :nodoc:
|
36
|
+
base.class_eval do
|
37
|
+
alias :cityblock :minkowski
|
38
|
+
alias :manhattan :minkowski
|
39
|
+
end
|
40
|
+
super
|
23
41
|
end
|
24
42
|
end
|
25
43
|
|
26
|
-
|
27
|
-
|
28
|
-
end
|
44
|
+
extend Measurable::Minkowski
|
45
|
+
end
|
data/lib/measurable/tanimoto.rb
CHANGED
@@ -1,32 +1,52 @@
|
|
1
|
+
require 'measurable/jaccard'
|
2
|
+
|
1
3
|
module Measurable
|
4
|
+
module Tanimoto
|
5
|
+
|
6
|
+
# call-seq:
|
7
|
+
# tanimoto(u, v) -> Float
|
8
|
+
#
|
9
|
+
# Tanimoto distance is a coefficient explicitly chosen such as to allow for
|
10
|
+
# two dissimilar specimens to be similar to a third one. This breaks the
|
11
|
+
# triangle inequality, thus this isn't a metric.
|
12
|
+
#
|
13
|
+
# More information and references on this are needed. It's left here mostly
|
14
|
+
# as a piece of curiosity.
|
15
|
+
#
|
16
|
+
# See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
|
17
|
+
#
|
18
|
+
# * *Arguments* :
|
19
|
+
# - +u+ -> An array of Numeric objects.
|
20
|
+
# - +v+ -> An array of Numeric objects.
|
21
|
+
# * *Returns* :
|
22
|
+
# - A measure of the similarity between +u+ and +v+.
|
23
|
+
# * *Raises* :
|
24
|
+
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
25
|
+
#
|
26
|
+
def tanimoto(u, v)
|
27
|
+
# TODO: Change this to a more specific, custom-made exception.
|
28
|
+
raise ArgumentError if u.size != v.size
|
2
29
|
|
3
|
-
|
4
|
-
|
30
|
+
-Math.log2(jaccard_index(u, v))
|
31
|
+
end
|
5
32
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
# as a piece of curiosity.
|
15
|
-
#
|
16
|
-
# See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
|
17
|
-
#
|
18
|
-
# * *Arguments* :
|
19
|
-
# - +u+ -> An array of Numeric objects.
|
20
|
-
# - +v+ -> An array of Numeric objects.
|
21
|
-
# * *Returns* :
|
22
|
-
# - A measure of the similarity between +u+ and +v+.
|
23
|
-
# * *Raises* :
|
24
|
-
# - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
|
25
|
-
#
|
26
|
-
def tanimoto(u, v)
|
27
|
-
# TODO: Change this to a more specific, custom-made exception.
|
28
|
-
raise ArgumentError if u.size != v.size
|
33
|
+
def self.extended(base) # :nodoc:
|
34
|
+
# Tanimoto similarity is the same as Jaccard similarity.
|
35
|
+
base.instance_eval do
|
36
|
+
extend Measurable::Jaccard
|
37
|
+
alias :tanimoto_similarity :jaccard
|
38
|
+
end
|
39
|
+
super
|
40
|
+
end
|
29
41
|
|
30
|
-
|
42
|
+
def self.included(base) # :nodoc:
|
43
|
+
base.class_eval do
|
44
|
+
include Measurable::Jaccard
|
45
|
+
alias :tanimoto_similarity :jaccard
|
46
|
+
end
|
47
|
+
super
|
48
|
+
end
|
31
49
|
end
|
32
|
-
|
50
|
+
|
51
|
+
extend Measurable::Tanimoto
|
52
|
+
end
|