measurable 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,57 +15,62 @@ module Measurable
15
15
  :meters => EARTH_RADIUS_IN_KILOMETERS * 1000
16
16
  }
17
17
 
18
- # call-seq:
19
- # haversine(u, v) -> Float
20
- #
21
- # Compute accurate distances between two points given their latitudes and
22
- # longitudes, even for short distances. This isn't a distance measure in the
23
- # same sense as the other methods in +Measurable+.
24
- #
25
- # The distance returned is the great circle (or orthodromic) distance between
26
- # +u+ and +v+, which is the shortest distance between them on the surface of
27
- # a sphere. Thus, this implementation considers the Earth to be a sphere.
28
- #
29
- # Reminding that the input vectors are of the form [latitude, longitude] in
30
- # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
31
- # Paulo), the corresponding vector is [-23.53333, -46.61667].
32
- #
33
- # References:
34
- # - http://www.movable-type.co.uk/scripts/latlong.html
35
- # - http://en.wikipedia.org/wiki/Haversine_formula
36
- # - http://en.wikipedia.org/wiki/Great-circle_distance
37
- #
38
- # * *Arguments* :
39
- # - +u+ -> An array of Numeric objects.
40
- # - +v+ -> An array of Numeric objects.
41
- # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
42
- # options are +:miles+, +:feet+, +:km+ and +:meters+.
43
- # * *Returns* :
44
- # - The great circle distance between +u+ and +v+.
45
- # * *Raises* :
46
- # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
47
- # - +ArgumentError+ -> +unit+ must be a Symbol.
48
- #
49
- def haversine(u, v, unit = :meters)
50
- # TODO: Create better exceptions.
51
- raise ArgumentError if u.size != 2 || v.size != 2
52
- raise ArgumentError if unit.class != Symbol
18
+ module Haversine
53
19
 
54
- dlat = u[0] - v[0]
55
- dlon = u[1] - v[1]
20
+ # call-seq:
21
+ # haversine(u, v) -> Float
22
+ #
23
+ # Compute accurate distances between two points given their latitudes and
24
+ # longitudes, even for short distances. This isn't a distance measure in the
25
+ # same sense as the other methods in +Measurable+.
26
+ #
27
+ # The distance returned is the great circle (or orthodromic) distance between
28
+ # +u+ and +v+, which is the shortest distance between them on the surface of
29
+ # a sphere. Thus, this implementation considers the Earth to be a sphere.
30
+ #
31
+ # Reminding that the input vectors are of the form [latitude, longitude] in
32
+ # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
33
+ # Paulo), the corresponding vector is [-23.53333, -46.61667].
34
+ #
35
+ # References:
36
+ # - http://www.movable-type.co.uk/scripts/latlong.html
37
+ # - http://en.wikipedia.org/wiki/Haversine_formula
38
+ # - http://en.wikipedia.org/wiki/Great-circle_distance
39
+ #
40
+ # * *Arguments* :
41
+ # - +u+ -> An array of Numeric objects.
42
+ # - +v+ -> An array of Numeric objects.
43
+ # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
44
+ # options are +:miles+, +:feet+, +:km+ and +:meters+.
45
+ # * *Returns* :
46
+ # - The great circle distance between +u+ and +v+.
47
+ # * *Raises* :
48
+ # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
49
+ # - +ArgumentError+ -> +unit+ must be a Symbol.
50
+ #
51
+ def haversine(u, v, unit = :meters)
52
+ # TODO: Create better exceptions.
53
+ raise ArgumentError if u.size != 2 || v.size != 2
54
+ raise ArgumentError if unit.class != Symbol
56
55
 
57
- dlon_rad = dlon * RAD_PER_DEG
58
- dlat_rad = dlat * RAD_PER_DEG
56
+ dlat = u[0] - v[0]
57
+ dlon = u[1] - v[1]
59
58
 
60
- lat1_rad = v[0] * RAD_PER_DEG
61
- lon1_rad = v[1] * RAD_PER_DEG
59
+ dlon_rad = dlon * RAD_PER_DEG
60
+ dlat_rad = dlat * RAD_PER_DEG
62
61
 
63
- lat2_rad = u[0] * RAD_PER_DEG
64
- lon2_rad = u[1] * RAD_PER_DEG
62
+ lat1_rad = v[0] * RAD_PER_DEG
63
+ lon1_rad = v[1] * RAD_PER_DEG
65
64
 
66
- a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
67
- c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
65
+ lat2_rad = u[0] * RAD_PER_DEG
66
+ lon2_rad = u[1] * RAD_PER_DEG
68
67
 
69
- EARTH_RADIUS[unit] * c
68
+ a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
69
+ c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
70
+
71
+ EARTH_RADIUS[unit] * c
72
+ end
70
73
  end
71
- end
74
+
75
+ extend Measurable::Haversine
76
+ end
@@ -1,62 +1,65 @@
1
1
  module Measurable
2
+ module Jaccard
2
3
 
3
- # call-seq:
4
- # jaccard_index(u, v) -> Float
5
- #
6
- # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
7
- # jaccard_index = |intersection| / |union|
8
- #
9
- # In which intersection and union refer to +u+ and +v+ and |x| is the
10
- # cardinality of set x.
11
- #
12
- # For example:
13
- # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
14
- #
15
- # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
16
- #
17
- # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
18
- #
19
- # * *Arguments* :
20
- # - +u+ -> Array.
21
- # - +v+ -> Array.
22
- # * *Returns* :
23
- # - Float value representing the Jaccard similarity coefficient between
24
- # +u+ and +v+.
25
- # * *Raises* :
26
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
27
- #
28
- def jaccard_index(u, v)
29
- # TODO: Change this to a more specific, custom-made exception.
30
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # jaccard_index(u, v) -> Float
6
+ #
7
+ # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
8
+ # jaccard_index = |intersection| / |union|
9
+ #
10
+ # In which intersection and union refer to +u+ and +v+ and |x| is the
11
+ # cardinality of set x.
12
+ #
13
+ # For example:
14
+ # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
15
+ #
16
+ # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
17
+ #
18
+ # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
19
+ #
20
+ # * *Arguments* :
21
+ # - +u+ -> Array.
22
+ # - +v+ -> Array.
23
+ # * *Returns* :
24
+ # - Float value representing the Jaccard similarity coefficient between
25
+ # +u+ and +v+.
26
+ # * *Raises* :
27
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
28
+ #
29
+ def jaccard_index(u, v)
30
+ # TODO: Change this to a more specific, custom-made exception.
31
+ raise ArgumentError if u.size != v.size
31
32
 
32
- intersection = u & v
33
- union = u | v
33
+ intersection = u & v
34
+ union = u | v
35
+ intersection.length.to_f / union.length
36
+ end
34
37
 
35
- intersection.length.to_f / union.length
36
- end
38
+ # call-seq:
39
+ # jaccard(u, v) -> Float
40
+ #
41
+ # The jaccard distance is a measure of dissimilarity between two sets. It is
42
+ # calculated as:
43
+ # jaccard_distance = 1 - jaccard_index
44
+ #
45
+ # This is a proper metric, i.e. the following conditions hold:
46
+ # - Symmetry: jaccard(u, v) == jaccard(v, u)
47
+ # - Non-negative: jaccard(u, v) >= 0
48
+ # - Coincidence axiom: jaccard(u, v) == 0 if u == v
49
+ # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
50
+ #
51
+ # * *Arguments* :
52
+ # - +u+ -> Array.
53
+ # - +v+ -> Array.
54
+ # * *Returns* :
55
+ # - Float value representing the dissimilarity between +u+ and +v+.
56
+ # * *Raises* :
57
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
58
+ #
59
+ def jaccard(u, v)
60
+ 1 - jaccard_index(u, v)
61
+ end
37
62
 
38
- # call-seq:
39
- # jaccard(u, v) -> Float
40
- #
41
- # The jaccard distance is a measure of dissimilarity between two sets. It is
42
- # calculated as:
43
- # jaccard_distance = 1 - jaccard_index
44
- #
45
- # This is a proper metric, i.e. the following conditions hold:
46
- # - Symmetry: jaccard(u, v) == jaccard(v, u)
47
- # - Non-negative: jaccard(u, v) >= 0
48
- # - Coincidence axiom: jaccard(u, v) == 0 if u == v
49
- # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
50
- #
51
- # * *Arguments* :
52
- # - +u+ -> Array.
53
- # - +v+ -> Array.
54
- # * *Returns* :
55
- # - Float value representing the dissimilarity between +u+ and +v+.
56
- # * *Raises* :
57
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
58
- #
59
- def jaccard(u, v)
60
- 1 - jaccard_index(u, v)
63
+ extend Measurable::Jaccard
61
64
  end
62
65
  end
@@ -0,0 +1,39 @@
1
+ module Measurable
2
+ module KullbackLeibler
3
+
4
+ # call-seq:
5
+ # kullback_leibler(p, q) -> Float
6
+ #
7
+ # The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
8
+ # a measure of their dissimilarity. However, it doesn't obey the triangular
9
+ # inequality and isn't symmetric, thus it isn't a metric.
10
+ #
11
+ # It is calculated as follows:
12
+ #
13
+ # KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
14
+ #
15
+ # With distributions +p+ and +q+ represented as vectors of N elements
16
+ # summing to 1.0.
17
+ #
18
+ # References:
19
+ # - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
20
+ # - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
21
+ # Natural Language Processing.
22
+ #
23
+ # * *Arguments*:
24
+ # - +p+ -> A probability distribution represented by a n-element Array.
25
+ # - +q+ -> A probability distribution represented by a n-element Array.
26
+ # * *Returns*:
27
+ # A measure of the difference between the probability distributions p and q.
28
+ def kullback_leibler(p, q)
29
+ # TODO: Change this to a more specific, custom-made exception.
30
+ raise ArgumentError if p.size != q.size
31
+
32
+ p.zip(q).reduce(0.0) do |acc, probs|
33
+ acc += probs[0] * Math.log(probs[0] / probs[1])
34
+ end
35
+ end
36
+ end
37
+
38
+ extend Measurable::KullbackLeibler
39
+ end
@@ -1,34 +1,39 @@
1
1
  module Measurable
2
+ module Maxmin
2
3
 
3
- # call-seq:
4
- # maxmin(u, v) -> Float
5
- #
6
- # The "Max-min distance" is used to measure similarity between two vectors.
7
- #
8
- # When used in k-means clustering, this similarity measure can give better
9
- # results in some datasets, as pointed out in the paper "K-means clustering
10
- # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
11
- #
12
- # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
13
- #
14
- # * *Arguments* :
15
- # - +u+ -> An array of Numeric objects.
16
- # - +v+ -> An array of Numeric objects.
17
- # * *Returns* :
18
- # - Similarity between +u+ and +v+.
19
- # * *Raises* :
20
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
21
- #
22
- def maxmin(u, v)
23
- # TODO: Change this to a more specific, custom-made exception.
24
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # maxmin(u, v) -> Float
6
+ #
7
+ # The "Max-min distance" is used to measure similarity between two vectors.
8
+ #
9
+ # When used in k-means clustering, this similarity measure can give better
10
+ # results in some datasets, as pointed out in the paper "K-means clustering
11
+ # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
12
+ #
13
+ # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
14
+ #
15
+ # * *Arguments* :
16
+ # - +u+ -> An array of Numeric objects.
17
+ # - +v+ -> An array of Numeric objects.
18
+ # * *Returns* :
19
+ # - Similarity between +u+ and +v+.
20
+ # * *Raises* :
21
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
22
+ #
23
+ def maxmin(u, v)
24
+ # TODO: Change this to a more specific, custom-made exception.
25
+ raise ArgumentError if u.size != v.size
25
26
 
26
- sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
- acc[0] += attributes.min
28
- acc[1] += attributes.max
29
- acc
27
+ sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
28
+ acc[0] += attributes.min
29
+ acc[1] += attributes.max
30
+ acc
31
+ end
32
+
33
+ sum_min / sum_max
30
34
  end
31
35
 
32
- sum_min / sum_max
33
36
  end
34
- end
37
+
38
+ extend Measurable::Maxmin
39
+ end
@@ -1,28 +1,45 @@
1
1
  module Measurable
2
+ module Minkowski
2
3
 
3
- # call-seq:
4
- # minkowski(u, v) -> Numeric
5
- #
6
- # Calculate the sum of the absolute value of the differences between each
7
- # coordinate of +u+ and +v+.
8
- #
9
- # * *Arguments* :
10
- # - +u+ -> An array of Numeric objects.
11
- # - +v+ -> An array of Numeric objects.
12
- # * *Returns* :
13
- # - The Minkowski (or L1) distance between +u+ and +v+.
14
- # * *Raises* :
15
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
16
- #
17
- def minkowski(u, v)
18
- # TODO: Change this to a more specific, custom-made exception.
19
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # minkowski(u, v) -> Numeric
6
+ #
7
+ # Calculate the sum of the absolute value of the differences between each
8
+ # coordinate of +u+ and +v+.
9
+ #
10
+ # * *Arguments* :
11
+ # - +u+ -> An array of Numeric objects.
12
+ # - +v+ -> An array of Numeric objects.
13
+ # * *Returns* :
14
+ # - The Minkowski (or L1) distance between +u+ and +v+.
15
+ # * *Raises* :
16
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
17
+ #
18
+ def minkowski(u, v)
19
+ # TODO: Change this to a more specific, custom-made exception.
20
+ raise ArgumentError if u.size != v.size
20
21
 
21
- u.zip(v).reduce(0) do |acc, elem|
22
- acc += (elem[0] - elem[1]).abs
22
+ u.zip(v).reduce(0) do |acc, elem|
23
+ acc += (elem[0] - elem[1]).abs
24
+ end
25
+ end
26
+
27
+ def self.extended(base) # :nodoc:
28
+ base.instance_eval do
29
+ alias :cityblock :minkowski
30
+ alias :manhattan :minkowski
31
+ end
32
+ super
33
+ end
34
+
35
+ def self.included(base) # :nodoc:
36
+ base.class_eval do
37
+ alias :cityblock :minkowski
38
+ alias :manhattan :minkowski
39
+ end
40
+ super
23
41
  end
24
42
  end
25
43
 
26
- alias :cityblock :minkowski
27
- alias :manhattan :minkowski
28
- end
44
+ extend Measurable::Minkowski
45
+ end
@@ -1,32 +1,52 @@
1
+ require 'measurable/jaccard'
2
+
1
3
  module Measurable
4
+ module Tanimoto
5
+
6
+ # call-seq:
7
+ # tanimoto(u, v) -> Float
8
+ #
9
+ # Tanimoto distance is a coefficient explicitly chosen such as to allow for
10
+ # two dissimilar specimens to be similar to a third one. This breaks the
11
+ # triangle inequality, thus this isn't a metric.
12
+ #
13
+ # More information and references on this are needed. It's left here mostly
14
+ # as a piece of curiosity.
15
+ #
16
+ # See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
17
+ #
18
+ # * *Arguments* :
19
+ # - +u+ -> An array of Numeric objects.
20
+ # - +v+ -> An array of Numeric objects.
21
+ # * *Returns* :
22
+ # - A measure of the similarity between +u+ and +v+.
23
+ # * *Raises* :
24
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
25
+ #
26
+ def tanimoto(u, v)
27
+ # TODO: Change this to a more specific, custom-made exception.
28
+ raise ArgumentError if u.size != v.size
2
29
 
3
- # Tanimoto similarity is the same as Jaccard similarity.
4
- alias :tanimoto_similarity :jaccard
30
+ -Math.log2(jaccard_index(u, v))
31
+ end
5
32
 
6
- # call-seq:
7
- # tanimoto(u, v) -> Float
8
- #
9
- # Tanimoto distance is a coefficient explicitly chosen such as to allow for
10
- # two dissimilar specimens to be similar to a third one. This breaks the
11
- # triangle inequality, thus this isn't a metric.
12
- #
13
- # More information and references on this are needed. It's left here mostly
14
- # as a piece of curiosity.
15
- #
16
- # See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
17
- #
18
- # * *Arguments* :
19
- # - +u+ -> An array of Numeric objects.
20
- # - +v+ -> An array of Numeric objects.
21
- # * *Returns* :
22
- # - A measure of the similarity between +u+ and +v+.
23
- # * *Raises* :
24
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
25
- #
26
- def tanimoto(u, v)
27
- # TODO: Change this to a more specific, custom-made exception.
28
- raise ArgumentError if u.size != v.size
33
+ def self.extended(base) # :nodoc:
34
+ # Tanimoto similarity is the same as Jaccard similarity.
35
+ base.instance_eval do
36
+ extend Measurable::Jaccard
37
+ alias :tanimoto_similarity :jaccard
38
+ end
39
+ super
40
+ end
29
41
 
30
- -Math.log2(jaccard_index(u, v))
42
+ def self.included(base) # :nodoc:
43
+ base.class_eval do
44
+ include Measurable::Jaccard
45
+ alias :tanimoto_similarity :jaccard
46
+ end
47
+ super
48
+ end
31
49
  end
32
- end
50
+
51
+ extend Measurable::Tanimoto
52
+ end