measurable 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,57 +15,62 @@ module Measurable
15
15
  :meters => EARTH_RADIUS_IN_KILOMETERS * 1000
16
16
  }
17
17
 
18
- # call-seq:
19
- # haversine(u, v) -> Float
20
- #
21
- # Compute accurate distances between two points given their latitudes and
22
- # longitudes, even for short distances. This isn't a distance measure in the
23
- # same sense as the other methods in +Measurable+.
24
- #
25
- # The distance returned is the great circle (or orthodromic) distance between
26
- # +u+ and +v+, which is the shortest distance between them on the surface of
27
- # a sphere. Thus, this implementation considers the Earth to be a sphere.
28
- #
29
- # Reminding that the input vectors are of the form [latitude, longitude] in
30
- # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
31
- # Paulo), the corresponding vector is [-23.53333, -46.61667].
32
- #
33
- # References:
34
- # - http://www.movable-type.co.uk/scripts/latlong.html
35
- # - http://en.wikipedia.org/wiki/Haversine_formula
36
- # - http://en.wikipedia.org/wiki/Great-circle_distance
37
- #
38
- # * *Arguments* :
39
- # - +u+ -> An array of Numeric objects.
40
- # - +v+ -> An array of Numeric objects.
41
- # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
42
- # options are +:miles+, +:feet+, +:km+ and +:meters+.
43
- # * *Returns* :
44
- # - The great circle distance between +u+ and +v+.
45
- # * *Raises* :
46
- # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
47
- # - +ArgumentError+ -> +unit+ must be a Symbol.
48
- #
49
- def haversine(u, v, unit = :meters)
50
- # TODO: Create better exceptions.
51
- raise ArgumentError if u.size != 2 || v.size != 2
52
- raise ArgumentError if unit.class != Symbol
18
+ module Haversine
53
19
 
54
- dlat = u[0] - v[0]
55
- dlon = u[1] - v[1]
20
+ # call-seq:
21
+ # haversine(u, v) -> Float
22
+ #
23
+ # Compute accurate distances between two points given their latitudes and
24
+ # longitudes, even for short distances. This isn't a distance measure in the
25
+ # same sense as the other methods in +Measurable+.
26
+ #
27
+ # The distance returned is the great circle (or orthodromic) distance between
28
+ # +u+ and +v+, which is the shortest distance between them on the surface of
29
+ # a sphere. Thus, this implementation considers the Earth to be a sphere.
30
+ #
31
+ # Reminding that the input vectors are of the form [latitude, longitude] in
32
+ # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
33
+ # Paulo), the corresponding vector is [-23.53333, -46.61667].
34
+ #
35
+ # References:
36
+ # - http://www.movable-type.co.uk/scripts/latlong.html
37
+ # - http://en.wikipedia.org/wiki/Haversine_formula
38
+ # - http://en.wikipedia.org/wiki/Great-circle_distance
39
+ #
40
+ # * *Arguments* :
41
+ # - +u+ -> An array of Numeric objects.
42
+ # - +v+ -> An array of Numeric objects.
43
+ # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
44
+ # options are +:miles+, +:feet+, +:km+ and +:meters+.
45
+ # * *Returns* :
46
+ # - The great circle distance between +u+ and +v+.
47
+ # * *Raises* :
48
+ # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
49
+ # - +ArgumentError+ -> +unit+ must be a Symbol.
50
+ #
51
+ def haversine(u, v, unit = :meters)
52
+ # TODO: Create better exceptions.
53
+ raise ArgumentError if u.size != 2 || v.size != 2
54
+ raise ArgumentError if unit.class != Symbol
56
55
 
57
- dlon_rad = dlon * RAD_PER_DEG
58
- dlat_rad = dlat * RAD_PER_DEG
56
+ dlat = u[0] - v[0]
57
+ dlon = u[1] - v[1]
59
58
 
60
- lat1_rad = v[0] * RAD_PER_DEG
61
- lon1_rad = v[1] * RAD_PER_DEG
59
+ dlon_rad = dlon * RAD_PER_DEG
60
+ dlat_rad = dlat * RAD_PER_DEG
62
61
 
63
- lat2_rad = u[0] * RAD_PER_DEG
64
- lon2_rad = u[1] * RAD_PER_DEG
62
+ lat1_rad = v[0] * RAD_PER_DEG
63
+ lon1_rad = v[1] * RAD_PER_DEG
65
64
 
66
- a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
67
- c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
65
+ lat2_rad = u[0] * RAD_PER_DEG
66
+ lon2_rad = u[1] * RAD_PER_DEG
68
67
 
69
- EARTH_RADIUS[unit] * c
68
+ a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
69
+ c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
70
+
71
+ EARTH_RADIUS[unit] * c
72
+ end
70
73
  end
71
- end
74
+
75
+ extend Measurable::Haversine
76
+ end
@@ -1,62 +1,65 @@
1
1
  module Measurable
2
+ module Jaccard
2
3
 
3
- # call-seq:
4
- # jaccard_index(u, v) -> Float
5
- #
6
- # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
7
- # jaccard_index = |intersection| / |union|
8
- #
9
- # In which intersection and union refer to +u+ and +v+ and |x| is the
10
- # cardinality of set x.
11
- #
12
- # For example:
13
- # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
14
- #
15
- # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
16
- #
17
- # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
18
- #
19
- # * *Arguments* :
20
- # - +u+ -> Array.
21
- # - +v+ -> Array.
22
- # * *Returns* :
23
- # - Float value representing the Jaccard similarity coefficient between
24
- # +u+ and +v+.
25
- # * *Raises* :
26
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
27
- #
28
- def jaccard_index(u, v)
29
- # TODO: Change this to a more specific, custom-made exception.
30
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # jaccard_index(u, v) -> Float
6
+ #
7
+ # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
8
+ # jaccard_index = |intersection| / |union|
9
+ #
10
+ # In which intersection and union refer to +u+ and +v+ and |x| is the
11
+ # cardinality of set x.
12
+ #
13
+ # For example:
14
+ # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.5
15
+ #
16
+ # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
17
+ #
18
+ # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
19
+ #
20
+ # * *Arguments* :
21
+ # - +u+ -> Array.
22
+ # - +v+ -> Array.
23
+ # * *Returns* :
24
+ # - Float value representing the Jaccard similarity coefficient between
25
+ # +u+ and +v+.
26
+ # * *Raises* :
27
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
28
+ #
29
+ def jaccard_index(u, v)
30
+ # TODO: Change this to a more specific, custom-made exception.
31
+ raise ArgumentError if u.size != v.size
31
32
 
32
- intersection = u & v
33
- union = u | v
33
+ intersection = u & v
34
+ union = u | v
35
+ intersection.length.to_f / union.length
36
+ end
34
37
 
35
- intersection.length.to_f / union.length
36
- end
38
+ # call-seq:
39
+ # jaccard(u, v) -> Float
40
+ #
41
+ # The jaccard distance is a measure of dissimilarity between two sets. It is
42
+ # calculated as:
43
+ # jaccard_distance = 1 - jaccard_index
44
+ #
45
+ # This is a proper metric, i.e. the following conditions hold:
46
+ # - Symmetry: jaccard(u, v) == jaccard(v, u)
47
+ # - Non-negative: jaccard(u, v) >= 0
48
+ # - Coincidence axiom: jaccard(u, v) == 0 if u == v
49
+ # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
50
+ #
51
+ # * *Arguments* :
52
+ # - +u+ -> Array.
53
+ # - +v+ -> Array.
54
+ # * *Returns* :
55
+ # - Float value representing the dissimilarity between +u+ and +v+.
56
+ # * *Raises* :
57
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
58
+ #
59
+ def jaccard(u, v)
60
+ 1 - jaccard_index(u, v)
61
+ end
37
62
 
38
- # call-seq:
39
- # jaccard(u, v) -> Float
40
- #
41
- # The jaccard distance is a measure of dissimilarity between two sets. It is
42
- # calculated as:
43
- # jaccard_distance = 1 - jaccard_index
44
- #
45
- # This is a proper metric, i.e. the following conditions hold:
46
- # - Symmetry: jaccard(u, v) == jaccard(v, u)
47
- # - Non-negative: jaccard(u, v) >= 0
48
- # - Coincidence axiom: jaccard(u, v) == 0 if u == v
49
- # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
50
- #
51
- # * *Arguments* :
52
- # - +u+ -> Array.
53
- # - +v+ -> Array.
54
- # * *Returns* :
55
- # - Float value representing the dissimilarity between +u+ and +v+.
56
- # * *Raises* :
57
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
58
- #
59
- def jaccard(u, v)
60
- 1 - jaccard_index(u, v)
63
+ extend Measurable::Jaccard
61
64
  end
62
65
  end
@@ -0,0 +1,39 @@
1
+ module Measurable
2
+ module KullbackLeibler
3
+
4
+ # call-seq:
5
+ # kullback_leibler(p, q) -> Float
6
+ #
7
+ # The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
8
+ # a measure of their dissimilarity. However, it doesn't obey the triangular
9
+ # inequality and isn't symmetric, thus it isn't a metric.
10
+ #
11
+ # It is calculated as follows:
12
+ #
13
+ # KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
14
+ #
15
+ # With distributions +p+ and +q+ represented as vectors of N elements
16
+ # summing to 1.0.
17
+ #
18
+ # References:
19
+ # - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
20
+ # - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
21
+ # Natural Language Processing.
22
+ #
23
+ # * *Arguments*:
24
+ # - +p+ -> A probability distribution represented by a n-element Array.
25
+ # - +q+ -> A probability distribution represented by a n-element Array.
26
+ # * *Returns*:
27
+ # A measure of the difference between the probability distributions p and q.
28
+ def kullback_leibler(p, q)
29
+ # TODO: Change this to a more specific, custom-made exception.
30
+ raise ArgumentError if p.size != q.size
31
+
32
+ p.zip(q).reduce(0.0) do |acc, probs|
33
+ acc += probs[0] * Math.log(probs[0] / probs[1])
34
+ end
35
+ end
36
+ end
37
+
38
+ extend Measurable::KullbackLeibler
39
+ end
@@ -1,34 +1,39 @@
1
1
  module Measurable
2
+ module Maxmin
2
3
 
3
- # call-seq:
4
- # maxmin(u, v) -> Float
5
- #
6
- # The "Max-min distance" is used to measure similarity between two vectors.
7
- #
8
- # When used in k-means clustering, this similarity measure can give better
9
- # results in some datasets, as pointed out in the paper "K-means clustering
10
- # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
11
- #
12
- # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
13
- #
14
- # * *Arguments* :
15
- # - +u+ -> An array of Numeric objects.
16
- # - +v+ -> An array of Numeric objects.
17
- # * *Returns* :
18
- # - Similarity between +u+ and +v+.
19
- # * *Raises* :
20
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
21
- #
22
- def maxmin(u, v)
23
- # TODO: Change this to a more specific, custom-made exception.
24
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # maxmin(u, v) -> Float
6
+ #
7
+ # The "Max-min distance" is used to measure similarity between two vectors.
8
+ #
9
+ # When used in k-means clustering, this similarity measure can give better
10
+ # results in some datasets, as pointed out in the paper "K-means clustering
11
+ # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
12
+ #
13
+ # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
14
+ #
15
+ # * *Arguments* :
16
+ # - +u+ -> An array of Numeric objects.
17
+ # - +v+ -> An array of Numeric objects.
18
+ # * *Returns* :
19
+ # - Similarity between +u+ and +v+.
20
+ # * *Raises* :
21
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
22
+ #
23
+ def maxmin(u, v)
24
+ # TODO: Change this to a more specific, custom-made exception.
25
+ raise ArgumentError if u.size != v.size
25
26
 
26
- sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
- acc[0] += attributes.min
28
- acc[1] += attributes.max
29
- acc
27
+ sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
28
+ acc[0] += attributes.min
29
+ acc[1] += attributes.max
30
+ acc
31
+ end
32
+
33
+ sum_min / sum_max
30
34
  end
31
35
 
32
- sum_min / sum_max
33
36
  end
34
- end
37
+
38
+ extend Measurable::Maxmin
39
+ end
@@ -1,28 +1,45 @@
1
1
  module Measurable
2
+ module Minkowski
2
3
 
3
- # call-seq:
4
- # minkowski(u, v) -> Numeric
5
- #
6
- # Calculate the sum of the absolute value of the differences between each
7
- # coordinate of +u+ and +v+.
8
- #
9
- # * *Arguments* :
10
- # - +u+ -> An array of Numeric objects.
11
- # - +v+ -> An array of Numeric objects.
12
- # * *Returns* :
13
- # - The Minkowski (or L1) distance between +u+ and +v+.
14
- # * *Raises* :
15
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
16
- #
17
- def minkowski(u, v)
18
- # TODO: Change this to a more specific, custom-made exception.
19
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # minkowski(u, v) -> Numeric
6
+ #
7
+ # Calculate the sum of the absolute value of the differences between each
8
+ # coordinate of +u+ and +v+.
9
+ #
10
+ # * *Arguments* :
11
+ # - +u+ -> An array of Numeric objects.
12
+ # - +v+ -> An array of Numeric objects.
13
+ # * *Returns* :
14
+ # - The Minkowski (or L1) distance between +u+ and +v+.
15
+ # * *Raises* :
16
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
17
+ #
18
+ def minkowski(u, v)
19
+ # TODO: Change this to a more specific, custom-made exception.
20
+ raise ArgumentError if u.size != v.size
20
21
 
21
- u.zip(v).reduce(0) do |acc, elem|
22
- acc += (elem[0] - elem[1]).abs
22
+ u.zip(v).reduce(0) do |acc, elem|
23
+ acc += (elem[0] - elem[1]).abs
24
+ end
25
+ end
26
+
27
+ def self.extended(base) # :nodoc:
28
+ base.instance_eval do
29
+ alias :cityblock :minkowski
30
+ alias :manhattan :minkowski
31
+ end
32
+ super
33
+ end
34
+
35
+ def self.included(base) # :nodoc:
36
+ base.class_eval do
37
+ alias :cityblock :minkowski
38
+ alias :manhattan :minkowski
39
+ end
40
+ super
23
41
  end
24
42
  end
25
43
 
26
- alias :cityblock :minkowski
27
- alias :manhattan :minkowski
28
- end
44
+ extend Measurable::Minkowski
45
+ end
@@ -1,32 +1,52 @@
1
+ require 'measurable/jaccard'
2
+
1
3
  module Measurable
4
+ module Tanimoto
5
+
6
+ # call-seq:
7
+ # tanimoto(u, v) -> Float
8
+ #
9
+ # Tanimoto distance is a coefficient explicitly chosen such as to allow for
10
+ # two dissimilar specimens to be similar to a third one. This breaks the
11
+ # triangle inequality, thus this isn't a metric.
12
+ #
13
+ # More information and references on this are needed. It's left here mostly
14
+ # as a piece of curiosity.
15
+ #
16
+ # See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
17
+ #
18
+ # * *Arguments* :
19
+ # - +u+ -> An array of Numeric objects.
20
+ # - +v+ -> An array of Numeric objects.
21
+ # * *Returns* :
22
+ # - A measure of the similarity between +u+ and +v+.
23
+ # * *Raises* :
24
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
25
+ #
26
+ def tanimoto(u, v)
27
+ # TODO: Change this to a more specific, custom-made exception.
28
+ raise ArgumentError if u.size != v.size
2
29
 
3
- # Tanimoto similarity is the same as Jaccard similarity.
4
- alias :tanimoto_similarity :jaccard
30
+ -Math.log2(jaccard_index(u, v))
31
+ end
5
32
 
6
- # call-seq:
7
- # tanimoto(u, v) -> Float
8
- #
9
- # Tanimoto distance is a coefficient explicitly chosen such as to allow for
10
- # two dissimilar specimens to be similar to a third one. This breaks the
11
- # triangle inequality, thus this isn't a metric.
12
- #
13
- # More information and references on this are needed. It's left here mostly
14
- # as a piece of curiosity.
15
- #
16
- # See: # http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto.27s_Definitions_of_Similarity_and_Distance
17
- #
18
- # * *Arguments* :
19
- # - +u+ -> An array of Numeric objects.
20
- # - +v+ -> An array of Numeric objects.
21
- # * *Returns* :
22
- # - A measure of the similarity between +u+ and +v+.
23
- # * *Raises* :
24
- # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
25
- #
26
- def tanimoto(u, v)
27
- # TODO: Change this to a more specific, custom-made exception.
28
- raise ArgumentError if u.size != v.size
33
+ def self.extended(base) # :nodoc:
34
+ # Tanimoto similarity is the same as Jaccard similarity.
35
+ base.instance_eval do
36
+ extend Measurable::Jaccard
37
+ alias :tanimoto_similarity :jaccard
38
+ end
39
+ super
40
+ end
29
41
 
30
- -Math.log2(jaccard_index(u, v))
42
+ def self.included(base) # :nodoc:
43
+ base.class_eval do
44
+ include Measurable::Jaccard
45
+ alias :tanimoto_similarity :jaccard
46
+ end
47
+ super
48
+ end
31
49
  end
32
- end
50
+
51
+ extend Measurable::Tanimoto
52
+ end