measurable 0.0.5 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ module Measurable
2
+ module Hamming
3
+
4
+ # call-seq:
5
+ # hamming(s1, s2) -> Integer
6
+ #
7
+ # Count the number of different characters between strings +s1+ and +s2+,
8
+ # that is, how many substitutions are necessary to change +s1+ into +s2+ and
9
+ # vice-versa.
10
+ #
11
+ # See: http://en.wikipedia.org/wiki/Hamming_distance
12
+ #
13
+ # Arguments:
14
+ # - +s1+ -> A String.
15
+ # - +s2+ -> A String with the same size of +s1+.
16
+ # Returns:
17
+ # - The number of characters in which +s1+ and +s2+ differ.
18
+ # Raises:
19
+ # - +ArgumentError+ -> The sizes of +s1+ and +s2+ don't match.
20
+ def hamming(s1, s2)
21
+ # TODO: Change this to a more specific, custom-made exception.
22
+ raise ArgumentError if s1.size != s2.size
23
+
24
+ s1.chars.zip(s2.chars).reduce(0) do |acc, c|
25
+ acc += 1 if c[0] != c[1]
26
+ acc
27
+ end
28
+ end
29
+ end
30
+
31
+ extend Measurable::Hamming
32
+ end
@@ -15,57 +15,61 @@ module Measurable
15
15
  :meters => EARTH_RADIUS_IN_KILOMETERS * 1000
16
16
  }
17
17
 
18
- # call-seq:
19
- # haversine(u, v) -> Float
20
- #
21
- # Compute accurate distances between two points given their latitudes and
22
- # longitudes, even for short distances. This isn't a distance measure in the
23
- # same sense as the other methods in +Measurable+.
24
- #
25
- # The distance returned is the great circle (or orthodromic) distance between
26
- # +u+ and +v+, which is the shortest distance between them on the surface of
27
- # a sphere. Thus, this implementation considers the Earth to be a sphere.
28
- #
29
- # Reminding that the input vectors are of the form [latitude, longitude] in
30
- # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
31
- # Paulo), the corresponding vector is [-23.53333, -46.61667].
32
- #
33
- # References:
34
- # - http://www.movable-type.co.uk/scripts/latlong.html
35
- # - http://en.wikipedia.org/wiki/Haversine_formula
36
- # - http://en.wikipedia.org/wiki/Great-circle_distance
37
- #
38
- # * *Arguments* :
39
- # - +u+ -> An array of Numeric objects.
40
- # - +v+ -> An array of Numeric objects.
41
- # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
42
- # options are +:miles+, +:feet+, +:km+ and +:meters+.
43
- # * *Returns* :
44
- # - The great circle distance between +u+ and +v+.
45
- # * *Raises* :
46
- # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
47
- # - +ArgumentError+ -> +unit+ must be a Symbol.
48
- #
49
- def haversine(u, v, unit = :meters)
50
- # TODO: Create better exceptions.
51
- raise ArgumentError if u.size != 2 || v.size != 2
52
- raise ArgumentError if unit.class != Symbol
18
+ module Haversine
53
19
 
54
- dlat = u[0] - v[0]
55
- dlon = u[1] - v[1]
20
+ # call-seq:
21
+ # haversine(u, v) -> Float
22
+ #
23
+ # Compute accurate distances between two points given their latitudes and
24
+ # longitudes, even for short distances. This isn't a distance measure in the
25
+ # same sense as the other methods in +Measurable+.
26
+ #
27
+ # The distance returned is the great circle (or orthodromic) distance between
28
+ # +u+ and +v+, which is the shortest distance between them on the surface of
29
+ # a sphere. Thus, this implementation considers the Earth to be a sphere.
30
+ #
31
+ # Reminding that the input vectors are of the form [latitude, longitude] in
32
+ # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
33
+ # Paulo), the corresponding vector is [-23.53333, -46.61667].
34
+ #
35
+ # References:
36
+ # - http://www.movable-type.co.uk/scripts/latlong.html
37
+ # - http://en.wikipedia.org/wiki/Haversine_formula
38
+ # - http://en.wikipedia.org/wiki/Great-circle_distance
39
+ #
40
+ # Arguments:
41
+ # - +u+ -> An array of Numeric objects.
42
+ # - +v+ -> An array of Numeric objects.
43
+ # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
44
+ # options are +:miles+, +:feet+, +:km+ and +:meters+.
45
+ # Returns:
46
+ # - The great circle distance between +u+ and +v+.
47
+ # Raises:
48
+ # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
49
+ # - +ArgumentError+ -> +unit+ must be a Symbol.
50
+ def haversine(u, v, unit = :meters)
51
+ # TODO: Create better exceptions.
52
+ raise ArgumentError if u.size != 2 || v.size != 2
53
+ raise ArgumentError if unit.class != Symbol
56
54
 
57
- dlon_rad = dlon * RAD_PER_DEG
58
- dlat_rad = dlat * RAD_PER_DEG
55
+ dlat = u[0] - v[0]
56
+ dlon = u[1] - v[1]
59
57
 
60
- lat1_rad = v[0] * RAD_PER_DEG
61
- lon1_rad = v[1] * RAD_PER_DEG
58
+ dlon_rad = dlon * RAD_PER_DEG
59
+ dlat_rad = dlat * RAD_PER_DEG
62
60
 
63
- lat2_rad = u[0] * RAD_PER_DEG
64
- lon2_rad = u[1] * RAD_PER_DEG
61
+ lat1_rad = v[0] * RAD_PER_DEG
62
+ lon1_rad = v[1] * RAD_PER_DEG
65
63
 
66
- a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
67
- c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
64
+ lat2_rad = u[0] * RAD_PER_DEG
65
+ lon2_rad = u[1] * RAD_PER_DEG
68
66
 
69
- EARTH_RADIUS[unit] * c
67
+ a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
68
+ c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
69
+
70
+ EARTH_RADIUS[unit] * c
71
+ end
70
72
  end
71
- end
73
+
74
+ extend Measurable::Haversine
75
+ end
@@ -1,69 +1,62 @@
1
1
  module Measurable
2
+ module Jaccard
2
3
 
3
- # call-seq:
4
- # jaccard_index(u, v) -> Float
5
- #
6
- # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
7
- # jaccard_index = |intersection| / |union|
8
- #
9
- # In which intersection and union refer to +u+ and +v+ and |x| is the
10
- # cardinality of set x.
11
- #
12
- # For example:
13
- # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.666...
14
- #
15
- # Because |intersection| = |(1, 0, 1)| = 2 and |union| = |(1, 1, 1)| = 3.
16
- #
17
- # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
18
- #
19
- # * *Arguments* :
20
- # - +u+ -> Array of 1s and 0s.
21
- # - +v+ -> Array of 1s and 0s.
22
- # * *Returns* :
23
- # - Float value representing the Jaccard similarity coefficient between
24
- # +u+ and +v+.
25
- # * *Raises* :
26
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
27
- #
28
- def jaccard_index(u, v)
29
- # TODO: Change this to a more specific, custom-made exception.
30
- raise ArgumentError if u.size != v.size
31
-
32
- intersection = u.zip(v).reduce(0) do |acc, elem|
33
- # Both u and v must have this element.
34
- elem[0] + elem[1] == 2 ? (acc + 1) : acc
4
+ # call-seq:
5
+ # jaccard_index(u, v) -> Float
6
+ #
7
+ # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
8
+ # jaccard_index = |intersection| / |union|
9
+ #
10
+ # In which intersection and union refer to +u+ and +v+ and |x| is the
11
+ # cardinality of set x.
12
+ #
13
+ # For example:
14
+ # jaccard_index([1, 0], [1]) == 0.5
15
+ #
16
+ # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
17
+ #
18
+ # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
19
+ #
20
+ # Arguments:
21
+ # - +u+ -> Array.
22
+ # - +v+ -> Array.
23
+ # Returns:
24
+ # - Float value representing the Jaccard similarity coefficient between
25
+ # +u+ and +v+.
26
+ def jaccard_index(u, v)
27
+ intersection = u & v
28
+ union = u | v
29
+ intersection.length.to_f / union.length
35
30
  end
36
31
 
37
- union = u.zip(v).reduce(0) do |acc, elem|
38
- # One of u and v must have this element.
39
- elem[0] + elem[1] >= 1 ? (acc + 1) : acc
32
+ alias_method :jaccard, :jaccard_index
33
+
34
+ # call-seq:
35
+ # jaccard_dissimilarity(u, v) -> Float
36
+ #
37
+ # The jaccard distance is a measure of dissimilarity between two sets. It is
38
+ # calculated as:
39
+ # jaccard_distance = 1 - jaccard_index
40
+ #
41
+ # This is a proper metric, i.e. the following conditions hold:
42
+ # - Symmetry: jaccard_dissimilarity(u, v) == jaccard(v, u)
43
+ # - Non-negative: jaccard_dissimilarity(u, v) >= 0
44
+ # - Coincidence axiom: jaccard_dissimilarity(u, v) == 0 if u == v
45
+ # - Triangular inequality: jaccard_dissimilarity(u, v) <= jaccard(u, w) + jaccard(w, v)
46
+ #
47
+ # Arguments:
48
+ # - +u+ -> Array.
49
+ # - +v+ -> Array.
50
+ # Returns:
51
+ # - Float value representing the dissimilarity between +u+ and +v+.
52
+ # Raises:
53
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
54
+ def jaccard_dissimilarity(u, v)
55
+ 1 - jaccard_index(u, v)
40
56
  end
41
57
 
42
- intersection.to_f / union
43
- end
58
+ alias_method :jaccard_distance, :jaccard_dissimilarity
44
59
 
45
- # call-seq:
46
- # jaccard(u, v) -> Float
47
- #
48
- # The jaccard distance is a measure of dissimilarity between two sets. It is
49
- # calculated as:
50
- # jaccard_distance = 1 - jaccard_index
51
- #
52
- # This is a proper metric, i.e. the following conditions hold:
53
- # - Symmetry: jaccard(u, v) == jaccard(v, u)
54
- # - Non-negative: jaccard(u, v) >= 0
55
- # - Coincidence axiom: jaccard(u, v) == 0 if u == v
56
- # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
57
- #
58
- # * *Arguments* :
59
- # - +u+ -> Array of 1s and 0s.
60
- # - +v+ -> Array of 1s and 0s.
61
- # * *Returns* :
62
- # - Float value representing the dissimilarity between +u+ and +v+.
63
- # * *Raises* :
64
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
65
- #
66
- def jaccard(u, v)
67
- 1 - jaccard_index(u, v)
60
+ extend Measurable::Jaccard
68
61
  end
69
- end
62
+ end
@@ -0,0 +1,39 @@
1
+ module Measurable
2
+ module KullbackLeibler
3
+
4
+ # call-seq:
5
+ # kullback_leibler(p, q) -> Float
6
+ #
7
+ # The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
8
+ # a measure of their dissimilarity. However, it doesn't obey the triangular
9
+ # inequality and isn't symmetric, thus it isn't a metric.
10
+ #
11
+ # It is calculated as follows:
12
+ #
13
+ # KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
14
+ #
15
+ # With distributions +p+ and +q+ represented as vectors of N elements
16
+ # summing to 1.0.
17
+ #
18
+ # References:
19
+ # - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
20
+ # - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
21
+ # Natural Language Processing.
22
+ #
23
+ # Arguments:
24
+ # - +p+ -> A probability distribution represented by a n-element Array.
25
+ # - +q+ -> A probability distribution represented by a n-element Array.
26
+ # Returns:
27
+ # - A measure of the difference between the probability distributions p and q.
28
+ def kullback_leibler(p, q)
29
+ # TODO: Change this to a more specific, custom-made exception.
30
+ raise ArgumentError if p.size != q.size
31
+
32
+ p.zip(q).reduce(0.0) do |acc, probs|
33
+ acc += probs[0] * Math.log(probs[0] / probs[1])
34
+ end
35
+ end
36
+ end
37
+
38
+ extend Measurable::KullbackLeibler
39
+ end
@@ -0,0 +1,57 @@
1
+ module Measurable
2
+ module Levenshtein
3
+
4
+ # call-seq:
5
+ # levenshtein(u, v) -> Integer
6
+ #
7
+ # Give the edit distance between two binary sequences +u+ and +v+ where each
8
+ # edit (insertion, deletion, substitution) required to change on into the
9
+ # other increments the total distance.
10
+ #
11
+ # For example:
12
+ # levenshtein('kitten', 'sitting') == 3
13
+ #
14
+ # Because
15
+ # 1. kitten -> sitten (substitution "s" for "k")
16
+ # 2. sitten -> sittin (substitution "i" for "e")
17
+ # 3. sittin -> sitting (insertion of "g" at the end)
18
+ #
19
+ # See: http://en.wikipedia.org/wiki/Levenshtein_distance
20
+ #
21
+ # Arguments:
22
+ # - +u+ -> Array or String.
23
+ # - +v+ -> Array or String.
24
+ # Returns:
25
+ # - Integer value representing the Levenshtein distance between +u+ and +v+.
26
+ #
27
+ def levenshtein(u, v)
28
+ return 0 if u == v
29
+ return u.size if v.size == 0
30
+ return v.size if u.size == 0
31
+
32
+ matrix = Array.new(u.size+1) { (0..v.size).to_a }
33
+
34
+ if v.size < u.size
35
+ u, v = v, u
36
+ end
37
+
38
+ (1..u.size).each do |i|
39
+ (1..v.size).each do |j|
40
+ if u[i] == v[j]
41
+ matrix[i][j] = matrix[i-1][j-1]
42
+ else
43
+ matrix[i][j] = [
44
+ matrix[i-1][j] + 1, # deletion
45
+ matrix[i][j-1] + 1, # insertion
46
+ matrix[i-1][j-1] + 1, # substitution
47
+ ].min
48
+ end
49
+ end
50
+ end
51
+
52
+ matrix[u.size][v.size]
53
+ end
54
+ end
55
+
56
+ extend Measurable::Levenshtein
57
+ end
@@ -1,34 +1,38 @@
1
1
  module Measurable
2
+ module Maxmin
2
3
 
3
- # call-seq:
4
- # maxmin(u, v) -> Float
5
- #
6
- # The "Max-min distance" is used to measure similarity between two vectors.
7
- #
8
- # When used in k-means clustering, this similarity measure can give better
9
- # results in some datasets, as pointed out in the paper "K-means clustering
10
- # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
11
- #
12
- # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
13
- #
14
- # * *Arguments* :
15
- # - +u+ -> An array of Numeric objects.
16
- # - +v+ -> An array of Numeric objects.
17
- # * *Returns* :
18
- # - Similarity between +u+ and +v+.
19
- # * *Raises* :
20
- # - +ArgumentError+ -> The sizes of +u+ and +v+ doesn't match.
21
- #
22
- def maxmin(u, v)
23
- # TODO: Change this to a more specific, custom-made exception.
24
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # maxmin(u, v) -> Float
6
+ #
7
+ # The "Max-min distance" is used to measure similarity between two vectors.
8
+ #
9
+ # When used in k-means clustering, this similarity measure can give better
10
+ # results in some datasets, as pointed out in the paper "K-means clustering
11
+ # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
12
+ #
13
+ # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
14
+ #
15
+ # Arguments:
16
+ # - +u+ -> An array of Numeric objects.
17
+ # - +v+ -> An array of Numeric objects.
18
+ # Returns:
19
+ # - Similarity between +u+ and +v+.
20
+ # Raises:
21
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
22
+ def maxmin(u, v)
23
+ # TODO: Change this to a more specific, custom-made exception.
24
+ raise ArgumentError if u.size != v.size
25
25
 
26
- sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
- acc[0] += attributes.min
28
- acc[1] += attributes.max
29
- acc
26
+ sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
+ acc[0] += attributes.min
28
+ acc[1] += attributes.max
29
+ acc
30
+ end
31
+
32
+ sum_min / sum_max
30
33
  end
31
34
 
32
- sum_min / sum_max
33
35
  end
34
- end
36
+
37
+ extend Measurable::Maxmin
38
+ end
@@ -0,0 +1,44 @@
1
+ module Measurable
2
+ module Minkowski
3
+
4
+ # call-seq:
5
+ # minkowski(u, v) -> Numeric
6
+ #
7
+ # Calculate the sum of the absolute value of the differences between each
8
+ # coordinate of +u+ and +v+.
9
+ #
10
+ # Arguments:
11
+ # - +u+ -> An array of Numeric objects.
12
+ # - +v+ -> An array of Numeric objects.
13
+ # Returns:
14
+ # - The Minkowski (or L1) distance between +u+ and +v+.
15
+ # Raises:
16
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
17
+ def minkowski(u, v)
18
+ # TODO: Change this to a more specific, custom-made exception.
19
+ raise ArgumentError if u.size != v.size
20
+
21
+ u.zip(v).reduce(0) do |acc, elem|
22
+ acc += (elem[0] - elem[1]).abs
23
+ end
24
+ end
25
+
26
+ def self.extended(base) # :nodoc:
27
+ base.instance_eval do
28
+ alias :cityblock :minkowski
29
+ alias :manhattan :minkowski
30
+ end
31
+ super
32
+ end
33
+
34
+ def self.included(base) # :nodoc:
35
+ base.class_eval do
36
+ alias :cityblock :minkowski
37
+ alias :manhattan :minkowski
38
+ end
39
+ super
40
+ end
41
+ end
42
+
43
+ extend Measurable::Minkowski
44
+ end