measurable 0.0.5 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+ module Measurable
2
+ module Hamming
3
+
4
+ # call-seq:
5
+ # hamming(s1, s2) -> Integer
6
+ #
7
+ # Count the number of different characters between strings +s1+ and +s2+,
8
+ # that is, how many substitutions are necessary to change +s1+ into +s2+ and
9
+ # vice-versa.
10
+ #
11
+ # See: http://en.wikipedia.org/wiki/Hamming_distance
12
+ #
13
+ # Arguments:
14
+ # - +s1+ -> A String.
15
+ # - +s2+ -> A String with the same size of +s1+.
16
+ # Returns:
17
+ # - The number of characters in which +s1+ and +s2+ differ.
18
+ # Raises:
19
+ # - +ArgumentError+ -> The sizes of +s1+ and +s2+ don't match.
20
+ def hamming(s1, s2)
21
+ # TODO: Change this to a more specific, custom-made exception.
22
+ raise ArgumentError if s1.size != s2.size
23
+
24
+ s1.chars.zip(s2.chars).reduce(0) do |acc, c|
25
+ acc += 1 if c[0] != c[1]
26
+ acc
27
+ end
28
+ end
29
+ end
30
+
31
+ extend Measurable::Hamming
32
+ end
@@ -15,57 +15,61 @@ module Measurable
15
15
  :meters => EARTH_RADIUS_IN_KILOMETERS * 1000
16
16
  }
17
17
 
18
- # call-seq:
19
- # haversine(u, v) -> Float
20
- #
21
- # Compute accurate distances between two points given their latitudes and
22
- # longitudes, even for short distances. This isn't a distance measure in the
23
- # same sense as the other methods in +Measurable+.
24
- #
25
- # The distance returned is the great circle (or orthodromic) distance between
26
- # +u+ and +v+, which is the shortest distance between them on the surface of
27
- # a sphere. Thus, this implementation considers the Earth to be a sphere.
28
- #
29
- # Reminding that the input vectors are of the form [latitude, longitude] in
30
- # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
31
- # Paulo), the corresponding vector is [-23.53333, -46.61667].
32
- #
33
- # References:
34
- # - http://www.movable-type.co.uk/scripts/latlong.html
35
- # - http://en.wikipedia.org/wiki/Haversine_formula
36
- # - http://en.wikipedia.org/wiki/Great-circle_distance
37
- #
38
- # * *Arguments* :
39
- # - +u+ -> An array of Numeric objects.
40
- # - +v+ -> An array of Numeric objects.
41
- # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
42
- # options are +:miles+, +:feet+, +:km+ and +:meters+.
43
- # * *Returns* :
44
- # - The great circle distance between +u+ and +v+.
45
- # * *Raises* :
46
- # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
47
- # - +ArgumentError+ -> +unit+ must be a Symbol.
48
- #
49
- def haversine(u, v, unit = :meters)
50
- # TODO: Create better exceptions.
51
- raise ArgumentError if u.size != 2 || v.size != 2
52
- raise ArgumentError if unit.class != Symbol
18
+ module Haversine
53
19
 
54
- dlat = u[0] - v[0]
55
- dlon = u[1] - v[1]
20
+ # call-seq:
21
+ # haversine(u, v) -> Float
22
+ #
23
+ # Compute accurate distances between two points given their latitudes and
24
+ # longitudes, even for short distances. This isn't a distance measure in the
25
+ # same sense as the other methods in +Measurable+.
26
+ #
27
+ # The distance returned is the great circle (or orthodromic) distance between
28
+ # +u+ and +v+, which is the shortest distance between them on the surface of
29
+ # a sphere. Thus, this implementation considers the Earth to be a sphere.
30
+ #
31
+ # Reminding that the input vectors are of the form [latitude, longitude] in
32
+ # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
33
+ # Paulo), the corresponding vector is [-23.53333, -46.61667].
34
+ #
35
+ # References:
36
+ # - http://www.movable-type.co.uk/scripts/latlong.html
37
+ # - http://en.wikipedia.org/wiki/Haversine_formula
38
+ # - http://en.wikipedia.org/wiki/Great-circle_distance
39
+ #
40
+ # Arguments:
41
+ # - +u+ -> An array of Numeric objects.
42
+ # - +v+ -> An array of Numeric objects.
43
+ # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
44
+ # options are +:miles+, +:feet+, +:km+ and +:meters+.
45
+ # Returns:
46
+ # - The great circle distance between +u+ and +v+.
47
+ # Raises:
48
+ # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
49
+ # - +ArgumentError+ -> +unit+ must be a Symbol.
50
+ def haversine(u, v, unit = :meters)
51
+ # TODO: Create better exceptions.
52
+ raise ArgumentError if u.size != 2 || v.size != 2
53
+ raise ArgumentError if unit.class != Symbol
56
54
 
57
- dlon_rad = dlon * RAD_PER_DEG
58
- dlat_rad = dlat * RAD_PER_DEG
55
+ dlat = u[0] - v[0]
56
+ dlon = u[1] - v[1]
59
57
 
60
- lat1_rad = v[0] * RAD_PER_DEG
61
- lon1_rad = v[1] * RAD_PER_DEG
58
+ dlon_rad = dlon * RAD_PER_DEG
59
+ dlat_rad = dlat * RAD_PER_DEG
62
60
 
63
- lat2_rad = u[0] * RAD_PER_DEG
64
- lon2_rad = u[1] * RAD_PER_DEG
61
+ lat1_rad = v[0] * RAD_PER_DEG
62
+ lon1_rad = v[1] * RAD_PER_DEG
65
63
 
66
- a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
67
- c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
64
+ lat2_rad = u[0] * RAD_PER_DEG
65
+ lon2_rad = u[1] * RAD_PER_DEG
68
66
 
69
- EARTH_RADIUS[unit] * c
67
+ a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
68
+ c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
69
+
70
+ EARTH_RADIUS[unit] * c
71
+ end
70
72
  end
71
- end
73
+
74
+ extend Measurable::Haversine
75
+ end
@@ -1,69 +1,62 @@
1
1
  module Measurable
2
+ module Jaccard
2
3
 
3
- # call-seq:
4
- # jaccard_index(u, v) -> Float
5
- #
6
- # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
7
- # jaccard_index = |intersection| / |union|
8
- #
9
- # In which intersection and union refer to +u+ and +v+ and |x| is the
10
- # cardinality of set x.
11
- #
12
- # For example:
13
- # jaccard_index([1, 0, 1], [1, 1, 1]) == 0.666...
14
- #
15
- # Because |intersection| = |(1, 0, 1)| = 2 and |union| = |(1, 1, 1)| = 3.
16
- #
17
- # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
18
- #
19
- # * *Arguments* :
20
- # - +u+ -> Array of 1s and 0s.
21
- # - +v+ -> Array of 1s and 0s.
22
- # * *Returns* :
23
- # - Float value representing the Jaccard similarity coefficient between
24
- # +u+ and +v+.
25
- # * *Raises* :
26
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
27
- #
28
- def jaccard_index(u, v)
29
- # TODO: Change this to a more specific, custom-made exception.
30
- raise ArgumentError if u.size != v.size
31
-
32
- intersection = u.zip(v).reduce(0) do |acc, elem|
33
- # Both u and v must have this element.
34
- elem[0] + elem[1] == 2 ? (acc + 1) : acc
4
+ # call-seq:
5
+ # jaccard_index(u, v) -> Float
6
+ #
7
+ # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
8
+ # jaccard_index = |intersection| / |union|
9
+ #
10
+ # In which intersection and union refer to +u+ and +v+ and |x| is the
11
+ # cardinality of set x.
12
+ #
13
+ # For example:
14
+ # jaccard_index([1, 0], [1]) == 0.5
15
+ #
16
+ # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
17
+ #
18
+ # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
19
+ #
20
+ # Arguments:
21
+ # - +u+ -> Array.
22
+ # - +v+ -> Array.
23
+ # Returns:
24
+ # - Float value representing the Jaccard similarity coefficient between
25
+ # +u+ and +v+.
26
+ def jaccard_index(u, v)
27
+ intersection = u & v
28
+ union = u | v
29
+ intersection.length.to_f / union.length
35
30
  end
36
31
 
37
- union = u.zip(v).reduce(0) do |acc, elem|
38
- # One of u and v must have this element.
39
- elem[0] + elem[1] >= 1 ? (acc + 1) : acc
32
+ alias_method :jaccard, :jaccard_index
33
+
34
+ # call-seq:
35
+ # jaccard_dissimilarity(u, v) -> Float
36
+ #
37
+ # The jaccard distance is a measure of dissimilarity between two sets. It is
38
+ # calculated as:
39
+ # jaccard_distance = 1 - jaccard_index
40
+ #
41
+ # This is a proper metric, i.e. the following conditions hold:
42
+ # - Symmetry: jaccard_dissimilarity(u, v) == jaccard(v, u)
43
+ # - Non-negative: jaccard_dissimilarity(u, v) >= 0
44
+ # - Coincidence axiom: jaccard_dissimilarity(u, v) == 0 if u == v
45
+ # - Triangular inequality: jaccard_dissimilarity(u, v) <= jaccard(u, w) + jaccard(w, v)
46
+ #
47
+ # Arguments:
48
+ # - +u+ -> Array.
49
+ # - +v+ -> Array.
50
+ # Returns:
51
+ # - Float value representing the dissimilarity between +u+ and +v+.
52
+ # Raises:
53
+ # - +ArgumentError+ -> The size of the input arrays doesn't match.
54
+ def jaccard_dissimilarity(u, v)
55
+ 1 - jaccard_index(u, v)
40
56
  end
41
57
 
42
- intersection.to_f / union
43
- end
58
+ alias_method :jaccard_distance, :jaccard_dissimilarity
44
59
 
45
- # call-seq:
46
- # jaccard(u, v) -> Float
47
- #
48
- # The jaccard distance is a measure of dissimilarity between two sets. It is
49
- # calculated as:
50
- # jaccard_distance = 1 - jaccard_index
51
- #
52
- # This is a proper metric, i.e. the following conditions hold:
53
- # - Symmetry: jaccard(u, v) == jaccard(v, u)
54
- # - Non-negative: jaccard(u, v) >= 0
55
- # - Coincidence axiom: jaccard(u, v) == 0 if u == v
56
- # - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
57
- #
58
- # * *Arguments* :
59
- # - +u+ -> Array of 1s and 0s.
60
- # - +v+ -> Array of 1s and 0s.
61
- # * *Returns* :
62
- # - Float value representing the dissimilarity between +u+ and +v+.
63
- # * *Raises* :
64
- # - +ArgumentError+ -> The size of the input arrays doesn't match.
65
- #
66
- def jaccard(u, v)
67
- 1 - jaccard_index(u, v)
60
+ extend Measurable::Jaccard
68
61
  end
69
- end
62
+ end
@@ -0,0 +1,39 @@
1
+ module Measurable
2
+ module KullbackLeibler
3
+
4
+ # call-seq:
5
+ # kullback_leibler(p, q) -> Float
6
+ #
7
+ # The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
8
+ # a measure of their dissimilarity. However, it doesn't obey the triangular
9
+ # inequality and isn't symmetric, thus it isn't a metric.
10
+ #
11
+ # It is calculated as follows:
12
+ #
13
+ # KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
14
+ #
15
+ # With distributions +p+ and +q+ represented as vectors of N elements
16
+ # summing to 1.0.
17
+ #
18
+ # References:
19
+ # - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
20
+ # - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
21
+ # Natural Language Processing.
22
+ #
23
+ # Arguments:
24
+ # - +p+ -> A probability distribution represented by a n-element Array.
25
+ # - +q+ -> A probability distribution represented by a n-element Array.
26
+ # Returns:
27
+ # - A measure of the difference between the probability distributions p and q.
28
+ def kullback_leibler(p, q)
29
+ # TODO: Change this to a more specific, custom-made exception.
30
+ raise ArgumentError if p.size != q.size
31
+
32
+ p.zip(q).reduce(0.0) do |acc, probs|
33
+ acc += probs[0] * Math.log(probs[0] / probs[1])
34
+ end
35
+ end
36
+ end
37
+
38
+ extend Measurable::KullbackLeibler
39
+ end
@@ -0,0 +1,57 @@
1
+ module Measurable
2
+ module Levenshtein
3
+
4
+ # call-seq:
5
+ # levenshtein(u, v) -> Integer
6
+ #
7
+ # Give the edit distance between two binary sequences +u+ and +v+ where each
8
+ # edit (insertion, deletion, substitution) required to change on into the
9
+ # other increments the total distance.
10
+ #
11
+ # For example:
12
+ # levenshtein('kitten', 'sitting') == 3
13
+ #
14
+ # Because
15
+ # 1. kitten -> sitten (substitution "s" for "k")
16
+ # 2. sitten -> sittin (substitution "i" for "e")
17
+ # 3. sittin -> sitting (insertion of "g" at the end)
18
+ #
19
+ # See: http://en.wikipedia.org/wiki/Levenshtein_distance
20
+ #
21
+ # Arguments:
22
+ # - +u+ -> Array or String.
23
+ # - +v+ -> Array or String.
24
+ # Returns:
25
+ # - Integer value representing the Levenshtein distance between +u+ and +v+.
26
+ #
27
+ def levenshtein(u, v)
28
+ return 0 if u == v
29
+ return u.size if v.size == 0
30
+ return v.size if u.size == 0
31
+
32
+ matrix = Array.new(u.size+1) { (0..v.size).to_a }
33
+
34
+ if v.size < u.size
35
+ u, v = v, u
36
+ end
37
+
38
+ (1..u.size).each do |i|
39
+ (1..v.size).each do |j|
40
+ if u[i] == v[j]
41
+ matrix[i][j] = matrix[i-1][j-1]
42
+ else
43
+ matrix[i][j] = [
44
+ matrix[i-1][j] + 1, # deletion
45
+ matrix[i][j-1] + 1, # insertion
46
+ matrix[i-1][j-1] + 1, # substitution
47
+ ].min
48
+ end
49
+ end
50
+ end
51
+
52
+ matrix[u.size][v.size]
53
+ end
54
+ end
55
+
56
+ extend Measurable::Levenshtein
57
+ end
@@ -1,34 +1,38 @@
1
1
  module Measurable
2
+ module Maxmin
2
3
 
3
- # call-seq:
4
- # maxmin(u, v) -> Float
5
- #
6
- # The "Max-min distance" is used to measure similarity between two vectors.
7
- #
8
- # When used in k-means clustering, this similarity measure can give better
9
- # results in some datasets, as pointed out in the paper "K-means clustering
10
- # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
11
- #
12
- # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
13
- #
14
- # * *Arguments* :
15
- # - +u+ -> An array of Numeric objects.
16
- # - +v+ -> An array of Numeric objects.
17
- # * *Returns* :
18
- # - Similarity between +u+ and +v+.
19
- # * *Raises* :
20
- # - +ArgumentError+ -> The sizes of +u+ and +v+ doesn't match.
21
- #
22
- def maxmin(u, v)
23
- # TODO: Change this to a more specific, custom-made exception.
24
- raise ArgumentError if u.size != v.size
4
+ # call-seq:
5
+ # maxmin(u, v) -> Float
6
+ #
7
+ # The "Max-min distance" is used to measure similarity between two vectors.
8
+ #
9
+ # When used in k-means clustering, this similarity measure can give better
10
+ # results in some datasets, as pointed out in the paper "K-means clustering
11
+ # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
12
+ #
13
+ # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
14
+ #
15
+ # Arguments:
16
+ # - +u+ -> An array of Numeric objects.
17
+ # - +v+ -> An array of Numeric objects.
18
+ # Returns:
19
+ # - Similarity between +u+ and +v+.
20
+ # Raises:
21
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
22
+ def maxmin(u, v)
23
+ # TODO: Change this to a more specific, custom-made exception.
24
+ raise ArgumentError if u.size != v.size
25
25
 
26
- sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
- acc[0] += attributes.min
28
- acc[1] += attributes.max
29
- acc
26
+ sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
27
+ acc[0] += attributes.min
28
+ acc[1] += attributes.max
29
+ acc
30
+ end
31
+
32
+ sum_min / sum_max
30
33
  end
31
34
 
32
- sum_min / sum_max
33
35
  end
34
- end
36
+
37
+ extend Measurable::Maxmin
38
+ end
@@ -0,0 +1,44 @@
1
+ module Measurable
2
+ module Minkowski
3
+
4
+ # call-seq:
5
+ # minkowski(u, v) -> Numeric
6
+ #
7
+ # Calculate the sum of the absolute value of the differences between each
8
+ # coordinate of +u+ and +v+.
9
+ #
10
+ # Arguments:
11
+ # - +u+ -> An array of Numeric objects.
12
+ # - +v+ -> An array of Numeric objects.
13
+ # Returns:
14
+ # - The Minkowski (or L1) distance between +u+ and +v+.
15
+ # Raises:
16
+ # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
17
+ def minkowski(u, v)
18
+ # TODO: Change this to a more specific, custom-made exception.
19
+ raise ArgumentError if u.size != v.size
20
+
21
+ u.zip(v).reduce(0) do |acc, elem|
22
+ acc += (elem[0] - elem[1]).abs
23
+ end
24
+ end
25
+
26
+ def self.extended(base) # :nodoc:
27
+ base.instance_eval do
28
+ alias :cityblock :minkowski
29
+ alias :manhattan :minkowski
30
+ end
31
+ super
32
+ end
33
+
34
+ def self.included(base) # :nodoc:
35
+ base.class_eval do
36
+ alias :cityblock :minkowski
37
+ alias :manhattan :minkowski
38
+ end
39
+ super
40
+ end
41
+ end
42
+
43
+ extend Measurable::Minkowski
44
+ end