RubyGems - measurable - Versions diffs - 0.0.5 → 0.0.11 - Mend

measurable 0.0.5 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +5 -5
data/.gitignore +3 -1
data/.travis.yml +7 -0
data/Gemfile +2 -1
data/History.txt +11 -0
data/README.md +29 -39
data/Rakefile +18 -12
data/lib/measurable.rb +6 -3
data/lib/measurable/chebyshev.rb +24 -0
data/lib/measurable/cosine.rb +66 -24
data/lib/measurable/euclidean.rb +56 -68
data/lib/measurable/hamming.rb +32 -0
data/lib/measurable/haversine.rb +51 -47
data/lib/measurable/jaccard.rb +54 -61
data/lib/measurable/kullback_leibler.rb +39 -0
data/lib/measurable/levenshtein.rb +57 -0
data/lib/measurable/maxmin.rb +32 -28
data/lib/measurable/minkowski.rb +44 -0
data/lib/measurable/tanimoto.rb +46 -27
data/lib/measurable/version.rb +2 -2
data/measurable.gemspec +6 -4
data/spec/chebyshev_spec.rb +48 -0
data/spec/cosine_spec.rb +72 -24
data/spec/euclidean_spec.rb +30 -14
data/spec/hamming_spec.rb +46 -0
data/spec/haversine_spec.rb +22 -2
data/spec/jaccard_spec.rb +35 -14
data/spec/kullback_leibler_spec.rb +46 -0
data/spec/levenshtein_spec.rb +71 -0
data/spec/maxmin_spec.rb +21 -2
data/spec/minkowski_spec.rb +44 -0
data/spec/spec_helper.rb +1 -1
data/spec/tanimoto_spec.rb +23 -3
metadata +53 -23
data/Gemfile.lock +0 -27

data/lib/measurable/hamming.rb ADDED

@@ -0,0 +1,32 @@
+module Measurable
+  module Hamming
+    # call-seq:
+    #     hamming(s1, s2) -> Integer
+    #
+    # Count the number of different characters between strings +s1+ and +s2+,
+    # that is, how many substitutions are necessary to change +s1+ into +s2+ and
+    # vice-versa.
+    #
+    # See: http://en.wikipedia.org/wiki/Hamming_distance
+    #
+    # Arguments:
+    # - +s1+ -> A String.
+    # - +s2+ -> A String with the same size of +s1+.
+    # Returns:
+    # - The number of characters in which +s1+ and +s2+ differ.
+    # Raises:
+    # - +ArgumentError+ -> The sizes of +s1+ and +s2+ don't match.
+    def hamming(s1, s2)
+      # TODO: Change this to a more specific, custom-made exception.
+      raise ArgumentError if s1.size != s2.size
+      s1.chars.zip(s2.chars).reduce(0) do |acc, c|
+        acc += 1 if c[0] != c[1]
+        acc
+      end
+    end
+  end
+  extend Measurable::Hamming
+end

data/lib/measurable/haversine.rb CHANGED

@@ -15,57 +15,61 @@ module Measurable
     :meters => EARTH_RADIUS_IN_KILOMETERS * 1000
   }
-  # call-seq:
-  #     haversine(u, v) -> Float
-  #
-  # Compute accurate distances between two points given their latitudes and
-  # longitudes, even for short distances. This isn't a distance measure in the
-  # same sense as the other methods in +Measurable+.
-  #
-  # The distance returned is the great circle (or orthodromic) distance between
-  # +u+ and +v+, which is the shortest distance between them on the surface of
-  # a sphere. Thus, this implementation considers the Earth to be a sphere.
-  #
-  # Reminding that the input vectors are of the form [latitude, longitude] in
-  # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
-  # Paulo), the corresponding vector is [-23.53333, -46.61667].
-  #
-  # References:
-  # - http://www.movable-type.co.uk/scripts/latlong.html
-  # - http://en.wikipedia.org/wiki/Haversine_formula
-  # - http://en.wikipedia.org/wiki/Great-circle_distance
-  #
-  # * *Arguments* :
-  #   - +u+ -> An array of Numeric objects.
-  #   - +v+ -> An array of Numeric objects.
-  #   - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
-  #               options are +:miles+, +:feet+, +:km+ and +:meters+.
-  # * *Returns* :
-  #   - The great circle distance between +u+ and +v+.
-  # * *Raises* :
-  #   - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
-  #   - +ArgumentError+ -> +unit+ must be a Symbol.
-  #
-  def haversine(u, v, unit = :meters)
-    # TODO: Create better exceptions.
-    raise ArgumentError if u.size != 2 || v.size != 2
-    raise ArgumentError if unit.class != Symbol
+  module Haversine
-    dlat = u[0] - v[0]
-    dlon = u[1] - v[1]
+    # call-seq:
+    #     haversine(u, v) -> Float
+    #
+    # Compute accurate distances between two points given their latitudes and
+    # longitudes, even for short distances. This isn't a distance measure in the
+    # same sense as the other methods in +Measurable+.
+    #
+    # The distance returned is the great circle (or orthodromic) distance between
+    # +u+ and +v+, which is the shortest distance between them on the surface of
+    # a sphere. Thus, this implementation considers the Earth to be a sphere.
+    #
+    # Reminding that the input vectors are of the form [latitude, longitude] in
+    # degrees, so if you have the coordinates [23 32' S, 46 37' W] (from São
+    # Paulo), the corresponding vector is [-23.53333, -46.61667].
+    #
+    # References:
+    # - http://www.movable-type.co.uk/scripts/latlong.html
+    # - http://en.wikipedia.org/wiki/Haversine_formula
+    # - http://en.wikipedia.org/wiki/Great-circle_distance
+    #
+    # Arguments:
+    # - +u+ -> An array of Numeric objects.
+    # - +v+ -> An array of Numeric objects.
+    # - +unit+ -> (Optional) A Symbol representing the unit of measure. Available
+    #             options are +:miles+, +:feet+, +:km+ and +:meters+.
+    # Returns:
+    # - The great circle distance between +u+ and +v+.
+    # Raises:
+    # - +ArgumentError+ -> The size of +u+ and +v+ must be 2.
+    # - +ArgumentError+ -> +unit+ must be a Symbol.
+    def haversine(u, v, unit = :meters)
+      # TODO: Create better exceptions.
+      raise ArgumentError if u.size != 2 || v.size != 2
+      raise ArgumentError if unit.class != Symbol
-    dlon_rad = dlon * RAD_PER_DEG
-    dlat_rad = dlat * RAD_PER_DEG
+      dlat = u[0] - v[0]
+      dlon = u[1] - v[1]
-    lat1_rad = v[0] * RAD_PER_DEG
-    lon1_rad = v[1] * RAD_PER_DEG
+      dlon_rad = dlon * RAD_PER_DEG
+      dlat_rad = dlat * RAD_PER_DEG
-    lat2_rad = u[0] * RAD_PER_DEG
-    lon2_rad = u[1] * RAD_PER_DEG
+      lat1_rad = v[0] * RAD_PER_DEG
+      lon1_rad = v[1] * RAD_PER_DEG
-    a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
-    c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
+      lat2_rad = u[0] * RAD_PER_DEG
+      lon2_rad = u[1] * RAD_PER_DEG
-    EARTH_RADIUS[unit] * c
+      a = (Math.sin(dlat_rad / 2)) ** 2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * (Math.sin(dlon_rad / 2)) ** 2
+      c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
+      EARTH_RADIUS[unit] * c
+    end
   end
-end
+  extend Measurable::Haversine
+end

data/lib/measurable/jaccard.rb CHANGED

@@ -1,69 +1,62 @@
 module Measurable
+  module Jaccard
-  # call-seq:
-  #     jaccard_index(u, v) -> Float
-  #
-  # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
-  #  jaccard_index = |intersection| / |union|
-  #
-  # In which intersection and union refer to +u+ and +v+ and |x| is the
-  # cardinality of set x.
-  #
-  # For example:
-  #   jaccard_index([1, 0, 1], [1, 1, 1]) == 0.666...
-  #
-  # Because |intersection| = |(1, 0, 1)| = 2 and |union| = |(1, 1, 1)| = 3.
-  #
-  # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
-  #
-  # * *Arguments* :
-  #   - +u+ -> Array of 1s and 0s.
-  #   - +v+ -> Array of 1s and 0s.
-  # * *Returns* :
-  #   - Float value representing the Jaccard similarity coefficient between
-  #     +u+ and +v+.
-  # * *Raises* :
-  #   - +ArgumentError+ -> The size of the input arrays doesn't match.
-  #
-  def jaccard_index(u, v)
-    # TODO: Change this to a more specific, custom-made exception.
-    raise ArgumentError if u.size != v.size
-    intersection = u.zip(v).reduce(0) do |acc, elem|
-      # Both u and v must have this element.
-      elem[0] + elem[1] == 2 ? (acc + 1) : acc
+    # call-seq:
+    #     jaccard_index(u, v) -> Float
+    #
+    # Give the similarity between two binary vectors +u+ and +v+. Calculated as:
+    #  jaccard_index = |intersection| / |union|
+    #
+    # In which intersection and union refer to +u+ and +v+ and |x| is the
+    # cardinality of set x.
+    #
+    # For example:
+    #   jaccard_index([1, 0], [1]) == 0.5
+    #
+    # Because |intersection| = |(1)| = 1 and |union| = |(0, 1)| = 2.
+    #
+    # See: http://en.wikipedia.org/wiki/Jaccard_coefficient
+    #
+    # Arguments:
+    # - +u+ -> Array.
+    # - +v+ -> Array.
+    # Returns:
+    # - Float value representing the Jaccard similarity coefficient between
+    #   +u+ and +v+.
+    def jaccard_index(u, v)
+      intersection = u & v
+      union = u | v
+      intersection.length.to_f / union.length
     end
-    union = u.zip(v).reduce(0) do |acc, elem|
-      # One of u and v must have this element.
-      elem[0] + elem[1] >= 1 ? (acc + 1) : acc
+    alias_method :jaccard, :jaccard_index
+    # call-seq:
+    #     jaccard_dissimilarity(u, v) -> Float
+    #
+    # The jaccard distance is a measure of dissimilarity between two sets. It is
+    # calculated as:
+    #   jaccard_distance = 1 - jaccard_index
+    #
+    # This is a proper metric, i.e. the following conditions hold:
+    #   - Symmetry:              jaccard_dissimilarity(u, v) == jaccard(v, u)
+    #   - Non-negative:          jaccard_dissimilarity(u, v) >= 0
+    #   - Coincidence axiom:     jaccard_dissimilarity(u, v) == 0 if u == v
+    #   - Triangular inequality: jaccard_dissimilarity(u, v) <= jaccard(u, w) + jaccard(w, v)
+    #
+    # Arguments:
+    # - +u+ -> Array.
+    # - +v+ -> Array.
+    # Returns:
+    # - Float value representing the dissimilarity between +u+ and +v+.
+    # Raises:
+    # - +ArgumentError+ -> The size of the input arrays doesn't match.
+    def jaccard_dissimilarity(u, v)
+      1 - jaccard_index(u, v)
     end
-    intersection.to_f / union
-  end
+    alias_method :jaccard_distance, :jaccard_dissimilarity
-  # call-seq:
-  #     jaccard(u, v) -> Float
-  #
-  # The jaccard distance is a measure of dissimilarity between two sets. It is
-  # calculated as:
-  #   jaccard_distance = 1 - jaccard_index
-  #
-  # This is a proper metric, i.e. the following conditions hold:
-  #   - Symmetry:              jaccard(u, v) == jaccard(v, u)
-  #   - Non-negative:          jaccard(u, v) >= 0
-  #   - Coincidence axiom:     jaccard(u, v) == 0 if u == v
-  #   - Triangular inequality: jaccard(u, v) <= jaccard(u, w) + jaccard(w, v)
-  #
-  # * *Arguments* :
-  #   - +u+ -> Array of 1s and 0s.
-  #   - +v+ -> Array of 1s and 0s.
-  # * *Returns* :
-  #   - Float value representing the dissimilarity between +u+ and +v+.
-  # * *Raises* :
-  #   - +ArgumentError+ -> The size of the input arrays doesn't match.
-  #
-  def jaccard(u, v)
-    1 - jaccard_index(u, v)
+    extend Measurable::Jaccard
   end
-end
+end

data/lib/measurable/kullback_leibler.rb ADDED

@@ -0,0 +1,39 @@
+module Measurable
+  module KullbackLeibler
+    # call-seq:
+    #     kullback_leibler(p, q) -> Float
+    #
+    # The Kullback-Leibler Divergence between the distributions +p+ and +q+ is
+    # a measure of their dissimilarity. However, it doesn't obey the triangular
+    # inequality and isn't symmetric, thus it isn't a metric.
+    #
+    # It is calculated as follows:
+    #
+    #   KL(p, q) = \sum_{i = q}^{N} p[i] * log(p[i] / q[i])
+    #
+    # With distributions +p+ and +q+ represented as vectors of N elements
+    # summing to 1.0.
+    #
+    # References:
+    # - http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    # - Christopher D. Manning and Hinrich Schütze. Foundations of Statistical
+    #   Natural Language Processing.
+    #
+    # Arguments:
+    # - +p+ -> A probability distribution represented by a n-element Array.
+    # - +q+ -> A probability distribution represented by a n-element Array.
+    # Returns:
+    # - A measure of the difference between the probability distributions p and q.
+    def kullback_leibler(p, q)
+      # TODO: Change this to a more specific, custom-made exception.
+      raise ArgumentError if p.size != q.size
+      p.zip(q).reduce(0.0) do |acc, probs|
+        acc += probs[0] * Math.log(probs[0] / probs[1])
+      end
+    end
+  end
+  extend Measurable::KullbackLeibler
+end

data/lib/measurable/levenshtein.rb ADDED

@@ -0,0 +1,57 @@
+module Measurable
+  module Levenshtein
+    # call-seq:
+    #     levenshtein(u, v) -> Integer
+    #
+    # Give the edit distance between two binary sequences +u+ and +v+ where each
+    # edit (insertion, deletion, substitution) required to change on into the
+    # other increments the total distance.
+    #
+    # For example:
+    #   levenshtein('kitten', 'sitting') == 3
+    #
+    # Because
+    # 1. kitten -> sitten (substitution "s" for "k")
+    # 2. sitten -> sittin (substitution "i" for "e")
+    # 3. sittin -> sitting (insertion of "g" at the end)
+    #
+    # See: http://en.wikipedia.org/wiki/Levenshtein_distance
+    #
+    # Arguments:
+    # - +u+ -> Array or String.
+    # - +v+ -> Array or String.
+    # Returns:
+    # - Integer value representing the Levenshtein distance between +u+ and +v+.
+    #
+    def levenshtein(u, v)
+      return 0 if u == v
+      return u.size if v.size == 0
+      return v.size if u.size == 0
+      matrix = Array.new(u.size+1) { (0..v.size).to_a }
+      if v.size < u.size
+        u, v = v, u
+      end
+      (1..u.size).each do |i|
+        (1..v.size).each do |j|
+          if u[i] == v[j]
+            matrix[i][j] = matrix[i-1][j-1]
+          else
+            matrix[i][j] = [
+              matrix[i-1][j] + 1,   # deletion
+              matrix[i][j-1] + 1,   # insertion
+              matrix[i-1][j-1] + 1, # substitution
+            ].min
+          end
+        end
+      end
+      matrix[u.size][v.size]
+    end
+  end
+  extend Measurable::Levenshtein
+end

data/lib/measurable/maxmin.rb CHANGED

@@ -1,34 +1,38 @@
 module Measurable
+  module Maxmin
-  # call-seq:
-  #     maxmin(u, v) -> Float
-  #
-  # The "Max-min distance" is used to measure similarity between two vectors.
-  #
-  # When used in k-means clustering, this similarity measure can give better
-  # results in some datasets, as pointed out in the paper "K-means clustering
-  # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
-  #
-  # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
-  #
-  # * *Arguments* :
-  #   - +u+ -> An array of Numeric objects.
-  #   - +v+ -> An array of Numeric objects.
-  # * *Returns* :
-  #   - Similarity between +u+ and +v+.
-  # * *Raises* :
-  #   - +ArgumentError+ -> The sizes of +u+ and +v+ doesn't match.
-  #
-  def maxmin(u, v)
-    # TODO: Change this to a more specific, custom-made exception.
-    raise ArgumentError if u.size != v.size
+    # call-seq:
+    #     maxmin(u, v) -> Float
+    #
+    # The "Max-min distance" is used to measure similarity between two vectors.
+    #
+    # When used in k-means clustering, this similarity measure can give better
+    # results in some datasets, as pointed out in the paper "K-means clustering
+    # using Max-min distance measure" --- Visalakshi, N. K.; Suguna, J.
+    #
+    # See: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=05156398
+    #
+    # Arguments:
+    # - +u+ -> An array of Numeric objects.
+    # - +v+ -> An array of Numeric objects.
+    # Returns:
+    # - Similarity between +u+ and +v+.
+    # Raises:
+    # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
+    def maxmin(u, v)
+      # TODO: Change this to a more specific, custom-made exception.
+      raise ArgumentError if u.size != v.size
-    sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
-      acc[0] += attributes.min
-      acc[1] += attributes.max
-      acc
+      sum_min, sum_max = u.zip(v).reduce([0.0, 0.0]) do |acc, attributes|
+        acc[0] += attributes.min
+        acc[1] += attributes.max
+        acc
+      end
+      sum_min / sum_max
     end
-    sum_min / sum_max
   end
-end
+  extend Measurable::Maxmin
+end

data/lib/measurable/minkowski.rb ADDED

@@ -0,0 +1,44 @@
+module Measurable
+  module Minkowski
+    # call-seq:
+    #     minkowski(u, v) -> Numeric
+    #
+    # Calculate the sum of the absolute value of the differences between each
+    # coordinate of +u+ and +v+.
+    #
+    # Arguments:
+    # - +u+ -> An array of Numeric objects.
+    # - +v+ -> An array of Numeric objects.
+    # Returns:
+    # - The Minkowski (or L1) distance between +u+ and +v+.
+    # Raises:
+    # - +ArgumentError+ -> The sizes of +u+ and +v+ don't match.
+    def minkowski(u, v)
+      # TODO: Change this to a more specific, custom-made exception.
+      raise ArgumentError if u.size != v.size
+      u.zip(v).reduce(0) do |acc, elem|
+        acc += (elem[0] - elem[1]).abs
+      end
+    end
+    def self.extended(base) # :nodoc:
+      base.instance_eval do
+        alias :cityblock :minkowski
+        alias :manhattan :minkowski
+      end
+      super
+    end
+    def self.included(base) # :nodoc:
+      base.class_eval do
+        alias :cityblock :minkowski
+        alias :manhattan :minkowski
+      end
+      super
+    end
+  end
+  extend Measurable::Minkowski
+end