RubyGems - spatial_stats - Versions diffs - 0.1.1 → 0.2.1 - Mend

spatial_stats 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/README.md +185 -9
data/lib/spatial_stats.rb +7 -4
data/lib/spatial_stats/enumerable_ext.rb +29 -0
data/lib/spatial_stats/global.rb +15 -0
data/lib/spatial_stats/global/bivariate_moran.rb +48 -4
data/lib/spatial_stats/global/moran.rb +69 -19
data/lib/spatial_stats/global/stat.rb +29 -17
data/lib/spatial_stats/local.rb +16 -1
data/lib/spatial_stats/local/bivariate_moran.rb +45 -4
data/lib/spatial_stats/local/geary.rb +34 -47
data/lib/spatial_stats/local/getis_ord.rb +109 -0
data/lib/spatial_stats/local/moran.rb +55 -22
data/lib/spatial_stats/local/multivariate_geary.rb +77 -22
data/lib/spatial_stats/local/stat.rb +160 -88
data/lib/spatial_stats/narray_ext.rb +27 -0
data/lib/spatial_stats/queries.rb +6 -0
data/lib/spatial_stats/queries/variables.rb +16 -3
data/lib/spatial_stats/queries/weights.rb +91 -9
data/lib/spatial_stats/utils.rb +7 -0
data/lib/spatial_stats/utils/lag.rb +34 -2
data/lib/spatial_stats/version.rb +1 -1
data/lib/spatial_stats/weights.rb +9 -0
data/lib/spatial_stats/weights/contiguous.rb +18 -0
data/lib/spatial_stats/weights/distant.rb +41 -4
data/lib/spatial_stats/weights/weights_matrix.rb +25 -0
metadata +5 -4
data/lib/spatial_stats/local/g.rb +0 -75

data/lib/spatial_stats/local/multivariate_geary.rb CHANGED

@@ -2,7 +2,24 @@
 module SpatialStats
   module Local
+    ##
+    # MultivariateGeary works like univariate Geary, except that it takes
+    # an array of data fields, rather than one data field. It measures the
+    # extent to which the average distance in attribute space between
+    # values and its neighbors compared to what they would be under spatial
+    # randomness.
+    #
+    # Functionally, C is computed by averaging the C values for each attribute
+    # at a certain location, under a univariate context.
     class MultivariateGeary < Stat
+      ##
+      # A new instance of Moran
+      #
+      # @param [ActiveRecord::Relation] scope
+      # @param [Symbol, String] fields to query from scope
+      # @param [WeightsMatrix] weights to define relationship between observations in scope
+      #
+      # @return [MultivariateGeary]
       def initialize(scope, fields, weights)
         @scope = scope
         @fields = fields
@@ -10,14 +27,33 @@ module SpatialStats
       end
       attr_accessor :scope, :fields, :weights
-      def i
+      ##
+      # Computes the stat for MultivariateGeary.
+      #
+      # @see https://geodacenter.github.io/workbook/6b_local_adv/lab6b.html#concept-5
+      #
+      # @return [Array] of C values for each observation.
+      def stat
         m = fields.size
         gearys = fields.map do |field|
-          Geary.new(scope, field, weights).i
+          Geary.new(scope, field, weights).stat
         end
         gearys.transpose.map { |x| x.reduce(:+) / m }
       end
+      alias c stat
+      ##
+      # Permutation test to determine a pseudo p-values of the +#stat+ method.
+      # Shuffles all tuples, recomputes +#stat+ for each variation, then compares
+      # to the computed one. The ratio of more extreme values to
+      # permutations is returned for each observation.
+      #
+      # @see https://geodacenter.github.io/glossary.html#perm
+      #
+      # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
+      # @param [Integer] seed used in random number generator for shuffles.
+      #
+      # @return [Array] of p-values
       def mc(permutations = 99, seed = nil)
         # in this case, one tuple of vals is held constant, then
         # the rest are shuffled, so for crand we will pass in an arr
@@ -28,26 +64,26 @@ module SpatialStats
         indices = (0..(n - 1)).to_a
         shuffles = crand(indices, permutations, rng)
-        i_orig = i
-        rs = [0] * i_orig.size
-        shuffles.each_with_index do |perms, idx|
-          ii_orig = i_orig[idx]
-          perms.each do |perm|
-            # essentially reimplement i here, but only use i_i
-            m = fields.size
-            gearys = fields.each_with_index.map do |field, field_idx|
-              geary = Geary.new(scope, field, weights)
-              geary.x = field_data[field_idx].values_at(*perm)
-              geary.i_i(idx)
-            end
-            ii_new = gearys.sum { |x| x / m }
-            if ii_orig.positive?
-              rs[idx] += 1 if ii_new >= ii_orig
-            else
-              rs[idx] += 1 if ii_new <= ii_orig
-            end
-          end
+        stat_orig = stat
+        rs = [0] * n
+        ws = neighbor_weights
+        idx = 0
+        while idx < n
+          stat_i_orig = stat_orig[idx]
+          wi = Numo::DFloat.cast(ws[idx])
+          # for each field, compute the C value at that index.
+          stat_i_new = mc_i(wi, shuffles[idx], idx)
+          rs[idx] = if stat_i_orig.positive?
+                      (stat_i_new >= stat_i_orig).count
+                    else
+                      (stat_i_new <= stat_i_orig).count
+                    end
+          idx += 1
         end
         rs.map do |ri|
@@ -57,12 +93,31 @@ module SpatialStats
       private
+      def mc_i(wi, perms, idx)
+        m = fields.size
+        permutations = perms.shape[0]
+        cs = Numo::DFloat.zeros(m, permutations)
+        (0..m - 1).each do |mi|
+          z = field_data[mi]
+          zs = matrix_field_data[mi, true][perms]
+          c = (z[idx] - zs)**2
+          cs[mi, true] = (wi * c).sum(1)
+        end
+        cs.mean(0)
+      end
       def field_data
         @field_data ||= fields.map do |field|
           SpatialStats::Queries::Variables.query_field(@scope, field)
                                           .standardize
         end
       end
+      def matrix_field_data
+        @matrix_field_data ||= Numo::DFloat.cast(field_data)
+      end
     end
   end
 end

data/lib/spatial_stats/local/stat.rb CHANGED

@@ -2,6 +2,11 @@
 module SpatialStats
   module Local
+    ##
+    # Stat is the abstract base class for local stats.
+    # It defines the methods that are common between all classes
+    # and will raise a NotImplementedError on those that are specific
+    # for each type of statistic.
     class Stat
       # Base class for local stats
       def initialize(scope, field, weights)
@@ -11,12 +16,8 @@ module SpatialStats
       end
       attr_accessor :scope, :field, :weights
-      def i
-        raise NotImplementedError, 'method i not defined'
-      end
-      def i_i(_idx)
-        raise NotImplementedError, 'method i_i not defined'
+      def stat
+        raise NotImplementedError, 'method stat not defined'
       end
       def expectation
@@ -27,47 +28,83 @@ module SpatialStats
         raise NotImplementedError, 'method variance not implemented'
       end
+      ##
+      # Z-score for each observation of the statistic.
+      #
+      # @return [Array] of the number of deviations from the mean
       def z_score
-        numerators = i.map { |v| v - expectation }
+        numerators = stat.map { |v| v - expectation }
         denominators = variance.map { |v| Math.sqrt(v) }
         numerators.each_with_index.map do |numerator, idx|
           numerator / denominators[idx]
         end
       end
+      ##
+      # Conditional randomization algorithm used in permutation testing.
+      # Outputs an array of length n of Numo::DFloat matrices of
+      # size m x num_neighbors. Where m is the number of permutations and
+      # num_neighbors is the number of neighbors for that observation.
+      #
+      # The values are randomly permutated values from arr that will act
+      # as its neighbors for that permutation.
+      #
+      # This is super important because most weight matrices are very
+      # sparse so the amount of shuffling/multiplication that is done
+      # is reduced drastically.
+      #
+      # @see https://github.com/pysal/esda/blob/master/esda/moran.py#L893
+      #
+      # @return [Array] of Numo::Narray matrices
+      #
       def crand(arr, permutations, rng)
-        # conditional randomization method
-        # will generate an n x permutations array of arrays.
-        # For each n, i will be held the same and the values around it will
-        # be permutated.
-        arr.each_with_index.map do |xi, idx|
-          tmp_arr = arr.dup
-          tmp_arr.delete_at(idx)
-          permutations.times.map do
-            perm = tmp_arr.shuffle(random: rng)
-            perm.insert(idx, xi)
-          end
+        # basing this off the ESDA method
+        # need to get k for max_neighbors
+        # and wc for cardinalities of each item
+        # this returns an array of length n with
+        # (permutations x neighborz) Numo Arrays.
+        # This helps reduce computation time because
+        # we are only dealing with neighbors for each
+        # entry not the entire list of permutations for each entry.
+        n_1 = weights.n - 1
+        # weight counts
+        wc = [0] * weights.n
+        k = 0
+        (0..n_1).each do |idx|
+          wc[idx] = (w[idx, true] > 0).count
         end
-      end
-      # def crandi(arr, permutations, rng)
-      #   n = @weights.n
-      #   lisas = Numo::DFloat.zeros([n, permutations])
+        k = wc.max + 1
+        prange = (0..permutations - 1).to_a
+        arr = Numo::DFloat.cast(arr)
-      #   ids = (0..n - 1).to_a
-      #   rids = permutations.times.map do
-      #     ids.shuffle(random: rng)
-      #   end
-      #   p rids
+        ids = (0..n_1).to_a
+        ids_perm = (0..n_1 - 1).to_a
+        rids = Numo::Int32.cast(prange.map { ids_perm.sample(k, random: rng) })
-      #   (0..n - 1).each do |idx|
-      #     idsi = ids.dup
-      #     idsi.delete_at(idx)
-      #     ids.shuffle!(random: rng)
-      #     tmp = arr[idsi[rids[]]]
-      #   end
-      # end
+        (0..n_1).map do |idx|
+          idsi = ids.dup
+          idsi.delete_at(idx)
+          idsi.shuffle!(random: rng)
+          idsi = Numo::Int32.cast(idsi)
+          arr[idsi[rids[true, 0..wc[idx] - 1]]]
+        end
+      end
+      ##
+      # Permutation test to determine a pseudo p-values of the +#stat+ method.
+      # Shuffles x values, recomputes +#stat+ for each variation, then compares
+      # to the computed one. The ratio of more extreme values to
+      # permutations is returned for each observation.
+      #
+      # @see https://geodacenter.github.io/glossary.html#perm
+      #
+      # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
+      # @param [Integer] seed used in random number generator for shuffles.
+      #
+      # @return [Array] of p-values
       def mc(permutations = 99, seed = nil)
         # For local tests, we need to shuffle the values
         # but for each item, hold its value in place and shuffle
@@ -75,44 +112,27 @@ module SpatialStats
         # of the entire set. This will be done for each item.
         rng = gen_rng(seed)
         shuffles = crand(x, permutations, rng)
+        n = weights.n
         # r is the number of equal to or more extreme samples
-        i_orig = i
-        rs = [0] * i_orig.size
-        # For each shuffle, we only need the spatially lagged variable
-        # at one index, but it needs to be an array of length n.
-        # Store a zeros array that can be mutated or duplicated and the
-        # lagged variable at idx will only be set there.
-        lagged = [0] * i_orig.size
-        shuffles.each_with_index do |perms, idx|
-          ii_orig = i_orig[idx]
-          wi = w[idx, true] # current weight row
-          perms.each do |perm|
-            stat = self.class.new(scope, field, weights)
-            stat.x = perm
-            # avoids computing lag for entire data set
-            # when we only care about one entry
-            lagged_var = wi.dot(perm)
-            z_lag = lagged.dup
-            z_lag[idx] = lagged_var
-            stat.z_lag = z_lag
-            ii_new = stat.i_i(idx)
-            # https://geodacenter.github.io/glossary.html#ppvalue
-            # NOTE: this is inconsistent with the output from GeoDa
-            # for local permutation tests, they seem to use greater than
-            # not greater than or equal to. I'm going to go by the definition
-            # in the glossary for now.
-            if ii_orig.positive?
-              rs[idx] += 1 if ii_new >= ii_orig
-            else
-              rs[idx] += 1 if ii_new <= ii_orig
-            end
-          end
+        stat_orig = stat
+        rs = [0] * n
+        ws = neighbor_weights
+        idx = 0
+        while idx < n
+          stat_i_orig = stat_orig[idx]
+          wi = Numo::DFloat.cast(ws[idx])
+          stat_i_new = mc_i(wi, shuffles[idx], idx)
+          rs[idx] = if stat_i_orig.positive?
+                      (stat_i_new >= stat_i_orig).count
+                    else
+                      (stat_i_new <= stat_i_orig).count
+                    end
+          idx += 1
         end
         rs.map do |ri|
@@ -120,27 +140,41 @@ module SpatialStats
         end
       end
+      ##
+      # Permutation test to determine a pseudo p-values of the +#stat+ method.
+      # Shuffles y values, hold x values, recomputes +#stat+ for each variation,
+      # then compares to the computed one. The ratio of more extreme values to
+      # permutations is returned for each observation.
+      #
+      # @see https://geodacenter.github.io/glossary.html#perm
+      #
+      # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
+      # @param [Integer] seed used in random number generator for shuffles.
+      #
+      # @return [Array] of p-values
       def mc_bv(permutations, seed)
         rng = gen_rng(seed)
         shuffles = crand(y, permutations, rng)
+        n = weights.n
-        # r is the number of equal to or more extreme samples
-        i_orig = i
-        rs = [0] * i_orig.size
-        shuffles.each_with_index do |perms, idx|
-          ii_orig = i_orig[idx]
-          perms.each do |perm|
-            stat = self.class.new(@scope, @x_field, @y_field, @weights)
-            stat.x = x
-            stat.y = perm
-            ii_new = stat.i_i(idx)
-            if ii_orig.positive?
-              rs[idx] += 1 if ii_new >= ii_orig
-            else
-              rs[idx] += 1 if ii_new <= ii_orig
-            end
-          end
+        stat_orig = stat
+        rs = [0] * n
+        ws = neighbor_weights
+        idx = 0
+        while idx < n
+          stat_i_orig = stat_orig[idx]
+          wi = Numo::DFloat.cast(ws[idx])
+          stat_i_new = mc_i(wi, shuffles[idx], idx)
+          rs[idx] = if stat_i_orig.positive?
+                      (stat_i_new >= stat_i_orig).count
+                    else
+                      (stat_i_new <= stat_i_orig).count
+                    end
+          idx += 1
         end
         rs.map do |ri|
@@ -148,6 +182,22 @@ module SpatialStats
         end
       end
+      ##
+      # Determines what quadrant an observation is in. Based on its value
+      # compared to its neighbors. This does not work for all stats, since
+      # it requires that values be negative.
+      #
+      # In a standardized array of z, high values are values greater than 0
+      # and it's neighbors are determined by the spatial lag and if that is
+      # positive then it's neighbors would be high, low otherwise.
+      #
+      # Quadrants are:
+      # [HH] a high value surrounded by other high values
+      # [LH] a low value surrounded by high values
+      # [LL] a low value surrounded by low values
+      # [HL] a high value surrounded by low values
+      #
+      # @return [Array] of labels
       def quads
         # https://github.com/pysal/esda/blob/master/esda/moran.py#L925
         w = @weights.full
@@ -173,6 +223,14 @@ module SpatialStats
       private
+      def stat_i
+        raise NotImplementedError, 'method stat_i not defined'
+      end
+      def mc_i
+        raise NotImplementedError, 'method mc_i not defined'
+      end
       def w
         weights.standardized
       end
@@ -184,6 +242,20 @@ module SpatialStats
           Random.new
         end
       end
+      def neighbor_weights
+        # record the non-zero weights in variable length arrays for each
+        # row in the weights table
+        ws = [[]] * weights.n
+        (0..weights.n - 1).each do |idx|
+          neighbors = []
+          w[idx, true].each do |wij|
+            neighbors << wij if wij != 0
+          end
+          ws[idx] = neighbors
+        end
+        ws
+      end
     end
   end
 end