spatial_stats 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,24 @@
2
2
 
3
3
  module SpatialStats
4
4
  module Local
5
+ ##
6
+ # MultivariateGeary works like univariate Geary, except that it takes
7
+ # an array of data fields, rather than one data field. It measures the
8
+ # extent to which the average distance in attribute space between
9
+ # values and its neighbors compared to what they would be under spatial
10
+ # randomness.
11
+ #
12
+ # Functionally, C is computed by averaging the C values for each attribute
13
+ # at a certain location, under a univariate context.
5
14
  class MultivariateGeary < Stat
15
+ ##
16
+ # A new instance of Moran
17
+ #
18
+ # @param [ActiveRecord::Relation] scope
19
+ # @param [Symbol, String] fields to query from scope
20
+ # @param [WeightsMatrix] weights to define relationship between observations in scope
21
+ #
22
+ # @return [MultivariateGeary]
6
23
  def initialize(scope, fields, weights)
7
24
  @scope = scope
8
25
  @fields = fields
@@ -10,14 +27,33 @@ module SpatialStats
10
27
  end
11
28
  attr_accessor :scope, :fields, :weights
12
29
 
13
- def i
30
+ ##
31
+ # Computes the stat for MultivariateGeary.
32
+ #
33
+ # @see https://geodacenter.github.io/workbook/6b_local_adv/lab6b.html#concept-5
34
+ #
35
+ # @return [Array] of C values for each observation.
36
+ def stat
14
37
  m = fields.size
15
38
  gearys = fields.map do |field|
16
- Geary.new(scope, field, weights).i
39
+ Geary.new(scope, field, weights).stat
17
40
  end
18
41
  gearys.transpose.map { |x| x.reduce(:+) / m }
19
42
  end
43
+ alias c stat
20
44
 
45
+ ##
46
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
47
+ # Shuffles all tuples, recomputes +#stat+ for each variation, then compares
48
+ # to the computed one. The ratio of more extreme values to
49
+ # permutations is returned for each observation.
50
+ #
51
+ # @see https://geodacenter.github.io/glossary.html#perm
52
+ #
53
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
54
+ # @param [Integer] seed used in random number generator for shuffles.
55
+ #
56
+ # @return [Array] of p-values
21
57
  def mc(permutations = 99, seed = nil)
22
58
  # in this case, one tuple of vals is held constant, then
23
59
  # the rest are shuffled, so for crand we will pass in an arr
@@ -28,26 +64,26 @@ module SpatialStats
28
64
  indices = (0..(n - 1)).to_a
29
65
  shuffles = crand(indices, permutations, rng)
30
66
 
31
- i_orig = i
32
- rs = [0] * i_orig.size
33
- shuffles.each_with_index do |perms, idx|
34
- ii_orig = i_orig[idx]
35
- perms.each do |perm|
36
- # essentially reimplement i here, but only use i_i
37
- m = fields.size
38
- gearys = fields.each_with_index.map do |field, field_idx|
39
- geary = Geary.new(scope, field, weights)
40
- geary.x = field_data[field_idx].values_at(*perm)
41
- geary.i_i(idx)
42
- end
43
- ii_new = gearys.sum { |x| x / m }
44
-
45
- if ii_orig.positive?
46
- rs[idx] += 1 if ii_new >= ii_orig
47
- else
48
- rs[idx] += 1 if ii_new <= ii_orig
49
- end
50
- end
67
+ stat_orig = stat
68
+ rs = [0] * n
69
+
70
+ ws = neighbor_weights
71
+
72
+ idx = 0
73
+ while idx < n
74
+ stat_i_orig = stat_orig[idx]
75
+ wi = Numo::DFloat.cast(ws[idx])
76
+
77
+ # for each field, compute the C value at that index.
78
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
79
+
80
+ rs[idx] = if stat_i_orig.positive?
81
+ (stat_i_new >= stat_i_orig).count
82
+ else
83
+ (stat_i_new <= stat_i_orig).count
84
+ end
85
+
86
+ idx += 1
51
87
  end
52
88
 
53
89
  rs.map do |ri|
@@ -57,12 +93,31 @@ module SpatialStats
57
93
 
58
94
  private
59
95
 
96
+ def mc_i(wi, perms, idx)
97
+ m = fields.size
98
+ permutations = perms.shape[0]
99
+
100
+ cs = Numo::DFloat.zeros(m, permutations)
101
+ (0..m - 1).each do |mi|
102
+ z = field_data[mi]
103
+ zs = matrix_field_data[mi, true][perms]
104
+ c = (z[idx] - zs)**2
105
+
106
+ cs[mi, true] = (wi * c).sum(1)
107
+ end
108
+ cs.mean(0)
109
+ end
110
+
60
111
  def field_data
61
112
  @field_data ||= fields.map do |field|
62
113
  SpatialStats::Queries::Variables.query_field(@scope, field)
63
114
  .standardize
64
115
  end
65
116
  end
117
+
118
+ def matrix_field_data
119
+ @matrix_field_data ||= Numo::DFloat.cast(field_data)
120
+ end
66
121
  end
67
122
  end
68
123
  end
@@ -2,6 +2,11 @@
2
2
 
3
3
  module SpatialStats
4
4
  module Local
5
+ ##
6
+ # Stat is the abstract base class for local stats.
7
+ # It defines the methods that are common between all classes
8
+ # and will raise a NotImplementedError on those that are specific
9
+ # for each type of statistic.
5
10
  class Stat
6
11
  # Base class for local stats
7
12
  def initialize(scope, field, weights)
@@ -11,12 +16,8 @@ module SpatialStats
11
16
  end
12
17
  attr_accessor :scope, :field, :weights
13
18
 
14
- def i
15
- raise NotImplementedError, 'method i not defined'
16
- end
17
-
18
- def i_i(_idx)
19
- raise NotImplementedError, 'method i_i not defined'
19
+ def stat
20
+ raise NotImplementedError, 'method stat not defined'
20
21
  end
21
22
 
22
23
  def expectation
@@ -27,47 +28,83 @@ module SpatialStats
27
28
  raise NotImplementedError, 'method variance not implemented'
28
29
  end
29
30
 
31
+ ##
32
+ # Z-score for each observation of the statistic.
33
+ #
34
+ # @return [Array] of the number of deviations from the mean
30
35
  def z_score
31
- numerators = i.map { |v| v - expectation }
36
+ numerators = stat.map { |v| v - expectation }
32
37
  denominators = variance.map { |v| Math.sqrt(v) }
33
38
  numerators.each_with_index.map do |numerator, idx|
34
39
  numerator / denominators[idx]
35
40
  end
36
41
  end
37
42
 
43
+ ##
44
+ # Conditional randomization algorithm used in permutation testing.
45
+ # Outputs an array of length n of Numo::DFloat matrices of
46
+ # size m x num_neighbors. Where m is the number of permutations and
47
+ # num_neighbors is the number of neighbors for that observation.
48
+ #
49
+ # The values are randomly permutated values from arr that will act
50
+ # as its neighbors for that permutation.
51
+ #
52
+ # This is super important because most weight matrices are very
53
+ # sparse so the amount of shuffling/multiplication that is done
54
+ # is reduced drastically.
55
+ #
56
+ # @see https://github.com/pysal/esda/blob/master/esda/moran.py#L893
57
+ #
58
+ # @return [Array] of Numo::Narray matrices
59
+ #
38
60
  def crand(arr, permutations, rng)
39
- # conditional randomization method
40
- # will generate an n x permutations array of arrays.
41
- # For each n, i will be held the same and the values around it will
42
- # be permutated.
43
- arr.each_with_index.map do |xi, idx|
44
- tmp_arr = arr.dup
45
- tmp_arr.delete_at(idx)
46
- permutations.times.map do
47
- perm = tmp_arr.shuffle(random: rng)
48
- perm.insert(idx, xi)
49
- end
61
+ # basing this off the ESDA method
62
+ # need to get k for max_neighbors
63
+ # and wc for cardinalities of each item
64
+ # this returns an array of length n with
65
+ # (permutations x neighborz) Numo Arrays.
66
+ # This helps reduce computation time because
67
+ # we are only dealing with neighbors for each
68
+ # entry not the entire list of permutations for each entry.
69
+ n_1 = weights.n - 1
70
+
71
+ # weight counts
72
+ wc = [0] * weights.n
73
+ k = 0
74
+ (0..n_1).each do |idx|
75
+ wc[idx] = (w[idx, true] > 0).count
50
76
  end
51
- end
52
77
 
53
- # def crandi(arr, permutations, rng)
54
- # n = @weights.n
55
- # lisas = Numo::DFloat.zeros([n, permutations])
78
+ k = wc.max + 1
79
+ prange = (0..permutations - 1).to_a
80
+
81
+ arr = Numo::DFloat.cast(arr)
56
82
 
57
- # ids = (0..n - 1).to_a
58
- # rids = permutations.times.map do
59
- # ids.shuffle(random: rng)
60
- # end
61
- # p rids
83
+ ids = (0..n_1).to_a
84
+ ids_perm = (0..n_1 - 1).to_a
85
+ rids = Numo::Int32.cast(prange.map { ids_perm.sample(k, random: rng) })
62
86
 
63
- # (0..n - 1).each do |idx|
64
- # idsi = ids.dup
65
- # idsi.delete_at(idx)
66
- # ids.shuffle!(random: rng)
67
- # tmp = arr[idsi[rids[]]]
68
- # end
69
- # end
87
+ (0..n_1).map do |idx|
88
+ idsi = ids.dup
89
+ idsi.delete_at(idx)
90
+ idsi.shuffle!(random: rng)
91
+ idsi = Numo::Int32.cast(idsi)
92
+ arr[idsi[rids[true, 0..wc[idx] - 1]]]
93
+ end
94
+ end
70
95
 
96
+ ##
97
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
98
+ # Shuffles x values, recomputes +#stat+ for each variation, then compares
99
+ # to the computed one. The ratio of more extreme values to
100
+ # permutations is returned for each observation.
101
+ #
102
+ # @see https://geodacenter.github.io/glossary.html#perm
103
+ #
104
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
105
+ # @param [Integer] seed used in random number generator for shuffles.
106
+ #
107
+ # @return [Array] of p-values
71
108
  def mc(permutations = 99, seed = nil)
72
109
  # For local tests, we need to shuffle the values
73
110
  # but for each item, hold its value in place and shuffle
@@ -75,44 +112,27 @@ module SpatialStats
75
112
  # of the entire set. This will be done for each item.
76
113
  rng = gen_rng(seed)
77
114
  shuffles = crand(x, permutations, rng)
78
-
115
+ n = weights.n
79
116
  # r is the number of equal to or more extreme samples
80
- i_orig = i
81
- rs = [0] * i_orig.size
82
-
83
- # For each shuffle, we only need the spatially lagged variable
84
- # at one index, but it needs to be an array of length n.
85
- # Store a zeros array that can be mutated or duplicated and the
86
- # lagged variable at idx will only be set there.
87
- lagged = [0] * i_orig.size
88
-
89
- shuffles.each_with_index do |perms, idx|
90
- ii_orig = i_orig[idx]
91
- wi = w[idx, true] # current weight row
92
- perms.each do |perm|
93
- stat = self.class.new(scope, field, weights)
94
- stat.x = perm
95
-
96
- # avoids computing lag for entire data set
97
- # when we only care about one entry
98
- lagged_var = wi.dot(perm)
99
- z_lag = lagged.dup
100
- z_lag[idx] = lagged_var
101
- stat.z_lag = z_lag
102
-
103
- ii_new = stat.i_i(idx)
104
-
105
- # https://geodacenter.github.io/glossary.html#ppvalue
106
- # NOTE: this is inconsistent with the output from GeoDa
107
- # for local permutation tests, they seem to use greater than
108
- # not greater than or equal to. I'm going to go by the definition
109
- # in the glossary for now.
110
- if ii_orig.positive?
111
- rs[idx] += 1 if ii_new >= ii_orig
112
- else
113
- rs[idx] += 1 if ii_new <= ii_orig
114
- end
115
- end
117
+ stat_orig = stat
118
+ rs = [0] * n
119
+
120
+ ws = neighbor_weights
121
+
122
+ idx = 0
123
+ while idx < n
124
+ stat_i_orig = stat_orig[idx]
125
+
126
+ wi = Numo::DFloat.cast(ws[idx])
127
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
128
+
129
+ rs[idx] = if stat_i_orig.positive?
130
+ (stat_i_new >= stat_i_orig).count
131
+ else
132
+ (stat_i_new <= stat_i_orig).count
133
+ end
134
+
135
+ idx += 1
116
136
  end
117
137
 
118
138
  rs.map do |ri|
@@ -120,27 +140,41 @@ module SpatialStats
120
140
  end
121
141
  end
122
142
 
143
+ ##
144
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
145
+ # Shuffles y values, hold x values, recomputes +#stat+ for each variation,
146
+ # then compares to the computed one. The ratio of more extreme values to
147
+ # permutations is returned for each observation.
148
+ #
149
+ # @see https://geodacenter.github.io/glossary.html#perm
150
+ #
151
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
152
+ # @param [Integer] seed used in random number generator for shuffles.
153
+ #
154
+ # @return [Array] of p-values
123
155
  def mc_bv(permutations, seed)
124
156
  rng = gen_rng(seed)
125
157
  shuffles = crand(y, permutations, rng)
158
+ n = weights.n
126
159
 
127
- # r is the number of equal to or more extreme samples
128
- i_orig = i
129
- rs = [0] * i_orig.size
130
- shuffles.each_with_index do |perms, idx|
131
- ii_orig = i_orig[idx]
132
- perms.each do |perm|
133
- stat = self.class.new(@scope, @x_field, @y_field, @weights)
134
- stat.x = x
135
- stat.y = perm
136
- ii_new = stat.i_i(idx)
137
-
138
- if ii_orig.positive?
139
- rs[idx] += 1 if ii_new >= ii_orig
140
- else
141
- rs[idx] += 1 if ii_new <= ii_orig
142
- end
143
- end
160
+ stat_orig = stat
161
+ rs = [0] * n
162
+
163
+ ws = neighbor_weights
164
+
165
+ idx = 0
166
+ while idx < n
167
+ stat_i_orig = stat_orig[idx]
168
+ wi = Numo::DFloat.cast(ws[idx])
169
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
170
+
171
+ rs[idx] = if stat_i_orig.positive?
172
+ (stat_i_new >= stat_i_orig).count
173
+ else
174
+ (stat_i_new <= stat_i_orig).count
175
+ end
176
+
177
+ idx += 1
144
178
  end
145
179
 
146
180
  rs.map do |ri|
@@ -148,6 +182,22 @@ module SpatialStats
148
182
  end
149
183
  end
150
184
 
185
+ ##
186
+ # Determines what quadrant an observation is in. Based on its value
187
+ # compared to its neighbors. This does not work for all stats, since
188
+ # it requires that values be negative.
189
+ #
190
+ # In a standardized array of z, high values are values greater than 0
191
+ # and it's neighbors are determined by the spatial lag and if that is
192
+ # positive then it's neighbors would be high, low otherwise.
193
+ #
194
+ # Quadrants are:
195
+ # [HH] a high value surrounded by other high values
196
+ # [LH] a low value surrounded by high values
197
+ # [LL] a low value surrounded by low values
198
+ # [HL] a high value surrounded by low values
199
+ #
200
+ # @return [Array] of labels
151
201
  def quads
152
202
  # https://github.com/pysal/esda/blob/master/esda/moran.py#L925
153
203
  w = @weights.full
@@ -173,6 +223,14 @@ module SpatialStats
173
223
 
174
224
  private
175
225
 
226
+ def stat_i
227
+ raise NotImplementedError, 'method stat_i not defined'
228
+ end
229
+
230
+ def mc_i
231
+ raise NotImplementedError, 'method mc_i not defined'
232
+ end
233
+
176
234
  def w
177
235
  weights.standardized
178
236
  end
@@ -184,6 +242,20 @@ module SpatialStats
184
242
  Random.new
185
243
  end
186
244
  end
245
+
246
+ def neighbor_weights
247
+ # record the non-zero weights in variable length arrays for each
248
+ # row in the weights table
249
+ ws = [[]] * weights.n
250
+ (0..weights.n - 1).each do |idx|
251
+ neighbors = []
252
+ w[idx, true].each do |wij|
253
+ neighbors << wij if wij != 0
254
+ end
255
+ ws[idx] = neighbors
256
+ end
257
+ ws
258
+ end
187
259
  end
188
260
  end
189
261
  end