spatial_stats 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,24 @@
2
2
 
3
3
  module SpatialStats
4
4
  module Local
5
+ ##
6
+ # MultivariateGeary works like univariate Geary, except that it takes
7
+ # an array of data fields, rather than one data field. It measures the
8
+ # extent to which the average distance in attribute space between
9
+ # values and its neighbors compared to what they would be under spatial
10
+ # randomness.
11
+ #
12
+ # Functionally, C is computed by averaging the C values for each attribute
13
+ # at a certain location, under a univariate context.
5
14
  class MultivariateGeary < Stat
15
+ ##
16
+ # A new instance of Moran
17
+ #
18
+ # @param [ActiveRecord::Relation] scope
19
+ # @param [Symbol, String] fields to query from scope
20
+ # @param [WeightsMatrix] weights to define relationship between observations in scope
21
+ #
22
+ # @return [MultivariateGeary]
6
23
  def initialize(scope, fields, weights)
7
24
  @scope = scope
8
25
  @fields = fields
@@ -10,14 +27,33 @@ module SpatialStats
10
27
  end
11
28
  attr_accessor :scope, :fields, :weights
12
29
 
13
- def i
30
+ ##
31
+ # Computes the stat for MultivariateGeary.
32
+ #
33
+ # @see https://geodacenter.github.io/workbook/6b_local_adv/lab6b.html#concept-5
34
+ #
35
+ # @return [Array] of C values for each observation.
36
+ def stat
14
37
  m = fields.size
15
38
  gearys = fields.map do |field|
16
- Geary.new(scope, field, weights).i
39
+ Geary.new(scope, field, weights).stat
17
40
  end
18
41
  gearys.transpose.map { |x| x.reduce(:+) / m }
19
42
  end
43
+ alias c stat
20
44
 
45
+ ##
46
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
47
+ # Shuffles all tuples, recomputes +#stat+ for each variation, then compares
48
+ # to the computed one. The ratio of more extreme values to
49
+ # permutations is returned for each observation.
50
+ #
51
+ # @see https://geodacenter.github.io/glossary.html#perm
52
+ #
53
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
54
+ # @param [Integer] seed used in random number generator for shuffles.
55
+ #
56
+ # @return [Array] of p-values
21
57
  def mc(permutations = 99, seed = nil)
22
58
  # in this case, one tuple of vals is held constant, then
23
59
  # the rest are shuffled, so for crand we will pass in an arr
@@ -28,26 +64,26 @@ module SpatialStats
28
64
  indices = (0..(n - 1)).to_a
29
65
  shuffles = crand(indices, permutations, rng)
30
66
 
31
- i_orig = i
32
- rs = [0] * i_orig.size
33
- shuffles.each_with_index do |perms, idx|
34
- ii_orig = i_orig[idx]
35
- perms.each do |perm|
36
- # essentially reimplement i here, but only use i_i
37
- m = fields.size
38
- gearys = fields.each_with_index.map do |field, field_idx|
39
- geary = Geary.new(scope, field, weights)
40
- geary.x = field_data[field_idx].values_at(*perm)
41
- geary.i_i(idx)
42
- end
43
- ii_new = gearys.sum { |x| x / m }
44
-
45
- if ii_orig.positive?
46
- rs[idx] += 1 if ii_new >= ii_orig
47
- else
48
- rs[idx] += 1 if ii_new <= ii_orig
49
- end
50
- end
67
+ stat_orig = stat
68
+ rs = [0] * n
69
+
70
+ ws = neighbor_weights
71
+
72
+ idx = 0
73
+ while idx < n
74
+ stat_i_orig = stat_orig[idx]
75
+ wi = Numo::DFloat.cast(ws[idx])
76
+
77
+ # for each field, compute the C value at that index.
78
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
79
+
80
+ rs[idx] = if stat_i_orig.positive?
81
+ (stat_i_new >= stat_i_orig).count
82
+ else
83
+ (stat_i_new <= stat_i_orig).count
84
+ end
85
+
86
+ idx += 1
51
87
  end
52
88
 
53
89
  rs.map do |ri|
@@ -57,12 +93,31 @@ module SpatialStats
57
93
 
58
94
  private
59
95
 
96
+ def mc_i(wi, perms, idx)
97
+ m = fields.size
98
+ permutations = perms.shape[0]
99
+
100
+ cs = Numo::DFloat.zeros(m, permutations)
101
+ (0..m - 1).each do |mi|
102
+ z = field_data[mi]
103
+ zs = matrix_field_data[mi, true][perms]
104
+ c = (z[idx] - zs)**2
105
+
106
+ cs[mi, true] = (wi * c).sum(1)
107
+ end
108
+ cs.mean(0)
109
+ end
110
+
60
111
  def field_data
61
112
  @field_data ||= fields.map do |field|
62
113
  SpatialStats::Queries::Variables.query_field(@scope, field)
63
114
  .standardize
64
115
  end
65
116
  end
117
+
118
+ def matrix_field_data
119
+ @matrix_field_data ||= Numo::DFloat.cast(field_data)
120
+ end
66
121
  end
67
122
  end
68
123
  end
@@ -2,6 +2,11 @@
2
2
 
3
3
  module SpatialStats
4
4
  module Local
5
+ ##
6
+ # Stat is the abstract base class for local stats.
7
+ # It defines the methods that are common between all classes
8
+ # and will raise a NotImplementedError on those that are specific
9
+ # for each type of statistic.
5
10
  class Stat
6
11
  # Base class for local stats
7
12
  def initialize(scope, field, weights)
@@ -11,12 +16,8 @@ module SpatialStats
11
16
  end
12
17
  attr_accessor :scope, :field, :weights
13
18
 
14
- def i
15
- raise NotImplementedError, 'method i not defined'
16
- end
17
-
18
- def i_i(_idx)
19
- raise NotImplementedError, 'method i_i not defined'
19
+ def stat
20
+ raise NotImplementedError, 'method stat not defined'
20
21
  end
21
22
 
22
23
  def expectation
@@ -27,47 +28,83 @@ module SpatialStats
27
28
  raise NotImplementedError, 'method variance not implemented'
28
29
  end
29
30
 
31
+ ##
32
+ # Z-score for each observation of the statistic.
33
+ #
34
+ # @return [Array] of the number of deviations from the mean
30
35
  def z_score
31
- numerators = i.map { |v| v - expectation }
36
+ numerators = stat.map { |v| v - expectation }
32
37
  denominators = variance.map { |v| Math.sqrt(v) }
33
38
  numerators.each_with_index.map do |numerator, idx|
34
39
  numerator / denominators[idx]
35
40
  end
36
41
  end
37
42
 
43
+ ##
44
+ # Conditional randomization algorithm used in permutation testing.
45
+ # Outputs an array of length n of Numo::DFloat matrices of
46
+ # size m x num_neighbors. Where m is the number of permutations and
47
+ # num_neighbors is the number of neighbors for that observation.
48
+ #
49
+ # The values are randomly permutated values from arr that will act
50
+ # as its neighbors for that permutation.
51
+ #
52
+ # This is super important because most weight matrices are very
53
+ # sparse so the amount of shuffling/multiplication that is done
54
+ # is reduced drastically.
55
+ #
56
+ # @see https://github.com/pysal/esda/blob/master/esda/moran.py#L893
57
+ #
58
+ # @return [Array] of Numo::Narray matrices
59
+ #
38
60
  def crand(arr, permutations, rng)
39
- # conditional randomization method
40
- # will generate an n x permutations array of arrays.
41
- # For each n, i will be held the same and the values around it will
42
- # be permutated.
43
- arr.each_with_index.map do |xi, idx|
44
- tmp_arr = arr.dup
45
- tmp_arr.delete_at(idx)
46
- permutations.times.map do
47
- perm = tmp_arr.shuffle(random: rng)
48
- perm.insert(idx, xi)
49
- end
61
+ # basing this off the ESDA method
62
+ # need to get k for max_neighbors
63
+ # and wc for cardinalities of each item
64
+ # this returns an array of length n with
65
+ # (permutations x neighborz) Numo Arrays.
66
+ # This helps reduce computation time because
67
+ # we are only dealing with neighbors for each
68
+ # entry not the entire list of permutations for each entry.
69
+ n_1 = weights.n - 1
70
+
71
+ # weight counts
72
+ wc = [0] * weights.n
73
+ k = 0
74
+ (0..n_1).each do |idx|
75
+ wc[idx] = (w[idx, true] > 0).count
50
76
  end
51
- end
52
77
 
53
- # def crandi(arr, permutations, rng)
54
- # n = @weights.n
55
- # lisas = Numo::DFloat.zeros([n, permutations])
78
+ k = wc.max + 1
79
+ prange = (0..permutations - 1).to_a
80
+
81
+ arr = Numo::DFloat.cast(arr)
56
82
 
57
- # ids = (0..n - 1).to_a
58
- # rids = permutations.times.map do
59
- # ids.shuffle(random: rng)
60
- # end
61
- # p rids
83
+ ids = (0..n_1).to_a
84
+ ids_perm = (0..n_1 - 1).to_a
85
+ rids = Numo::Int32.cast(prange.map { ids_perm.sample(k, random: rng) })
62
86
 
63
- # (0..n - 1).each do |idx|
64
- # idsi = ids.dup
65
- # idsi.delete_at(idx)
66
- # ids.shuffle!(random: rng)
67
- # tmp = arr[idsi[rids[]]]
68
- # end
69
- # end
87
+ (0..n_1).map do |idx|
88
+ idsi = ids.dup
89
+ idsi.delete_at(idx)
90
+ idsi.shuffle!(random: rng)
91
+ idsi = Numo::Int32.cast(idsi)
92
+ arr[idsi[rids[true, 0..wc[idx] - 1]]]
93
+ end
94
+ end
70
95
 
96
+ ##
97
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
98
+ # Shuffles x values, recomputes +#stat+ for each variation, then compares
99
+ # to the computed one. The ratio of more extreme values to
100
+ # permutations is returned for each observation.
101
+ #
102
+ # @see https://geodacenter.github.io/glossary.html#perm
103
+ #
104
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
105
+ # @param [Integer] seed used in random number generator for shuffles.
106
+ #
107
+ # @return [Array] of p-values
71
108
  def mc(permutations = 99, seed = nil)
72
109
  # For local tests, we need to shuffle the values
73
110
  # but for each item, hold its value in place and shuffle
@@ -75,44 +112,27 @@ module SpatialStats
75
112
  # of the entire set. This will be done for each item.
76
113
  rng = gen_rng(seed)
77
114
  shuffles = crand(x, permutations, rng)
78
-
115
+ n = weights.n
79
116
  # r is the number of equal to or more extreme samples
80
- i_orig = i
81
- rs = [0] * i_orig.size
82
-
83
- # For each shuffle, we only need the spatially lagged variable
84
- # at one index, but it needs to be an array of length n.
85
- # Store a zeros array that can be mutated or duplicated and the
86
- # lagged variable at idx will only be set there.
87
- lagged = [0] * i_orig.size
88
-
89
- shuffles.each_with_index do |perms, idx|
90
- ii_orig = i_orig[idx]
91
- wi = w[idx, true] # current weight row
92
- perms.each do |perm|
93
- stat = self.class.new(scope, field, weights)
94
- stat.x = perm
95
-
96
- # avoids computing lag for entire data set
97
- # when we only care about one entry
98
- lagged_var = wi.dot(perm)
99
- z_lag = lagged.dup
100
- z_lag[idx] = lagged_var
101
- stat.z_lag = z_lag
102
-
103
- ii_new = stat.i_i(idx)
104
-
105
- # https://geodacenter.github.io/glossary.html#ppvalue
106
- # NOTE: this is inconsistent with the output from GeoDa
107
- # for local permutation tests, they seem to use greater than
108
- # not greater than or equal to. I'm going to go by the definition
109
- # in the glossary for now.
110
- if ii_orig.positive?
111
- rs[idx] += 1 if ii_new >= ii_orig
112
- else
113
- rs[idx] += 1 if ii_new <= ii_orig
114
- end
115
- end
117
+ stat_orig = stat
118
+ rs = [0] * n
119
+
120
+ ws = neighbor_weights
121
+
122
+ idx = 0
123
+ while idx < n
124
+ stat_i_orig = stat_orig[idx]
125
+
126
+ wi = Numo::DFloat.cast(ws[idx])
127
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
128
+
129
+ rs[idx] = if stat_i_orig.positive?
130
+ (stat_i_new >= stat_i_orig).count
131
+ else
132
+ (stat_i_new <= stat_i_orig).count
133
+ end
134
+
135
+ idx += 1
116
136
  end
117
137
 
118
138
  rs.map do |ri|
@@ -120,27 +140,41 @@ module SpatialStats
120
140
  end
121
141
  end
122
142
 
143
+ ##
144
+ # Permutation test to determine a pseudo p-values of the +#stat+ method.
145
+ # Shuffles y values, hold x values, recomputes +#stat+ for each variation,
146
+ # then compares to the computed one. The ratio of more extreme values to
147
+ # permutations is returned for each observation.
148
+ #
149
+ # @see https://geodacenter.github.io/glossary.html#perm
150
+ #
151
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
152
+ # @param [Integer] seed used in random number generator for shuffles.
153
+ #
154
+ # @return [Array] of p-values
123
155
  def mc_bv(permutations, seed)
124
156
  rng = gen_rng(seed)
125
157
  shuffles = crand(y, permutations, rng)
158
+ n = weights.n
126
159
 
127
- # r is the number of equal to or more extreme samples
128
- i_orig = i
129
- rs = [0] * i_orig.size
130
- shuffles.each_with_index do |perms, idx|
131
- ii_orig = i_orig[idx]
132
- perms.each do |perm|
133
- stat = self.class.new(@scope, @x_field, @y_field, @weights)
134
- stat.x = x
135
- stat.y = perm
136
- ii_new = stat.i_i(idx)
137
-
138
- if ii_orig.positive?
139
- rs[idx] += 1 if ii_new >= ii_orig
140
- else
141
- rs[idx] += 1 if ii_new <= ii_orig
142
- end
143
- end
160
+ stat_orig = stat
161
+ rs = [0] * n
162
+
163
+ ws = neighbor_weights
164
+
165
+ idx = 0
166
+ while idx < n
167
+ stat_i_orig = stat_orig[idx]
168
+ wi = Numo::DFloat.cast(ws[idx])
169
+ stat_i_new = mc_i(wi, shuffles[idx], idx)
170
+
171
+ rs[idx] = if stat_i_orig.positive?
172
+ (stat_i_new >= stat_i_orig).count
173
+ else
174
+ (stat_i_new <= stat_i_orig).count
175
+ end
176
+
177
+ idx += 1
144
178
  end
145
179
 
146
180
  rs.map do |ri|
@@ -148,6 +182,22 @@ module SpatialStats
148
182
  end
149
183
  end
150
184
 
185
+ ##
186
+ # Determines what quadrant an observation is in. Based on its value
187
+ # compared to its neighbors. This does not work for all stats, since
188
+ # it requires that values be negative.
189
+ #
190
+ # In a standardized array of z, high values are values greater than 0
191
+ # and it's neighbors are determined by the spatial lag and if that is
192
+ # positive then it's neighbors would be high, low otherwise.
193
+ #
194
+ # Quadrants are:
195
+ # [HH] a high value surrounded by other high values
196
+ # [LH] a low value surrounded by high values
197
+ # [LL] a low value surrounded by low values
198
+ # [HL] a high value surrounded by low values
199
+ #
200
+ # @return [Array] of labels
151
201
  def quads
152
202
  # https://github.com/pysal/esda/blob/master/esda/moran.py#L925
153
203
  w = @weights.full
@@ -173,6 +223,14 @@ module SpatialStats
173
223
 
174
224
  private
175
225
 
226
+ def stat_i
227
+ raise NotImplementedError, 'method stat_i not defined'
228
+ end
229
+
230
+ def mc_i
231
+ raise NotImplementedError, 'method mc_i not defined'
232
+ end
233
+
176
234
  def w
177
235
  weights.standardized
178
236
  end
@@ -184,6 +242,20 @@ module SpatialStats
184
242
  Random.new
185
243
  end
186
244
  end
245
+
246
+ def neighbor_weights
247
+ # record the non-zero weights in variable length arrays for each
248
+ # row in the weights table
249
+ ws = [[]] * weights.n
250
+ (0..weights.n - 1).each do |idx|
251
+ neighbors = []
252
+ w[idx, true].each do |wij|
253
+ neighbors << wij if wij != 0
254
+ end
255
+ ws[idx] = neighbors
256
+ end
257
+ ws
258
+ end
187
259
  end
188
260
  end
189
261
  end