spatial_stats 0.2.2 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,10 +19,28 @@ module SpatialStats
19
19
  @scope = scope
20
20
  @x_field = x_field
21
21
  @y_field = y_field
22
- @weights = weights
22
+ @weights = weights.standardize
23
23
  end
24
24
  attr_accessor :scope, :x_field, :y_field, :weights
25
25
 
26
+ ##
27
+ # A new instance of BivariateMoran, from vector and weights.
28
+ #
29
+ # @param [Array] x observations of dataset
30
+ # @param [Array] y observations of dataset
31
+ # @param [WeightsMatrix] weights to define relationships between observations
32
+ #
33
+ # @return [BivariateMoran]
34
+ def self.from_observations(x, y, weights)
35
+ n = weights.n
36
+ raise ArgumentError, 'Data size != weights.n' if x.size != n || y.size != n
37
+
38
+ instance = new(nil, nil, nil, weights.standardize)
39
+ instance.x = x
40
+ instance.y = y
41
+ instance
42
+ end
43
+
26
44
  ##
27
45
  # Computes the local indicator of spatial correlation for
28
46
  # x against lagged y.
@@ -62,6 +80,61 @@ module SpatialStats
62
80
  mc_bv(permutations, seed)
63
81
  end
64
82
 
83
+ ##
84
+ # Determines what quadrant an observation is in. Based on its value
85
+ # compared to its neighbors. This does not work for all stats, since
86
+ # it requires that values be negative.
87
+ #
88
+ # In a standardized array of z, high values are values greater than 0
89
+ # and it's neighbors are determined by the spatial lag and if that is
90
+ # positive then it's neighbors would be high, low otherwise.
91
+ #
92
+ # Quadrants are:
93
+ # [HH] a high value surrounded by other high values
94
+ # [LH] a low value surrounded by high values
95
+ # [LL] a low value surrounded by low values
96
+ # [HL] a high value surrounded by low values
97
+ #
98
+ # @return [Array] of labels
99
+ def quads
100
+ # https://github.com/pysal/esda/blob/master/esda/moran.py#L925
101
+ z_lag = SpatialStats::Utils::Lag.neighbor_average(weights, y)
102
+ zp = x.map(&:positive?)
103
+ lp = z_lag.map(&:positive?)
104
+
105
+ # hh = zp & lp
106
+ # lh = zp ^ true & lp
107
+ # ll = zp ^ true & lp ^ true
108
+ # hl = zp next to lp ^ true
109
+ hh = zp.each_with_index.map { |v, idx| v & lp[idx] }
110
+ lh = zp.each_with_index.map { |v, idx| (v ^ true) & lp[idx] }
111
+ ll = zp.each_with_index.map { |v, idx| (v ^ true) & (lp[idx] ^ true) }
112
+ hl = zp.each_with_index.map { |v, idx| v & (lp[idx] ^ true) }
113
+
114
+ # now zip lists and map them to proper terms
115
+ quad_terms = %w[HH LH LL HL]
116
+ hh.zip(lh, ll, hl).map do |feature|
117
+ quad_terms[feature.index(true)]
118
+ end
119
+ end
120
+ alias groups quads
121
+
122
+ ##
123
+ # Summary of the statistic. Computes +stat+, +mc+, and +groups+ then returns the values
124
+ # in a hash array.
125
+ #
126
+ # @param [Integer] permutations to run. Last digit should be 9 to produce round numbers.
127
+ # @param [Integer] seed used in random number generator for shuffles.
128
+ #
129
+ # @return [Array]
130
+ def summary(permutations = 99, seed = nil)
131
+ p_vals = mc(permutations, seed)
132
+ data = weights.keys.zip(stat, p_vals, groups)
133
+ data.map do |row|
134
+ { key: row[0], stat: row[1], p: row[2], group: row[3] }
135
+ end
136
+ end
137
+
65
138
  def x
66
139
  @x ||= SpatialStats::Queries::Variables.query_field(@scope, @x_field)
67
140
  .standardize
@@ -79,8 +152,17 @@ module SpatialStats
79
152
  x[idx] * y_lag_i
80
153
  end
81
154
 
155
+ def mc_observation_calc(stat_i_orig, stat_i_new, _permutations)
156
+ # Since moran can be positive or negative, go by this definition
157
+ if stat_i_orig.positive?
158
+ (stat_i_new >= stat_i_orig).count
159
+ else
160
+ (stat_i_new <= stat_i_orig).count
161
+ end
162
+ end
163
+
82
164
  def y_lag
83
- @y_lag ||= SpatialStats::Utils::Lag.neighbor_sum(w, y)
165
+ @y_lag ||= SpatialStats::Utils::Lag.neighbor_sum(weights, y)
84
166
  end
85
167
  end
86
168
  end
@@ -32,6 +32,25 @@ module SpatialStats
32
32
  end
33
33
  alias c stat
34
34
 
35
+ ##
36
+ # Computes the groups each observation belongs to.
37
+ # Potential groups for Geary's C are:
38
+ # [HH] High-High
39
+ # [LL] Low-Low
40
+ # [N] Negative - Group traditionally for HL and LH, but since the difference is squared they are in the same group.
41
+ #
42
+ #
43
+ # @return [Array] groups for each observation
44
+ def groups
45
+ quads.map do |quad|
46
+ if %w[HL LH].include?(quad)
47
+ 'N'
48
+ else
49
+ quad
50
+ end
51
+ end
52
+ end
53
+
35
54
  ##
36
55
  # Values of the +field+ queried from the +scope+
37
56
  #
@@ -45,9 +64,11 @@ module SpatialStats
45
64
  private
46
65
 
47
66
  def stat_i(idx)
48
- zs = Numo::DFloat.cast(z)
49
- zi = (z[idx] - zs)**2
50
- (w[idx, true] * zi).sum
67
+ # TODO: maybe don't even use stat_i
68
+ # just form all of the modified zs and then
69
+ # pass it to a loop of mulvec all implemented in c ext
70
+ zi = z.map { |val| (z[idx] - val)**2 }
71
+ weights.sparse.dot_row(zi, idx)
51
72
  end
52
73
 
53
74
  def mc_i(wi, perms, idx)
@@ -55,8 +76,17 @@ module SpatialStats
55
76
  (wi * zi).sum(1)
56
77
  end
57
78
 
58
- def w
59
- @w ||= weights.full.row_standardized
79
+ def mc_observation_calc(stat_i_orig, stat_i_new, _permutations)
80
+ # Geary cannot be negative, so we have to use this technique from
81
+ # GeoDa to determine p values. Note I slightly modified it to be inclusive
82
+ # on both tails not just the lower tail.
83
+ # https://github.com/GeoDaCenter/geoda/blob/master/Explore/LocalGearyCoordinator.cpp#L981 mean = stat_i_new.mean
84
+ mean = stat_i_new.mean
85
+ if stat_i_orig <= mean
86
+ (stat_i_new <= stat_i_orig).count
87
+ else
88
+ (stat_i_new >= stat_i_orig).count
89
+ end
60
90
  end
61
91
  end
62
92
  end
@@ -14,13 +14,18 @@ module SpatialStats
14
14
  # @param [ActiveRecord::Relation] scope
15
15
  # @param [Symbol, String] field to query from scope
16
16
  # @param [WeightsMatrix] weights to define relationship between observations in scope
17
+ # @param [Boolean] star to preset if star will be true or false. Will be calculated otherwise.
17
18
  #
18
19
  # @return [GetisOrd]
19
20
  def initialize(scope, field, weights, star = nil)
20
- super(scope, field, weights)
21
+ @scope = scope
22
+ @field = field
23
+ @weights = weights
21
24
  @star = star
25
+ calc_weights
22
26
  end
23
27
  attr_accessor :star
28
+ attr_writer :x
24
29
 
25
30
  ##
26
31
  # Computes the G or G* statistic for every observation in x.
@@ -33,6 +38,25 @@ module SpatialStats
33
38
  end
34
39
  alias g stat
35
40
 
41
+ ##
42
+ # Computes the groups each observation belongs to.
43
+ # Potential groups for G are:
44
+ # [H] High
45
+ # [L] Low
46
+ #
47
+ # Group is high when standardized z is positive, low otherwise.
48
+ #
49
+ # @return [Array] groups for each observation
50
+ def groups
51
+ z.standardize.map do |val|
52
+ if val.positive?
53
+ 'H'
54
+ else
55
+ 'L'
56
+ end
57
+ end
58
+ end
59
+
36
60
  ##
37
61
  # Values of the +field+ queried from the +scope+
38
62
  #
@@ -50,7 +74,7 @@ module SpatialStats
50
74
  # @return [Boolean] of star
51
75
  def star?
52
76
  if @star.nil?
53
- @star = weights.full.trace.positive?
77
+ @star = weights.dense.trace.positive?
54
78
  else
55
79
  @star
56
80
  end
@@ -67,25 +91,29 @@ module SpatialStats
67
91
  x_lag_i / denominators[idx]
68
92
  end
69
93
 
70
- def w
71
- @w ||= begin
72
- if star?
73
- weights.full.windowed.row_standardized
74
- else
75
- weights.standardized
76
- end
94
+ def mc_observation_calc(stat_i_orig, stat_i_new, permutations)
95
+ # GetisOrd cannot be negative, so we have to use this technique from
96
+ # ESDA to determine if we should select p or 1-p.
97
+ # https://github.com/pysal/esda/blob/master/esda/getisord.py#L388
98
+ num_larger = (stat_i_new >= stat_i_orig).count
99
+ is_low = (permutations - num_larger) < num_larger
100
+ if is_low
101
+ permutations - num_larger
102
+ else
103
+ num_larger
77
104
  end
78
105
  end
79
106
 
107
+ def calc_weights
108
+ @weights = if star?
109
+ weights.window.standardize
110
+ else
111
+ weights.standardize
112
+ end
113
+ end
114
+
80
115
  def z_lag
81
- # window if star is true
82
- @z_lag ||= begin
83
- if star?
84
- SpatialStats::Utils::Lag.window_sum(w, x)
85
- else
86
- SpatialStats::Utils::Lag.neighbor_sum(w, x)
87
- end
88
- end
116
+ @z_lag ||= SpatialStats::Utils::Lag.neighbor_sum(weights, x)
89
117
  end
90
118
  alias x_lag z_lag
91
119
 
@@ -55,12 +55,11 @@ module SpatialStats
55
55
  # @return [Array] of variances for each observation
56
56
  def variance
57
57
  # formula is A - B - (E[I])**2
58
- wt = w.row_standardized
59
58
  exp = expectation
60
59
 
61
60
  vars = []
62
- a_terms = a_calc(wt)
63
- b_terms = b_calc(wt)
61
+ a_terms = a_calc
62
+ b_terms = b_calc
64
63
 
65
64
  a_terms.each_with_index do |a_term, idx|
66
65
  vars << (a_term - b_terms[idx] - (exp**2))
@@ -68,6 +67,21 @@ module SpatialStats
68
67
  vars
69
68
  end
70
69
 
70
+ ##
71
+ # Computes the groups each observation belongs to.
72
+ # Potential groups for Moran's I are:
73
+ # [HH] High-High
74
+ # [HL] High-Low
75
+ # [LH] Low-High
76
+ # [LL] Low-Low
77
+ #
78
+ # This is the same as the +#quads+ method in the +Stat+ class.
79
+ #
80
+ # @return [Array] groups for each observation
81
+ def groups
82
+ quads
83
+ end
84
+
71
85
  ##
72
86
  # Values of the +field+ queried from the +scope+
73
87
  #
@@ -85,7 +99,7 @@ module SpatialStats
85
99
  def z_lag
86
100
  # w is already row_standardized, so we are using
87
101
  # neighbor sum instead of neighbor_average to save cost
88
- @z_lag ||= SpatialStats::Utils::Lag.neighbor_sum(w, z)
102
+ @z_lag ||= SpatialStats::Utils::Lag.neighbor_sum(weights, z)
89
103
  end
90
104
 
91
105
  private
@@ -102,6 +116,15 @@ module SpatialStats
102
116
  z[idx] * z_lag_i
103
117
  end
104
118
 
119
+ def mc_observation_calc(stat_i_orig, stat_i_new, _permutations)
120
+ # Since moran can be positive or negative, go by this definition
121
+ if stat_i_orig.positive?
122
+ (stat_i_new >= stat_i_orig).count
123
+ else
124
+ (stat_i_new <= stat_i_orig).count
125
+ end
126
+ end
127
+
105
128
  def si2
106
129
  # @si2 ||= z.sample_variance
107
130
  # we standardize so sample_variance is 1
@@ -109,20 +132,27 @@ module SpatialStats
109
132
  end
110
133
 
111
134
  # https://pro.arcgis.com/en/pro-app/tool-reference/spatial-statistics/h-local-morans-i-additional-math.htm
112
- def a_calc(wt)
113
- n = wt.shape[0]
135
+ # TODO: sparse
136
+ def a_calc
137
+ n = weights.n
114
138
  b2i = b2i_calc
139
+
140
+ wts = weights.sparse.values
141
+ row_index = weights.sparse.row_index
142
+
115
143
  a_terms = []
116
144
 
117
145
  (0..n - 1).each do |idx|
118
- sigma_term = wt[idx, true].to_a.sum { |v| v**2 }
146
+ row_range = row_index[idx]..(row_index[idx + 1] - 1)
147
+ wt = wts[row_range]
148
+ sigma_term = wt.sum { |v| v**2 }
119
149
  a_terms << (n - b2i) * sigma_term / (n - 1)
120
150
  end
121
151
  a_terms
122
152
  end
123
153
 
124
- def b_calc(wt)
125
- n = wt.shape[0]
154
+ def b_calc
155
+ n = weights.n
126
156
  b2i = b2i_calc
127
157
  b_terms = []
128
158
 
@@ -23,7 +23,7 @@ module SpatialStats
23
23
  def initialize(scope, fields, weights)
24
24
  @scope = scope
25
25
  @fields = fields
26
- @weights = weights
26
+ @weights = weights.standardize
27
27
  end
28
28
  attr_accessor :scope, :fields, :weights
29
29
 
@@ -60,37 +60,47 @@ module SpatialStats
60
60
  # of indices, which will return a list of new orders for the fields.
61
61
  # They will then be shuffled corresponding to the new indices.
62
62
  rng = gen_rng(seed)
63
- n = w.shape[0]
64
- indices = (0..(n - 1)).to_a
65
- shuffles = crand(indices, permutations, rng)
63
+ rids = crand(permutations, rng)
66
64
 
65
+ n_1 = weights.n - 1
66
+ sparse = weights.sparse
67
+ row_index = sparse.row_index
68
+ ws = sparse.values
69
+ wc = weights.wc
67
70
  stat_orig = stat
68
- rs = [0] * n
69
71
 
70
- ws = neighbor_weights
71
-
72
- idx = 0
73
- while idx < n
72
+ ids = (0..n_1).to_a
73
+ observations = Array.new(weights.n)
74
+ (0..n_1).each do |idx|
75
+ idsi = ids.dup
76
+ idsi.delete_at(idx)
77
+ idsi.shuffle!(random: rng)
78
+ idsi = Numo::Int32.cast(idsi)
79
+ sample = rids[idsi[rids[true, 0..wc[idx] - 1]]]
80
+
81
+ # account for case where there are no neighbors
82
+ row_range = row_index[idx]..(row_index[idx + 1] - 1)
83
+ if row_range.size.zero?
84
+ observations[idx] = permutations
85
+ next
86
+ end
87
+
88
+ wi = Numo::DFloat.cast(ws[row_range])
89
+ stat_i_new = mc_i(wi, sample, idx)
74
90
  stat_i_orig = stat_orig[idx]
75
- wi = Numo::DFloat.cast(ws[idx])
76
-
77
- # for each field, compute the C value at that index.
78
- stat_i_new = mc_i(wi, shuffles[idx], idx)
79
-
80
- rs[idx] = if stat_i_orig.positive?
81
- (stat_i_new >= stat_i_orig).count
82
- else
83
- (stat_i_new <= stat_i_orig).count
84
- end
85
-
86
- idx += 1
91
+ observations[idx] = mc_observation_calc(stat_i_orig, stat_i_new,
92
+ permutations)
87
93
  end
88
94
 
89
- rs.map do |ri|
95
+ observations.map do |ri|
90
96
  (ri + 1.0) / (permutations + 1.0)
91
97
  end
92
98
  end
93
99
 
100
+ def groups
101
+ raise NotImplementedError, 'groups not implemented'
102
+ end
103
+
94
104
  private
95
105
 
96
106
  def mc_i(wi, perms, idx)
@@ -108,6 +118,19 @@ module SpatialStats
108
118
  cs.mean(0)
109
119
  end
110
120
 
121
+ def mc_observation_calc(stat_i_orig, stat_i_new, _permutations)
122
+ # Geary cannot be negative, so we have to use this technique from
123
+ # GeoDa to determine p values. Note I slightly modified it to be inclusive
124
+ # on both tails not just the lower tail.
125
+ # https://github.com/GeoDaCenter/geoda/blob/master/Explore/LocalGearyCoordinator.cpp#L981 mean = stat_i_new.mean
126
+ mean = stat_i_new.mean
127
+ if stat_i_orig <= mean
128
+ (stat_i_new <= stat_i_orig).count
129
+ else
130
+ (stat_i_new >= stat_i_orig).count
131
+ end
132
+ end
133
+
111
134
  def field_data
112
135
  @field_data ||= fields.map do |field|
113
136
  SpatialStats::Queries::Variables.query_field(@scope, field)