dea-tools 0.3.7.dev35__tar.gz → 0.4.1.dev14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/PKG-INFO +1 -1
  2. dea_tools-0.4.1.dev14/Tools/dea_tools/validation.py +309 -0
  3. dea_tools-0.3.7.dev35/Tools/dea_tools/validation.py +0 -87
  4. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/.gitignore +0 -0
  5. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/LICENSE +0 -0
  6. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/README.md +0 -0
  7. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/__init__.py +0 -0
  8. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/__main__.py +0 -0
  9. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/__init__.py +0 -0
  10. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/animations.py +0 -0
  11. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/changefilmstrips.py +0 -0
  12. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/crophealth.py +0 -0
  13. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/deacoastlines.py +0 -0
  14. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/geomedian.py +0 -0
  15. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/imageexport.py +0 -0
  16. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/miningrehab.py +0 -0
  17. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/wetlandsinsighttool.py +0 -0
  18. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/app/widgetconstructors.py +0 -0
  19. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/bandindices.py +0 -0
  20. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/bom.py +0 -0
  21. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/classification.py +0 -0
  22. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/coastal.py +0 -0
  23. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/dask.py +0 -0
  24. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/datahandling.py +0 -0
  25. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/landcover.py +0 -0
  26. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/maps.py +0 -0
  27. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/plotting.py +0 -0
  28. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/spatial.py +0 -0
  29. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/temporal.py +0 -0
  30. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/waterbodies.py +0 -0
  31. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/Tools/dea_tools/wetlands.py +0 -0
  32. {dea_tools-0.3.7.dev35 → dea_tools-0.4.1.dev14}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dea-tools
3
- Version: 0.3.7.dev35
3
+ Version: 0.4.1.dev14
4
4
  Summary: Open-source tools for geospatial analysis with Digital Earth Australia, Open Data Cube, and Xarray
5
5
  Project-URL: Homepage, https://github.com/GeoscienceAustralia/dea-notebooks
6
6
  Project-URL: Repository, https://github.com/GeoscienceAustralia/dea-notebooks
@@ -0,0 +1,309 @@
1
+ # validation.py
2
+ """
3
+ Tools for validating outputs and producing accuracy assessment metrics.
4
+
5
+ License: The code in this notebook is licensed under the Apache License,
6
+ Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0). Digital Earth
7
+ Australia data is licensed under the Creative Commons by Attribution 4.0
8
+ license (https://creativecommons.org/licenses/by/4.0/).
9
+
10
+ Contact: If you need assistance, please post a question on the Open Data
11
+ Cube Discord chat (https://discord.com/invite/4hhBQVas5U) or on the GIS Stack
12
+ Exchange (https://gis.stackexchange.com/questions/ask?tags=open-data-cube)
13
+ using the `open-data-cube` tag (you can view previously asked questions
14
+ here: https://gis.stackexchange.com/questions/tagged/open-data-cube).
15
+
16
+ If you would like to report an issue with this script, you can file one
17
+ on GitHub (https://github.com/GeoscienceAustralia/dea-notebooks/issues/new).
18
+
19
+ Last modified: July 2025
20
+ """
21
+
22
+ from math import sqrt
23
+
24
+ import geopandas as gpd
25
+ import numpy as np
26
+ import pandas as pd
27
+ import xarray as xr
28
+ from scipy import stats
29
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
30
+
31
+ from .spatial import add_geobox
32
+
33
+
34
+ def eval_metrics(x, y, round=3, all_regress=False):
35
+ """
36
+ Calculate a set of common statistical metrics
37
+ based on two input actual and predicted vectors.
38
+
39
+ These include:
40
+ - Pearson correlation
41
+ - Root Mean Squared Error
42
+ - Mean Absolute Error
43
+ - R-squared
44
+ - Bias
45
+ - Linear regression parameters (slope,
46
+ p-value, intercept, standard error)
47
+
48
+ Parameters
49
+ ----------
50
+ x : numpy.array
51
+ An array providing "actual" variable values
52
+ y : numpy.array
53
+ An array providing "predicted" variable values
54
+ round : int
55
+ Number of decimal places to round each metric
56
+ to. Defaults to 3
57
+ all_regress : bool
58
+ Whether to return linear regression p-value,
59
+ intercept and standard error (in addition to
60
+ only regression slope). Defaults to False
61
+
62
+ Returns
63
+ -------
64
+ A pandas.Series containing calculated metrics
65
+ """
66
+
67
+ # Create dataframe to drop na
68
+ xy_df = pd.DataFrame({"x": x, "y": y}).dropna()
69
+
70
+ # Compute linear regression
71
+ lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)
72
+
73
+ # Calculate statistics
74
+ stats_dict = {
75
+ "Correlation": xy_df.corr().iloc[0, 1],
76
+ "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
77
+ "MAE": mean_absolute_error(xy_df.x, xy_df.y),
78
+ "R-squared": lin_reg.rvalue**2,
79
+ "Bias": (xy_df.y - xy_df.x).mean(),
80
+ "Regression slope": lin_reg.slope,
81
+ }
82
+
83
+ # Additional regression params
84
+ if all_regress:
85
+ stats_dict.update({
86
+ "Regression p-value": lin_reg.pvalue,
87
+ "Regression intercept": lin_reg.intercept,
88
+ "Regression standard error": lin_reg.stderr,
89
+ })
90
+
91
+ # Return as
92
+ return pd.Series(stats_dict).round(round)
93
+
94
+
95
+ def xr_random_sampling(
96
+ da,
97
+ n=None,
98
+ sampling="stratified_random",
99
+ manual_class_ratios=None,
100
+ oversample_factor=5,
101
+ out_fname=None,
102
+ verbose=True,
103
+ ):
104
+ """
105
+ Efficient and scalable random sampling of a 2D classified xarray.DataArray.
106
+ Returns a GeoDataFrame of point samples based on specified sampling strategy.
107
+
108
+ Parameters
109
+ ----------
110
+ da : xarray.DataArray
111
+ A classified 2-dimensional xarray.DataArray
112
+ n : int
113
+ Total number of points to sample. Ignored if providing
114
+ a dictionary of {class:numofpoints} to 'manual_class_ratios'
115
+ sampling : str, optional
116
+ The sampling strategy to use. Options include:
117
+ 'stratified_random' = Create points that are randomly
118
+ distributed within each class, where each class has a
119
+ number of points proportional to its relative area.
120
+ 'equal_stratified_random' = Create points that are randomly
121
+ distributed within each class, where each class has the
122
+ same number of points.
123
+ 'random' = Create points that are randomly distributed
124
+ throughout the image.
125
+ 'manual' = user definined, each class is allocated a
126
+ specified number of points, supply a manual_class_ratio
127
+ dictionary mapping number of points to each class
128
+ manual_class_ratios : dict, optional
129
+ If setting sampling to 'manual', the provide a dictionary
130
+ of type {'class': numofpoints} mapping the number of points
131
+ to generate for each class.
132
+ oversample_factor : float, optional (default=5)
133
+ A multiplier used to increase the number of random candidate pixels
134
+ initially drawn when sampling very large classes (>1 billion pixels).
135
+ For such large classes, the function randomly samples a subset of
136
+ pixel coordinates and checks which ones match the target class.
137
+ To reduce the chance of undersampling, `oversample_factor` controls
138
+ how many candidate coordinates are initially drawn.
139
+ For example, if 100 samples are required and `oversample_factor=5`,
140
+ 500 random (x, y) coordinates will be sampled first. Only those matching
141
+ the class will be retained and then randomly subsampled down to the desired
142
+ number of samples. If too few valid matches are found, a warning is issued.
143
+ Increasing this value can improve success rates when sampling sparse or
144
+ spatially fragmented classes in large datasets, at the cost of more memory
145
+ and computation.
146
+ out_fname : str, optional
147
+ If providing a filepath name, e.g 'sample_points.geojson', the
148
+ function will export a geojson (or shapefile) of the sampling
149
+ points to file.
150
+ verbose: bool, optional (default=True)
151
+ If True, print statements will track progress and print warnings
152
+
153
+ Returns
154
+ -------
155
+ geopandas.GeoDataFrame
156
+
157
+ """
158
+ # perform checks on the inputs
159
+ if sampling not in [
160
+ "stratified_random",
161
+ "equal_stratified_random",
162
+ "random",
163
+ "manual",
164
+ ]:
165
+ raise ValueError(
166
+ "Sampling strategy must be one of 'stratified_random', 'equal_stratified_random', 'random', or 'manual'"
167
+ )
168
+
169
+ if "time" in da.dims:
170
+ raise ValueError("Input DataArray must not have a 'time' dimension.")
171
+
172
+ if len(da.dims) > 2:
173
+ raise ValueError("Input DataArray must not have more than two dimensions")
174
+
175
+ if not isinstance(da, xr.DataArray):
176
+ raise ValueError("This function only accepts xarray.DataArrays as input")
177
+
178
+ # Ensure da has a .odc.* accessor using odc.geo.
179
+ da = add_geobox(da)
180
+
181
+ # Obtain spatial dim names
182
+ y_dim, x_dim = da.odc.spatial_dims
183
+
184
+ # grab data as numpy arrays and count classes
185
+ data = da.values
186
+
187
+ unique_classes, class_counts = np.unique(data[~np.isnan(data)], return_counts=True)
188
+
189
+ unique_classes = unique_classes.astype(int)
190
+
191
+ # store our samples in a list
192
+ samples = []
193
+
194
+ if sampling == "random":
195
+ # first check num of samples doesn't exceed pixels
196
+ total_valid = (~np.isnan(data)).sum()
197
+ if n > total_valid:
198
+ raise ValueError("Requested more samples than available valid pixels.")
199
+
200
+ if verbose:
201
+ print(f"Sampling {n} points")
202
+
203
+ # determine flat indices of the non-Nans
204
+ flat_indices = np.flatnonzero(~np.isnan(data))
205
+
206
+ # sample the flat indices
207
+ sampled = np.random.choice(flat_indices, size=n, replace=False)
208
+
209
+ # get coords and class values from sample indices
210
+ for idx in sampled:
211
+ y, x = np.unravel_index(idx, data.shape)
212
+ y_val = da[y_dim].values[y]
213
+ x_val = da[x_dim].values[x]
214
+ cls = data[y, x]
215
+ samples.append((y_val, x_val, int(cls)))
216
+
217
+ elif sampling in ["stratified_random", "equal_stratified_random", "manual"]:
218
+ if sampling == "equal_stratified_random":
219
+ # divide n by the number of classes
220
+ n_per_class = int(np.ceil(n / len(unique_classes)))
221
+ class_sample_sizes = dict.fromkeys(unique_classes, n_per_class)
222
+
223
+ elif sampling == "stratified_random":
224
+ # calculate relative proportions of classes.
225
+ proportions = class_counts / class_counts.sum()
226
+ class_sample_sizes = {cls: int(np.round(n * prop)) for cls, prop in zip(unique_classes, proportions)}
227
+
228
+ elif sampling == "manual":
229
+ if not isinstance(manual_class_ratios, dict):
230
+ raise ValueError("Must provide manual_class_ratios for manual sampling.")
231
+
232
+ class_sample_sizes = {int(k): int(v) for k, v in manual_class_ratios.items()}
233
+
234
+ for cls in class_sample_sizes:
235
+ sample_size = class_sample_sizes[cls]
236
+
237
+ if verbose:
238
+ print(f"Class {cls}: sampling {sample_size} points")
239
+
240
+ class_count = (data == cls).sum()
241
+
242
+ if class_count > 1e9: # For v. large classes, sample random coords first and check matches
243
+ # Try oversampling until we get enough
244
+ n_try = int(sample_size * oversample_factor)
245
+ rand_x = np.random.choice(np.arange(len(da.x)), n_try, replace=False)
246
+
247
+ rand_y = np.random.choice(np.arange(len(da.y)), n_try, replace=False)
248
+
249
+ # find matches with class id
250
+ match = data[rand_y, rand_x] == cls
251
+ rand_y, rand_x = rand_y[match], rand_x[match]
252
+
253
+ # check if matches is less than requested sample size
254
+ # and return samples with a warning
255
+ if len(rand_y) < sample_size:
256
+ if verbose:
257
+ print(
258
+ f"Warning: insufficient matches for class {cls}, "
259
+ f"try increasing oversampling. Returning {len(rand_y)} matches"
260
+ )
261
+
262
+ idx = np.random.choice(np.arange(len(rand_y)), size=len(rand_y), replace=False)
263
+ for i in idx:
264
+ y = da[y_dim].values[rand_y[i]]
265
+ x = da[x_dim].values[rand_x[i]]
266
+ samples.append((y, x, cls))
267
+
268
+ else:
269
+ # If more matches than samples, then randomly sample the matches so we get the
270
+ # the right number of samples.
271
+ idx = np.random.choice(np.arange(len(rand_y)), size=sample_size, replace=False)
272
+ for i in idx:
273
+ y = da[y_dim].values[rand_y[i]]
274
+ x = da[x_dim].values[rand_x[i]]
275
+ samples.append((y, x, cls))
276
+
277
+ else:
278
+ # if class size is less than a billion, then sample class mask
279
+ class_mask = data == cls
280
+ flat_indices = np.flatnonzero(class_mask)
281
+
282
+ # Check if enough pixels exist
283
+ if flat_indices.size < sample_size:
284
+ if verbose:
285
+ print(f"Warning: not enough pixels in class {cls} for given sample size, skipping")
286
+ continue
287
+
288
+ # Randomly sample from those flat indices
289
+ sampled = np.random.choice(flat_indices, size=sample_size, replace=False)
290
+
291
+ # Convert flat indices to (y, x), then to coordinates
292
+ for idx in sampled:
293
+ y_idx, x_idx = np.unravel_index(idx, data.shape)
294
+ y = da[y_dim].values[y_idx]
295
+ x = da[x_dim].values[x_idx]
296
+ samples.append((y, x, cls))
297
+
298
+ if len(samples) == 0:
299
+ raise RuntimeError("No samples collected. Check input conditions.")
300
+
301
+ # Add samples to geodataframe
302
+ df = pd.DataFrame(samples, columns=["y", "x", "class"])
303
+ gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y), crs=f"EPSG:{da.odc.crs.epsg}")
304
+ gdf = gdf.drop(["x", "y"], axis=1)
305
+
306
+ if out_fname:
307
+ gdf.to_file(out_fname)
308
+
309
+ return gdf
@@ -1,87 +0,0 @@
1
- ## validation.py
2
- """
3
- Tools for validating outputs and producing accuracy assessment metrics.
4
-
5
- License: The code in this notebook is licensed under the Apache License,
6
- Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0). Digital Earth
7
- Australia data is licensed under the Creative Commons by Attribution 4.0
8
- license (https://creativecommons.org/licenses/by/4.0/).
9
-
10
- Contact: If you need assistance, please post a question on the Open Data
11
- Cube Discord chat (https://discord.com/invite/4hhBQVas5U) or on the GIS Stack
12
- Exchange (https://gis.stackexchange.com/questions/ask?tags=open-data-cube)
13
- using the `open-data-cube` tag (you can view previously asked questions
14
- here: https://gis.stackexchange.com/questions/tagged/open-data-cube).
15
-
16
- If you would like to report an issue with this script, you can file one
17
- on GitHub (https://github.com/GeoscienceAustralia/dea-notebooks/issues/new).
18
-
19
- Last modified: April 2023
20
- """
21
-
22
- from math import sqrt
23
-
24
- import pandas as pd
25
- from scipy import stats
26
- from sklearn.metrics import mean_absolute_error, mean_squared_error
27
-
28
-
29
- def eval_metrics(x, y, round=3, all_regress=False):
30
- """
31
- Calculate a set of common statistical metrics
32
- based on two input actual and predicted vectors.
33
-
34
- These include:
35
- - Pearson correlation
36
- - Root Mean Squared Error
37
- - Mean Absolute Error
38
- - R-squared
39
- - Bias
40
- - Linear regression parameters (slope,
41
- p-value, intercept, standard error)
42
-
43
- Parameters
44
- ----------
45
- x : numpy.array
46
- An array providing "actual" variable values
47
- y : numpy.array
48
- An array providing "predicted" variable values
49
- round : int
50
- Number of decimal places to round each metric
51
- to. Defaults to 3
52
- all_regress : bool
53
- Whether to return linear regression p-value,
54
- intercept and standard error (in addition to
55
- only regression slope). Defaults to False
56
-
57
- Returns
58
- -------
59
- A pandas.Series containing calculated metrics
60
- """
61
-
62
- # Create dataframe to drop na
63
- xy_df = pd.DataFrame({"x": x, "y": y}).dropna()
64
-
65
- # Compute linear regression
66
- lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)
67
-
68
- # Calculate statistics
69
- stats_dict = {
70
- "Correlation": xy_df.corr().iloc[0, 1],
71
- "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
72
- "MAE": mean_absolute_error(xy_df.x, xy_df.y),
73
- "R-squared": lin_reg.rvalue**2,
74
- "Bias": (xy_df.y - xy_df.x).mean(),
75
- "Regression slope": lin_reg.slope,
76
- }
77
-
78
- # Additional regression params
79
- if all_regress:
80
- stats_dict.update({
81
- "Regression p-value": lin_reg.pvalue,
82
- "Regression intercept": lin_reg.intercept,
83
- "Regression standard error": lin_reg.stderr,
84
- })
85
-
86
- # Return as
87
- return pd.Series(stats_dict).round(round)
File without changes