disfor 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
disfor-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.3
2
+ Name: disfor
3
+ Version: 0.2.0
4
+ Summary: Utilities to load and filter DISFOR dataset. The dataset provides forest disturbance agent labels for use with Sentinel-2.
5
+ Author: Jonas Viehweger
6
+ Author-email: Jonas Viehweger <jonas.viehweger@joanneum.at>
7
+ License: EUPL-1.2
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Classifier: Programming Language :: Python :: 3.14
12
+ Classifier: License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)
13
+ Requires-Dist: imagecodecs>=2025.8.2
14
+ Requires-Dist: matplotlib>=3.10.6
15
+ Requires-Dist: numpy>=2.3.3
16
+ Requires-Dist: polars>=1.33.1
17
+ Requires-Dist: pooch>=1.8.2
18
+ Requires-Dist: pyarrow>=21.0.0
19
+ Requires-Dist: scikit-learn>=1.7.2
20
+ Requires-Dist: tifffile>=2025.9.30
21
+ Requires-Dist: zstandard>=0.25.0
22
+ Requires-Dist: torch>=2.8.0 ; extra == 'torch'
23
+ Requires-Dist: lightning>=2.5.5 ; extra == 'torch'
24
+ Requires-Python: >=3.12
25
+ Project-URL: Documentation, https://jr-digital.github.io/DISFOR/
26
+ Project-URL: Homepage, https://github.com/JR-DIGITAL/DISFOR
27
+ Project-URL: Issues, https://github.com/JR-DIGITAL/DISFOR/issues
28
+ Project-URL: Repository, https://github.com/JR-DIGITAL/DISFOR
29
+ Provides-Extra: torch
30
+ Description-Content-Type: text/markdown
31
+
32
+ # DISFOR <img align="right" width="150" height="150" src="docs/assets/logo.svg">
33
+
34
+ **DISFOR** offers dense labelled satellite time-series data on forest disturbance timing and agents of disturbance. It contains 3823 unique time-series.
35
+ Each time-series corresponds to a single 10x10m Sentinel-2 pixel.
36
+
37
+ ![Example image chips](docs/assets/examples.png)
38
+
39
+ ## Installation
40
+
41
+ The package can be installed from PyPI:
42
+
43
+ `pip install disfor`
44
+
45
+ ## Usage
46
+
47
+ The data itself is available at on huggingface: https://huggingface.co/datasets/JR-DIGITAL/DISFOR.
48
+ There are detailed usage guides available in the documentation at: https://jr-digital.github.io/DISFOR/usage/dataset-overview/
49
+
50
+ ## Dataset Overview
51
+
52
+ There are four main parts of this dataset:
53
+
54
+ - `samples.parquet`: Providing location and metadata of sampled points
55
+ - `labels.parquet`: Providing labels for each sampled time-series
56
+ - `pixel_data.parquet`: Providing Sentinel-2 band data for each acquistion in the time-series
57
+ - Sentinel-2 Chips: Image chip time-series for each sample
58
+
59
+ ### samples.parquet
60
+
61
+ This contains the sampled points along with metadata on the points. It provides the following columns:
62
+
63
+ | Column name | Description |
64
+ | --- | --- |
65
+ | sample_id | Unique sample ID for each sample point |
66
+ | original_sample_id | Sample ID of the point in the original publication of the dataset |
67
+ | interpreter | Shorthand code for the interpreter who labelled this sample |
68
+ | dataset | Number of the original sampling campaign in which this point was labelled |
69
+ | source | The ancillary data source used to interpret the agent |
70
+ | source_description | A long text description of the used source. Link to the original data if available |
71
+ | s2_tile | If available, which Sentinel 2 Tile the sample intersects |
72
+ | cluster_id | Unique ID to group samples which are spatio-temporally autocorrelated |
73
+ | cluster_description | What type of cluster it is |
74
+ | comment | Free text comment about the interpretation of the sampled point |
75
+ | confidence | Confidence of sampling: high where both timing and agent are confident, medium were only the timing is confident |
76
+ | geometry | Coordinates of the sampled point. In CRS EPSG:4326 |
77
+
78
+ ### labels.parquet
79
+
80
+ This contains the time-series labels for each sampled point in `samples.parquet`. The following columns are available:
81
+
82
+ | Column name | Description |
83
+ | --- | --- |
84
+ | sample_id | Taken from sample table |
85
+ | original_sample_id | Taken from sample table |
86
+ | dataset | Taken from sample table |
87
+ | label | Interpreted class of the segment (see next table) |
88
+ | original_label | The label which was originally assigned and remapped to label |
89
+ | start | Start date of the segment |
90
+ | end | End date of the segment |
91
+ | start_next_label | Start date of the next label. Some labels are encoded as events (Clear Cuts for example) and are not immediately followed by another label, this column allows a full segmentation of the time-series. Null if it is the last label of the sample |
92
+
93
+ The provided label is a hierarchical label, following this hierarchy:
94
+
95
+ <table border="1" cellspacing="0" cellpadding="6">
96
+ <thead>
97
+ <tr>
98
+ <th>Level 1</th>
99
+ <th>Level 2</th>
100
+ <th>Level 3</th>
101
+ </tr>
102
+ </thead>
103
+ <tbody>
104
+ <!-- 100 - Alive Vegetation -->
105
+ <tr>
106
+ <td rowspan="4">100 - Healthy Vegetation</td>
107
+ <td>110 - Undisturbed Forest</td>
108
+ <td></td>
109
+ </tr>
110
+ <tr>
111
+ <td rowspan="3">120 - Revegetation</td>
112
+ <td>121 - With Trees (after clear cut)</td>
113
+ </tr>
114
+ <tr>
115
+ <td>122 - Canopy closing (after thinning/defoliation)</td>
116
+ </tr>
117
+ <tr>
118
+ <td>123 - Without Trees (shrubs and grasses, no reforestation visible)</td>
119
+ </tr>
120
+ <tr>
121
+ <td rowspan="14">200 - Disturbed</td>
122
+ <td rowspan="3">210 - Planned</td>
123
+ <td>211 - Clear Cut</td>
124
+ </tr>
125
+ <tr>
126
+ <td>212 - Thinning</td>
127
+ </tr>
128
+ <tr>
129
+ <td>213 - Forestry Mulching (Non Forest Vegetation Removal)</td>
130
+ </tr>
131
+ <tr>
132
+ <td rowspan="2">220 - Salvage</td>
133
+ <td>221 - After Biotic Disturbances</td>
134
+ </tr>
135
+ <tr>
136
+ <td>222 - After Abiotic Disturbances</td>
137
+ </tr>
138
+ <tr>
139
+ <td rowspan="2">230 - Biotic</td>
140
+ <td>231 - Bark Beetle</td>
141
+ </tr>
142
+ <tr>
143
+ <td>232 - Gypsy Moth (temporal segment of visible disturbance)</td>
144
+ </tr>
145
+ <tr>
146
+ <td rowspan="5">240 - Abiotic</td>
147
+ <td>241 - Drought</td>
148
+ </tr>
149
+ <tr>
150
+ <td>242 - Wildfire</td>
151
+ </tr>
152
+ <tr>
153
+ <td>243 - Wind</td>
154
+ </tr>
155
+ <tr>
156
+ <td>244 - Avalanche</td>
157
+ </tr>
158
+ <tr>
159
+ <td>245 - Flood</td>
160
+ </tr>
161
+ </tbody>
162
+ </table>
163
+
164
+ This mapping from label numbers to text is also available in `classes.json`.
165
+
166
+ ### pixel_data.parquet
167
+
168
+ This dataset provides the Sentinel-2 time-series of spectral values from which the labels were interpreted. The following columns are available:
169
+
170
+ | Column name | Datatype | Description |
171
+ | --- | --- | --- |
172
+ | sample_id | UINT16 | Taken from sample table |
173
+ | timestamp | DATE | UTC date of the S2 acquisition |
174
+ | label | UINT16 | Interpreted class of the segment, see previous table |
175
+ | clear | BOOL | True if the pixel is clear (SCL value any of 2,4,5,6) |
176
+ | percent_clear_4x4 [8x8, 16x16, 32x32] | UINT8 | The percentage of clear pixels (SCL in 2,4,5,6) within a 4x4, 8x8, 16x16 or 32x32 pixel image chip |
177
+ | B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12 | UINT16 | DN value for the spectral band |
178
+ | SCL | UINT8 | Sentinel 2 Scene Classification Value |
179
+
180
+ ### Sentinel-2 Chips
181
+
182
+ The files `disfor-<start-id>-<end-id>.tar.zst` provide tarballs with Sentinel-2 chips for each sample. The chips are of size 32x32px,
183
+ the sampled point is always at `[16,16]`. The available bands are: `B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12`.
184
+ Sentinel-2 bands with a native resolution of 20m (B11, B12) were resampled to 10m using nearest neighbor resampling.
185
+
186
+ The file structure in each tarball is:
187
+
188
+ `tiffs/<sample_id>/YYYY-MM-DD.tif`
189
+
190
+ ## Train Test Split
191
+
192
+ There is a train test split available which was constructed to reduce spatial autocorrelation and information leakage between the sets.
193
+ Two JSONs with lists of sample_ids are available in
194
+
195
+ - `train_ids.json`
196
+ - `val_ids.json`
disfor-0.2.0/README.md ADDED
@@ -0,0 +1,165 @@
1
+ # DISFOR <img align="right" width="150" height="150" src="docs/assets/logo.svg">
2
+
3
+ **DISFOR** offers dense labelled satellite time-series data on forest disturbance timing and agents of disturbance. It contains 3823 unique time-series.
4
+ Each time-series corresponds to a single 10x10m Sentinel-2 pixel.
5
+
6
+ ![Example image chips](docs/assets/examples.png)
7
+
8
+ ## Installation
9
+
10
+ The package can be installed from PyPI:
11
+
12
+ `pip install disfor`
13
+
14
+ ## Usage
15
+
16
+ The data itself is available at on huggingface: https://huggingface.co/datasets/JR-DIGITAL/DISFOR.
17
+ There are detailed usage guides available in the documentation at: https://jr-digital.github.io/DISFOR/usage/dataset-overview/
18
+
19
+ ## Dataset Overview
20
+
21
+ There are four main parts of this dataset:
22
+
23
+ - `samples.parquet`: Providing location and metadata of sampled points
24
+ - `labels.parquet`: Providing labels for each sampled time-series
25
+ - `pixel_data.parquet`: Providing Sentinel-2 band data for each acquistion in the time-series
26
+ - Sentinel-2 Chips: Image chip time-series for each sample
27
+
28
+ ### samples.parquet
29
+
30
+ This contains the sampled points along with metadata on the points. It provides the following columns:
31
+
32
+ | Column name | Description |
33
+ | --- | --- |
34
+ | sample_id | Unique sample ID for each sample point |
35
+ | original_sample_id | Sample ID of the point in the original publication of the dataset |
36
+ | interpreter | Shorthand code for the interpreter who labelled this sample |
37
+ | dataset | Number of the original sampling campaign in which this point was labelled |
38
+ | source | The ancillary data source used to interpret the agent |
39
+ | source_description | A long text description of the used source. Link to the original data if available |
40
+ | s2_tile | If available, which Sentinel 2 Tile the sample intersects |
41
+ | cluster_id | Unique ID to group samples which are spatio-temporally autocorrelated |
42
+ | cluster_description | What type of cluster it is |
43
+ | comment | Free text comment about the interpretation of the sampled point |
44
+ | confidence | Confidence of sampling: high where both timing and agent are confident, medium were only the timing is confident |
45
+ | geometry | Coordinates of the sampled point. In CRS EPSG:4326 |
46
+
47
+ ### labels.parquet
48
+
49
+ This contains the time-series labels for each sampled point in `samples.parquet`. The following columns are available:
50
+
51
+ | Column name | Description |
52
+ | --- | --- |
53
+ | sample_id | Taken from sample table |
54
+ | original_sample_id | Taken from sample table |
55
+ | dataset | Taken from sample table |
56
+ | label | Interpreted class of the segment (see next table) |
57
+ | original_label | The label which was originally assigned and remapped to label |
58
+ | start | Start date of the segment |
59
+ | end | End date of the segment |
60
+ | start_next_label | Start date of the next label. Some labels are encoded as events (Clear Cuts for example) and are not immediately followed by another label, this column allows a full segmentation of the time-series. Null if it is the last label of the sample |
61
+
62
+ The provided label is a hierarchical label, following this hierarchy:
63
+
64
+ <table border="1" cellspacing="0" cellpadding="6">
65
+ <thead>
66
+ <tr>
67
+ <th>Level 1</th>
68
+ <th>Level 2</th>
69
+ <th>Level 3</th>
70
+ </tr>
71
+ </thead>
72
+ <tbody>
73
+ <!-- 100 - Alive Vegetation -->
74
+ <tr>
75
+ <td rowspan="4">100 - Healthy Vegetation</td>
76
+ <td>110 - Undisturbed Forest</td>
77
+ <td></td>
78
+ </tr>
79
+ <tr>
80
+ <td rowspan="3">120 - Revegetation</td>
81
+ <td>121 - With Trees (after clear cut)</td>
82
+ </tr>
83
+ <tr>
84
+ <td>122 - Canopy closing (after thinning/defoliation)</td>
85
+ </tr>
86
+ <tr>
87
+ <td>123 - Without Trees (shrubs and grasses, no reforestation visible)</td>
88
+ </tr>
89
+ <tr>
90
+ <td rowspan="14">200 - Disturbed</td>
91
+ <td rowspan="3">210 - Planned</td>
92
+ <td>211 - Clear Cut</td>
93
+ </tr>
94
+ <tr>
95
+ <td>212 - Thinning</td>
96
+ </tr>
97
+ <tr>
98
+ <td>213 - Forestry Mulching (Non Forest Vegetation Removal)</td>
99
+ </tr>
100
+ <tr>
101
+ <td rowspan="2">220 - Salvage</td>
102
+ <td>221 - After Biotic Disturbances</td>
103
+ </tr>
104
+ <tr>
105
+ <td>222 - After Abiotic Disturbances</td>
106
+ </tr>
107
+ <tr>
108
+ <td rowspan="2">230 - Biotic</td>
109
+ <td>231 - Bark Beetle</td>
110
+ </tr>
111
+ <tr>
112
+ <td>232 - Gypsy Moth (temporal segment of visible disturbance)</td>
113
+ </tr>
114
+ <tr>
115
+ <td rowspan="5">240 - Abiotic</td>
116
+ <td>241 - Drought</td>
117
+ </tr>
118
+ <tr>
119
+ <td>242 - Wildfire</td>
120
+ </tr>
121
+ <tr>
122
+ <td>243 - Wind</td>
123
+ </tr>
124
+ <tr>
125
+ <td>244 - Avalanche</td>
126
+ </tr>
127
+ <tr>
128
+ <td>245 - Flood</td>
129
+ </tr>
130
+ </tbody>
131
+ </table>
132
+
133
+ This mapping from label numbers to text is also available in `classes.json`.
134
+
135
+ ### pixel_data.parquet
136
+
137
+ This dataset provides the Sentinel-2 time-series of spectral values from which the labels were interpreted. The following columns are available:
138
+
139
+ | Column name | Datatype | Description |
140
+ | --- | --- | --- |
141
+ | sample_id | UINT16 | Taken from sample table |
142
+ | timestamp | DATE | UTC date of the S2 acquisition |
143
+ | label | UINT16 | Interpreted class of the segment, see previous table |
144
+ | clear | BOOL | True if the pixel is clear (SCL value any of 2,4,5,6) |
145
+ | percent_clear_4x4 [8x8, 16x16, 32x32] | UINT8 | The percentage of clear pixels (SCL in 2,4,5,6) within a 4x4, 8x8, 16x16 or 32x32 pixel image chip |
146
+ | B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12 | UINT16 | DN value for the spectral band |
147
+ | SCL | UINT8 | Sentinel 2 Scene Classification Value |
148
+
149
+ ### Sentinel-2 Chips
150
+
151
+ The files `disfor-<start-id>-<end-id>.tar.zst` provide tarballs with Sentinel-2 chips for each sample. The chips are of size 32x32px,
152
+ the sampled point is always at `[16,16]`. The available bands are: `B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12`.
153
+ Sentinel-2 bands with a native resolution of 20m (B11, B12) were resampled to 10m using nearest neighbor resampling.
154
+
155
+ The file structure in each tarball is:
156
+
157
+ `tiffs/<sample_id>/YYYY-MM-DD.tif`
158
+
159
+ ## Train Test Split
160
+
161
+ There is a train test split available which was constructed to reduce spatial autocorrelation and information leakage between the sets.
162
+ Two JSONs with lists of sample_ids are available in
163
+
164
+ - `train_ids.json`
165
+ - `val_ids.json`
@@ -0,0 +1,73 @@
1
+ [project]
2
+ name = "disfor"
3
+ version = "0.2.0"
4
+ description = "Utilities to load and filter DISFOR dataset. The dataset provides forest disturbance agent labels for use with Sentinel-2."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Jonas Viehweger", email = "jonas.viehweger@joanneum.at" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ license = {text = "EUPL-1.2"}
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "Programming Language :: Python :: 3.12",
14
+ "Programming Language :: Python :: 3.13",
15
+ "Programming Language :: Python :: 3.14",
16
+ "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)"
17
+ ]
18
+ dependencies = [
19
+ "imagecodecs>=2025.8.2",
20
+ "matplotlib>=3.10.6",
21
+ "numpy>=2.3.3",
22
+ "polars>=1.33.1",
23
+ "pooch>=1.8.2",
24
+ "pyarrow>=21.0.0",
25
+ "scikit-learn>=1.7.2",
26
+ "tifffile>=2025.9.30",
27
+ "zstandard>=0.25.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ torch = [
32
+ "torch>=2.8.0",
33
+ "lightning>=2.5.5",
34
+ ]
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "hypothesis>=6.140.3",
39
+ "pre-commit>=4.3.0",
40
+ "pytest>=8.4.2",
41
+ "ipykernel>=6.30.1",
42
+ "rasterio>=1.4.3",
43
+ "pystac-client>=0.9.0",
44
+ "sentinelhub>=3.11.3",
45
+ ]
46
+ docs = [
47
+ "folium>=0.20.0",
48
+ "geopandas>=1.1.1",
49
+ "mapclassify>=2.10.0",
50
+ "matplotlib>=3.10.6",
51
+ "mkdocs-autorefs>=1.4.3",
52
+ "mkdocs-jupyter>=0.25.1",
53
+ "mkdocs-material>=9.6.23",
54
+ "mkdocstrings[python]>=0.30.1",
55
+ "plotly>=6.4.0",
56
+ "plotnine>=0.15.0",
57
+ ]
58
+
59
+ [project.urls]
60
+ Homepage = "https://github.com/JR-DIGITAL/DISFOR"
61
+ Documentation = "https://jr-digital.github.io/DISFOR/"
62
+ Repository = "https://github.com/JR-DIGITAL/DISFOR"
63
+ Issues = "https://github.com/JR-DIGITAL/DISFOR/issues"
64
+
65
+ [tool.uv]
66
+ default-groups = ["dev", "docs"]
67
+
68
+ [build-system]
69
+ requires = ["uv_build>=0.8.22,<0.9.0"]
70
+ build-backend = "uv_build"
71
+
72
+ [tool.pytest.ini_options]
73
+ testpaths = [ "tests" ]
@@ -0,0 +1,6 @@
1
+ import importlib.metadata
2
+ from .io import get
3
+
4
+ __version__ = importlib.metadata.version("disfor")
5
+
6
+ __all__ = ["get"]
@@ -0,0 +1,25 @@
1
+ CLASSES = {
2
+ 100: "Healthy Vegetation",
3
+ 110: "Undisturbed Forest",
4
+ 120: "Revegetation",
5
+ 121: "With Trees (after clear cut)",
6
+ 122: "Canopy closing (after thinning/defoliation)",
7
+ 123: "Without Trees (shrubs and grasses, no reforestation visible)",
8
+ 200: "Disturbed",
9
+ 210: "Planned",
10
+ 211: "Clear Cut",
11
+ 212: "Thinning",
12
+ 213: "Forestry Mulching (Non Forest Vegetation Removal)",
13
+ 220: "Salvage",
14
+ 221: "After Biotic Disturbance",
15
+ 222: "After Abiotic Disturbance",
16
+ 230: "Biotic",
17
+ 231: "Bark Beetle (with decline)",
18
+ 232: "Gypsy Moth (temporary)",
19
+ 240: "Abiotic",
20
+ 241: "Drought",
21
+ 242: "Wildfire",
22
+ 243: "Wind",
23
+ 244: "Avalanche",
24
+ 245: "Flood",
25
+ }
@@ -0,0 +1,33 @@
1
+ import importlib.util
2
+ from typing import TYPE_CHECKING
3
+ from .generic import GenericDataset
4
+ from .tabular import TabularDataset
5
+
6
+ _HAS_LIGHTNING = importlib.util.find_spec("lightning") is not None
7
+
8
+ if _HAS_LIGHTNING:
9
+ from .monotemporal import (
10
+ MonoTemporalClassification,
11
+ MonoTemporalClassificationDataModule,
12
+ )
13
+ elif TYPE_CHECKING:
14
+ from .monotemporal import (
15
+ MonoTemporalClassification,
16
+ MonoTemporalClassificationDataModule,
17
+ )
18
+ else:
19
+
20
+ class MonoTemporalClassification:
21
+ def __init__(self, *args, **kwargs):
22
+ raise ImportError("Install 'disfor[torch]' to use pytorch datasets.")
23
+
24
+ class MonoTemporalClassificationDataModule(MonoTemporalClassification):
25
+ pass
26
+
27
+
28
+ __all__ = [
29
+ "GenericDataset",
30
+ "TabularDataset",
31
+ "MonoTemporalClassification",
32
+ "MonoTemporalClassificationDataModule",
33
+ ]