PyPI - disfor - Versions diffs - 0.2.0__tar.gz - Mend

disfor 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

disfor-0.2.0/PKG-INFO +196 -0
disfor-0.2.0/README.md +165 -0
disfor-0.2.0/pyproject.toml +73 -0
disfor-0.2.0/src/disfor/__init__.py +6 -0
disfor-0.2.0/src/disfor/const.py +25 -0
disfor-0.2.0/src/disfor/datasets/__init__.py +33 -0
disfor-0.2.0/src/disfor/datasets/generic.py +539 -0
disfor-0.2.0/src/disfor/datasets/monotemporal.py +225 -0
disfor-0.2.0/src/disfor/datasets/tabular.py +28 -0
disfor-0.2.0/src/disfor/io.py +138 -0
disfor-0.2.0/src/disfor/utils.py +226 -0

disfor-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,196 @@
+Metadata-Version: 2.3
+Name: disfor
+Version: 0.2.0
+Summary: Utilities to load and filter DISFOR dataset. The dataset provides forest disturbance agent labels for use with Sentinel-2.
+Author: Jonas Viehweger
+Author-email: Jonas Viehweger <jonas.viehweger@joanneum.at>
+License: EUPL-1.2
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)
+Requires-Dist: imagecodecs>=2025.8.2
+Requires-Dist: matplotlib>=3.10.6
+Requires-Dist: numpy>=2.3.3
+Requires-Dist: polars>=1.33.1
+Requires-Dist: pooch>=1.8.2
+Requires-Dist: pyarrow>=21.0.0
+Requires-Dist: scikit-learn>=1.7.2
+Requires-Dist: tifffile>=2025.9.30
+Requires-Dist: zstandard>=0.25.0
+Requires-Dist: torch>=2.8.0 ; extra == 'torch'
+Requires-Dist: lightning>=2.5.5 ; extra == 'torch'
+Requires-Python: >=3.12
+Project-URL: Documentation, https://jr-digital.github.io/DISFOR/
+Project-URL: Homepage, https://github.com/JR-DIGITAL/DISFOR
+Project-URL: Issues, https://github.com/JR-DIGITAL/DISFOR/issues
+Project-URL: Repository, https://github.com/JR-DIGITAL/DISFOR
+Provides-Extra: torch
+Description-Content-Type: text/markdown
+# DISFOR <img align="right" width="150" height="150" src="docs/assets/logo.svg">
+**DISFOR** offers dense labelled satellite time-series data on forest disturbance timing and agents of disturbance. It contains 3823 unique time-series.
+Each time-series corresponds to a single 10x10m Sentinel-2 pixel.
+![Example image chips](docs/assets/examples.png)
+## Installation
+The package can be installed from PyPI:
+`pip install disfor`
+## Usage
+The data itself is available at on huggingface: https://huggingface.co/datasets/JR-DIGITAL/DISFOR.
+There are detailed usage guides available in the documentation at: https://jr-digital.github.io/DISFOR/usage/dataset-overview/
+## Dataset Overview
+There are four main parts of this dataset:
+- `samples.parquet`: Providing location and metadata of sampled points
+- `labels.parquet`: Providing labels for each sampled time-series
+- `pixel_data.parquet`: Providing Sentinel-2 band data for each acquistion in the time-series
+- Sentinel-2 Chips: Image chip time-series for each sample
+### samples.parquet
+This contains the sampled points along with metadata on the points. It provides the following columns:
+| Column name | Description |
+| --- | --- |
+| sample_id | Unique sample ID for each sample point |
+| original_sample_id | Sample ID of the point in the original publication of the dataset |
+| interpreter | Shorthand code for the interpreter who labelled this sample |
+| dataset | Number of the original sampling campaign in which this point was labelled |
+| source | The ancillary data source used to interpret the agent |
+| source_description | A long text description of the used source. Link to the original data if available |
+| s2_tile | If available, which Sentinel 2 Tile the sample intersects |
+| cluster_id | Unique ID to group samples which are spatio-temporally autocorrelated |
+| cluster_description | What type of cluster it is |
+| comment | Free text comment about the interpretation of the sampled point |
+| confidence | Confidence of sampling: high where both timing and agent are confident, medium were only the timing is confident |
+| geometry | Coordinates of the sampled point. In CRS EPSG:4326 |
+### labels.parquet
+This contains the time-series labels for each sampled point in `samples.parquet`. The following columns are available:
+| Column name | Description |
+| --- | --- |
+| sample_id | Taken from sample table |
+| original_sample_id | Taken from sample table |
+| dataset | Taken from sample table |
+| label | Interpreted class of the segment (see next table) |
+| original_label | The label which was originally assigned and remapped to label |
+| start | Start date of the segment |
+| end | End date of the segment |
+| start_next_label | Start date of the next label. Some labels are encoded as events (Clear Cuts for example) and are not immediately followed by another label, this column allows a full segmentation of the time-series. Null if it is the last label of the sample |
+The provided label is a hierarchical label, following this hierarchy:
+<table border="1" cellspacing="0" cellpadding="6">
+  <thead>
+    <tr>
+      <th>Level 1</th>
+      <th>Level 2</th>
+      <th>Level 3</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- 100 - Alive Vegetation -->
+    <tr>
+      <td rowspan="4">100 - Healthy Vegetation</td>
+      <td>110 - Undisturbed Forest</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td rowspan="3">120 - Revegetation</td>
+      <td>121 - With Trees (after clear cut)</td>
+    </tr>
+    <tr>
+      <td>122 - Canopy closing (after thinning/defoliation)</td>
+    </tr>
+    <tr>
+      <td>123 - Without Trees (shrubs and grasses, no reforestation visible)</td>
+    </tr>
+    <tr>
+      <td rowspan="14">200 - Disturbed</td>
+      <td rowspan="3">210 - Planned</td>
+      <td>211 - Clear Cut</td>
+    </tr>
+    <tr>
+      <td>212 - Thinning</td>
+    </tr>
+    <tr>
+      <td>213 - Forestry Mulching (Non Forest Vegetation Removal)</td>
+    </tr>
+    <tr>
+      <td rowspan="2">220 - Salvage</td>
+      <td>221 - After Biotic Disturbances</td>
+    </tr>
+    <tr>
+      <td>222 - After Abiotic Disturbances</td>
+    </tr>
+    <tr>
+      <td rowspan="2">230 - Biotic</td>
+      <td>231 - Bark Beetle</td>
+    </tr>
+    <tr>
+      <td>232 - Gypsy Moth (temporal segment of visible disturbance)</td>
+    </tr>
+    <tr>
+      <td rowspan="5">240 - Abiotic</td>
+      <td>241 - Drought</td>
+    </tr>
+    <tr>
+      <td>242 - Wildfire</td>
+    </tr>
+    <tr>
+      <td>243 - Wind</td>
+    </tr>
+    <tr>
+      <td>244 - Avalanche</td>
+    </tr>
+    <tr>
+      <td>245 - Flood</td>
+    </tr>
+  </tbody>
+</table>
+This mapping from label numbers to text is also available in `classes.json`.
+### pixel_data.parquet
+This dataset provides the Sentinel-2 time-series of spectral values from which the labels were interpreted. The following columns are available:
+| Column name | Datatype | Description |
+| --- | --- | --- |
+| sample_id | UINT16 | Taken from sample table |
+| timestamp | DATE | UTC date of the S2 acquisition |
+| label | UINT16 | Interpreted class of the segment, see previous table |
+| clear | BOOL | True if the pixel is clear (SCL value any of 2,4,5,6) |
+| percent_clear_4x4 [8x8, 16x16, 32x32] | UINT8 | The percentage of clear pixels (SCL in 2,4,5,6) within a 4x4, 8x8, 16x16 or 32x32 pixel image chip |
+| B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12 | UINT16 | DN value for the spectral band |
+| SCL | UINT8 | Sentinel 2 Scene Classification Value |
+### Sentinel-2 Chips
+The files `disfor-<start-id>-<end-id>.tar.zst` provide tarballs with Sentinel-2 chips for each sample. The chips are of size 32x32px,
+the sampled point is always at `[16,16]`. The available bands are: `B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12`.
+Sentinel-2 bands with a native resolution of 20m (B11, B12) were resampled to 10m using nearest neighbor resampling.
+The file structure in each tarball is:
+`tiffs/<sample_id>/YYYY-MM-DD.tif`
+## Train Test Split
+There is a train test split available which was constructed to reduce spatial autocorrelation and information leakage between the sets.
+Two JSONs with lists of sample_ids are available in
+- `train_ids.json`
+- `val_ids.json`

disfor-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,165 @@
+# DISFOR <img align="right" width="150" height="150" src="docs/assets/logo.svg">
+**DISFOR** offers dense labelled satellite time-series data on forest disturbance timing and agents of disturbance. It contains 3823 unique time-series.
+Each time-series corresponds to a single 10x10m Sentinel-2 pixel.
+![Example image chips](docs/assets/examples.png)
+## Installation
+The package can be installed from PyPI:
+`pip install disfor`
+## Usage
+The data itself is available at on huggingface: https://huggingface.co/datasets/JR-DIGITAL/DISFOR.
+There are detailed usage guides available in the documentation at: https://jr-digital.github.io/DISFOR/usage/dataset-overview/
+## Dataset Overview
+There are four main parts of this dataset:
+- `samples.parquet`: Providing location and metadata of sampled points
+- `labels.parquet`: Providing labels for each sampled time-series
+- `pixel_data.parquet`: Providing Sentinel-2 band data for each acquistion in the time-series
+- Sentinel-2 Chips: Image chip time-series for each sample
+### samples.parquet
+This contains the sampled points along with metadata on the points. It provides the following columns:
+| Column name | Description |
+| --- | --- |
+| sample_id | Unique sample ID for each sample point |
+| original_sample_id | Sample ID of the point in the original publication of the dataset |
+| interpreter | Shorthand code for the interpreter who labelled this sample |
+| dataset | Number of the original sampling campaign in which this point was labelled |
+| source | The ancillary data source used to interpret the agent |
+| source_description | A long text description of the used source. Link to the original data if available |
+| s2_tile | If available, which Sentinel 2 Tile the sample intersects |
+| cluster_id | Unique ID to group samples which are spatio-temporally autocorrelated |
+| cluster_description | What type of cluster it is |
+| comment | Free text comment about the interpretation of the sampled point |
+| confidence | Confidence of sampling: high where both timing and agent are confident, medium were only the timing is confident |
+| geometry | Coordinates of the sampled point. In CRS EPSG:4326 |
+### labels.parquet
+This contains the time-series labels for each sampled point in `samples.parquet`. The following columns are available:
+| Column name | Description |
+| --- | --- |
+| sample_id | Taken from sample table |
+| original_sample_id | Taken from sample table |
+| dataset | Taken from sample table |
+| label | Interpreted class of the segment (see next table) |
+| original_label | The label which was originally assigned and remapped to label |
+| start | Start date of the segment |
+| end | End date of the segment |
+| start_next_label | Start date of the next label. Some labels are encoded as events (Clear Cuts for example) and are not immediately followed by another label, this column allows a full segmentation of the time-series. Null if it is the last label of the sample |
+The provided label is a hierarchical label, following this hierarchy:
+<table border="1" cellspacing="0" cellpadding="6">
+  <thead>
+    <tr>
+      <th>Level 1</th>
+      <th>Level 2</th>
+      <th>Level 3</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- 100 - Alive Vegetation -->
+    <tr>
+      <td rowspan="4">100 - Healthy Vegetation</td>
+      <td>110 - Undisturbed Forest</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td rowspan="3">120 - Revegetation</td>
+      <td>121 - With Trees (after clear cut)</td>
+    </tr>
+    <tr>
+      <td>122 - Canopy closing (after thinning/defoliation)</td>
+    </tr>
+    <tr>
+      <td>123 - Without Trees (shrubs and grasses, no reforestation visible)</td>
+    </tr>
+    <tr>
+      <td rowspan="14">200 - Disturbed</td>
+      <td rowspan="3">210 - Planned</td>
+      <td>211 - Clear Cut</td>
+    </tr>
+    <tr>
+      <td>212 - Thinning</td>
+    </tr>
+    <tr>
+      <td>213 - Forestry Mulching (Non Forest Vegetation Removal)</td>
+    </tr>
+    <tr>
+      <td rowspan="2">220 - Salvage</td>
+      <td>221 - After Biotic Disturbances</td>
+    </tr>
+    <tr>
+      <td>222 - After Abiotic Disturbances</td>
+    </tr>
+    <tr>
+      <td rowspan="2">230 - Biotic</td>
+      <td>231 - Bark Beetle</td>
+    </tr>
+    <tr>
+      <td>232 - Gypsy Moth (temporal segment of visible disturbance)</td>
+    </tr>
+    <tr>
+      <td rowspan="5">240 - Abiotic</td>
+      <td>241 - Drought</td>
+    </tr>
+    <tr>
+      <td>242 - Wildfire</td>
+    </tr>
+    <tr>
+      <td>243 - Wind</td>
+    </tr>
+    <tr>
+      <td>244 - Avalanche</td>
+    </tr>
+    <tr>
+      <td>245 - Flood</td>
+    </tr>
+  </tbody>
+</table>
+This mapping from label numbers to text is also available in `classes.json`.
+### pixel_data.parquet
+This dataset provides the Sentinel-2 time-series of spectral values from which the labels were interpreted. The following columns are available:
+| Column name | Datatype | Description |
+| --- | --- | --- |
+| sample_id | UINT16 | Taken from sample table |
+| timestamp | DATE | UTC date of the S2 acquisition |
+| label | UINT16 | Interpreted class of the segment, see previous table |
+| clear | BOOL | True if the pixel is clear (SCL value any of 2,4,5,6) |
+| percent_clear_4x4 [8x8, 16x16, 32x32] | UINT8 | The percentage of clear pixels (SCL in 2,4,5,6) within a 4x4, 8x8, 16x16 or 32x32 pixel image chip |
+| B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12 | UINT16 | DN value for the spectral band |
+| SCL | UINT8 | Sentinel 2 Scene Classification Value |
+### Sentinel-2 Chips
+The files `disfor-<start-id>-<end-id>.tar.zst` provide tarballs with Sentinel-2 chips for each sample. The chips are of size 32x32px,
+the sampled point is always at `[16,16]`. The available bands are: `B02, B03, B04, B05, B06, B07, B08, B8A, B11, B12`.
+Sentinel-2 bands with a native resolution of 20m (B11, B12) were resampled to 10m using nearest neighbor resampling.
+The file structure in each tarball is:
+`tiffs/<sample_id>/YYYY-MM-DD.tif`
+## Train Test Split
+There is a train test split available which was constructed to reduce spatial autocorrelation and information leakage between the sets.
+Two JSONs with lists of sample_ids are available in
+- `train_ids.json`
+- `val_ids.json`

disfor-0.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,73 @@
+[project]
+name = "disfor"
+version = "0.2.0"
+description = "Utilities to load and filter DISFOR dataset. The dataset provides forest disturbance agent labels for use with Sentinel-2."
+readme = "README.md"
+authors = [
+    { name = "Jonas Viehweger", email = "jonas.viehweger@joanneum.at" }
+]
+requires-python = ">=3.12"
+license = {text = "EUPL-1.2"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)"
+]
+dependencies = [
+    "imagecodecs>=2025.8.2",
+    "matplotlib>=3.10.6",
+    "numpy>=2.3.3",
+    "polars>=1.33.1",
+    "pooch>=1.8.2",
+    "pyarrow>=21.0.0",
+    "scikit-learn>=1.7.2",
+    "tifffile>=2025.9.30",
+    "zstandard>=0.25.0",
+]
+[project.optional-dependencies]
+torch = [
+    "torch>=2.8.0",
+    "lightning>=2.5.5",
+]
+[dependency-groups]
+dev = [
+    "hypothesis>=6.140.3",
+    "pre-commit>=4.3.0",
+    "pytest>=8.4.2",
+    "ipykernel>=6.30.1",
+    "rasterio>=1.4.3",
+    "pystac-client>=0.9.0",
+    "sentinelhub>=3.11.3",
+]
+docs = [
+    "folium>=0.20.0",
+    "geopandas>=1.1.1",
+    "mapclassify>=2.10.0",
+    "matplotlib>=3.10.6",
+    "mkdocs-autorefs>=1.4.3",
+    "mkdocs-jupyter>=0.25.1",
+    "mkdocs-material>=9.6.23",
+    "mkdocstrings[python]>=0.30.1",
+    "plotly>=6.4.0",
+    "plotnine>=0.15.0",
+]
+[project.urls]
+Homepage = "https://github.com/JR-DIGITAL/DISFOR"
+Documentation = "https://jr-digital.github.io/DISFOR/"
+Repository = "https://github.com/JR-DIGITAL/DISFOR"
+Issues = "https://github.com/JR-DIGITAL/DISFOR/issues"
+[tool.uv]
+default-groups = ["dev", "docs"]
+[build-system]
+requires = ["uv_build>=0.8.22,<0.9.0"]
+build-backend = "uv_build"
+[tool.pytest.ini_options]
+testpaths = [ "tests" ]

disfor-0.2.0/src/disfor/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+import importlib.metadata
+from .io import get
+__version__ = importlib.metadata.version("disfor")
+__all__ = ["get"]

disfor-0.2.0/src/disfor/const.py ADDED Viewed

@@ -0,0 +1,25 @@
+CLASSES = {
+    100: "Healthy Vegetation",
+    110: "Undisturbed Forest",
+    120: "Revegetation",
+    121: "With Trees (after clear cut)",
+    122: "Canopy closing (after thinning/defoliation)",
+    123: "Without Trees (shrubs and grasses, no reforestation visible)",
+    200: "Disturbed",
+    210: "Planned",
+    211: "Clear Cut",
+    212: "Thinning",
+    213: "Forestry Mulching (Non Forest Vegetation Removal)",
+    220: "Salvage",
+    221: "After Biotic Disturbance",
+    222: "After Abiotic Disturbance",
+    230: "Biotic",
+    231: "Bark Beetle (with decline)",
+    232: "Gypsy Moth (temporary)",
+    240: "Abiotic",
+    241: "Drought",
+    242: "Wildfire",
+    243: "Wind",
+    244: "Avalanche",
+    245: "Flood",
+}

disfor-0.2.0/src/disfor/datasets/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+import importlib.util
+from typing import TYPE_CHECKING
+from .generic import GenericDataset
+from .tabular import TabularDataset
+_HAS_LIGHTNING = importlib.util.find_spec("lightning") is not None
+if _HAS_LIGHTNING:
+    from .monotemporal import (
+        MonoTemporalClassification,
+        MonoTemporalClassificationDataModule,
+    )
+elif TYPE_CHECKING:
+    from .monotemporal import (
+        MonoTemporalClassification,
+        MonoTemporalClassificationDataModule,
+    )
+else:
+    class MonoTemporalClassification:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("Install 'disfor[torch]' to use pytorch datasets.")
+    class MonoTemporalClassificationDataModule(MonoTemporalClassification):
+        pass
+__all__ = [
+    "GenericDataset",
+    "TabularDataset",
+    "MonoTemporalClassification",
+    "MonoTemporalClassificationDataModule",
+]