piscal-processor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- piscal_processor-0.1.0/PKG-INFO +98 -0
- piscal_processor-0.1.0/README.md +83 -0
- piscal_processor-0.1.0/pyproject.toml +32 -0
- piscal_processor-0.1.0/setup.cfg +4 -0
- piscal_processor-0.1.0/src/piscal_processor/__init__.py +51 -0
- piscal_processor-0.1.0/src/piscal_processor/cli.py +156 -0
- piscal_processor-0.1.0/src/piscal_processor/converter.py +226 -0
- piscal_processor-0.1.0/src/piscal_processor/export.py +70 -0
- piscal_processor-0.1.0/src/piscal_processor/parser.py +140 -0
- piscal_processor-0.1.0/src/piscal_processor/schema.py +228 -0
- piscal_processor-0.1.0/src/piscal_processor/storage.py +231 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/PKG-INFO +98 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/SOURCES.txt +20 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/dependency_links.txt +1 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/entry_points.txt +3 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/requires.txt +9 -0
- piscal_processor-0.1.0/src/piscal_processor.egg-info/top_level.txt +1 -0
- piscal_processor-0.1.0/tests/test_converter.py +64 -0
- piscal_processor-0.1.0/tests/test_export.py +49 -0
- piscal_processor-0.1.0/tests/test_parser.py +89 -0
- piscal_processor-0.1.0/tests/test_schema.py +32 -0
- piscal_processor-0.1.0/tests/test_storage.py +64 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: piscal-processor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PISCAL CSV/Parquet processing: convert, export, and schema utilities for curves and measurements.
|
|
5
|
+
Project-URL: Repository, https://github.com/kolpacksoftware/piscal-processor
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas>=2.0.0
|
|
9
|
+
Requires-Dist: pyarrow>=10.0.0
|
|
10
|
+
Provides-Extra: s3
|
|
11
|
+
Requires-Dist: s3fs>=2023.0.0; extra == "s3"
|
|
12
|
+
Requires-Dist: fsspec>=2023.0.0; extra == "s3"
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
15
|
+
|
|
16
|
+
# piscal-processor
|
|
17
|
+
|
|
18
|
+
PISCAL CSV/Parquet processing: convert curves to Parquet, export to CSV/TSV, and use standard schemas. PISCAL is used at [LeafWeb.org](https://leafweb.org).
|
|
19
|
+
|
|
20
|
+
Install from PyPI, a local clone, or a Git URL (see below).
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
**From PyPI** (when published):
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install piscal-processor
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Optional S3 support** (e.g. for `s3a://` paths):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install piscal-processor[s3]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**From a local clone** (development):
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e /path/to/piscal-processor
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**From a Git URL** (CI or private install): use a personal access token or SSH:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install "piscal-processor @ git+https://github.com/kolpacksoftware/piscal-processor.git@main"
|
|
46
|
+
# or
|
|
47
|
+
pip install "piscal-processor @ git+ssh://git@github.com/kolpacksoftware/piscal-processor.git@main"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
You can pin a branch (`@main`), tag (`@v0.1.0`), or commit (`@abc1234`).
|
|
51
|
+
|
|
52
|
+
## CLI
|
|
53
|
+
|
|
54
|
+
**Convert** PISCAL CSV files to Parquet (metadata + measurements):
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
piscal-processor /path/to/csv_dir --output-dir parquet_output
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Options: `--no-discover-pathway-subdirs`, `--source-pathway`, `--metadata-name`, `--measurements-name`. Input can be a local path or `s3a://` URI.
|
|
61
|
+
|
|
62
|
+
**Export** measurement Parquet to CSV or TSV:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
piscal-processor-export curve_measurements.parquet -o out.tsv --format tsv --columns curve_id,AnetCO2,PARi
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Library
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from piscal_processor import convert_curves, export_curves, get_backend
|
|
72
|
+
|
|
73
|
+
# Convert CSVs to DataFrames (or write Parquet via converter.normalize_and_write_parquet)
|
|
74
|
+
backend = get_backend("/path/to/csv_dir")
|
|
75
|
+
meta_df, meas_df = convert_curves("/path/to/csv_dir", backend, source_pathway="C3")
|
|
76
|
+
|
|
77
|
+
# Export measurements to CSV/TSV
|
|
78
|
+
export_curves(meas_df, "out.tsv", columns=["curve_id", "AnetCO2", "PARi"], format="tsv")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Schema constants and parser helpers are also available:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from piscal_processor import STANDARD_MEASUREMENT_COLUMNS, STANDARD_METADATA_COLUMNS
|
|
85
|
+
from piscal_processor.parser import parse_csv_line, parse_key_value_section
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Tests
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install -e ".[dev]"
|
|
92
|
+
pytest
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Documentation
|
|
96
|
+
|
|
97
|
+
- `docs/csv_structure.md`: CSV file structure (header, site/parameter blocks, measurement table).
|
|
98
|
+
- `docs/inputformat.txt`: PISCAL input file specification (official format description).
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# piscal-processor
|
|
2
|
+
|
|
3
|
+
PISCAL CSV/Parquet processing: convert curves to Parquet, export to CSV/TSV, and use standard schemas. PISCAL is used at [LeafWeb.org](https://leafweb.org).
|
|
4
|
+
|
|
5
|
+
Install from PyPI, a local clone, or a Git URL (see below).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
**From PyPI** (when published):
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install piscal-processor
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
**Optional S3 support** (e.g. for `s3a://` paths):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install piscal-processor[s3]
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
**From a local clone** (development):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e /path/to/piscal-processor
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**From a Git URL** (CI or private install): use a personal access token or SSH:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install "piscal-processor @ git+https://github.com/kolpacksoftware/piscal-processor.git@main"
|
|
31
|
+
# or
|
|
32
|
+
pip install "piscal-processor @ git+ssh://git@github.com/kolpacksoftware/piscal-processor.git@main"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
You can pin a branch (`@main`), tag (`@v0.1.0`), or commit (`@abc1234`).
|
|
36
|
+
|
|
37
|
+
## CLI
|
|
38
|
+
|
|
39
|
+
**Convert** PISCAL CSV files to Parquet (metadata + measurements):
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
piscal-processor /path/to/csv_dir --output-dir parquet_output
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Options: `--no-discover-pathway-subdirs`, `--source-pathway`, `--metadata-name`, `--measurements-name`. Input can be a local path or `s3a://` URI.
|
|
46
|
+
|
|
47
|
+
**Export** measurement Parquet to CSV or TSV:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
piscal-processor-export curve_measurements.parquet -o out.tsv --format tsv --columns curve_id,AnetCO2,PARi
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Library
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from piscal_processor import convert_curves, export_curves, get_backend
|
|
57
|
+
|
|
58
|
+
# Convert CSVs to DataFrames (or write Parquet via converter.normalize_and_write_parquet)
|
|
59
|
+
backend = get_backend("/path/to/csv_dir")
|
|
60
|
+
meta_df, meas_df = convert_curves("/path/to/csv_dir", backend, source_pathway="C3")
|
|
61
|
+
|
|
62
|
+
# Export measurements to CSV/TSV
|
|
63
|
+
export_curves(meas_df, "out.tsv", columns=["curve_id", "AnetCO2", "PARi"], format="tsv")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Schema constants and parser helpers are also available:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from piscal_processor import STANDARD_MEASUREMENT_COLUMNS, STANDARD_METADATA_COLUMNS
|
|
70
|
+
from piscal_processor.parser import parse_csv_line, parse_key_value_section
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Tests
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e ".[dev]"
|
|
77
|
+
pytest
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Documentation
|
|
81
|
+
|
|
82
|
+
- `docs/csv_structure.md`: CSV file structure (header, site/parameter blocks, measurement table).
|
|
83
|
+
- `docs/inputformat.txt`: PISCAL input file specification (official format description).
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "piscal-processor"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "PISCAL CSV/Parquet processing: convert, export, and schema utilities for curves and measurements."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=2.0.0",
|
|
13
|
+
"pyarrow>=10.0.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
s3 = ["s3fs>=2023.0.0", "fsspec>=2023.0.0"]
|
|
18
|
+
dev = ["pytest>=7.0.0"]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
piscal-processor = "piscal_processor.cli:main"
|
|
22
|
+
piscal-processor-export = "piscal_processor.cli:export_main"
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Repository = "https://github.com/kolpacksoftware/piscal-processor"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
29
|
+
|
|
30
|
+
[tool.pytest.ini_options]
|
|
31
|
+
testpaths = ["tests"]
|
|
32
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
piscal-processor: PISCAL CSV/Parquet processing: convert curves, export to CSV/TSV, and use standard schemas.
|
|
3
|
+
|
|
4
|
+
Use as a library:
|
|
5
|
+
from piscal_processor import convert_curves, export_curves, get_backend
|
|
6
|
+
meta, meas = convert_curves(input_dir, get_backend(input_dir))
|
|
7
|
+
export_curves(meas, "out.tsv", columns=["curve_id", "AnetCO2"], format="tsv")
|
|
8
|
+
|
|
9
|
+
CLI: piscal-processor (convert), piscal-processor-export (export to CSV/TSV).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from piscal_processor.converter import convert_curves, parse_curve_file
|
|
13
|
+
from piscal_processor.export import export_curves, extract_columns
|
|
14
|
+
from piscal_processor.parser import (
|
|
15
|
+
next_nonempty,
|
|
16
|
+
normalize_scalar,
|
|
17
|
+
parse_csv_line,
|
|
18
|
+
parse_key_value_section,
|
|
19
|
+
parse_triplet,
|
|
20
|
+
)
|
|
21
|
+
from piscal_processor.schema import (
|
|
22
|
+
MEASUREMENT_COLUMN_ALIASES,
|
|
23
|
+
MEASUREMENT_STRING_COLUMNS,
|
|
24
|
+
METADATA_COLUMN_ALIASES,
|
|
25
|
+
NUMERIC_METADATA_COLUMNS,
|
|
26
|
+
STANDARD_MEASUREMENT_COLUMNS,
|
|
27
|
+
STANDARD_METADATA_COLUMNS,
|
|
28
|
+
)
|
|
29
|
+
from piscal_processor.storage import FilesystemBackend, S3Backend, StorageBackend, get_backend
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"convert_curves",
|
|
33
|
+
"parse_curve_file",
|
|
34
|
+
"export_curves",
|
|
35
|
+
"extract_columns",
|
|
36
|
+
"get_backend",
|
|
37
|
+
"StorageBackend",
|
|
38
|
+
"FilesystemBackend",
|
|
39
|
+
"S3Backend",
|
|
40
|
+
"next_nonempty",
|
|
41
|
+
"normalize_scalar",
|
|
42
|
+
"parse_csv_line",
|
|
43
|
+
"parse_key_value_section",
|
|
44
|
+
"parse_triplet",
|
|
45
|
+
"STANDARD_MEASUREMENT_COLUMNS",
|
|
46
|
+
"STANDARD_METADATA_COLUMNS",
|
|
47
|
+
"MEASUREMENT_COLUMN_ALIASES",
|
|
48
|
+
"METADATA_COLUMN_ALIASES",
|
|
49
|
+
"MEASUREMENT_STRING_COLUMNS",
|
|
50
|
+
"NUMERIC_METADATA_COLUMNS",
|
|
51
|
+
]
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Command-line interface for piscal-processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from piscal_processor.converter import (
|
|
9
|
+
_pathway_subdirs_from_csv_paths,
|
|
10
|
+
convert_curves,
|
|
11
|
+
normalize_and_write_parquet,
|
|
12
|
+
)
|
|
13
|
+
from piscal_processor.export import export_curves
|
|
14
|
+
from piscal_processor.storage import get_backend
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _parse_convert_args() -> argparse.Namespace:
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
description="Convert PISCAL CSV curve files into metadata and measurement Parquet tables."
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"input_dir",
|
|
23
|
+
type=str,
|
|
24
|
+
help="Directory containing the CSV files (one curve per file). Local path or s3a:// URI.",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--no-discover-pathway-subdirs",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Do not discover subdirs; treat input_dir as the folder of CSVs.",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--source-pathway",
|
|
33
|
+
type=str,
|
|
34
|
+
default=None,
|
|
35
|
+
help="Explicit pathway_subtype label (e.g. C4_NAD-ME). Only with --no-discover-pathway-subdirs.",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--output-dir",
|
|
39
|
+
type=str,
|
|
40
|
+
default="parquet_output",
|
|
41
|
+
help="Destination directory for Parquet output.",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--metadata-name",
|
|
45
|
+
default="curve_metadata.parquet",
|
|
46
|
+
help="Filename for the metadata parquet output.",
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--measurements-name",
|
|
50
|
+
default="curve_measurements.parquet",
|
|
51
|
+
help="Filename for the measurement parquet output.",
|
|
52
|
+
)
|
|
53
|
+
return parser.parse_args()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_export_args() -> argparse.Namespace:
|
|
57
|
+
parser = argparse.ArgumentParser(
|
|
58
|
+
description="Export measurement Parquet to CSV or TSV (optionally selected columns)."
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"measurements_parquet",
|
|
62
|
+
type=str,
|
|
63
|
+
help="Path to curve_measurements.parquet (local or s3a://).",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"-o", "--output",
|
|
67
|
+
type=str,
|
|
68
|
+
required=True,
|
|
69
|
+
help="Output CSV or TSV path.",
|
|
70
|
+
)
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--columns",
|
|
73
|
+
type=str,
|
|
74
|
+
default=None,
|
|
75
|
+
help="Comma-separated column names to export (default: all).",
|
|
76
|
+
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--format",
|
|
79
|
+
type=str,
|
|
80
|
+
choices=("csv", "tsv"),
|
|
81
|
+
default="csv",
|
|
82
|
+
help="Output format: csv or tsv.",
|
|
83
|
+
)
|
|
84
|
+
return parser.parse_args()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def main() -> None:
|
|
88
|
+
"""Main entry: convert CSV files to Parquet (default subcommand)."""
|
|
89
|
+
args = _parse_convert_args()
|
|
90
|
+
output_dir = Path(args.output_dir)
|
|
91
|
+
|
|
92
|
+
if args.no_discover_pathway_subdirs:
|
|
93
|
+
read_backend = get_backend(args.input_dir)
|
|
94
|
+
metadata_df, measurement_df = convert_curves(
|
|
95
|
+
args.input_dir, read_backend, source_pathway=args.source_pathway
|
|
96
|
+
)
|
|
97
|
+
normalize_and_write_parquet(
|
|
98
|
+
metadata_df,
|
|
99
|
+
measurement_df,
|
|
100
|
+
output_dir,
|
|
101
|
+
metadata_name=args.metadata_name,
|
|
102
|
+
measurements_name=args.measurements_name,
|
|
103
|
+
)
|
|
104
|
+
print(f"Wrote metadata to {output_dir / args.metadata_name}")
|
|
105
|
+
print(f"Wrote measurements to {output_dir / args.measurements_name}")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
read_backend = get_backend(args.input_dir)
|
|
109
|
+
csv_paths = read_backend.list_csv_paths(args.input_dir)
|
|
110
|
+
subdirs = _pathway_subdirs_from_csv_paths(args.input_dir, csv_paths)
|
|
111
|
+
parent = args.input_dir.rstrip("/").split("/")[-1]
|
|
112
|
+
|
|
113
|
+
if not subdirs:
|
|
114
|
+
metadata_df, measurement_df = convert_curves(
|
|
115
|
+
args.input_dir, read_backend, source_pathway=None
|
|
116
|
+
)
|
|
117
|
+
normalize_and_write_parquet(
|
|
118
|
+
metadata_df,
|
|
119
|
+
measurement_df,
|
|
120
|
+
output_dir,
|
|
121
|
+
metadata_name=args.metadata_name,
|
|
122
|
+
measurements_name=args.measurements_name,
|
|
123
|
+
)
|
|
124
|
+
print(f"Wrote metadata to {output_dir / args.metadata_name}")
|
|
125
|
+
print(f"Wrote measurements to {output_dir / args.measurements_name}")
|
|
126
|
+
else:
|
|
127
|
+
for sub in subdirs:
|
|
128
|
+
input_path = f"{args.input_dir.rstrip('/')}/{sub}"
|
|
129
|
+
source_pathway = f"{parent}_{sub}"
|
|
130
|
+
sub_output = output_dir / source_pathway
|
|
131
|
+
metadata_df, measurement_df = convert_curves(
|
|
132
|
+
input_path, read_backend, source_pathway=source_pathway
|
|
133
|
+
)
|
|
134
|
+
normalize_and_write_parquet(
|
|
135
|
+
metadata_df,
|
|
136
|
+
measurement_df,
|
|
137
|
+
sub_output,
|
|
138
|
+
metadata_name=args.metadata_name,
|
|
139
|
+
measurements_name=args.measurements_name,
|
|
140
|
+
)
|
|
141
|
+
print(f"Wrote {source_pathway} to {sub_output}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def export_main() -> None:
|
|
145
|
+
"""CLI entry for export: Parquet -> CSV/TSV."""
|
|
146
|
+
args = _parse_export_args()
|
|
147
|
+
backend = get_backend(args.measurements_parquet)
|
|
148
|
+
measurement_df = backend.read_parquet(args.measurements_parquet)
|
|
149
|
+
columns = [c.strip() for c in args.columns.split(",")] if args.columns else None
|
|
150
|
+
export_curves(
|
|
151
|
+
measurement_df,
|
|
152
|
+
args.output,
|
|
153
|
+
columns=columns,
|
|
154
|
+
format=args.format,
|
|
155
|
+
)
|
|
156
|
+
print(f"Exported to {args.output}")
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Convert PISCAL CSV curves into metadata and measurement DataFrames and Parquet."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Set, Tuple
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from piscal_processor.parser import (
|
|
12
|
+
next_nonempty,
|
|
13
|
+
normalize_scalar,
|
|
14
|
+
parse_csv_line,
|
|
15
|
+
parse_key_value_section,
|
|
16
|
+
parse_triplet,
|
|
17
|
+
)
|
|
18
|
+
from piscal_processor.schema import (
|
|
19
|
+
MEASUREMENT_COLUMN_ALIASES,
|
|
20
|
+
MEASUREMENT_STRING_COLUMNS,
|
|
21
|
+
METADATA_COLUMN_ALIASES,
|
|
22
|
+
NUMERIC_METADATA_COLUMNS,
|
|
23
|
+
STANDARD_MEASUREMENT_COLUMNS,
|
|
24
|
+
STANDARD_METADATA_COLUMNS,
|
|
25
|
+
)
|
|
26
|
+
from piscal_processor.storage import StorageBackend, get_backend
|
|
27
|
+
|
|
28
|
+
OPTIONAL_MEASUREMENT_COLUMNS = ["BLCond"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _pathway_subdirs_from_csv_paths(input_dir: str, csv_paths: List[str]) -> List[str]:
|
|
32
|
+
"""Return sorted list of immediate subdir names under input_dir that contain CSVs."""
|
|
33
|
+
base = input_dir.rstrip("/")
|
|
34
|
+
if not base:
|
|
35
|
+
return []
|
|
36
|
+
subdirs: Set[str] = set()
|
|
37
|
+
for p in csv_paths:
|
|
38
|
+
p_str = str(p)
|
|
39
|
+
if p_str.startswith(base):
|
|
40
|
+
rest = p_str[len(base) :].lstrip("/")
|
|
41
|
+
if rest:
|
|
42
|
+
subdirs.add(rest.split("/")[0])
|
|
43
|
+
return sorted(subdirs)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def pad_row(row: List[str], target_len: int) -> List[str]:
|
|
47
|
+
"""Normalize a CSV row to a fixed length by padding or truncating."""
|
|
48
|
+
if len(row) < target_len:
|
|
49
|
+
return row + ["" for _ in range(target_len - len(row))]
|
|
50
|
+
if len(row) > target_len:
|
|
51
|
+
return row[:target_len]
|
|
52
|
+
return row
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def coerce_numeric_columns(df: pd.DataFrame, skip: set[str] | None = None) -> pd.DataFrame:
|
|
56
|
+
"""Convert string columns to numeric when they contain numeric data."""
|
|
57
|
+
skip = skip or set()
|
|
58
|
+
for column in df.columns:
|
|
59
|
+
if column in skip:
|
|
60
|
+
continue
|
|
61
|
+
numeric = pd.to_numeric(df[column], errors="coerce")
|
|
62
|
+
if numeric.notna().any():
|
|
63
|
+
df[column] = numeric
|
|
64
|
+
return df
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def parse_curve_file(uri: str, backend: StorageBackend) -> Tuple[Dict[str, object], pd.DataFrame]:
|
|
68
|
+
"""Parse a single PISCAL CSV file into metadata dict and measurements DataFrame."""
|
|
69
|
+
lines = backend.read_text(uri).splitlines()
|
|
70
|
+
|
|
71
|
+
idx, general_info = parse_key_value_section(lines)
|
|
72
|
+
idx, site_headers, _site_units, site_values = parse_triplet(lines, idx)
|
|
73
|
+
site_data = {
|
|
74
|
+
header.strip(): normalize_scalar(value)
|
|
75
|
+
for header, value in zip(site_headers, site_values)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
idx, param_headers, param_units, param_values = parse_triplet(lines, idx)
|
|
79
|
+
param_data = {
|
|
80
|
+
f"param_{header.strip()}": normalize_scalar(value)
|
|
81
|
+
for header, value in zip(param_headers, param_values)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
idx = next_nonempty(lines, idx)
|
|
85
|
+
measurement_headers = parse_csv_line(lines[idx])
|
|
86
|
+
idx += 1
|
|
87
|
+
idx = next_nonempty(lines, idx)
|
|
88
|
+
measurement_units = parse_csv_line(lines[idx])
|
|
89
|
+
idx += 1
|
|
90
|
+
|
|
91
|
+
data_rows: List[List[str]] = []
|
|
92
|
+
while idx < len(lines):
|
|
93
|
+
line = lines[idx]
|
|
94
|
+
idx += 1
|
|
95
|
+
if not line.strip():
|
|
96
|
+
continue
|
|
97
|
+
row = parse_csv_line(line)
|
|
98
|
+
if not row:
|
|
99
|
+
continue
|
|
100
|
+
data_rows.append(pad_row(row, len(measurement_headers)))
|
|
101
|
+
|
|
102
|
+
measurements_df = pd.DataFrame(data_rows, columns=measurement_headers)
|
|
103
|
+
measurements_df = measurements_df.rename(
|
|
104
|
+
columns={k: v for k, v in MEASUREMENT_COLUMN_ALIASES.items() if k in measurements_df.columns}
|
|
105
|
+
)
|
|
106
|
+
measurements_df.insert(0, "curve_id", backend.stem(uri))
|
|
107
|
+
|
|
108
|
+
insert_idx = 1
|
|
109
|
+
if "SpeciesSampled" in site_data:
|
|
110
|
+
measurements_df.insert(insert_idx, "SpeciesSampled", site_data.get("SpeciesSampled"))
|
|
111
|
+
insert_idx += 1
|
|
112
|
+
if "Major species" in general_info:
|
|
113
|
+
measurements_df.insert(insert_idx, "Major_species", general_info.get("Major species"))
|
|
114
|
+
insert_idx += 1
|
|
115
|
+
if "Photosynthetic pathway" in general_info:
|
|
116
|
+
measurements_df.insert(insert_idx, "Photosynthetic_pathway", general_info.get("Photosynthetic pathway"))
|
|
117
|
+
insert_idx += 1
|
|
118
|
+
|
|
119
|
+
measurements_df.replace(r"^-9999(\.0+)?$", pd.NA, regex=True, inplace=True)
|
|
120
|
+
measurements_df.replace("NA", pd.NA, inplace=True)
|
|
121
|
+
|
|
122
|
+
coerce_numeric_columns(
|
|
123
|
+
measurements_df,
|
|
124
|
+
skip={"HHMMSS", "curve_id", "SpeciesSampled", "Major_species", "Photosynthetic_pathway", "DataType", "ObsDate"},
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
for col in OPTIONAL_MEASUREMENT_COLUMNS:
|
|
128
|
+
if col not in measurements_df.columns:
|
|
129
|
+
measurements_df[col] = pd.NA
|
|
130
|
+
|
|
131
|
+
metadata_row: Dict[str, object] = {
|
|
132
|
+
"curve_id": backend.stem(uri),
|
|
133
|
+
"source_file": backend.name(uri),
|
|
134
|
+
**general_info,
|
|
135
|
+
**site_data,
|
|
136
|
+
**param_data,
|
|
137
|
+
"measurement_units": json.dumps(
|
|
138
|
+
{key.strip(): unit.strip() for key, unit in zip(measurement_headers, measurement_units)}
|
|
139
|
+
),
|
|
140
|
+
"parameter_units": json.dumps(
|
|
141
|
+
{key.replace("param_", ""): unit.strip() for key, unit in zip(param_headers, param_units)}
|
|
142
|
+
),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return metadata_row, measurements_df
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def convert_curves(
|
|
149
|
+
input_dir: str | Path,
|
|
150
|
+
backend: StorageBackend,
|
|
151
|
+
*,
|
|
152
|
+
source_pathway: str | None = None,
|
|
153
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
154
|
+
"""Convert all CSV files in a directory into combined metadata and measurement DataFrames."""
|
|
155
|
+
curve_files = backend.list_csv_paths(input_dir)
|
|
156
|
+
if not curve_files:
|
|
157
|
+
raise FileNotFoundError(f"No CSV files found in {input_dir}")
|
|
158
|
+
|
|
159
|
+
metadata_rows: List[Dict[str, object]] = []
|
|
160
|
+
measurement_frames: List[pd.DataFrame] = []
|
|
161
|
+
|
|
162
|
+
for uri in curve_files:
|
|
163
|
+
metadata_row, measurement_df = parse_curve_file(uri, backend)
|
|
164
|
+
metadata_rows.append(metadata_row)
|
|
165
|
+
measurement_frames.append(measurement_df)
|
|
166
|
+
|
|
167
|
+
all_columns = STANDARD_MEASUREMENT_COLUMNS
|
|
168
|
+
|
|
169
|
+
def _align_frame(df: pd.DataFrame) -> pd.DataFrame:
|
|
170
|
+
out = df.reindex(columns=all_columns, fill_value=pd.NA)
|
|
171
|
+
for col in OPTIONAL_MEASUREMENT_COLUMNS:
|
|
172
|
+
if col in out.columns:
|
|
173
|
+
out[col] = pd.to_numeric(out[col], errors="coerce")
|
|
174
|
+
all_na = out.columns[out.isna().all()].tolist()
|
|
175
|
+
if all_na:
|
|
176
|
+
out = out.drop(columns=all_na)
|
|
177
|
+
return out
|
|
178
|
+
|
|
179
|
+
measurement_frames = [_align_frame(df) for df in measurement_frames]
|
|
180
|
+
|
|
181
|
+
metadata_df = pd.DataFrame(metadata_rows)
|
|
182
|
+
measurement_df = pd.concat(measurement_frames, ignore_index=True, join="outer")
|
|
183
|
+
measurement_df = measurement_df.reindex(columns=STANDARD_MEASUREMENT_COLUMNS)
|
|
184
|
+
|
|
185
|
+
if source_pathway:
|
|
186
|
+
metadata_df.insert(1, "pathway_subtype", source_pathway)
|
|
187
|
+
measurement_df["pathway_subtype"] = source_pathway
|
|
188
|
+
|
|
189
|
+
return metadata_df, measurement_df
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def normalize_and_write_parquet(
|
|
193
|
+
metadata_df: pd.DataFrame,
|
|
194
|
+
measurement_df: pd.DataFrame,
|
|
195
|
+
output_dir: Path,
|
|
196
|
+
*,
|
|
197
|
+
metadata_name: str = "curve_metadata.parquet",
|
|
198
|
+
measurements_name: str = "curve_measurements.parquet",
|
|
199
|
+
) -> None:
|
|
200
|
+
"""Normalize schema and write both Parquet tables under output_dir."""
|
|
201
|
+
metadata_path = output_dir / metadata_name
|
|
202
|
+
measurement_path = output_dir / measurements_name
|
|
203
|
+
write_backend = get_backend(str(output_dir))
|
|
204
|
+
write_backend.ensure_output_parent(str(metadata_path))
|
|
205
|
+
write_backend.ensure_output_parent(str(measurement_path))
|
|
206
|
+
|
|
207
|
+
metadata_df = metadata_df.rename(
|
|
208
|
+
columns={k: v for k, v in METADATA_COLUMN_ALIASES.items() if k in metadata_df.columns}
|
|
209
|
+
)
|
|
210
|
+
if metadata_df.columns.duplicated().any():
|
|
211
|
+
metadata_df = metadata_df.loc[:, ~metadata_df.columns.duplicated(keep="first")]
|
|
212
|
+
metadata_df = metadata_df.reindex(columns=STANDARD_METADATA_COLUMNS)
|
|
213
|
+
|
|
214
|
+
for col in metadata_df.select_dtypes(include=["object", "string"]).columns:
|
|
215
|
+
metadata_df[col] = metadata_df[col].apply(lambda x: pd.NA if pd.isna(x) else str(x))
|
|
216
|
+
for col in NUMERIC_METADATA_COLUMNS:
|
|
217
|
+
if col in metadata_df.columns:
|
|
218
|
+
metadata_df[col] = pd.to_numeric(metadata_df[col], errors="coerce").astype("float64")
|
|
219
|
+
|
|
220
|
+
for col in MEASUREMENT_STRING_COLUMNS:
|
|
221
|
+
if col in measurement_df.columns:
|
|
222
|
+
s = measurement_df[col]
|
|
223
|
+
measurement_df[col] = s.apply(lambda x: pd.NA if pd.isna(x) else str(x)).astype("string")
|
|
224
|
+
|
|
225
|
+
write_backend.write_parquet(metadata_df, metadata_path)
|
|
226
|
+
write_backend.write_parquet(measurement_df, measurement_path)
|