disdrodb 0.0.21__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disdrodb/__init__.py +132 -15
- disdrodb/_config.py +4 -2
- disdrodb/_version.py +9 -4
- disdrodb/api/checks.py +264 -237
- disdrodb/api/configs.py +4 -8
- disdrodb/api/create_directories.py +235 -290
- disdrodb/api/info.py +217 -26
- disdrodb/api/io.py +295 -269
- disdrodb/api/path.py +597 -173
- disdrodb/api/search.py +486 -0
- disdrodb/{metadata/scripts → cli}/disdrodb_check_metadata_archive.py +12 -7
- disdrodb/{utils/pandas.py → cli/disdrodb_data_archive_directory.py} +9 -18
- disdrodb/cli/disdrodb_download_archive.py +86 -0
- disdrodb/cli/disdrodb_download_metadata_archive.py +53 -0
- disdrodb/cli/disdrodb_download_station.py +84 -0
- disdrodb/{api/scripts → cli}/disdrodb_initialize_station.py +22 -10
- disdrodb/cli/disdrodb_metadata_archive_directory.py +32 -0
- disdrodb/{data_transfer/scripts/disdrodb_download_station.py → cli/disdrodb_open_data_archive.py} +22 -22
- disdrodb/cli/disdrodb_open_logs_directory.py +69 -0
- disdrodb/{data_transfer/scripts/disdrodb_upload_station.py → cli/disdrodb_open_metadata_archive.py} +22 -24
- disdrodb/cli/disdrodb_open_metadata_directory.py +71 -0
- disdrodb/cli/disdrodb_open_product_directory.py +74 -0
- disdrodb/cli/disdrodb_open_readers_directory.py +32 -0
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0.py +38 -31
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0_station.py +32 -30
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0a.py +30 -21
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0a_station.py +24 -33
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0b.py +30 -21
- disdrodb/{l0/scripts → cli}/disdrodb_run_l0b_station.py +25 -34
- disdrodb/cli/disdrodb_run_l0c.py +130 -0
- disdrodb/cli/disdrodb_run_l0c_station.py +129 -0
- disdrodb/cli/disdrodb_run_l1.py +122 -0
- disdrodb/cli/disdrodb_run_l1_station.py +121 -0
- disdrodb/cli/disdrodb_run_l2e.py +122 -0
- disdrodb/cli/disdrodb_run_l2e_station.py +122 -0
- disdrodb/cli/disdrodb_run_l2m.py +122 -0
- disdrodb/cli/disdrodb_run_l2m_station.py +122 -0
- disdrodb/cli/disdrodb_upload_archive.py +105 -0
- disdrodb/cli/disdrodb_upload_station.py +98 -0
- disdrodb/configs.py +90 -25
- disdrodb/data_transfer/__init__.py +22 -0
- disdrodb/data_transfer/download_data.py +87 -90
- disdrodb/data_transfer/upload_data.py +64 -37
- disdrodb/data_transfer/zenodo.py +15 -18
- disdrodb/docs.py +1 -1
- disdrodb/issue/__init__.py +17 -4
- disdrodb/issue/checks.py +10 -23
- disdrodb/issue/reader.py +9 -12
- disdrodb/issue/writer.py +14 -17
- disdrodb/l0/__init__.py +17 -26
- disdrodb/l0/check_configs.py +35 -23
- disdrodb/l0/check_standards.py +32 -42
- disdrodb/l0/configs/{Thies_LPM → LPM}/bins_diameter.yml +44 -44
- disdrodb/l0/configs/{Thies_LPM → LPM}/bins_velocity.yml +40 -40
- disdrodb/l0/configs/LPM/l0a_encodings.yml +80 -0
- disdrodb/l0/configs/{Thies_LPM → LPM}/l0b_cf_attrs.yml +62 -59
- disdrodb/l0/configs/{Thies_LPM → LPM}/l0b_encodings.yml +9 -9
- disdrodb/l0/configs/{Thies_LPM → LPM}/raw_data_format.yml +245 -245
- disdrodb/l0/configs/{OTT_Parsivel → PARSIVEL}/bins_diameter.yml +66 -66
- disdrodb/l0/configs/{OTT_Parsivel → PARSIVEL}/bins_velocity.yml +64 -64
- disdrodb/l0/configs/PARSIVEL/l0a_encodings.yml +32 -0
- disdrodb/l0/configs/{OTT_Parsivel → PARSIVEL}/l0b_cf_attrs.yml +22 -20
- disdrodb/l0/configs/{OTT_Parsivel → PARSIVEL}/l0b_encodings.yml +17 -17
- disdrodb/l0/configs/{OTT_Parsivel → PARSIVEL}/raw_data_format.yml +77 -77
- disdrodb/l0/configs/{OTT_Parsivel2 → PARSIVEL2}/bins_diameter.yml +64 -64
- disdrodb/l0/configs/{OTT_Parsivel2 → PARSIVEL2}/bins_velocity.yml +64 -64
- disdrodb/l0/configs/PARSIVEL2/l0a_encodings.yml +39 -0
- disdrodb/l0/configs/{OTT_Parsivel2 → PARSIVEL2}/l0b_cf_attrs.yml +24 -22
- disdrodb/l0/configs/{OTT_Parsivel2 → PARSIVEL2}/l0b_encodings.yml +20 -20
- disdrodb/l0/configs/{OTT_Parsivel2 → PARSIVEL2}/raw_data_format.yml +98 -98
- disdrodb/l0/configs/{RD_80 → RD80}/bins_diameter.yml +40 -40
- disdrodb/l0/configs/RD80/l0a_encodings.yml +16 -0
- disdrodb/l0/configs/{RD_80 → RD80}/l0b_cf_attrs.yml +3 -3
- disdrodb/l0/configs/RD80/l0b_encodings.yml +135 -0
- disdrodb/l0/configs/{RD_80 → RD80}/raw_data_format.yml +48 -48
- disdrodb/l0/l0_reader.py +216 -340
- disdrodb/l0/l0a_processing.py +237 -208
- disdrodb/l0/l0b_nc_processing.py +227 -80
- disdrodb/l0/l0b_processing.py +93 -173
- disdrodb/l0/l0c_processing.py +627 -0
- disdrodb/l0/readers/{ARM → LPM/ARM}/ARM_LPM.py +36 -58
- disdrodb/l0/readers/LPM/AUSTRALIA/MELBOURNE_2007_LPM.py +226 -0
- disdrodb/l0/readers/LPM/BRAZIL/CHUVA_LPM.py +185 -0
- disdrodb/l0/readers/LPM/BRAZIL/GOAMAZON_LPM.py +183 -0
- disdrodb/l0/readers/LPM/ITALY/GID_LPM.py +179 -0
- disdrodb/l0/readers/{UK → LPM/UK}/DIVEN.py +14 -35
- disdrodb/l0/readers/PARSIVEL/AUSTRALIA/MELBOURNE_2007_PARSIVEL.py +157 -0
- disdrodb/l0/readers/PARSIVEL/CHINA/CHONGQING.py +113 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/ARCTIC_2021.py +40 -57
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/COMMON_2011.py +37 -54
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/DAVOS_2009_2011.py +34 -51
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/EPFL_2009.py +34 -51
- disdrodb/l0/readers/{EPFL/PARADISO_2014.py → PARSIVEL/EPFL/EPFL_ROOF_2008.py} +38 -50
- disdrodb/l0/readers/PARSIVEL/EPFL/EPFL_ROOF_2010.py +105 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/EPFL_ROOF_2011.py +34 -51
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/EPFL_ROOF_2012.py +33 -51
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/GENEPI_2007.py +25 -44
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/GRAND_ST_BERNARD_2007.py +25 -44
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/GRAND_ST_BERNARD_2007_2.py +25 -44
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/HPICONET_2010.py +34 -51
- disdrodb/l0/readers/{EPFL/EPFL_ROOF_2010.py → PARSIVEL/EPFL/HYMEX_LTE_SOP2.py} +37 -50
- disdrodb/l0/readers/PARSIVEL/EPFL/HYMEX_LTE_SOP3.py +111 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/HYMEX_LTE_SOP4.py +36 -54
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/LOCARNO_2018.py +34 -52
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/LOCARNO_2019.py +38 -56
- disdrodb/l0/readers/PARSIVEL/EPFL/PARADISO_2014.py +105 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/PARSIVEL_2007.py +27 -45
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/PLATO_2019.py +24 -44
- disdrodb/l0/readers/PARSIVEL/EPFL/RACLETS_2019.py +140 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/RACLETS_2019_WJF.py +41 -59
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/RIETHOLZBACH_2011.py +34 -51
- disdrodb/l0/readers/PARSIVEL/EPFL/SAMOYLOV_2017.py +117 -0
- disdrodb/l0/readers/PARSIVEL/EPFL/SAMOYLOV_2019.py +137 -0
- disdrodb/l0/readers/{EPFL → PARSIVEL/EPFL}/UNIL_2022.py +42 -55
- disdrodb/l0/readers/PARSIVEL/GPM/IFLOODS.py +104 -0
- disdrodb/l0/readers/{GPM → PARSIVEL/GPM}/LPVEX.py +29 -48
- disdrodb/l0/readers/PARSIVEL/GPM/MC3E.py +184 -0
- disdrodb/l0/readers/PARSIVEL/NCAR/CCOPE_2015.py +113 -0
- disdrodb/l0/readers/{NCAR/VORTEX_SE_2016_P1.py → PARSIVEL/NCAR/OWLES_MIPS.py} +46 -72
- disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +125 -0
- disdrodb/l0/readers/{NCAR/OWLES_MIPS.py → PARSIVEL/NCAR/PLOWS_MIPS.py} +45 -64
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +114 -0
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2010.py +176 -0
- disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2010_UF.py +183 -0
- disdrodb/l0/readers/{ARM/ARM_LD.py → PARSIVEL2/ARM/ARM_PARSIVEL2.py} +27 -50
- disdrodb/l0/readers/PARSIVEL2/BRAZIL/CHUVA_PARSIVEL2.py +163 -0
- disdrodb/l0/readers/PARSIVEL2/BRAZIL/GOAMAZON_PARSIVEL2.py +163 -0
- disdrodb/l0/readers/{DENMARK → PARSIVEL2/DENMARK}/EROSION_nc.py +14 -35
- disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +119 -0
- disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +104 -0
- disdrodb/l0/readers/PARSIVEL2/GPM/NSSTC.py +176 -0
- disdrodb/l0/readers/PARSIVEL2/ITALY/GID_PARSIVEL2.py +32 -0
- disdrodb/l0/readers/PARSIVEL2/MEXICO/OH_IIUNAM_nc.py +56 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +120 -0
- disdrodb/l0/readers/{NCAR → PARSIVEL2/NCAR}/PECAN_MIPS.py +45 -64
- disdrodb/l0/readers/PARSIVEL2/NCAR/RELAMPAGO_PARSIVEL2.py +181 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/SNOWIE_PJ.py +160 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/SNOWIE_SB.py +160 -0
- disdrodb/l0/readers/{NCAR/PLOWS_MIPS.py → PARSIVEL2/NCAR/VORTEX_SE_2016_P1.py} +49 -66
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +118 -0
- disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +152 -0
- disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT.py +166 -0
- disdrodb/l0/readers/{NCAR/RELAMPAGO_RD80.py → RD80/BRAZIL/CHUVA_RD80.py} +36 -60
- disdrodb/l0/readers/{BRAZIL → RD80/BRAZIL}/GOAMAZON_RD80.py +36 -55
- disdrodb/l0/readers/{NCAR → RD80/NCAR}/CINDY_2011_RD80.py +35 -54
- disdrodb/l0/readers/{BRAZIL/CHUVA_RD80.py → RD80/NCAR/RELAMPAGO_RD80.py} +40 -54
- disdrodb/l0/readers/template_reader_raw_netcdf_data.py +62 -0
- disdrodb/l0/readers/{reader_template.py → template_reader_raw_text_data.py} +20 -44
- disdrodb/l0/routines.py +885 -581
- disdrodb/l0/standards.py +72 -236
- disdrodb/l0/template_tools.py +104 -109
- disdrodb/l1/__init__.py +17 -0
- disdrodb/l1/beard_model.py +716 -0
- disdrodb/l1/encoding_attrs.py +620 -0
- disdrodb/l1/fall_velocity.py +260 -0
- disdrodb/l1/filters.py +192 -0
- disdrodb/l1/processing.py +200 -0
- disdrodb/l1/resampling.py +236 -0
- disdrodb/l1/routines.py +357 -0
- disdrodb/l1_env/__init__.py +17 -0
- disdrodb/l1_env/routines.py +38 -0
- disdrodb/l2/__init__.py +17 -0
- disdrodb/l2/empirical_dsd.py +1735 -0
- disdrodb/l2/event.py +388 -0
- disdrodb/l2/processing.py +519 -0
- disdrodb/l2/processing_options.py +213 -0
- disdrodb/l2/routines.py +868 -0
- disdrodb/metadata/__init__.py +9 -2
- disdrodb/metadata/checks.py +165 -118
- disdrodb/metadata/download.py +81 -0
- disdrodb/metadata/geolocation.py +146 -0
- disdrodb/metadata/info.py +20 -13
- disdrodb/metadata/manipulation.py +1 -1
- disdrodb/metadata/reader.py +59 -8
- disdrodb/metadata/search.py +77 -144
- disdrodb/metadata/standards.py +7 -8
- disdrodb/metadata/writer.py +8 -14
- disdrodb/psd/__init__.py +38 -0
- disdrodb/psd/fitting.py +2146 -0
- disdrodb/psd/models.py +774 -0
- disdrodb/routines.py +1176 -0
- disdrodb/scattering/__init__.py +28 -0
- disdrodb/scattering/axis_ratio.py +344 -0
- disdrodb/scattering/routines.py +456 -0
- disdrodb/utils/__init__.py +17 -0
- disdrodb/utils/attrs.py +208 -0
- disdrodb/utils/cli.py +269 -0
- disdrodb/utils/compression.py +60 -42
- disdrodb/utils/dask.py +62 -0
- disdrodb/utils/decorators.py +110 -0
- disdrodb/utils/directories.py +107 -46
- disdrodb/utils/encoding.py +127 -0
- disdrodb/utils/list.py +29 -0
- disdrodb/utils/logger.py +168 -46
- disdrodb/utils/time.py +657 -0
- disdrodb/utils/warnings.py +30 -0
- disdrodb/utils/writer.py +57 -0
- disdrodb/utils/xarray.py +138 -47
- disdrodb/utils/yaml.py +0 -1
- disdrodb/viz/__init__.py +17 -0
- disdrodb/viz/plots.py +17 -0
- disdrodb-0.1.0.dist-info/METADATA +321 -0
- disdrodb-0.1.0.dist-info/RECORD +216 -0
- {disdrodb-0.0.21.dist-info → disdrodb-0.1.0.dist-info}/WHEEL +1 -1
- disdrodb-0.1.0.dist-info/entry_points.txt +30 -0
- disdrodb/data_transfer/scripts/disdrodb_download_archive.py +0 -53
- disdrodb/data_transfer/scripts/disdrodb_upload_archive.py +0 -57
- disdrodb/l0/configs/OTT_Parsivel/l0a_encodings.yml +0 -32
- disdrodb/l0/configs/OTT_Parsivel2/l0a_encodings.yml +0 -39
- disdrodb/l0/configs/RD_80/l0a_encodings.yml +0 -16
- disdrodb/l0/configs/RD_80/l0b_encodings.yml +0 -135
- disdrodb/l0/configs/Thies_LPM/l0a_encodings.yml +0 -80
- disdrodb/l0/io.py +0 -257
- disdrodb/l0/l0_processing.py +0 -1091
- disdrodb/l0/readers/AUSTRALIA/MELBOURNE_2007_OTT.py +0 -178
- disdrodb/l0/readers/AUSTRALIA/MELBOURNE_2007_THIES.py +0 -247
- disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py +0 -204
- disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py +0 -183
- disdrodb/l0/readers/BRAZIL/GOAMAZON_LPM.py +0 -204
- disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py +0 -183
- disdrodb/l0/readers/CHINA/CHONGQING.py +0 -131
- disdrodb/l0/readers/EPFL/EPFL_ROOF_2008.py +0 -128
- disdrodb/l0/readers/EPFL/HYMEX_LTE_SOP2.py +0 -127
- disdrodb/l0/readers/EPFL/HYMEX_LTE_SOP3.py +0 -129
- disdrodb/l0/readers/EPFL/RACLETS_2019.py +0 -158
- disdrodb/l0/readers/EPFL/SAMOYLOV_2017.py +0 -136
- disdrodb/l0/readers/EPFL/SAMOYLOV_2019.py +0 -158
- disdrodb/l0/readers/FRANCE/SIRTA_OTT2.py +0 -138
- disdrodb/l0/readers/GPM/GCPEX.py +0 -123
- disdrodb/l0/readers/GPM/IFLOODS.py +0 -123
- disdrodb/l0/readers/GPM/MC3E.py +0 -123
- disdrodb/l0/readers/GPM/NSSTC.py +0 -164
- disdrodb/l0/readers/ITALY/GID.py +0 -199
- disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py +0 -92
- disdrodb/l0/readers/NCAR/CCOPE_2015.py +0 -133
- disdrodb/l0/readers/NCAR/PECAN_FP3.py +0 -137
- disdrodb/l0/readers/NCAR/PECAN_MOBILE.py +0 -144
- disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py +0 -195
- disdrodb/l0/readers/NCAR/SNOWIE_PJ.py +0 -172
- disdrodb/l0/readers/NCAR/SNOWIE_SB.py +0 -179
- disdrodb/l0/readers/NCAR/VORTEX2_2009.py +0 -133
- disdrodb/l0/readers/NCAR/VORTEX2_2010.py +0 -188
- disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py +0 -191
- disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py +0 -135
- disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py +0 -170
- disdrodb/l0/readers/NETHERLANDS/DELFT.py +0 -187
- disdrodb/l0/readers/SPAIN/SBEGUERIA.py +0 -179
- disdrodb/l0/scripts/disdrodb_run_l0b_concat.py +0 -93
- disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py +0 -85
- disdrodb/utils/netcdf.py +0 -452
- disdrodb/utils/scripts.py +0 -102
- disdrodb-0.0.21.dist-info/AUTHORS.md +0 -18
- disdrodb-0.0.21.dist-info/METADATA +0 -186
- disdrodb-0.0.21.dist-info/RECORD +0 -168
- disdrodb-0.0.21.dist-info/entry_points.txt +0 -15
- /disdrodb/l0/configs/{RD_80 → RD80}/bins_velocity.yml +0 -0
- /disdrodb/l0/manuals/{Thies_LPM.pdf → LPM.pdf} +0 -0
- /disdrodb/l0/manuals/{ODM_470.pdf → ODM470.pdf} +0 -0
- /disdrodb/l0/manuals/{OTT_Parsivel.pdf → PARSIVEL.pdf} +0 -0
- /disdrodb/l0/manuals/{OTT_Parsivel2.pdf → PARSIVEL2.pdf} +0 -0
- /disdrodb/l0/manuals/{PWS_100.pdf → PWS100.pdf} +0 -0
- /disdrodb/l0/manuals/{RD_80.pdf → RD80.pdf} +0 -0
- {disdrodb-0.0.21.dist-info → disdrodb-0.1.0.dist-info/licenses}/LICENSE +0 -0
- {disdrodb-0.0.21.dist-info → disdrodb-0.1.0.dist-info}/top_level.txt +0 -0
disdrodb/l0/l0a_processing.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
"""Functions to process raw text files into DISDRODB L0A Apache Parquet."""
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
import inspect
|
|
23
22
|
import logging
|
|
24
23
|
import os
|
|
25
24
|
from typing import Union
|
|
@@ -39,7 +38,6 @@ from disdrodb.utils.directories import create_directory, remove_if_exists
|
|
|
39
38
|
|
|
40
39
|
# Logger
|
|
41
40
|
from disdrodb.utils.logger import (
|
|
42
|
-
log_debug,
|
|
43
41
|
log_error,
|
|
44
42
|
log_info,
|
|
45
43
|
log_warning,
|
|
@@ -55,7 +53,7 @@ pd.set_option("mode.chained_assignment", None) # Avoid SettingWithCopyWarning
|
|
|
55
53
|
#### Raw file readers
|
|
56
54
|
|
|
57
55
|
|
|
58
|
-
def
|
|
56
|
+
def preprocess_reader_kwargs(reader_kwargs: dict) -> dict:
|
|
59
57
|
"""Preprocess arguments required to read raw text file into Pandas.
|
|
60
58
|
|
|
61
59
|
Parameters
|
|
@@ -86,10 +84,20 @@ def _preprocess_reader_kwargs(reader_kwargs: dict) -> dict:
|
|
|
86
84
|
return reader_kwargs
|
|
87
85
|
|
|
88
86
|
|
|
89
|
-
def
|
|
87
|
+
def check_matching_column_number(df, column_names):
|
|
88
|
+
"""Check the number of columns in the dataframe matches the length of column names."""
|
|
89
|
+
n_columns = len(df.columns)
|
|
90
|
+
n_expected_columns = len(column_names)
|
|
91
|
+
if n_columns != n_expected_columns:
|
|
92
|
+
msg = f"The dataframe has {n_columns} columns, while {n_expected_columns} are expected !."
|
|
93
|
+
raise ValueError(msg)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def read_raw_text_file(
|
|
90
97
|
filepath: str,
|
|
91
98
|
column_names: list,
|
|
92
99
|
reader_kwargs: dict,
|
|
100
|
+
logger=None, # noqa
|
|
93
101
|
) -> pd.DataFrame:
|
|
94
102
|
"""Read a raw file into a dataframe.
|
|
95
103
|
|
|
@@ -100,7 +108,12 @@ def read_raw_file(
|
|
|
100
108
|
column_names : list
|
|
101
109
|
Column names.
|
|
102
110
|
reader_kwargs : dict
|
|
103
|
-
Pandas pd.read_csv arguments.
|
|
111
|
+
Pandas ``pd.read_csv`` arguments.
|
|
112
|
+
logger : logging.Logger
|
|
113
|
+
Logger object.
|
|
114
|
+
The default is ``None``.
|
|
115
|
+
If ``None``, the logger is created using the module name.
|
|
116
|
+
If ``logger`` is passed, it will be used to log messages.
|
|
104
117
|
|
|
105
118
|
Returns
|
|
106
119
|
-------
|
|
@@ -108,7 +121,7 @@ def read_raw_file(
|
|
|
108
121
|
Pandas dataframe.
|
|
109
122
|
"""
|
|
110
123
|
# Preprocess reader_kwargs
|
|
111
|
-
reader_kwargs =
|
|
124
|
+
reader_kwargs = preprocess_reader_kwargs(reader_kwargs)
|
|
112
125
|
|
|
113
126
|
# Enforce all raw files columns with dtype = 'object'
|
|
114
127
|
dtype = "object"
|
|
@@ -117,8 +130,17 @@ def read_raw_file(
|
|
|
117
130
|
try:
|
|
118
131
|
df = pd.read_csv(filepath, names=column_names, dtype=dtype, **reader_kwargs)
|
|
119
132
|
except pd.errors.EmptyDataError:
|
|
120
|
-
msg = f"
|
|
121
|
-
|
|
133
|
+
msg = f"The following file is empty: {filepath}"
|
|
134
|
+
raise ValueError(msg)
|
|
135
|
+
|
|
136
|
+
# Check the dataframe is not empty
|
|
137
|
+
if len(df.index) == 0:
|
|
138
|
+
msg = f"The following file is empty: {filepath}"
|
|
139
|
+
raise ValueError(msg)
|
|
140
|
+
|
|
141
|
+
# Check dataframe column number matches columns_names
|
|
142
|
+
if column_names is not None:
|
|
143
|
+
check_matching_column_number(df, column_names)
|
|
122
144
|
|
|
123
145
|
# Return dataframe
|
|
124
146
|
return df
|
|
@@ -128,45 +150,19 @@ def read_raw_file(
|
|
|
128
150
|
#### L0A checks and homogenization
|
|
129
151
|
|
|
130
152
|
|
|
131
|
-
def
|
|
132
|
-
"""
|
|
133
|
-
if df_sanitizer_fun is None:
|
|
134
|
-
return None
|
|
135
|
-
if not callable(df_sanitizer_fun):
|
|
136
|
-
raise ValueError("'df_sanitizer_fun' must be a function.")
|
|
137
|
-
if not np.all(np.isin(inspect.getfullargspec(df_sanitizer_fun).args, ["df"])):
|
|
138
|
-
raise ValueError("The `df_sanitizer_fun` must have only `df` as input argument!")
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def _check_not_empty_dataframe(df, verbose=False):
|
|
142
|
-
if len(df.index) == 0:
|
|
143
|
-
msg = " - The file is empty and has been skipped."
|
|
144
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
145
|
-
raise ValueError(msg)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def _check_matching_column_number(df, column_names, verbose=False):
|
|
149
|
-
n_columns = len(df.columns)
|
|
150
|
-
n_expected_columns = len(column_names)
|
|
151
|
-
if n_columns != n_expected_columns:
|
|
152
|
-
msg = f" - The dataframe has {n_columns} columns, while {n_expected_columns} are expected !."
|
|
153
|
-
log_error(logger, msg, verbose)
|
|
154
|
-
raise ValueError(msg)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def remove_rows_with_missing_time(df: pd.DataFrame, verbose: bool = False):
|
|
158
|
-
"""Remove dataframe rows where the "time" is NaT.
|
|
153
|
+
def remove_rows_with_missing_time(df: pd.DataFrame, logger=logger, verbose: bool = False):
|
|
154
|
+
"""Remove dataframe rows where the ``"time"`` is ``NaT``.
|
|
159
155
|
|
|
160
156
|
Parameters
|
|
161
157
|
----------
|
|
162
|
-
df :
|
|
158
|
+
df : pandas.DataFrame
|
|
163
159
|
Input dataframe.
|
|
164
160
|
verbose : bool
|
|
165
|
-
Whether to verbose the processing.
|
|
161
|
+
Whether to verbose the processing. The default is ``False``.
|
|
166
162
|
|
|
167
163
|
Returns
|
|
168
164
|
-------
|
|
169
|
-
|
|
165
|
+
pandas.DataFrame
|
|
170
166
|
Dataframe with valid timesteps.
|
|
171
167
|
"""
|
|
172
168
|
# Get the number of rows of the dataframe
|
|
@@ -175,32 +171,31 @@ def remove_rows_with_missing_time(df: pd.DataFrame, verbose: bool = False):
|
|
|
175
171
|
df = df.dropna(subset="time", axis=0)
|
|
176
172
|
# If no valid timesteps, raise error
|
|
177
173
|
if len(df.index) == 0:
|
|
178
|
-
msg = "
|
|
179
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
174
|
+
msg = "There are not valid timestep."
|
|
180
175
|
raise ValueError(msg)
|
|
181
176
|
# Otherwise, report the number of invalid timesteps
|
|
182
177
|
n_invalid_timesteps = n_rows - len(df)
|
|
183
178
|
if n_invalid_timesteps > 0:
|
|
184
|
-
msg = f"
|
|
179
|
+
msg = f"{n_invalid_timesteps} rows had invalid timesteps and were discarded."
|
|
185
180
|
log_warning(logger=logger, msg=msg, verbose=verbose)
|
|
186
181
|
return df
|
|
187
182
|
|
|
188
183
|
|
|
189
|
-
def remove_duplicated_timesteps(df: pd.DataFrame, verbose: bool = False):
|
|
184
|
+
def remove_duplicated_timesteps(df: pd.DataFrame, logger=None, verbose: bool = False):
|
|
190
185
|
"""Remove duplicated timesteps.
|
|
191
186
|
|
|
192
187
|
It keep only the first timestep occurrence !
|
|
193
188
|
|
|
194
189
|
Parameters
|
|
195
190
|
----------
|
|
196
|
-
df :
|
|
191
|
+
df : pandas.DataFrame
|
|
197
192
|
Input dataframe.
|
|
198
193
|
verbose : bool
|
|
199
|
-
Whether to verbose the processing.
|
|
194
|
+
Whether to verbose the processing. The default is ``False``.
|
|
200
195
|
|
|
201
196
|
Returns
|
|
202
197
|
-------
|
|
203
|
-
|
|
198
|
+
pandas.DataFrame
|
|
204
199
|
Dataframe with valid unique timesteps.
|
|
205
200
|
"""
|
|
206
201
|
values, counts = np.unique(df["time"], return_counts=True)
|
|
@@ -208,11 +203,13 @@ def remove_duplicated_timesteps(df: pd.DataFrame, verbose: bool = False):
|
|
|
208
203
|
values_duplicates = values[idx_duplicates].astype("M8[s]")
|
|
209
204
|
# If there are duplicated timesteps
|
|
210
205
|
if len(values_duplicates) > 0:
|
|
206
|
+
# TODO: raise error if duplicated timesteps have different values !
|
|
207
|
+
|
|
211
208
|
# Drop duplicated timesteps (keeping the first occurrence)
|
|
212
209
|
df = df.drop_duplicates(subset="time", keep="first")
|
|
213
210
|
# Report the values of duplicated timesteps
|
|
214
211
|
msg = (
|
|
215
|
-
f"
|
|
212
|
+
f"The following timesteps occurred more than once: {values_duplicates}. Only the first occurrence"
|
|
216
213
|
" selected."
|
|
217
214
|
)
|
|
218
215
|
log_warning(logger=logger, msg=msg, verbose=verbose)
|
|
@@ -225,13 +222,12 @@ def drop_timesteps(df, timesteps):
|
|
|
225
222
|
# Check there are row left
|
|
226
223
|
if len(df) == 0:
|
|
227
224
|
msg = "No rows left after removing problematic timesteps. Maybe you need to adjust the issue YAML file."
|
|
228
|
-
log_warning(logger=logger, msg=msg, verbose=False)
|
|
229
225
|
raise ValueError(msg)
|
|
230
226
|
return df
|
|
231
227
|
|
|
232
228
|
|
|
233
229
|
def drop_time_periods(df, time_periods):
|
|
234
|
-
"""Drop problematic
|
|
230
|
+
"""Drop problematic time periods."""
|
|
235
231
|
for time_period in time_periods:
|
|
236
232
|
if len(df) > 0:
|
|
237
233
|
start_time = time_period[0]
|
|
@@ -240,25 +236,26 @@ def drop_time_periods(df, time_periods):
|
|
|
240
236
|
# Check there are row left
|
|
241
237
|
if len(df) == 0:
|
|
242
238
|
msg = "No rows left after removing problematic time_periods. Maybe you need to adjust the issue YAML file."
|
|
243
|
-
log_warning(logger=logger, msg=msg, verbose=False)
|
|
244
239
|
raise ValueError(msg)
|
|
245
240
|
|
|
246
241
|
return df
|
|
247
242
|
|
|
248
243
|
|
|
249
|
-
def remove_issue_timesteps(df, issue_dict, verbose=False):
|
|
244
|
+
def remove_issue_timesteps(df, issue_dict, logger=None, verbose=False):
|
|
250
245
|
"""Drop dataframe rows with timesteps listed in the issue dictionary.
|
|
251
246
|
|
|
252
247
|
Parameters
|
|
253
248
|
----------
|
|
254
|
-
df :
|
|
249
|
+
df : pandas.DataFrame
|
|
255
250
|
Input dataframe.
|
|
256
251
|
issue_dict : dict
|
|
257
|
-
Issue dictionary
|
|
252
|
+
Issue dictionary.
|
|
253
|
+
verbose : bool
|
|
254
|
+
Whether to verbose the processing. The default is ``False``.
|
|
258
255
|
|
|
259
256
|
Returns
|
|
260
257
|
-------
|
|
261
|
-
|
|
258
|
+
pandas.DataFrame
|
|
262
259
|
Dataframe with problematic timesteps removed.
|
|
263
260
|
|
|
264
261
|
"""
|
|
@@ -286,24 +283,21 @@ def remove_issue_timesteps(df, issue_dict, verbose=False):
|
|
|
286
283
|
return df
|
|
287
284
|
|
|
288
285
|
|
|
289
|
-
def cast_column_dtypes(df: pd.DataFrame, sensor_name: str
|
|
290
|
-
"""Convert 'object' dataframe columns into DISDRODB L0A dtype standards.
|
|
286
|
+
def cast_column_dtypes(df: pd.DataFrame, sensor_name: str) -> pd.DataFrame:
|
|
287
|
+
"""Convert ``'object'`` dataframe columns into DISDRODB L0A dtype standards.
|
|
291
288
|
|
|
292
289
|
Parameters
|
|
293
290
|
----------
|
|
294
|
-
df :
|
|
291
|
+
df : pandas.DataFrame
|
|
295
292
|
Input dataframe.
|
|
296
293
|
sensor_name : str
|
|
297
294
|
Name of the sensor.
|
|
298
|
-
verbose : bool
|
|
299
|
-
Whether to verbose the processing.
|
|
300
295
|
|
|
301
296
|
Returns
|
|
302
297
|
-------
|
|
303
|
-
|
|
298
|
+
pandas.DataFrame
|
|
304
299
|
Dataframe with corrected columns types.
|
|
305
300
|
"""
|
|
306
|
-
|
|
307
301
|
# Cast dataframe to dtypes
|
|
308
302
|
dtype_dict = get_l0a_dtype(sensor_name)
|
|
309
303
|
# Ensure time column is saved with seconds resolution
|
|
@@ -321,26 +315,23 @@ def cast_column_dtypes(df: pd.DataFrame, sensor_name: str, verbose: bool = False
|
|
|
321
315
|
df[column] = df[column].astype(dtype_dict[column])
|
|
322
316
|
except ValueError as e:
|
|
323
317
|
msg = f"ValueError: The column {column} has {e}"
|
|
324
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
325
318
|
raise ValueError(msg)
|
|
326
319
|
return df
|
|
327
320
|
|
|
328
321
|
|
|
329
|
-
def coerce_corrupted_values_to_nan(df: pd.DataFrame, sensor_name: str
|
|
330
|
-
"""Coerce corrupted values in dataframe numeric columns to np.nan
|
|
322
|
+
def coerce_corrupted_values_to_nan(df: pd.DataFrame, sensor_name: str) -> pd.DataFrame:
|
|
323
|
+
"""Coerce corrupted values in dataframe numeric columns to ``np.nan``.
|
|
331
324
|
|
|
332
325
|
Parameters
|
|
333
326
|
----------
|
|
334
|
-
df :
|
|
327
|
+
df : pandas.DataFrame
|
|
335
328
|
Input dataframe.
|
|
336
329
|
sensor_name : str
|
|
337
330
|
Name of the sensor.
|
|
338
|
-
verbose : bool
|
|
339
|
-
Whether to verbose the processing.
|
|
340
331
|
|
|
341
332
|
Returns
|
|
342
333
|
-------
|
|
343
|
-
|
|
334
|
+
pandas.DataFrame
|
|
344
335
|
Dataframe with string columns without corrupted values.
|
|
345
336
|
"""
|
|
346
337
|
# Cast dataframe to dtypes
|
|
@@ -359,21 +350,19 @@ def coerce_corrupted_values_to_nan(df: pd.DataFrame, sensor_name: str, verbose:
|
|
|
359
350
|
return df
|
|
360
351
|
|
|
361
352
|
|
|
362
|
-
def strip_string_spaces(df: pd.DataFrame, sensor_name: str
|
|
353
|
+
def strip_string_spaces(df: pd.DataFrame, sensor_name: str) -> pd.DataFrame:
|
|
363
354
|
"""Strip leading/trailing spaces from dataframe string columns.
|
|
364
355
|
|
|
365
356
|
Parameters
|
|
366
357
|
----------
|
|
367
|
-
df :
|
|
358
|
+
df : pandas.DataFrame
|
|
368
359
|
Input dataframe.
|
|
369
360
|
sensor_name : str
|
|
370
361
|
Name of the sensor.
|
|
371
|
-
verbose : bool
|
|
372
|
-
Whether to verbose the processing.
|
|
373
362
|
|
|
374
363
|
Returns
|
|
375
364
|
-------
|
|
376
|
-
|
|
365
|
+
pandas.DataFrame
|
|
377
366
|
Dataframe with string columns without leading/trailing spaces.
|
|
378
367
|
"""
|
|
379
368
|
# Cast dataframe to dtypes
|
|
@@ -390,13 +379,13 @@ def strip_string_spaces(df: pd.DataFrame, sensor_name: str, verbose: bool = Fals
|
|
|
390
379
|
try:
|
|
391
380
|
df[column] = df[column].str.strip()
|
|
392
381
|
except AttributeError:
|
|
393
|
-
msg = f"
|
|
394
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
382
|
+
msg = f"The column {column} is not a string/object dtype."
|
|
395
383
|
raise AttributeError(msg)
|
|
396
384
|
return df
|
|
397
385
|
|
|
398
386
|
|
|
399
|
-
def
|
|
387
|
+
def strip_delimiter(string):
|
|
388
|
+
"""Remove the first and last delimiter occurrence from a string."""
|
|
400
389
|
if not isinstance(string, str):
|
|
401
390
|
return string
|
|
402
391
|
split_str = infer_split_str(string=string)
|
|
@@ -415,12 +404,12 @@ def strip_delimiter_from_raw_arrays(df):
|
|
|
415
404
|
available_fields = list(df.columns[np.isin(df.columns, possible_fields)])
|
|
416
405
|
# Loop over the fields and strip away the delimiter
|
|
417
406
|
for field in available_fields:
|
|
418
|
-
df[field] = df[field].apply(
|
|
407
|
+
df[field] = df[field].apply(strip_delimiter)
|
|
419
408
|
# Return the dataframe
|
|
420
409
|
return df
|
|
421
410
|
|
|
422
411
|
|
|
423
|
-
def
|
|
412
|
+
def is_raw_array_string_not_corrupted(string):
|
|
424
413
|
"""Check if the raw array is corrupted."""
|
|
425
414
|
if not isinstance(string, str):
|
|
426
415
|
return False
|
|
@@ -445,32 +434,32 @@ def remove_corrupted_rows(df):
|
|
|
445
434
|
# Loop over the fields and remove corrupted ones
|
|
446
435
|
for field in available_fields:
|
|
447
436
|
if len(df) != 0:
|
|
448
|
-
df = df[df[field].apply(
|
|
437
|
+
df = df[df[field].apply(is_raw_array_string_not_corrupted)]
|
|
449
438
|
# Check if there are rows left
|
|
450
439
|
if len(df) == 0:
|
|
451
440
|
raise ValueError("No remaining rows after data corruption checks.")
|
|
452
441
|
# If only one row available, raise also error
|
|
453
442
|
if len(df) == 1:
|
|
454
|
-
raise ValueError("Only 1 row remains after data corruption checks. Check the file.")
|
|
443
|
+
raise ValueError("Only 1 row remains after data corruption checks. Check the raw file and maybe delete it.")
|
|
455
444
|
# Return the dataframe
|
|
456
445
|
return df
|
|
457
446
|
|
|
458
447
|
|
|
459
|
-
def replace_nan_flags(df, sensor_name, verbose):
|
|
460
|
-
"""Set values corresponding to nan_flags to np.nan
|
|
448
|
+
def replace_nan_flags(df, sensor_name, logger=None, verbose=False):
|
|
449
|
+
"""Set values corresponding to ``nan_flags`` to ``np.nan``.
|
|
461
450
|
|
|
462
451
|
Parameters
|
|
463
452
|
----------
|
|
464
|
-
df :
|
|
453
|
+
df : pandas.DataFrame
|
|
465
454
|
Input dataframe.
|
|
466
455
|
sensor_name : str
|
|
467
456
|
Name of the sensor.
|
|
468
457
|
verbose : bool
|
|
469
|
-
Whether to verbose the processing.
|
|
458
|
+
Whether to verbose the processing. The default is ``False``.
|
|
470
459
|
|
|
471
460
|
Returns
|
|
472
461
|
-------
|
|
473
|
-
|
|
462
|
+
pandas.DataFrame
|
|
474
463
|
Dataframe without nan_flags values.
|
|
475
464
|
"""
|
|
476
465
|
# Get dictionary of nan flags
|
|
@@ -486,26 +475,26 @@ def replace_nan_flags(df, sensor_name, verbose):
|
|
|
486
475
|
if n_nan_flags_values > 0:
|
|
487
476
|
msg = f"In variable {var}, {n_nan_flags_values} values were nan_flags and were replaced to np.nan."
|
|
488
477
|
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
489
|
-
df[var]
|
|
478
|
+
df.loc[is_a_nan_flag, var] = np.nan
|
|
490
479
|
# Return dataframe
|
|
491
480
|
return df
|
|
492
481
|
|
|
493
482
|
|
|
494
|
-
def set_nan_outside_data_range(df, sensor_name, verbose):
|
|
495
|
-
"""Set values outside the data range as np.nan
|
|
483
|
+
def set_nan_outside_data_range(df, sensor_name, logger=None, verbose=False):
|
|
484
|
+
"""Set values outside the data range as ``np.nan``.
|
|
496
485
|
|
|
497
486
|
Parameters
|
|
498
487
|
----------
|
|
499
|
-
df :
|
|
488
|
+
df : pandas.DataFrame
|
|
500
489
|
Input dataframe.
|
|
501
490
|
sensor_name : str
|
|
502
491
|
Name of the sensor.
|
|
503
492
|
verbose : bool
|
|
504
|
-
Whether to verbose the processing.
|
|
493
|
+
Whether to verbose the processing. The default is ``False``.
|
|
505
494
|
|
|
506
495
|
Returns
|
|
507
496
|
-------
|
|
508
|
-
|
|
497
|
+
pandas.DataFrame
|
|
509
498
|
Dataframe without values outside the expected data range.
|
|
510
499
|
"""
|
|
511
500
|
# Get dictionary of data_range
|
|
@@ -530,21 +519,21 @@ def set_nan_outside_data_range(df, sensor_name, verbose):
|
|
|
530
519
|
return df
|
|
531
520
|
|
|
532
521
|
|
|
533
|
-
def set_nan_invalid_values(df, sensor_name, verbose):
|
|
534
|
-
"""Set invalid (class) values to np.nan
|
|
522
|
+
def set_nan_invalid_values(df, sensor_name, logger=None, verbose=False):
|
|
523
|
+
"""Set invalid (class) values to ``np.nan``.
|
|
535
524
|
|
|
536
525
|
Parameters
|
|
537
526
|
----------
|
|
538
|
-
df :
|
|
527
|
+
df : pandas.DataFrame
|
|
539
528
|
Input dataframe.
|
|
540
529
|
sensor_name : str
|
|
541
530
|
Name of the sensor.
|
|
542
531
|
verbose : bool
|
|
543
|
-
Whether to verbose the processing.
|
|
532
|
+
Whether to verbose the processing. The default is ``False``.
|
|
544
533
|
|
|
545
534
|
Returns
|
|
546
535
|
-------
|
|
547
|
-
|
|
536
|
+
pandas.DataFrame
|
|
548
537
|
Dataframe without invalid values.
|
|
549
538
|
"""
|
|
550
539
|
# Get dictionary of valid values
|
|
@@ -566,14 +555,12 @@ def set_nan_invalid_values(df, sensor_name, verbose):
|
|
|
566
555
|
return df
|
|
567
556
|
|
|
568
557
|
|
|
569
|
-
def
|
|
570
|
-
|
|
571
|
-
column_names,
|
|
572
|
-
reader_kwargs,
|
|
573
|
-
df_sanitizer_fun,
|
|
558
|
+
def sanitize_df(
|
|
559
|
+
df,
|
|
574
560
|
sensor_name,
|
|
575
561
|
verbose=True,
|
|
576
|
-
issue_dict=
|
|
562
|
+
issue_dict=None,
|
|
563
|
+
logger=None,
|
|
577
564
|
):
|
|
578
565
|
"""Read and parse a raw text files into a L0A dataframe.
|
|
579
566
|
|
|
@@ -581,63 +568,41 @@ def process_raw_file(
|
|
|
581
568
|
----------
|
|
582
569
|
filepath : str
|
|
583
570
|
File path
|
|
584
|
-
column_names : list
|
|
585
|
-
Columns names.
|
|
586
|
-
reader_kwargs : dict
|
|
587
|
-
Pandas `read_csv` arguments.
|
|
588
|
-
df_sanitizer_fun : object, optional
|
|
589
|
-
Sanitizer function to format the datafame.
|
|
590
571
|
sensor_name : str
|
|
591
572
|
Name of the sensor.
|
|
592
573
|
verbose : bool
|
|
593
|
-
Whether to verbose the processing.
|
|
594
|
-
The default is True
|
|
574
|
+
Whether to verbose the processing. The default is ``True``.
|
|
595
575
|
issue_dict : dict
|
|
596
576
|
Issue dictionary providing information on timesteps to remove.
|
|
597
|
-
The default is an empty dictionary {}
|
|
598
|
-
Valid issue_dict key are 'timesteps' and 'time_periods'
|
|
577
|
+
The default is an empty dictionary ``{}``.
|
|
578
|
+
Valid issue_dict key are ``'timesteps'`` and ``'time_periods'``.
|
|
599
579
|
Valid issue_dict values are list of datetime64 values (with second accuracy).
|
|
600
|
-
To correctly format and check the validity of the issue_dict
|
|
601
|
-
the disdrodb.l0.issue.check_issue_dict function.
|
|
580
|
+
To correctly format and check the validity of the ``issue_dict``, use
|
|
581
|
+
the ``disdrodb.l0.issue.check_issue_dict`` function.
|
|
602
582
|
|
|
603
583
|
Returns
|
|
604
584
|
-------
|
|
605
|
-
|
|
585
|
+
pandas.DataFrame
|
|
606
586
|
Dataframe
|
|
607
587
|
"""
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
df = read_raw_file(
|
|
612
|
-
filepath=filepath,
|
|
613
|
-
column_names=column_names,
|
|
614
|
-
reader_kwargs=reader_kwargs,
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
# - Check if file empty
|
|
618
|
-
_check_not_empty_dataframe(df=df, verbose=verbose)
|
|
619
|
-
|
|
620
|
-
# - Check dataframe column number matches columns_names
|
|
621
|
-
_check_matching_column_number(df, column_names, verbose=False)
|
|
622
|
-
|
|
623
|
-
# - Sanitize the dataframe with a custom function
|
|
624
|
-
if df_sanitizer_fun is not None:
|
|
625
|
-
df = df_sanitizer_fun(df)
|
|
588
|
+
# Define the issue dictionary
|
|
589
|
+
# - If None, set to empty dictionary
|
|
590
|
+
issue_dict = {} if issue_dict is None else issue_dict
|
|
626
591
|
|
|
627
592
|
# - Remove rows with time NaT
|
|
628
|
-
df = remove_rows_with_missing_time(df, verbose=verbose)
|
|
593
|
+
df = remove_rows_with_missing_time(df, logger=logger, verbose=verbose)
|
|
629
594
|
|
|
630
595
|
# - Remove duplicated timesteps
|
|
631
|
-
df = remove_duplicated_timesteps(df, verbose=verbose)
|
|
596
|
+
df = remove_duplicated_timesteps(df, logger=logger, verbose=verbose)
|
|
632
597
|
|
|
633
598
|
# - Filter out problematic tiemsteps reported in the issue YAML file
|
|
634
|
-
df = remove_issue_timesteps(df, issue_dict=issue_dict, verbose=verbose)
|
|
599
|
+
df = remove_issue_timesteps(df, issue_dict=issue_dict, logger=logger, verbose=verbose)
|
|
635
600
|
|
|
636
601
|
# - Coerce numeric columns corrupted values to np.nan
|
|
637
|
-
df = coerce_corrupted_values_to_nan(df, sensor_name=sensor_name
|
|
602
|
+
df = coerce_corrupted_values_to_nan(df, sensor_name=sensor_name)
|
|
638
603
|
|
|
639
604
|
# - Strip trailing/leading space from string columns
|
|
640
|
-
df = strip_string_spaces(df, sensor_name=sensor_name
|
|
605
|
+
df = strip_string_spaces(df, sensor_name=sensor_name)
|
|
641
606
|
|
|
642
607
|
# - Strip first and last delimiter from the raw arrays
|
|
643
608
|
df = strip_delimiter_from_raw_arrays(df)
|
|
@@ -646,16 +611,19 @@ def process_raw_file(
|
|
|
646
611
|
df = remove_corrupted_rows(df)
|
|
647
612
|
|
|
648
613
|
# - Cast dataframe to dtypes
|
|
649
|
-
df = cast_column_dtypes(df, sensor_name=sensor_name
|
|
614
|
+
df = cast_column_dtypes(df, sensor_name=sensor_name)
|
|
650
615
|
|
|
651
616
|
# - Replace nan flags values with np.nans
|
|
652
|
-
df = replace_nan_flags(df, sensor_name=sensor_name, verbose=verbose)
|
|
617
|
+
df = replace_nan_flags(df, sensor_name=sensor_name, logger=logger, verbose=verbose)
|
|
653
618
|
|
|
654
619
|
# - Set values outside the data range to np.nan
|
|
655
|
-
df = set_nan_outside_data_range(df, sensor_name=sensor_name, verbose=verbose)
|
|
620
|
+
df = set_nan_outside_data_range(df, sensor_name=sensor_name, logger=logger, verbose=verbose)
|
|
656
621
|
|
|
657
622
|
# - Replace invalid values with np.nan
|
|
658
|
-
df = set_nan_invalid_values(df, sensor_name=sensor_name, verbose=verbose)
|
|
623
|
+
df = set_nan_invalid_values(df, sensor_name=sensor_name, logger=logger, verbose=verbose)
|
|
624
|
+
|
|
625
|
+
# - Sort by time
|
|
626
|
+
df = df.sort_values("time")
|
|
659
627
|
|
|
660
628
|
# ------------------------------------------------------.
|
|
661
629
|
# - Check column names agrees to DISDRODB standards
|
|
@@ -677,23 +645,23 @@ def write_l0a(
|
|
|
677
645
|
df: pd.DataFrame,
|
|
678
646
|
filepath: str,
|
|
679
647
|
force: bool = False,
|
|
648
|
+
logger=None,
|
|
680
649
|
verbose: bool = False,
|
|
681
650
|
):
|
|
682
651
|
"""Save the dataframe into an Apache Parquet file.
|
|
683
652
|
|
|
684
653
|
Parameters
|
|
685
654
|
----------
|
|
686
|
-
df :
|
|
655
|
+
df : pandas.DataFrame
|
|
687
656
|
Input dataframe.
|
|
688
657
|
filepath : str
|
|
689
658
|
Output file path.
|
|
690
659
|
force : bool, optional
|
|
691
660
|
Whether to overwrite existing data.
|
|
692
|
-
If True
|
|
693
|
-
If False
|
|
661
|
+
If ``True``, overwrite existing data into destination directories.
|
|
662
|
+
If ``False``, raise an error if there are already data into destination directories. This is the default.
|
|
694
663
|
verbose : bool, optional
|
|
695
|
-
Whether to verbose the processing.
|
|
696
|
-
The default is False.
|
|
664
|
+
Whether to verbose the processing. The default is ``False``.
|
|
697
665
|
|
|
698
666
|
Raises
|
|
699
667
|
------
|
|
@@ -702,7 +670,6 @@ def write_l0a(
|
|
|
702
670
|
NotImplementedError
|
|
703
671
|
The input dataframe can not be processed.
|
|
704
672
|
"""
|
|
705
|
-
|
|
706
673
|
# -------------------------------------------------------------------------.
|
|
707
674
|
# Create station directory if does not exist
|
|
708
675
|
create_directory(os.path.dirname(filepath))
|
|
@@ -710,7 +677,7 @@ def write_l0a(
|
|
|
710
677
|
# Check if the file already exists
|
|
711
678
|
# - If force=True --> Remove it
|
|
712
679
|
# - If force=False --> Raise error
|
|
713
|
-
remove_if_exists(filepath, force=force)
|
|
680
|
+
remove_if_exists(filepath, force=force, logger=logger)
|
|
714
681
|
|
|
715
682
|
# -------------------------------------------------------------------------.
|
|
716
683
|
# Define writing options
|
|
@@ -727,20 +694,18 @@ def write_l0a(
|
|
|
727
694
|
row_group_size=row_group_size,
|
|
728
695
|
)
|
|
729
696
|
msg = f"The Pandas Dataframe has been written as an Apache Parquet file to {filepath}."
|
|
730
|
-
log_info(logger=logger, msg=msg, verbose=
|
|
697
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
731
698
|
except Exception as e:
|
|
732
|
-
msg = f"
|
|
733
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
699
|
+
msg = f"The Pandas DataFrame cannot be written as an Apache Parquet file. The error is: \n {e}."
|
|
734
700
|
raise ValueError(msg)
|
|
735
701
|
# -------------------------------------------------------------------------.
|
|
736
|
-
return None
|
|
737
702
|
|
|
738
703
|
|
|
739
|
-
|
|
740
|
-
#### L0A
|
|
704
|
+
####--------------------------------------------------------------------------.
|
|
705
|
+
#### DISDRODB L0A product reader
|
|
741
706
|
|
|
742
707
|
|
|
743
|
-
def concatenate_dataframe(list_df: list, verbose: bool = False) -> pd.DataFrame:
|
|
708
|
+
def concatenate_dataframe(list_df: list, logger=None, verbose: bool = False) -> pd.DataFrame:
|
|
744
709
|
"""Concatenate a list of dataframes.
|
|
745
710
|
|
|
746
711
|
Parameters
|
|
@@ -748,12 +713,12 @@ def concatenate_dataframe(list_df: list, verbose: bool = False) -> pd.DataFrame:
|
|
|
748
713
|
list_df : list
|
|
749
714
|
List of dataframes.
|
|
750
715
|
verbose : bool, optional
|
|
751
|
-
If True
|
|
752
|
-
If False
|
|
716
|
+
If ``True``, print messages.
|
|
717
|
+
If ``False``, no print.
|
|
753
718
|
|
|
754
719
|
Returns
|
|
755
720
|
-------
|
|
756
|
-
|
|
721
|
+
pandas.DataFrame
|
|
757
722
|
Concatenated dataframe.
|
|
758
723
|
|
|
759
724
|
Raises
|
|
@@ -769,39 +734,111 @@ def concatenate_dataframe(list_df: list, verbose: bool = False) -> pd.DataFrame:
|
|
|
769
734
|
return df
|
|
770
735
|
|
|
771
736
|
# Log
|
|
772
|
-
msg = "
|
|
773
|
-
log_info(logger, msg, verbose)
|
|
737
|
+
msg = "Concatenation of dataframes started."
|
|
738
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
774
739
|
|
|
775
740
|
# Concatenate the dataframe
|
|
776
741
|
try:
|
|
777
742
|
df = pd.concat(list_df, axis=0, ignore_index=True)
|
|
778
|
-
|
|
779
|
-
# Drop duplicated values
|
|
780
|
-
df = df.drop_duplicates(subset="time")
|
|
781
|
-
|
|
782
743
|
# Sort by increasing time
|
|
783
744
|
df = df.sort_values(by="time")
|
|
784
745
|
|
|
785
746
|
except (AttributeError, TypeError) as e:
|
|
786
|
-
msg = f"
|
|
787
|
-
log_error(logger=logger, msg=msg, verbose=False)
|
|
747
|
+
msg = f"Can not concatenate the files. \n Error: {e}"
|
|
788
748
|
raise ValueError(msg)
|
|
789
749
|
|
|
790
750
|
# Log
|
|
791
|
-
msg = "
|
|
792
|
-
log_info(logger, msg, verbose)
|
|
751
|
+
msg = "Concatenation of dataframes has finished."
|
|
752
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
793
753
|
|
|
794
754
|
# Return dataframe
|
|
795
755
|
return df
|
|
796
756
|
|
|
797
757
|
|
|
798
|
-
def
|
|
758
|
+
def _read_l0a(filepath: str, verbose: bool = False, logger=None, debugging_mode: bool = False) -> pd.DataFrame:
|
|
759
|
+
# Log
|
|
760
|
+
msg = f"Reading L0 Apache Parquet file at {filepath} started."
|
|
761
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
762
|
+
# Open file
|
|
763
|
+
df = pd.read_parquet(filepath)
|
|
764
|
+
if debugging_mode:
|
|
765
|
+
df = df.iloc[0:100]
|
|
766
|
+
# Log
|
|
767
|
+
msg = f"Reading L0 Apache Parquet file at {filepath} ended."
|
|
768
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
769
|
+
return df
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def read_l0a_dataframe(
|
|
773
|
+
filepaths: Union[str, list],
|
|
774
|
+
verbose: bool = False,
|
|
775
|
+
logger=None,
|
|
776
|
+
debugging_mode: bool = False,
|
|
777
|
+
) -> pd.DataFrame:
|
|
778
|
+
"""Read DISDRODB L0A Apache Parquet file(s).
|
|
779
|
+
|
|
780
|
+
Parameters
|
|
781
|
+
----------
|
|
782
|
+
filepaths : str or list
|
|
783
|
+
Either a list or a single filepath.
|
|
784
|
+
verbose : bool
|
|
785
|
+
Whether to print detailed processing information into terminal.
|
|
786
|
+
The default is ``False``.
|
|
787
|
+
debugging_mode : bool
|
|
788
|
+
If ``True``, it reduces the amount of data to process.
|
|
789
|
+
If filepaths is a list, it reads only the first 3 files.
|
|
790
|
+
For each file it select only the first 100 rows.
|
|
791
|
+
The default is ``False``.
|
|
792
|
+
|
|
793
|
+
Returns
|
|
794
|
+
-------
|
|
795
|
+
pandas.DataFrame
|
|
796
|
+
L0A Dataframe.
|
|
797
|
+
|
|
798
|
+
"""
|
|
799
|
+
from disdrodb.l0.l0a_processing import concatenate_dataframe
|
|
800
|
+
|
|
801
|
+
# ----------------------------------------
|
|
802
|
+
# Check filepaths validity
|
|
803
|
+
if not isinstance(filepaths, (list, str)):
|
|
804
|
+
raise TypeError("Expecting filepaths to be a string or a list of strings.")
|
|
805
|
+
|
|
806
|
+
# ----------------------------------------
|
|
807
|
+
# If filepath is a string, convert to list
|
|
808
|
+
if isinstance(filepaths, str):
|
|
809
|
+
filepaths = [filepaths]
|
|
810
|
+
# ---------------------------------------------------
|
|
811
|
+
# If debugging_mode=True, it reads only the first 3 filepaths
|
|
812
|
+
if debugging_mode:
|
|
813
|
+
filepaths = filepaths[0:3] # select first 3 filepaths
|
|
814
|
+
|
|
815
|
+
# ---------------------------------------------------
|
|
816
|
+
# Define the list of dataframe
|
|
817
|
+
list_df = [
|
|
818
|
+
_read_l0a(filepath, verbose=verbose, logger=logger, debugging_mode=debugging_mode) for filepath in filepaths
|
|
819
|
+
]
|
|
820
|
+
|
|
821
|
+
# Concatenate dataframe
|
|
822
|
+
df = concatenate_dataframe(list_df, logger=logger, verbose=verbose)
|
|
823
|
+
|
|
824
|
+
# Ensure time is in nanoseconds
|
|
825
|
+
df["time"] = df["time"].astype("M8[ns]")
|
|
826
|
+
|
|
827
|
+
# ---------------------------------------------------
|
|
828
|
+
# Return dataframe
|
|
829
|
+
return df
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
####---------------------------------------------------------------------------.
|
|
833
|
+
#### L0A Utility
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def read_raw_text_files(
|
|
799
837
|
filepaths: Union[list, str],
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
df_sanitizer_fun: object = None,
|
|
838
|
+
reader,
|
|
839
|
+
sensor_name,
|
|
840
|
+
verbose=True,
|
|
841
|
+
logger=None,
|
|
805
842
|
) -> pd.DataFrame:
|
|
806
843
|
"""Read and parse a list for raw files into a dataframe.
|
|
807
844
|
|
|
@@ -809,20 +846,17 @@ def read_raw_files(
|
|
|
809
846
|
----------
|
|
810
847
|
filepaths : Union[list,str]
|
|
811
848
|
File(s) path(s)
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
Pandas `read_csv` arguments.
|
|
849
|
+
reader:
|
|
850
|
+
DISDRODB reader function.
|
|
851
|
+
Format: reader(filepath, logger=None)
|
|
816
852
|
sensor_name : str
|
|
817
853
|
Name of the sensor.
|
|
818
854
|
verbose : bool
|
|
819
|
-
Whether to verbose the processing.
|
|
820
|
-
df_sanitizer_fun : object, optional
|
|
821
|
-
Sanitizer function to format the datafame.
|
|
855
|
+
Whether to verbose the processing. The default is ``True``.
|
|
822
856
|
|
|
823
857
|
Returns
|
|
824
858
|
-------
|
|
825
|
-
|
|
859
|
+
pandas.DataFrame
|
|
826
860
|
Dataframe
|
|
827
861
|
|
|
828
862
|
Raises
|
|
@@ -831,7 +865,6 @@ def read_raw_files(
|
|
|
831
865
|
Input parameters can not be used or the raw file can not be processed.
|
|
832
866
|
|
|
833
867
|
"""
|
|
834
|
-
|
|
835
868
|
# ------------------------------------------------------.
|
|
836
869
|
# Check input list
|
|
837
870
|
if isinstance(filepaths, str):
|
|
@@ -840,54 +873,50 @@ def read_raw_files(
|
|
|
840
873
|
raise ValueError("'filepaths' must contains at least 1 filepath.")
|
|
841
874
|
|
|
842
875
|
# ------------------------------------------------------.
|
|
843
|
-
|
|
876
|
+
# Loop over all raw files
|
|
844
877
|
n_files = len(filepaths)
|
|
845
878
|
processed_file_counter = 0
|
|
846
879
|
list_skipped_files_msg = []
|
|
847
880
|
list_df = []
|
|
848
881
|
for filepath in filepaths:
|
|
882
|
+
# Try read the raw text file
|
|
849
883
|
try:
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
reader_kwargs=reader_kwargs,
|
|
855
|
-
df_sanitizer_fun=df_sanitizer_fun,
|
|
884
|
+
df = reader(filepath, logger=logger)
|
|
885
|
+
# Sanitize the dataframe
|
|
886
|
+
df = sanitize_df(
|
|
887
|
+
df=df,
|
|
856
888
|
sensor_name=sensor_name,
|
|
889
|
+
logger=logger,
|
|
857
890
|
verbose=verbose,
|
|
858
891
|
)
|
|
859
|
-
|
|
860
892
|
# Append dataframe to the list
|
|
861
893
|
list_df.append(df)
|
|
862
|
-
|
|
863
894
|
# Update the logger
|
|
864
895
|
processed_file_counter += 1
|
|
865
|
-
msg = f"
|
|
866
|
-
|
|
896
|
+
msg = f"Raw file '{filepath}' processed successfully ({processed_file_counter}/{n_files})."
|
|
897
|
+
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
867
898
|
|
|
868
|
-
#
|
|
899
|
+
# Skip the file if the processing fails
|
|
869
900
|
except Exception as e:
|
|
870
901
|
# Update the logger
|
|
871
|
-
msg = f"
|
|
872
|
-
|
|
902
|
+
msg = f"{filepath} has been skipped. The error is: {e}."
|
|
903
|
+
log_error(logger=logger, msg=msg, verbose=verbose)
|
|
873
904
|
list_skipped_files_msg.append(msg)
|
|
874
905
|
|
|
875
906
|
# Update logger
|
|
876
|
-
msg = f"
|
|
907
|
+
msg = f"{len(list_skipped_files_msg)} of {n_files} have been skipped."
|
|
877
908
|
log_info(logger=logger, msg=msg, verbose=verbose)
|
|
878
|
-
logger.info("---")
|
|
879
|
-
logger.info(msg)
|
|
880
|
-
logger.info("---")
|
|
881
909
|
|
|
882
910
|
##----------------------------------------------------------------.
|
|
883
|
-
|
|
911
|
+
# Concatenate the dataframe
|
|
884
912
|
if len(list_df) == 0:
|
|
885
|
-
raise ValueError(
|
|
886
|
-
df = concatenate_dataframe(list_df, verbose=verbose)
|
|
887
|
-
|
|
888
|
-
# - Remove rows with duplicate timestep (keep the first)
|
|
889
|
-
df = df.drop_duplicates(subset=["time"], keep="first")
|
|
913
|
+
raise ValueError("Any raw file could be read!")
|
|
914
|
+
df = concatenate_dataframe(list_df, verbose=verbose, logger=logger)
|
|
890
915
|
|
|
891
916
|
# ------------------------------------------------------.
|
|
917
|
+
# Enforce output time to be [ns]
|
|
918
|
+
# --> For compatibility with xarray
|
|
919
|
+
df["time"] = df["time"].astype("M8[ns]")
|
|
920
|
+
|
|
892
921
|
# Return the dataframe
|
|
893
922
|
return df
|