disdrodb 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. disdrodb/__init__.py +68 -34
  2. disdrodb/_config.py +5 -4
  3. disdrodb/_version.py +16 -3
  4. disdrodb/accessor/__init__.py +20 -0
  5. disdrodb/accessor/methods.py +125 -0
  6. disdrodb/api/checks.py +177 -24
  7. disdrodb/api/configs.py +3 -3
  8. disdrodb/api/info.py +13 -13
  9. disdrodb/api/io.py +281 -22
  10. disdrodb/api/path.py +184 -195
  11. disdrodb/api/search.py +18 -9
  12. disdrodb/cli/disdrodb_create_summary.py +103 -0
  13. disdrodb/cli/disdrodb_create_summary_station.py +91 -0
  14. disdrodb/cli/disdrodb_run_l0.py +1 -1
  15. disdrodb/cli/disdrodb_run_l0_station.py +1 -1
  16. disdrodb/cli/disdrodb_run_l0a_station.py +1 -1
  17. disdrodb/cli/disdrodb_run_l0b.py +1 -1
  18. disdrodb/cli/disdrodb_run_l0b_station.py +3 -3
  19. disdrodb/cli/disdrodb_run_l0c.py +1 -1
  20. disdrodb/cli/disdrodb_run_l0c_station.py +3 -3
  21. disdrodb/cli/disdrodb_run_l1_station.py +2 -2
  22. disdrodb/cli/disdrodb_run_l2e_station.py +2 -2
  23. disdrodb/cli/disdrodb_run_l2m_station.py +2 -2
  24. disdrodb/configs.py +149 -4
  25. disdrodb/constants.py +61 -0
  26. disdrodb/data_transfer/download_data.py +127 -11
  27. disdrodb/etc/configs/attributes.yaml +339 -0
  28. disdrodb/etc/configs/encodings.yaml +473 -0
  29. disdrodb/etc/products/L1/global.yaml +13 -0
  30. disdrodb/etc/products/L2E/10MIN.yaml +12 -0
  31. disdrodb/etc/products/L2E/1MIN.yaml +1 -0
  32. disdrodb/etc/products/L2E/global.yaml +22 -0
  33. disdrodb/etc/products/L2M/10MIN.yaml +12 -0
  34. disdrodb/etc/products/L2M/GAMMA_ML.yaml +8 -0
  35. disdrodb/etc/products/L2M/NGAMMA_GS_LOG_ND_MAE.yaml +6 -0
  36. disdrodb/etc/products/L2M/NGAMMA_GS_ND_MAE.yaml +6 -0
  37. disdrodb/etc/products/L2M/NGAMMA_GS_Z_MAE.yaml +6 -0
  38. disdrodb/etc/products/L2M/global.yaml +26 -0
  39. disdrodb/issue/writer.py +2 -0
  40. disdrodb/l0/__init__.py +13 -0
  41. disdrodb/l0/configs/LPM/l0b_cf_attrs.yml +4 -4
  42. disdrodb/l0/configs/PARSIVEL/l0b_cf_attrs.yml +1 -1
  43. disdrodb/l0/configs/PARSIVEL/l0b_encodings.yml +3 -3
  44. disdrodb/l0/configs/PARSIVEL/raw_data_format.yml +1 -1
  45. disdrodb/l0/configs/PARSIVEL2/l0b_cf_attrs.yml +5 -5
  46. disdrodb/l0/configs/PARSIVEL2/l0b_encodings.yml +3 -3
  47. disdrodb/l0/configs/PARSIVEL2/raw_data_format.yml +1 -1
  48. disdrodb/l0/configs/PWS100/l0b_cf_attrs.yml +4 -4
  49. disdrodb/l0/configs/PWS100/raw_data_format.yml +1 -1
  50. disdrodb/l0/l0a_processing.py +37 -32
  51. disdrodb/l0/l0b_nc_processing.py +118 -8
  52. disdrodb/l0/l0b_processing.py +30 -65
  53. disdrodb/l0/l0c_processing.py +369 -259
  54. disdrodb/l0/readers/LPM/ARM/ARM_LPM.py +7 -0
  55. disdrodb/l0/readers/LPM/NETHERLANDS/DELFT_LPM_NC.py +66 -0
  56. disdrodb/l0/readers/LPM/SLOVENIA/{CRNI_VRH.py → UL.py} +3 -0
  57. disdrodb/l0/readers/LPM/SWITZERLAND/INNERERIZ_LPM.py +195 -0
  58. disdrodb/l0/readers/PARSIVEL/GPM/PIERS.py +0 -2
  59. disdrodb/l0/readers/PARSIVEL/JAPAN/JMA.py +4 -1
  60. disdrodb/l0/readers/PARSIVEL/NCAR/PECAN_MOBILE.py +1 -1
  61. disdrodb/l0/readers/PARSIVEL/NCAR/VORTEX2_2009.py +1 -1
  62. disdrodb/l0/readers/PARSIVEL2/ARM/ARM_PARSIVEL2.py +4 -0
  63. disdrodb/l0/readers/PARSIVEL2/BELGIUM/ILVO.py +168 -0
  64. disdrodb/l0/readers/PARSIVEL2/CANADA/UQAM_NC.py +69 -0
  65. disdrodb/l0/readers/PARSIVEL2/DENMARK/DTU.py +165 -0
  66. disdrodb/l0/readers/PARSIVEL2/FINLAND/FMI_PARSIVEL2.py +69 -0
  67. disdrodb/l0/readers/PARSIVEL2/FRANCE/ENPC_PARSIVEL2.py +255 -134
  68. disdrodb/l0/readers/PARSIVEL2/FRANCE/OSUG.py +525 -0
  69. disdrodb/l0/readers/PARSIVEL2/FRANCE/SIRTA_PARSIVEL2.py +1 -1
  70. disdrodb/l0/readers/PARSIVEL2/GPM/GCPEX.py +9 -7
  71. disdrodb/l0/readers/PARSIVEL2/KIT/BURKINA_FASO.py +1 -1
  72. disdrodb/l0/readers/PARSIVEL2/KIT/TEAMX.py +123 -0
  73. disdrodb/l0/readers/PARSIVEL2/{NETHERLANDS/DELFT.py → MPI/BCO_PARSIVEL2.py} +41 -71
  74. disdrodb/l0/readers/PARSIVEL2/MPI/BOWTIE.py +220 -0
  75. disdrodb/l0/readers/PARSIVEL2/NASA/APU.py +120 -0
  76. disdrodb/l0/readers/PARSIVEL2/NASA/LPVEX.py +109 -0
  77. disdrodb/l0/readers/PARSIVEL2/NCAR/FARM_PARSIVEL2.py +1 -0
  78. disdrodb/l0/readers/PARSIVEL2/NCAR/PECAN_FP3.py +1 -1
  79. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_MIPS.py +126 -0
  80. disdrodb/l0/readers/PARSIVEL2/NCAR/PERILS_PIPS.py +165 -0
  81. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_P2.py +1 -1
  82. disdrodb/l0/readers/PARSIVEL2/NCAR/VORTEX_SE_2016_PIPS.py +20 -12
  83. disdrodb/l0/readers/PARSIVEL2/NETHERLANDS/DELFT_NC.py +5 -0
  84. disdrodb/l0/readers/PARSIVEL2/SPAIN/CENER.py +144 -0
  85. disdrodb/l0/readers/PARSIVEL2/SPAIN/CR1000DL.py +201 -0
  86. disdrodb/l0/readers/PARSIVEL2/SPAIN/LIAISE.py +137 -0
  87. disdrodb/l0/readers/PARSIVEL2/USA/C3WE.py +146 -0
  88. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100.py +105 -99
  89. disdrodb/l0/readers/PWS100/FRANCE/ENPC_PWS100_SIRTA.py +151 -0
  90. disdrodb/l1/__init__.py +5 -0
  91. disdrodb/l1/fall_velocity.py +46 -0
  92. disdrodb/l1/filters.py +34 -20
  93. disdrodb/l1/processing.py +46 -45
  94. disdrodb/l1/resampling.py +77 -66
  95. disdrodb/l1_env/routines.py +18 -3
  96. disdrodb/l2/__init__.py +7 -0
  97. disdrodb/l2/empirical_dsd.py +58 -10
  98. disdrodb/l2/processing.py +268 -117
  99. disdrodb/metadata/checks.py +132 -125
  100. disdrodb/metadata/standards.py +3 -1
  101. disdrodb/psd/fitting.py +631 -345
  102. disdrodb/psd/models.py +9 -6
  103. disdrodb/routines/__init__.py +54 -0
  104. disdrodb/{l0/routines.py → routines/l0.py} +316 -355
  105. disdrodb/{l1/routines.py → routines/l1.py} +76 -116
  106. disdrodb/routines/l2.py +1019 -0
  107. disdrodb/{routines.py → routines/wrappers.py} +98 -10
  108. disdrodb/scattering/__init__.py +16 -4
  109. disdrodb/scattering/axis_ratio.py +61 -37
  110. disdrodb/scattering/permittivity.py +504 -0
  111. disdrodb/scattering/routines.py +746 -184
  112. disdrodb/summary/__init__.py +17 -0
  113. disdrodb/summary/routines.py +4196 -0
  114. disdrodb/utils/archiving.py +434 -0
  115. disdrodb/utils/attrs.py +68 -125
  116. disdrodb/utils/cli.py +5 -5
  117. disdrodb/utils/compression.py +30 -1
  118. disdrodb/utils/dask.py +121 -9
  119. disdrodb/utils/dataframe.py +61 -7
  120. disdrodb/utils/decorators.py +31 -0
  121. disdrodb/utils/directories.py +35 -15
  122. disdrodb/utils/encoding.py +37 -19
  123. disdrodb/{l2 → utils}/event.py +15 -173
  124. disdrodb/utils/logger.py +14 -7
  125. disdrodb/utils/manipulations.py +81 -0
  126. disdrodb/utils/routines.py +166 -0
  127. disdrodb/utils/subsetting.py +214 -0
  128. disdrodb/utils/time.py +35 -177
  129. disdrodb/utils/writer.py +20 -7
  130. disdrodb/utils/xarray.py +5 -4
  131. disdrodb/viz/__init__.py +13 -0
  132. disdrodb/viz/plots.py +398 -0
  133. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/METADATA +4 -3
  134. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/RECORD +139 -98
  135. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/entry_points.txt +2 -0
  136. disdrodb/l1/encoding_attrs.py +0 -642
  137. disdrodb/l2/processing_options.py +0 -213
  138. disdrodb/l2/routines.py +0 -868
  139. /disdrodb/l0/readers/PARSIVEL/SLOVENIA/{UL_FGG.py → UL.py} +0 -0
  140. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/WHEEL +0 -0
  141. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/licenses/LICENSE +0 -0
  142. {disdrodb-0.1.2.dist-info → disdrodb-0.1.4.dist-info}/top_level.txt +0 -0
@@ -47,7 +47,7 @@ number_particles:
47
47
  sensor_temperature:
48
48
  description: Temperature in sensor housing
49
49
  long_name: Temperature of the sensor
50
- units: "C"
50
+ units: "degC"
51
51
  sensor_serial_number:
52
52
  description: Sensor serial number
53
53
  long_name: Serial number of the sensor
@@ -105,15 +105,15 @@ error_code:
105
105
  sensor_temperature_pcb:
106
106
  description: Temperature in printed circuit board
107
107
  long_name: Sensor PCB temperature
108
- units: "C"
108
+ units: "degC"
109
109
  sensor_temperature_receiver:
110
110
  description: Temperature in right sensor head
111
111
  long_name: Sensor receiver temperature
112
- units: "C"
112
+ units: "degC"
113
113
  sensor_temperature_trasmitter:
114
114
  description: Temperature in left sensor head
115
115
  long_name: Sensor trasmitter temperature
116
- units: "C"
116
+ units: "degC"
117
117
  rainfall_rate_16_bit_30:
118
118
  description: Rainfall rate
119
119
  long_name: Rainfall rate max 30 mm/h 16 bit
@@ -161,7 +161,7 @@ raw_drop_number:
161
161
  air_temperature:
162
162
  description: "Air temperature in degrees Celsius (C)"
163
163
  long_name: Air temperature
164
- units: "C"
164
+ units: "degC"
165
165
  relative_humidity:
166
166
  description: "Relative humidity in percent (%)"
167
167
  long_name: Relative humidity
@@ -102,7 +102,7 @@ sensor_temperature:
102
102
  chunksizes: 5000
103
103
  _FillValue: 127
104
104
  sensor_serial_number:
105
- dtype: object
105
+ dtype: str
106
106
  zlib: false
107
107
  complevel: 3
108
108
  shuffle: true
@@ -110,7 +110,7 @@ sensor_serial_number:
110
110
  contiguous: false
111
111
  chunksizes: 5000
112
112
  firmware_iop:
113
- dtype: object
113
+ dtype: str
114
114
  zlib: false
115
115
  complevel: 3
116
116
  shuffle: true
@@ -118,7 +118,7 @@ firmware_iop:
118
118
  contiguous: false
119
119
  chunksizes: 5000
120
120
  firmware_dsp:
121
- dtype: object
121
+ dtype: str
122
122
  zlib: false
123
123
  complevel: 3
124
124
  shuffle: true
@@ -15,7 +15,7 @@ rainfall_accumulated_32bit:
15
15
  n_naturals: 4
16
16
  data_range:
17
17
  - 0
18
- - 300.0
18
+ - 9999.0
19
19
  nan_flags: null
20
20
  field_number: "02"
21
21
  weather_code_synop_4680:
@@ -25,7 +25,7 @@ sensor_status:
25
25
  air_temperature:
26
26
  description: "Air temperature in degrees Celsius"
27
27
  long_name: Air temperature
28
- units: "C"
28
+ units: "degC"
29
29
  relative_humidity:
30
30
  description: "Relative humidity in percent (%)"
31
31
  long_name: Relative humidity
@@ -33,15 +33,15 @@ relative_humidity:
33
33
  wetbulb_temperature:
34
34
  description: "Wet bulb temperature in degrees Celsius"
35
35
  long_name: Wet bulb temperature
36
- units: "C"
36
+ units: "degC"
37
37
  air_temperature_max:
38
38
  description: "Maximum air temperature in degrees Celsius"
39
39
  long_name: Maximum air temperature
40
- units: "C"
40
+ units: "degC"
41
41
  air_temperature_min:
42
42
  description: "Minimum air temperature in degrees Celsius"
43
43
  long_name: Minimum air temperature
44
- units: "C"
44
+ units: "degC"
45
45
  rainfall_rate:
46
46
  description: Rainfall rate
47
47
  long_name: Rainfall rate
@@ -5,7 +5,7 @@ mor_visibility:
5
5
  n_naturals: 4
6
6
  data_range:
7
7
  - 0
8
- - 9999.9
8
+ - 20000
9
9
  nan_flags: null
10
10
  field_number: "20"
11
11
  weather_code_synop_4680:
@@ -18,13 +18,13 @@
18
18
  # -----------------------------------------------------------------------------.
19
19
  """Functions to process raw text files into DISDRODB L0A Apache Parquet."""
20
20
 
21
-
22
21
  import logging
23
22
  import os
24
23
  from typing import Union
25
24
 
26
25
  import numpy as np
27
26
  import pandas as pd
27
+ import pyarrow.parquet as pq
28
28
 
29
29
  from disdrodb.l0.check_standards import check_l0a_column_names, check_l0a_standards
30
30
  from disdrodb.l0.l0b_processing import infer_split_str
@@ -130,11 +130,15 @@ def read_raw_text_file(
130
130
  try:
131
131
  df = pd.read_csv(filepath, names=column_names, dtype=dtype, **reader_kwargs)
132
132
  except pd.errors.EmptyDataError:
133
+ # if isinstance(filepath, zipfile.ZipExtFile):
134
+ # filepath = filepath.name
133
135
  msg = f"The following file is empty: {filepath}"
134
136
  raise ValueError(msg)
135
137
 
136
138
  # Check the dataframe is not empty
137
139
  if len(df.index) == 0:
140
+ # if isinstance(filepath, zipfile.ZipExtFile):
141
+ # filepath = filepath.name
138
142
  msg = f"The following file is empty: {filepath}"
139
143
  raise ValueError(msg)
140
144
 
@@ -265,13 +269,15 @@ def remove_issue_timesteps(df, issue_dict, logger=None, verbose=False):
265
269
  # Retrieve timesteps and time_periods
266
270
  timesteps = issue_dict.get("timesteps", None)
267
271
  time_periods = issue_dict.get("time_periods", None)
272
+ timesteps = [] if timesteps is None else timesteps
273
+ time_periods = [] if time_periods is None else time_periods
268
274
 
269
275
  # Drop rows of specified timesteps
270
- if timesteps:
276
+ if len(timesteps) > 0:
271
277
  df = drop_timesteps(df=df, timesteps=timesteps)
272
278
 
273
279
  # Drop rows within specified time_period
274
- if time_periods:
280
+ if len(time_periods) > 0:
275
281
  df = drop_time_periods(df, time_periods=time_periods)
276
282
 
277
283
  # Report number of dropped rows
@@ -413,6 +419,8 @@ def is_raw_array_string_not_corrupted(string):
413
419
  """Check if the raw array is corrupted."""
414
420
  if not isinstance(string, str):
415
421
  return False
422
+ if string in ["", "NAN", "NaN"]:
423
+ return True
416
424
  split_str = infer_split_str(string=string)
417
425
  list_values = string.split(split_str)
418
426
  values = pd.to_numeric(list_values, errors="coerce")
@@ -625,6 +633,9 @@ def sanitize_df(
625
633
  # - Sort by time
626
634
  df = df.sort_values("time")
627
635
 
636
+ # - Drop index
637
+ df = df.reset_index(drop=True)
638
+
628
639
  # ------------------------------------------------------.
629
640
  # - Check column names agrees to DISDRODB standards
630
641
  check_l0a_column_names(df, sensor_name=sensor_name)
@@ -755,24 +766,8 @@ def concatenate_dataframe(list_df: list, logger=None, verbose: bool = False) ->
755
766
  return df
756
767
 
757
768
 
758
- def _read_l0a(filepath: str, verbose: bool = False, logger=None, debugging_mode: bool = False) -> pd.DataFrame:
759
- # Log
760
- msg = f"Reading L0 Apache Parquet file at {filepath} started."
761
- log_info(logger=logger, msg=msg, verbose=verbose)
762
- # Open file
763
- df = pd.read_parquet(filepath)
764
- if debugging_mode:
765
- df = df.iloc[0:100]
766
- # Log
767
- msg = f"Reading L0 Apache Parquet file at {filepath} ended."
768
- log_info(logger=logger, msg=msg, verbose=verbose)
769
- return df
770
-
771
-
772
769
  def read_l0a_dataframe(
773
770
  filepaths: Union[str, list],
774
- verbose: bool = False,
775
- logger=None,
776
771
  debugging_mode: bool = False,
777
772
  ) -> pd.DataFrame:
778
773
  """Read DISDRODB L0A Apache Parquet file(s).
@@ -781,13 +776,10 @@ def read_l0a_dataframe(
781
776
  ----------
782
777
  filepaths : str or list
783
778
  Either a list or a single filepath.
784
- verbose : bool
785
- Whether to print detailed processing information into terminal.
786
- The default is ``False``.
787
779
  debugging_mode : bool
788
780
  If ``True``, it reduces the amount of data to process.
789
781
  If filepaths is a list, it reads only the first 3 files.
790
- For each file it select only the first 100 rows.
782
+ It selects only 100 rows sampled from the first 3 files.
791
783
  The default is ``False``.
792
784
 
793
785
  Returns
@@ -796,8 +788,6 @@ def read_l0a_dataframe(
796
788
  L0A Dataframe.
797
789
 
798
790
  """
799
- from disdrodb.l0.l0a_processing import concatenate_dataframe
800
-
801
791
  # ----------------------------------------
802
792
  # Check filepaths validity
803
793
  if not isinstance(filepaths, (list, str)):
@@ -814,16 +804,22 @@ def read_l0a_dataframe(
814
804
 
815
805
  # ---------------------------------------------------
816
806
  # Define the list of dataframe
817
- list_df = [
818
- _read_l0a(filepath, verbose=verbose, logger=logger, debugging_mode=debugging_mode) for filepath in filepaths
819
- ]
807
+ df = pq.ParquetDataset(filepaths).read().to_pandas()
820
808
 
821
- # Concatenate dataframe
822
- df = concatenate_dataframe(list_df, logger=logger, verbose=verbose)
809
+ # Reduce rows
810
+ if debugging_mode:
811
+ n_rows = min(100, len(df))
812
+ df = df.sample(n=n_rows)
823
813
 
824
814
  # Ensure time is in nanoseconds
825
815
  df["time"] = df["time"].astype("M8[ns]")
826
816
 
817
+ # Ensure sorted by time
818
+ df = df.sort_values(by="time")
819
+
820
+ # Ensure no index
821
+ df = df.reset_index(drop=True)
822
+
827
823
  # ---------------------------------------------------
828
824
  # Return dataframe
829
825
  return df
@@ -833,14 +829,15 @@ def read_l0a_dataframe(
833
829
  #### L0A Utility
834
830
 
835
831
 
836
- def read_raw_text_files(
832
+ def generate_l0a(
837
833
  filepaths: Union[list, str],
838
834
  reader,
839
835
  sensor_name,
836
+ issue_dict=None,
840
837
  verbose=True,
841
838
  logger=None,
842
839
  ) -> pd.DataFrame:
843
- """Read and parse a list for raw files into a dataframe.
840
+ """Read and parse a list of raw files and generate a DISDRODB L0A dataframe.
844
841
 
845
842
  Parameters
846
843
  ----------
@@ -851,6 +848,13 @@ def read_raw_text_files(
851
848
  Format: reader(filepath, logger=None)
852
849
  sensor_name : str
853
850
  Name of the sensor.
851
+ issue_dict : dict, optional
852
+ Issue dictionary providing information on timesteps to remove.
853
+ The default is an empty dictionary ``{}``.
854
+ Valid issue_dict key are ``'timesteps'`` and ``'time_periods'``.
855
+ Valid issue_dict values are list of datetime64 values (with second accuracy).
856
+ To correctly format and check the validity of the ``issue_dict``, use
857
+ the ``disdrodb.l0.issue.check_issue_dict`` function.
854
858
  verbose : bool
855
859
  Whether to verbose the processing. The default is ``True``.
856
860
 
@@ -886,6 +890,7 @@ def read_raw_text_files(
886
890
  df = sanitize_df(
887
891
  df=df,
888
892
  sensor_name=sensor_name,
893
+ issue_dict=issue_dict,
889
894
  logger=logger,
890
895
  verbose=verbose,
891
896
  )
@@ -19,6 +19,7 @@
19
19
  """Functions to process DISDRODB raw netCDF files into DISDRODB L0B netCDF files."""
20
20
 
21
21
  import logging
22
+ from typing import Union
22
23
 
23
24
  import numpy as np
24
25
 
@@ -33,8 +34,8 @@ from disdrodb.l0.standards import (
33
34
  get_valid_variable_names,
34
35
  )
35
36
  from disdrodb.utils.logger import (
37
+ log_error,
36
38
  # log_warning,
37
- # log_debug,
38
39
  log_info,
39
40
  )
40
41
 
@@ -169,6 +170,8 @@ def standardize_raw_dataset(ds, dict_names, sensor_name):
169
170
 
170
171
  # If missing variables, infill with NaN array
171
172
  missing_vars = _get_missing_variables(ds, dict_names, sensor_name)
173
+ if "raw_drop_number" in missing_vars:
174
+ raise ValueError("The raw drop spectrum is not present in the netCDF file!")
172
175
  if len(missing_vars) > 0:
173
176
  ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name)
174
177
 
@@ -343,7 +346,7 @@ def drop_timesteps(ds, timesteps: list):
343
346
  # Ensure there's at least one timestep left
344
347
  if ds_filtered.sizes.get("time", 0) == 0:
345
348
  raise ValueError(
346
- "No timesteps left after removing problematic timesteps. " "Maybe you need to adjust the issue YAML file.",
349
+ "No timesteps left after removing problematic timesteps. Maybe you need to adjust the issue YAML file.",
347
350
  )
348
351
  return ds_filtered
349
352
 
@@ -419,16 +422,21 @@ def remove_issue_timesteps(
419
422
  ValueError
420
423
  If after removing specified timesteps/periods no data remains.
421
424
  """
425
+ # Retrieve number of initial rows
422
426
  n_initial = ds.sizes.get("time", 0)
423
- timesteps = issue_dict.get("timesteps", []) or []
424
- time_periods = issue_dict.get("time_periods", []) or []
427
+
428
+ # Retrieve timesteps and time_periods
429
+ timesteps = issue_dict.get("timesteps")
430
+ time_periods = issue_dict.get("time_periods")
431
+ timesteps = [] if timesteps is None else timesteps
432
+ time_periods = [] if time_periods is None else time_periods
425
433
 
426
434
  # Drop individual timesteps
427
- if timesteps:
435
+ if len(timesteps) > 0:
428
436
  ds = drop_timesteps(ds, timesteps)
429
437
 
430
438
  # Drop intervals of time
431
- if time_periods:
439
+ if len(time_periods) > 0:
432
440
  ds = drop_time_periods(ds, time_periods)
433
441
 
434
442
  # Report number dropped
@@ -454,8 +462,8 @@ def sanitize_ds(
454
462
  ----------
455
463
  ds : xarray.Dataset
456
464
  Raw xarray dataset
457
- attrs: dict
458
- Global metadata to attach as global attributes to the xr.Dataset.
465
+ metadata: dict
466
+ Station metadata to attach as global attributes to the xr.Dataset.
459
467
  sensor_name : str
460
468
  Name of the sensor.
461
469
  verbose : bool
@@ -525,3 +533,105 @@ def open_raw_netcdf_file(
525
533
  # Log information
526
534
  log_info(logger=logger, msg=f"netCDF file {filepath} has been loaded successively into xarray.", verbose=False)
527
535
  return ds
536
+
537
+
538
+ def generate_l0b_from_nc(
539
+ filepaths: Union[list, str],
540
+ reader,
541
+ sensor_name,
542
+ metadata,
543
+ issue_dict=None,
544
+ verbose=True,
545
+ logger=None,
546
+ ):
547
+ """Read and parse a list of raw netCDF files and generate a DISDRODB L0B dataset.
548
+
549
+ Parameters
550
+ ----------
551
+ filepaths : Union[list,str]
552
+ File(s) path(s)
553
+ reader:
554
+ DISDRODB reader function.
555
+ Format: reader(filepath, logger=None)
556
+ sensor_name : str
557
+ Name of the sensor.
558
+ metadata: dict
559
+ Station metadata to attach as global attributes to the xr.Dataset.
560
+ issue_dict : dict, optional
561
+ Issue dictionary providing information on timesteps to remove.
562
+ The default is an empty dictionary ``{}``.
563
+ Valid issue_dict key are ``'timesteps'`` and ``'time_periods'``.
564
+ Valid issue_dict values are list of datetime64 values (with second accuracy).
565
+ To correctly format and check the validity of the ``issue_dict``, use
566
+ the ``disdrodb.l0.issue.check_issue_dict`` function.
567
+ verbose : bool
568
+ Whether to verbose the processing. The default is ``True``.
569
+
570
+ Returns
571
+ -------
572
+ xarray.Dataset
573
+ DISDRODB L0B Dataset.
574
+
575
+ Raises
576
+ ------
577
+ ValueError
578
+ Input parameters can not be used or the raw file can not be processed.
579
+
580
+ """
581
+ import xarray as xr
582
+
583
+ # Check input list
584
+ if isinstance(filepaths, str):
585
+ filepaths = [filepaths]
586
+ if len(filepaths) == 0:
587
+ raise ValueError("'filepaths' must contains at least 1 filepath.")
588
+
589
+ # ------------------------------------------------------.
590
+ # Loop over all raw files
591
+ n_files = len(filepaths)
592
+ processed_file_counter = 0
593
+ list_skipped_files_msg = []
594
+ list_ds = []
595
+ for filepath in filepaths:
596
+ # Try read the raw netCDF file
597
+ try:
598
+ ds = reader(filepath, logger=logger)
599
+ # Sanitize the dataframe
600
+ ds = sanitize_ds(
601
+ ds=ds,
602
+ sensor_name=sensor_name,
603
+ metadata=metadata,
604
+ issue_dict=issue_dict,
605
+ verbose=verbose,
606
+ logger=logger,
607
+ )
608
+ # Append dataframe to the list
609
+ list_ds.append(ds)
610
+ # Update the logger
611
+ processed_file_counter += 1
612
+ msg = f"Raw file '{filepath}' processed successfully ({processed_file_counter}/{n_files})."
613
+ log_info(logger=logger, msg=msg, verbose=verbose)
614
+
615
+ # Skip the file if the processing fails
616
+ except Exception as e:
617
+ # Update the logger
618
+ msg = f"{filepath} has been skipped. The error is: {e}."
619
+ log_error(logger=logger, msg=msg, verbose=verbose)
620
+ list_skipped_files_msg.append(msg)
621
+
622
+ # Update logger
623
+ msg = f"{len(list_skipped_files_msg)} of {n_files} have been skipped."
624
+ log_info(logger=logger, msg=msg, verbose=verbose)
625
+
626
+ # Check if there are files to concatenate
627
+ if len(list_ds) == 0:
628
+ raise ValueError("Any raw file could be read!")
629
+
630
+ ##----------------------------------------------------------------.
631
+ # Concatenate the datasets
632
+ list_ds = [ds.chunk({"time": -1}) for ds in list_ds]
633
+ ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby("time")
634
+ ds = ds.compute()
635
+
636
+ # Return the dataframe
637
+ return ds
@@ -19,7 +19,6 @@
19
19
  """Functions to process DISDRODB L0A files into DISDRODB L0B netCDF files."""
20
20
 
21
21
  import logging
22
- import os
23
22
 
24
23
  import numpy as np
25
24
  import pandas as pd
@@ -43,13 +42,8 @@ from disdrodb.utils.attrs import (
43
42
  set_coordinate_attributes,
44
43
  set_disdrodb_attrs,
45
44
  )
46
- from disdrodb.utils.directories import create_directory, remove_if_exists
47
45
  from disdrodb.utils.encoding import set_encodings
48
- from disdrodb.utils.logger import (
49
- # log_warning,
50
- # log_debug,
51
- log_info,
52
- )
46
+ from disdrodb.utils.logger import log_info
53
47
  from disdrodb.utils.time import ensure_sorted_by_time
54
48
 
55
49
  logger = logging.getLogger(__name__)
@@ -246,12 +240,20 @@ def retrieve_l0b_arrays(
246
240
  unavailable_keys.append(key)
247
241
  continue
248
242
 
249
- # Ensure is a string
250
- df_series = df[key].astype(str)
243
+ # Ensure is a string, get a numpy array for each row and then stack
244
+ # - Option 1: Clear but lot of copies
245
+ # df_series = df[key].astype(str)
246
+ # list_arr = df_series.apply(_format_string_array, n_values=n_values)
247
+ # arr = np.stack(list_arr, axis=0)
248
+
249
+ # - Option 2: still copies
250
+ # arr = np.vstack(_format_string_array(s, n_values=n_values) for s in df_series.astype(str))
251
251
 
252
- # Get a numpy array for each row and then stack
253
- list_arr = df_series.apply(_format_string_array, n_values=n_values)
254
- arr = np.stack(list_arr, axis=0)
252
+ # - Option 3: more memory efficient
253
+ n_timesteps = len(df[key])
254
+ arr = np.empty((n_timesteps, n_values), dtype=float) # preallocates
255
+ for i, s in enumerate(df[key].astype(str)):
256
+ arr[i, :] = _format_string_array(s, n_values=n_values)
255
257
 
256
258
  # Retrieve dimensions
257
259
  dims_order = dims_order_dict[key]
@@ -333,18 +335,6 @@ def _set_variable_attributes(ds: xr.Dataset, sensor_name: str) -> xr.Dataset:
333
335
  return ds
334
336
 
335
337
 
336
- def _set_dataset_attrs(ds, sensor_name):
337
- """Set variable and coordinates attributes."""
338
- # - Add netCDF variable attributes
339
- # --> Attributes: long_name, units, descriptions, valid_min, valid_max
340
- ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name)
341
- # - Add netCDF coordinate attributes
342
- ds = set_coordinate_attributes(ds=ds)
343
- # - Set DISDRODB global attributes
344
- ds = set_disdrodb_attrs(ds=ds, product="L0B")
345
- return ds
346
-
347
-
348
338
  def add_dataset_crs_coords(ds):
349
339
  """Add the CRS coordinate to the xr.Dataset."""
350
340
  # TODO: define CF-compliant CRS !
@@ -386,13 +376,13 @@ def _define_dataset_variables(df, sensor_name, logger=None, verbose=False):
386
376
  return data_vars
387
377
 
388
378
 
389
- def create_l0b_from_l0a(
379
+ def generate_l0b(
390
380
  df: pd.DataFrame,
391
381
  metadata: dict,
392
382
  logger=None,
393
383
  verbose: bool = False,
394
384
  ) -> xr.Dataset:
395
- """Transform the L0A dataframe to the L0B xr.Dataset.
385
+ """Transform the DISDRODB L0A dataframe to the DISDRODB L0B xr.Dataset.
396
386
 
397
387
  Parameters
398
388
  ----------
@@ -475,16 +465,25 @@ def finalize_dataset(ds, sensor_name, metadata):
475
465
  ds = add_dataset_crs_coords(ds)
476
466
 
477
467
  # Set netCDF dimension order
468
+ # --> Required for correct encoding !
478
469
  ds = ds.transpose("time", "diameter_bin_center", ...)
479
470
 
480
- # Add netCDF variable and coordinate attributes
481
- ds = _set_dataset_attrs(ds, sensor_name)
482
-
483
471
  # Ensure variables with dtype object are converted to string
484
472
  ds = _convert_object_variables_to_string(ds)
485
473
 
474
+ # Add netCDF variable and coordinate attributes
475
+ # - Add variable attributes: long_name, units, descriptions, valid_min, valid_max
476
+ ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name)
477
+ # - Add netCDF coordinate attributes
478
+ ds = set_coordinate_attributes(ds=ds)
479
+ # - Set DISDRODB global attributes
480
+ ds = set_disdrodb_attrs(ds=ds, product="L0B")
481
+
486
482
  # Check L0B standards
487
483
  check_l0b_standards(ds)
484
+
485
+ # Set L0B encodings
486
+ ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
488
487
  return ds
489
488
 
490
489
 
@@ -503,43 +502,9 @@ def set_l0b_encodings(ds: xr.Dataset, sensor_name: str):
503
502
  xarray.Dataset
504
503
  Output xarray dataset.
505
504
  """
506
- encoding_dict = get_l0b_encodings_dict(sensor_name)
507
- ds = set_encodings(ds=ds, encoding_dict=encoding_dict)
505
+ encodings_dict = get_l0b_encodings_dict(sensor_name)
506
+ ds = set_encodings(ds=ds, encodings_dict=encodings_dict)
508
507
  return ds
509
508
 
510
509
 
511
- def write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None:
512
- """Save the xarray dataset into a NetCDF file.
513
-
514
- Parameters
515
- ----------
516
- ds : xarray.Dataset
517
- Input xarray dataset.
518
- filepath : str
519
- Output file path.
520
- sensor_name : str
521
- Name of the sensor.
522
- force : bool, optional
523
- Whether to overwrite existing data.
524
- If ``True``, overwrite existing data into destination directories.
525
- If ``False``, raise an error if there are already data into destination directories. This is the default.
526
- """
527
- # Create station directory if does not exist
528
- create_directory(os.path.dirname(filepath))
529
-
530
- # Check if the file already exists
531
- # - If force=True --> Remove it
532
- # - If force=False --> Raise error
533
- remove_if_exists(filepath, force=force)
534
-
535
- # Get sensor name from dataset
536
- sensor_name = ds.attrs.get("sensor_name")
537
-
538
- # Set encodings
539
- ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
540
-
541
- # Write netcdf
542
- ds.to_netcdf(filepath, engine="netcdf4")
543
-
544
-
545
510
  ####--------------------------------------------------------------------------.