mrio-toolbox 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mrio-toolbox might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,2 @@
1
+ from . import xarray
2
+ from . import pandas
@@ -0,0 +1,245 @@
1
+ """
2
+ Routines for converting between Pandas DataFrames and Parts objects.
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ def to_pandas(part):
9
+ """Return the current Part object as a Pandas DataFrame
10
+
11
+ Only applicable to Parts objects with 1 or 2 dimensions.
12
+ """
13
+ if part.ndim>2:
14
+ raise ValueError(f"Cannot convert a Part with {part.ndim} dimensions to DataFrame.")
15
+ elif part.ndim==2:
16
+ return pd.DataFrame(part.data,
17
+ index = part.axes[0].label(True),
18
+ columns = part.axes[1].label(True))
19
+ else:
20
+ return pd.DataFrame(part.data,index = part.axes[0].label(True)
21
+ )
22
+
23
+ def make_part(df,name="from_df",
24
+ label_detection=False,
25
+ **kwargs):
26
+ """Load a Part object from a Pandas DataFrame
27
+
28
+ Parameters
29
+ ----------
30
+ df : DataFrame
31
+ DataFrame to load
32
+ label_detection : bool, optional
33
+ Automatically detect labels, by default False
34
+ If True, the DataFrame is scanned to detect labels (defined as non-numeric data)
35
+ name : str, optional
36
+ Name of the data variable to load, by default None.
37
+ This can be left empty if there's a single variable in the DataFrame.
38
+
39
+ Returns
40
+ -------
41
+ dict
42
+ Data required to create the Part object
43
+ """
44
+ part_data = dict()
45
+ if label_detection:
46
+ df = autodecode_labels(df)
47
+ part_data["data"] = df.to_numpy()
48
+ ndim = df.ndim
49
+
50
+ labels = []
51
+ if ndim == 1:
52
+ labels.append(convert_labels(df.index))
53
+ else:
54
+ labels.append(convert_labels(df.index))
55
+ labels.append(convert_labels(df.columns))
56
+ labels = disambiguate_labels(labels)
57
+ part_data["labels"] = labels
58
+ part_data["groupings"] = kwargs.pop("groupings",dict())
59
+ part_data["metadata"] = kwargs.pop("metadata",dict())
60
+ part_data["name"] = name
61
+ for key in kwargs:
62
+ part_data["metadata"][key] = kwargs[key]
63
+ return part_data
64
+
65
+ def autodecode_labels(df):
66
+ """Automatically detect the labels from a DataFrame
67
+
68
+ This is done by indentifying the indices and columns
69
+ with non-numeric values.
70
+ """
71
+ def test_selection(df,row,col):
72
+ """Test if a selection is numeric"""
73
+ try:
74
+ for col in df.iloc[row:,col]:
75
+ pd.to_numeric(col)
76
+ return True
77
+ except ValueError:
78
+ return False
79
+
80
+ def try_reduce(df,row,col):
81
+ """Try reducing the rectangle to the right or down"""
82
+ if test_selection(df,row+1,col):
83
+ return row+1,col
84
+ elif test_selection(df,row,col+1):
85
+ return row,col+1
86
+ else:
87
+ return row+1,col+1
88
+
89
+ def try_expand(df,row,col):
90
+ """Try expanding the rectangle to the left or up"""
91
+ if not test_selection(df,row+1,col):
92
+ return row+1,col
93
+ elif not test_selection(df,row,col+1):
94
+ return row,col+1
95
+ else:
96
+ return row, col
97
+
98
+ def find_rectangle(df):
99
+ """Find the largest rectangle with only numeric data"""
100
+ row = 0
101
+ col = 0
102
+ while not test_selection(df,row,col):
103
+ row,col = try_reduce(df,row,col)
104
+ while not test_selection(df,row,col):
105
+ #After the first while loop, we found only numeric data
106
+ #We now expand to the top and the left
107
+ #To make sure we didn't crop numerical data
108
+ row,col = try_expand(df,row,col)
109
+ return row,col
110
+
111
+ #First, we find the largest rectangle with only numeric data
112
+ row,col = find_rectangle(df)
113
+
114
+ #And we remove potential nan axes and ensure types are ok
115
+ data = pd.DataFrame(
116
+ data=df.iloc[row:,col:],
117
+ dtype=np.float64)
118
+
119
+ #We count Nan axes as they offset label names
120
+ row_offset = data.map(
121
+ np.isnan
122
+ ).all(1).sum()
123
+ col_offset = data.map(
124
+ np.isnan
125
+ ).all(0).sum()
126
+
127
+
128
+ data = data.dropna(axis=0,how="all")
129
+ data = data.dropna(axis=1,how="all")
130
+
131
+
132
+ #Then, we build the labels
133
+ if col>0:
134
+ col_names = df.iloc[:row,col-1+col_offset].to_list()
135
+ if row > 1:
136
+ labels = []
137
+ sel = df.iloc[:row,col:].transpose()
138
+ for column in sel.columns:
139
+ labels.append(sel[column].dropna().unique())
140
+ columns = pd.MultiIndex.from_product(
141
+ labels,
142
+ names = col_names)
143
+ else:
144
+ columns = pd.Index(
145
+ df.iloc[
146
+ :row,col:
147
+ ].values.flatten(),
148
+ name = col_names[0]
149
+ )
150
+
151
+ else:
152
+ columns = None
153
+ if row > 0:
154
+ index_names = df.iloc[row-1+row_offset,:col].to_list()
155
+ if col > 1:
156
+ labels = []
157
+ sel = df.iloc[row+row_offset:,:col]
158
+ for column in sel.columns:
159
+ labels.append(
160
+ list(sel[column].dropna().unique())
161
+ )
162
+ index = pd.MultiIndex.from_product(
163
+ labels,
164
+ names = index_names)
165
+ else:
166
+ index = pd.Index(
167
+ list(
168
+ df.iloc[
169
+ row:,:col
170
+ ].values.flatten()
171
+ ),
172
+ name = index_names[0]
173
+ )
174
+ else:
175
+ index = None
176
+
177
+ #We build the formatted DataFrame
178
+ output = pd.DataFrame(
179
+ data = data.values,
180
+ columns=columns,
181
+ index = index
182
+ )
183
+
184
+ return output
185
+
186
+ def convert_labels(index):
187
+ """Convert a Pandas Index to a dictionary of labels
188
+
189
+ Parameters
190
+ ----------
191
+ index : Index
192
+ Pandas Index to convert
193
+ """
194
+ output = []
195
+ if isinstance(index,pd.MultiIndex):
196
+ for i in range(index.nlevels):
197
+ name = index.names[i]
198
+ if name is None:
199
+ name = f"level_{i}"
200
+ output.append(
201
+ {name : list(index.levels[i].values)}
202
+ )
203
+ return output
204
+ if index.name is None:
205
+ return [{0:list(index.array)}]
206
+ return [{index.name:list(index.array)}]
207
+
208
+ def disambiguate_labels(labels):
209
+ """Disambiguate the labels
210
+
211
+ This allow solving labels ambiguity if the name was incorrectly loaded.
212
+
213
+ Parameters
214
+ ----------
215
+ index : dict of str:list of str
216
+ New index to disambiguate
217
+ labels : list of str:list of str
218
+ List of labels to disambiguate
219
+ """
220
+ ordered = []
221
+ cleared = dict()
222
+ flat_labels = [label_dim for label in labels for label_dim in label]
223
+ values = []
224
+ for label in labels:
225
+ ordered.append([])
226
+ for level in range(len(label)):
227
+ name,value = list(
228
+ label[level].keys()
229
+ )[0],list(
230
+ label[level].values()
231
+ )[0]
232
+ if name not in cleared.keys():
233
+ if value in values:
234
+ #We have a duplicate
235
+ #We use the first occurrence as reference
236
+ ref_name = cleared.keys()[list(cleared.values()).index(value)]
237
+ ordered[-1].append(
238
+ {ref_name:value}
239
+ )
240
+ cleared[name] = value
241
+ ordered[-1].append(label[level])
242
+ cleared[name] = value
243
+ values.append(value)
244
+
245
+ return ordered
@@ -0,0 +1,141 @@
1
+ """
2
+ Routines for converting between xarray DataArrays and Parts objects.
3
+
4
+ """
5
+ import pandas as pd
6
+ import xarray as xr
7
+ import numpy as np
8
+
9
+ def to_DataArray(part):
10
+ """
11
+ Convert a Part object to an xarray DataArray
12
+
13
+ Labels are directly passed to the DataArray as coords.
14
+
15
+ Returns
16
+ -------
17
+ xr.DataArray
18
+ Corresponding DataArray
19
+ """
20
+ developed = part.develop()
21
+ old_dims = part.get_dimensions()
22
+ new_dims = developed.get_dimensions()
23
+ if old_dims != new_dims:
24
+ #We code the original dimensions in the metadata
25
+ #Because netcdf files do not support multi-level attributes
26
+ original_dims = [
27
+ dim for axe in old_dims for dim in axe+["_sep_"]
28
+ ]
29
+ part.metadata["_original_dimensions"] = original_dims[:-1]
30
+ #The last bit removes the last separator
31
+ coords = list()
32
+ for axe in developed.axes:
33
+ coords.append(
34
+ axe.label(True)
35
+ )
36
+ return xr.DataArray(
37
+ data = developed.data,
38
+ name = part.name,
39
+ attrs = part.metadata,
40
+ coords = coords
41
+ )
42
+
43
+ def to_DataSet(mrio):
44
+ ds = xr.Dataset(
45
+ attrs = mrio.metadata,
46
+ coords = mrio.labels
47
+ )
48
+ for part in mrio.parts:
49
+ ds[part] = mrio.parts[part].to_xarray()
50
+ return ds
51
+
52
+ def make_part(data,**kwargs):
53
+ """
54
+ Load a Part object from an xarray DataArray
55
+
56
+ Parameters
57
+ ----------
58
+ data : DataArray
59
+ Part object to load
60
+ name : str, optional
61
+ Name of the data variable to load, by default None.
62
+ This can be left empty if there's a single variable in the DataArray.
63
+
64
+ Returns
65
+ -------
66
+ dict
67
+ Data required to create the Part object
68
+ """
69
+ part_data = dict()
70
+
71
+ if isinstance(data,xr.Dataset):
72
+ #Extract the data from the Dataset
73
+ list_vars = list(data.data_vars)
74
+ if len(list_vars) > 1:
75
+ #In ambiguous cases, the name must be provided
76
+ name = kwargs.get("name",None)
77
+ else:
78
+ name = list_vars[0]
79
+ data = data[name]
80
+ elif isinstance(data,xr.DataArray):
81
+ name = data.name
82
+
83
+ part_data["data"] = data.to_numpy()
84
+
85
+ #Format the labels
86
+ labels = []
87
+ for key in data.dims:
88
+ label = dict()
89
+ index = data.indexes[key]
90
+ if isinstance(index,pd.MultiIndex):
91
+ for i in index.nlevels:
92
+ name = index.names[i]
93
+ if name is None:
94
+ name = i
95
+ label[str(name)] = index.get_level_values(i).tolist()
96
+ else:
97
+ label[index.name] = index.values.tolist()
98
+ labels.append(label)
99
+ part_data["name"] = name
100
+ part_data["labels"] = labels
101
+ part_data["metadata"] = kwargs.get("metadata",dict())
102
+ for attr in data.attrs:
103
+ #Add metadata
104
+ part_data["metadata"][attr] = data.attrs[attr]
105
+ part_data["groupings"] = kwargs.get("groupings",dict())
106
+ return part_data
107
+
108
+ def make_mrio(data,**kwargs):
109
+ """
110
+ Load an MRIO object from an xarray DataSet
111
+
112
+ Parameters
113
+ ----------
114
+ data : DataArray
115
+ Part object to load
116
+
117
+ Returns
118
+ -------
119
+ dict
120
+ Data required to create the Part object
121
+ """
122
+ #Extract the data from the xarray
123
+ list_vars = list(data.data_vars)
124
+ to_load = kwargs.get("parts",list_vars)
125
+
126
+ mrio_data = dict()
127
+
128
+ labels = dict()
129
+ for coord in data.coords:
130
+ #Uncompress MultiIndex data if needed
131
+ if "compress" in data[coord].attrs:
132
+ import cf_xarray as cfxr
133
+ data = cfxr.decode_compress_to_multi_index(data,coord)
134
+ labels[coord] = data[coord].values.tolist()
135
+ mrio_data["labels"] = labels
136
+ mrio_data["groupings"] = kwargs.get("groupings",dict())
137
+ mrio_data["groupings"].update(data.attrs.get("groupings",dict()))
138
+ mrio_data["metadata"] = data.attrs
139
+ mrio_data["metadata"].update(kwargs.get("metadata",dict()))
140
+ mrio_data["parts"] = dict()
141
+ return {"data":mrio_data},to_load
@@ -0,0 +1,3 @@
1
+ from mrio_toolbox.utils.loaders._loader_factory import make_loader
2
+
3
+ __all__ = ["make_loader"]
@@ -0,0 +1,256 @@
1
+ """
2
+ Central loading module for the mrio_toolbox package.
3
+
4
+ This module contains the central loading function for the mrio_toolbox package.
5
+ Depending on the loading mode, the function will call the appropriate loader.
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ import yaml
11
+
12
+ log = logging.Logger(__name__)
13
+
14
+ class Loader:
15
+ """
16
+ Parent class for the loaders
17
+ """
18
+ def __init__(
19
+ self
20
+ ):
21
+ """
22
+ Loaders are created with format-specific parameters.
23
+
24
+ They hold metadata and methods to load MRIO data.
25
+
26
+ A loader is created using the base class if no specific loader is required,
27
+ i.e., if the data is directly loaded from dict, pandas or xarray.
28
+ In that case, the loader will fail when used,
29
+ triggering the creation of a specific loader.
30
+ """
31
+ self.load_mrio()
32
+
33
+ def extract_basic_info(self,**kwargs):
34
+ """
35
+ Extract basic information from the loader.
36
+
37
+ The function will extract the path, labels and groupings from the loader.
38
+ """
39
+ self.loader_kwargs = kwargs.pop("loader_kwargs",dict())
40
+ self.file = kwargs.get("file",None)
41
+ self.groupings = kwargs.get("groupings",dict())
42
+ self.labels = kwargs.get("labels",dict())
43
+ #Remaining kwargs are metadata
44
+ self.metadata = kwargs
45
+ if isinstance(self.groupings,str):
46
+ self.groupings = self.load_groupings(self.groupings)
47
+
48
+ def update_settings(self,**settings):
49
+ """
50
+ Update the loader settings with new parameters
51
+ """
52
+ self.loader_kwargs.update(
53
+ settings.pop("loader_kwargs",dict())
54
+ )
55
+ self.groupings.update(
56
+ settings.pop("groupings",dict())
57
+ )
58
+ self.labels.update(
59
+ settings.pop("labels",dict())
60
+ )
61
+ self.metadata.update(
62
+ settings.pop("metadata",dict())
63
+ )
64
+ self.metadata.update(settings)
65
+
66
+
67
+ def load_mrio(
68
+ self
69
+ ):
70
+ """
71
+ Create an MRIO container based on the new parameters
72
+
73
+ Returns
74
+ -------
75
+ dict
76
+ Dictionary of MRIO metadata
77
+ """
78
+ self.metadata = dict()
79
+ self.labels = dict()
80
+ self.groupings = dict()
81
+ pass
82
+
83
+ def load_part(
84
+ self,
85
+ **kwargs
86
+ ):
87
+ """
88
+ Load an MRIO Part based on new or existing parameters
89
+
90
+ Returns
91
+ -------
92
+ dict
93
+ Dictionary containing the Part data
94
+ """
95
+ raise FileNotFoundError("No proper loader was initialised.\n"+\
96
+ "The loader needs to be reloaded with new instructions.")
97
+
98
+ def set_groupings(self,groupings):
99
+ """
100
+ Update the groupings attribute of the loader
101
+
102
+ Parameters
103
+ ----------
104
+ groupings : dict of dict of str
105
+ Aggregation on labels
106
+ """
107
+ self.groupings = groupings
108
+
109
+ def update_attributes(self,**kwargs):
110
+ """
111
+ Update the current attributes of the loader.
112
+
113
+ The function will update the groupings, paths, labels and metadata attributes.
114
+ """
115
+ if "groupings" in kwargs:
116
+ log.debug("Update groupings")
117
+ self.groupings = kwargs.pop("groupings",self.groupings)
118
+
119
+ self.extract_path(update=True,**kwargs)
120
+
121
+ if "labels" in kwargs:
122
+ log.debug("Update labels")
123
+ self.format_labels(kwargs.pop("labels"))
124
+
125
+ for kwarg in kwargs:
126
+ log.debug(f"Override parameter {kwarg} with explicit parameter {kwargs[kwarg]}")
127
+ self.metadata[kwarg] = kwargs[kwarg]
128
+
129
+ def load_groupings(self,
130
+ file,
131
+ dimension=None,
132
+ path=None):
133
+ """Load groupings from a file
134
+
135
+ Parameters
136
+ ----------
137
+ file : str
138
+ Name of the file to load
139
+ dimension : str, optional
140
+ Name of the dimension to load groupings for.
141
+ By default (None), the file is interpreted as a preset
142
+ of groupings on different dimension.
143
+ path : path-like, optional
144
+ Path where the file is stored.
145
+ By default, the groupings are from the settings dir
146
+ in the working dir.
147
+ """
148
+ def _check_groupings(groupings,dimension):
149
+ """Check whether the groupings are consistent with the labels"""
150
+ for key in groupings.keys():
151
+ for item in groupings[key]:
152
+ if item not in self.labels[dimension]:
153
+ log.warning(
154
+ f"Item {item} not found in {dimension} labels"
155
+ )
156
+ groupings[key].remove(item)
157
+ if len(groupings[key])==0:
158
+ log.warning(f"Group {key} is empty")
159
+ groupings.pop(key)
160
+ return groupings
161
+
162
+ def load_grouping(file,level,path):
163
+ """Load a single grouping file"""
164
+ path = os.path.join(path,level)
165
+ with open(os.path.join(path,file+'.txt')) as f:
166
+ group = f.read().splitlines()
167
+ return {file:group}
168
+
169
+ if path is None:
170
+ path = os.path.join("parameters","groupings")
171
+
172
+ #If no dimension is specified, interpret as a preset
173
+ output = dict()
174
+ if isinstance(file,str):
175
+ log.info("Load groupings set from "+path+file)
176
+ with open(os.path.join(path,file)) as f:
177
+ groupings = yaml.safe_load(f)
178
+ elif isinstance(file,dict):
179
+ groupings = file
180
+ output = self.groupings
181
+
182
+ if dimension is None:
183
+ dimensions = list(groupings.keys())
184
+ output = dict()
185
+ for level in dimensions:
186
+ if isinstance(groupings[level],dict):
187
+ #Case the preset explicitly defines a grouping
188
+ groupings[level] = _check_groupings(
189
+ groupings[level],level
190
+ )
191
+ output[level] = groupings[level]
192
+ continue
193
+ if isinstance(groupings[level],str):
194
+ groupings[level] = [groupings[level]]
195
+ if isinstance(groupings[level],list):
196
+ #Otherwise, interpret as a list of groupings
197
+ output[level] = dict()
198
+ covered = []
199
+ for item in groupings[level]:
200
+ #Load all groupings
201
+ groups= load_grouping(
202
+ item,level,path
203
+ )
204
+ if any([group in covered for group in groups]):
205
+ duplicate = [
206
+ group for group in groups if group in covered
207
+ ]
208
+ log.warning("The following items are covered in "+\
209
+ "multiple groupings: "+duplicate)
210
+ covered += groups
211
+ output[level][item] = groups
212
+ return output
213
+
214
+ def set_labels(self,labels):
215
+ """
216
+ Update the labels attribute of the loader
217
+
218
+ Parameters
219
+ ----------
220
+ labels : dict of str:list of str
221
+ Labels of the axes
222
+ """
223
+ self.labels = labels
224
+
225
+ def check_instructions(self,**kwargs):
226
+ """
227
+ Interpret the file argument for loading a part.
228
+
229
+ This method solves the ambiguity between data files and optional
230
+ .yaml instructions.
231
+ If the file argument refers to an instruction file, it is compared
232
+ to the current instructions.
233
+ If the data file or instruction file differ from the ones currently loaded,
234
+ an exception is raised to force a reload.
235
+
236
+ Parameters
237
+ ----------
238
+ file : path-like
239
+ User-provided file path
240
+ kwargs : additional arguments
241
+
242
+ Raises
243
+ ------
244
+ FileNotFoundError
245
+ If the loader needs to be reloaded with new instructions.
246
+
247
+ """
248
+ #The 'instructions' attribute is used to check if the loader needs to be reloaded
249
+ #It contains the reference to the potential yaml file used to load the data
250
+ new_instructions = kwargs.get("instructions",None)
251
+ ref_instructions = self.metadata.get("instructions",None)
252
+ if new_instructions is not None and ref_instructions != new_instructions:
253
+ #If the instructions differ from the current ones,
254
+ #trigger a reload of the loader
255
+ log.error("The loader needs to be reloaded with new instructions.")
256
+ raise FileNotFoundError("The loader needs to be reloaded with new instructions.")