hestia-earth-utils 0.16.6__tar.gz → 0.16.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/PKG-INFO +2 -7
  2. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/blank_node.py +2 -2
  3. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/lookup.py +39 -73
  4. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/lookup_utils.py +4 -4
  5. hestia_earth_utils-0.16.7/hestia_earth/utils/version.py +1 -0
  6. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth_utils.egg-info/PKG-INFO +2 -7
  7. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth_utils.egg-info/SOURCES.txt +0 -1
  8. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth_utils.egg-info/requires.txt +1 -7
  9. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/setup.py +1 -5
  10. hestia_earth_utils-0.16.7/tests/test_lookup.py +128 -0
  11. hestia_earth_utils-0.16.6/hestia_earth/utils/csv_utils.py +0 -84
  12. hestia_earth_utils-0.16.6/hestia_earth/utils/version.py +0 -1
  13. hestia_earth_utils-0.16.6/tests/test_lookup.py +0 -163
  14. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/MANIFEST.in +0 -0
  15. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/README.md +0 -0
  16. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/bin/hestia-format-upload +0 -0
  17. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/bin/hestia-pivot-csv +0 -0
  18. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/__init__.py +0 -0
  19. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/api.py +0 -0
  20. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/calculation_status.py +0 -0
  21. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/cycle.py +0 -0
  22. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/date.py +0 -0
  23. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/descriptive_stats.py +0 -0
  24. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/emission.py +0 -0
  25. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/model.py +0 -0
  26. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/pipeline.py +0 -0
  27. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/pivot/__init__.py +0 -0
  28. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/pivot/_shared.py +0 -0
  29. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/pivot/pivot_csv.py +0 -0
  30. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/pivot/pivot_json.py +0 -0
  31. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/request.py +0 -0
  32. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/stats.py +0 -0
  33. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/storage/__init__.py +0 -0
  34. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/storage/_azure_client.py +0 -0
  35. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/storage/_local_client.py +0 -0
  36. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/storage/_s3_client.py +0 -0
  37. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/storage/_sns_client.py +0 -0
  38. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/table.py +0 -0
  39. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/term.py +0 -0
  40. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth/utils/tools.py +0 -0
  41. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth_utils.egg-info/dependency_links.txt +0 -0
  42. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/hestia_earth_utils.egg-info/top_level.txt +0 -0
  43. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/setup.cfg +0 -0
  44. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_api.py +0 -0
  45. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_blank_node.py +0 -0
  46. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_calculation_status.py +0 -0
  47. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_cycle.py +0 -0
  48. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_date.py +0 -0
  49. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_descriptive_stats.py +0 -0
  50. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_emission.py +0 -0
  51. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_lookup_utils.py +0 -0
  52. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_model.py +0 -0
  53. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_pipeline.py +0 -0
  54. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_request.py +0 -0
  55. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_stats.py +0 -0
  56. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_table.py +0 -0
  57. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_term.py +0 -0
  58. {hestia_earth_utils-0.16.6 → hestia_earth_utils-0.16.7}/tests/test_tools.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hestia_earth_utils
3
- Version: 0.16.6
3
+ Version: 0.16.7
4
4
  Summary: HESTIA's utils library
5
5
  Home-page: https://gitlab.com/hestia-earth/hestia-utils
6
6
  Author: HESTIA Team
@@ -13,12 +13,8 @@ Requires-Dist: hestia-earth-schema>=35.0.1
13
13
  Requires-Dist: requests>=2.24.0
14
14
  Requires-Dist: urllib3~=1.26.0
15
15
  Requires-Dist: python-dateutil>=2.8.1
16
- Requires-Dist: numpy>=2
16
+ Requires-Dist: pandas>=2
17
17
  Requires-Dist: flatten_json
18
- Provides-Extra: pivot-csv
19
- Requires-Dist: pandas>=2; extra == "pivot-csv"
20
- Provides-Extra: table
21
- Requires-Dist: pandas>=2; extra == "table"
22
18
  Dynamic: author
23
19
  Dynamic: author-email
24
20
  Dynamic: classifier
@@ -26,7 +22,6 @@ Dynamic: description
26
22
  Dynamic: description-content-type
27
23
  Dynamic: home-page
28
24
  Dynamic: license
29
- Dynamic: provides-extra
30
25
  Dynamic: requires-dist
31
26
  Dynamic: requires-python
32
27
  Dynamic: summary
@@ -5,7 +5,7 @@ from functools import reduce
5
5
  from statistics import mode, mean
6
6
  from hestia_earth.schema import TermTermType
7
7
 
8
- from .lookup import download_lookup, get_table_value, column_name
8
+ from .lookup import download_lookup, get_table_value
9
9
  from .tools import non_empty_list, non_empty_value, flatten
10
10
  from .emission import cycle_emissions_in_system_boundary
11
11
  from .model import filter_list_term_type
@@ -15,7 +15,7 @@ def get_lookup_value(blank_node: dict, column: str):
15
15
  term = blank_node.get('term', {})
16
16
  table_name = f"{term.get('termType')}.csv" if term else None
17
17
  value = get_table_value(
18
- download_lookup(table_name), 'termid', term.get('@id'), column_name(column)
18
+ download_lookup(table_name), 'term.id', term.get('@id'), column
19
19
  ) if table_name else None
20
20
  return value
21
21
 
@@ -1,15 +1,14 @@
1
1
  from functools import reduce
2
- from typing import Union
2
+ from typing import Any
3
3
  import requests
4
- import numpy
4
+ from io import StringIO
5
+ import pandas as pd
5
6
 
6
7
  from .storage import _load_from_storage
7
8
  from .request import request_url, web_url
8
- from .csv_utils import csv_str_to_recarray, csv_file_to_recarray, is_missing_value, _replace_chars
9
9
 
10
10
  _GLOSSARY_FOLDER = 'glossary/lookups'
11
11
  _memory = {}
12
- _INDEX_COL = 'termid'
13
12
 
14
13
 
15
14
  def _memory_wrapper(key: str, func):
@@ -18,6 +17,18 @@ def _memory_wrapper(key: str, func):
18
17
  return _memory[key]
19
18
 
20
19
 
20
+ def _read_csv(value: str) -> pd.DataFrame:
21
+ return pd.read_csv(value, na_values=['-', ''])
22
+
23
+
24
+ def _read_csv_from_string(data: str) -> pd.DataFrame:
25
+ return _read_csv(StringIO(data))
26
+
27
+
28
+ def is_missing_value(value):
29
+ return pd.isna(value) or value is None or value == '' or value == '-'
30
+
31
+
21
32
  def load_lookup(filepath: str, keep_in_memory: bool = False):
22
33
  """
23
34
  Import local lookup table as csv file into a `numpy.recarray`.
@@ -34,7 +45,7 @@ def load_lookup(filepath: str, keep_in_memory: bool = False):
34
45
  numpy.recarray
35
46
  The `numpy.recarray` converted from the csv content.
36
47
  """
37
- def load(): return csv_file_to_recarray(filepath)
48
+ def load(): return _read_csv(filepath)
38
49
  return _memory_wrapper(filepath, load) if keep_in_memory else load()
39
50
 
40
51
 
@@ -53,20 +64,7 @@ def _download_lookup_data(filename: str):
53
64
  return fallback()
54
65
 
55
66
 
56
- def _build_index(array: numpy.recarray):
57
- columns = list(array.dtype.names)
58
- try:
59
- return {
60
- row[_INDEX_COL]: {col: row[col] for col in columns}
61
- for row in array
62
- } if _INDEX_COL in columns else array
63
- except TypeError:
64
- return {
65
- array[_INDEX_COL].item(): {col: array[col].item() for col in columns}
66
- } if _INDEX_COL in columns else array
67
-
68
-
69
- def download_lookup(filename: str, keep_in_memory: bool = True, build_index: bool = False):
67
+ def download_lookup(filename: str, keep_in_memory: bool = True):
70
68
  """
71
69
  Download lookup table from HESTIA as csv into a `numpy.recarray`.
72
70
 
@@ -86,8 +84,7 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
86
84
  """
87
85
  def load():
88
86
  data = _download_lookup_data(filename)
89
- rec = csv_str_to_recarray(data) if data else None
90
- return (_build_index(rec) if build_index else rec) if data else None
87
+ return _read_csv_from_string(data) if data else None
91
88
 
92
89
  try:
93
90
  return _memory_wrapper(filename, load) if keep_in_memory else load()
@@ -97,19 +94,9 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
97
94
 
98
95
  def column_name(key: str):
99
96
  """
100
- Convert the column name to a usable key on a `numpy.recarray`.
101
-
102
- Parameters
103
- ----------
104
- key : str
105
- The column name.
106
-
107
- Returns
108
- -------
109
- str
110
- The column name that can be used in `get_table_value`.
97
+ Deprecated. Columns are no longer renamed.
111
98
  """
112
- return _replace_chars(key) if key else ''
99
+ return key
113
100
 
114
101
 
115
102
  def _parse_value(value: str):
@@ -124,36 +111,24 @@ def _parse_value(value: str):
124
111
  return value
125
112
 
126
113
 
127
- def _get_single_table_value(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
128
- return (
129
- data.get(col_match_with, {})[col_val] if isinstance(data, dict) else
130
- data[data[col_match] == col_match_with][col_val][0]
131
- )
132
-
133
-
134
- def _get_multiple_table_values(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
135
- def reducer(x, values):
136
- col = values[1]
137
- value = col_match_with[values[0]]
138
- return x.get(value) if isinstance(x, dict) else x[x[col] == value]
114
+ def _get_single_table_value(df: pd.DataFrame, col_match: str, col_match_with, col_val):
115
+ filtered_df = df[df[col_match] == col_match_with]
116
+ return None if filtered_df.empty else filtered_df[col_val].iloc[0]
139
117
 
140
- return reduce(reducer, enumerate(col_match), data)[col_val][0]
141
118
 
142
-
143
- def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val, default_value=''):
119
+ def get_table_value(lookup: pd.DataFrame, col_match: str, col_match_with: str, col_val: Any, default_value=''):
144
120
  """
145
121
  Get a value matched by one or more columns from a `numpy.recarray`.
146
122
 
147
123
  Parameters
148
124
  ----------
149
- lookup : dict | numpy.recarray
125
+ lookup : DataFrame
150
126
  The value returned by the `download_lookup` function.
151
127
  col_match : str
152
128
  Which `column` should be used to find data in. This will restrict the rows to search for.
153
129
  Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
154
- col_match_with
130
+ col_match_with: str
155
131
  Which column `value` should be used to find data in. This will restrict the rows to search for.
156
- Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
157
132
  col_val: str
158
133
  The column which contains the value to look for.
159
134
  default_value : Any
@@ -164,44 +139,35 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
164
139
  str
165
140
  The value found or `None` if no match.
166
141
  """
167
- single = isinstance(col_match, str) and isinstance(col_match_with, str)
168
142
  try:
169
- value = (
170
- _get_single_table_value(lookup, col_match, col_match_with, col_val) if single else
171
- _get_multiple_table_values(lookup, col_match, col_match_with, col_val)
172
- )
143
+ value = _get_single_table_value(lookup, col_match, col_match_with, col_val)
144
+ print(value, type(value))
173
145
  return default_value if is_missing_value(value) else _parse_value(value)
174
146
  except Exception:
175
147
  return None
176
148
 
177
149
 
178
- def find_term_ids_by(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with):
150
+ def find_term_ids_by(lookup: pd.DataFrame, col_match: str, col_match_with: str):
179
151
  """
180
152
  Find `term.id` values where a column matches a specific value.
181
153
 
182
154
  Parameters
183
155
  ----------
184
- lookup : dict | numpy.recarray
156
+ lookup : DataFrame
185
157
  The value returned by the `download_lookup` function.
186
158
  col_match : str
187
159
  Which `column` should be used to find data in. This will restrict the rows to search for.
188
160
  Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
189
- col_match_with
161
+ col_match_with: str
190
162
  Which column `value` should be used to find data in. This will restrict the rows to search for.
191
- Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
192
163
 
193
164
  Returns
194
165
  -------
195
166
  list[str]
196
167
  The list of `term.id` that matched the expected column value.
197
168
  """
198
- term_ids = (
199
- set([
200
- key
201
- for key, value in lookup.items()
202
- if value.get(col_match) == col_match_with
203
- ])
204
- ) if isinstance(lookup, dict) else set(list(lookup[lookup[col_match] == col_match_with].termid))
169
+ filtered_df = lookup[lookup[col_match] == col_match_with]
170
+ term_ids = filtered_df['term.id'].unique().tolist() if 'term.id' in filtered_df.columns else []
205
171
  return list(map(str, term_ids))
206
172
 
207
173
 
@@ -266,13 +232,13 @@ def extract_grouped_data_closest_date(data: str, year: int) -> str:
266
232
  return None if closest_year is None else _parse_value(data_by_date.get(closest_year))
267
233
 
268
234
 
269
- def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
235
+ def lookup_term_ids(lookup: pd.DataFrame):
270
236
  """
271
237
  Get the `term.id` values from a lookup.
272
238
 
273
239
  Parameters
274
240
  ----------
275
- lookup : dict | numpy.recarray
241
+ lookup : DataFrame
276
242
  The value returned by the `download_lookup` function.
277
243
 
278
244
  Returns
@@ -280,16 +246,16 @@ def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
280
246
  list[str]
281
247
  The `term.id` values from the lookup.
282
248
  """
283
- return lookup.keys() if isinstance(lookup, dict) else list(lookup.termid)
249
+ return list(map(str, lookup['term.id'].tolist())) if 'term.id' in lookup.columns else []
284
250
 
285
251
 
286
- def lookup_columns(lookup: Union[dict, numpy.recarray]):
252
+ def lookup_columns(lookup: pd.DataFrame):
287
253
  """
288
254
  Get the columns from a lookup.
289
255
 
290
256
  Parameters
291
257
  ----------
292
- lookup : dict | numpy.recarray
258
+ lookup : DataFrame
293
259
  The value returned by the `download_lookup` function.
294
260
 
295
261
  Returns
@@ -297,4 +263,4 @@ def lookup_columns(lookup: Union[dict, numpy.recarray]):
297
263
  list[str]
298
264
  The columns from the lookup.
299
265
  """
300
- return list(list(lookup.values())[0].keys()) if isinstance(lookup, dict) else list(lookup.dtype.names)
266
+ return list(lookup.columns)
@@ -2,7 +2,7 @@ from functools import lru_cache
2
2
  import json
3
3
  from hestia_earth.schema import SchemaType
4
4
 
5
- from .lookup import _download_lookup_data, download_lookup, get_table_value, column_name
5
+ from .lookup import _download_lookup_data, download_lookup, get_table_value
6
6
  from .api import download_hestia
7
7
  from .tools import non_empty_list, flatten
8
8
 
@@ -45,7 +45,7 @@ def _allowed_model_mapping(model: str, term_id: str, column: str):
45
45
  mapping = _allowed_mapping_data()
46
46
  value = mapping.get(term_id, {}).get(model, {}).get(column) if mapping else get_table_value(
47
47
  download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}-model-{column}.csv"),
48
- 'termid', term_id, column_name(column)
48
+ 'term.id', term_id, column
49
49
  )
50
50
  return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
51
51
 
@@ -78,7 +78,7 @@ def _allowed_mapping(term_id: str, column: str):
78
78
  mapping = _allowed_mapping_data()
79
79
  value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
80
80
  download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
81
- 'termid', term_id, column_name(column)
81
+ 'term.id', term_id, column
82
82
  )
83
83
  return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
84
84
 
@@ -174,7 +174,7 @@ def is_in_system_boundary(term_id: str) -> bool:
174
174
  column = 'inHestiaDefaultSystemBoundary'
175
175
  value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
176
176
  download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
177
- 'termid', term_id, column_name(column)
177
+ 'term.id', term_id, column
178
178
  )
179
179
  # handle numpy bool from table value
180
180
  return not (not value)
@@ -0,0 +1 @@
1
+ VERSION = '0.16.7'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hestia_earth_utils
3
- Version: 0.16.6
3
+ Version: 0.16.7
4
4
  Summary: HESTIA's utils library
5
5
  Home-page: https://gitlab.com/hestia-earth/hestia-utils
6
6
  Author: HESTIA Team
@@ -13,12 +13,8 @@ Requires-Dist: hestia-earth-schema>=35.0.1
13
13
  Requires-Dist: requests>=2.24.0
14
14
  Requires-Dist: urllib3~=1.26.0
15
15
  Requires-Dist: python-dateutil>=2.8.1
16
- Requires-Dist: numpy>=2
16
+ Requires-Dist: pandas>=2
17
17
  Requires-Dist: flatten_json
18
- Provides-Extra: pivot-csv
19
- Requires-Dist: pandas>=2; extra == "pivot-csv"
20
- Provides-Extra: table
21
- Requires-Dist: pandas>=2; extra == "table"
22
18
  Dynamic: author
23
19
  Dynamic: author-email
24
20
  Dynamic: classifier
@@ -26,7 +22,6 @@ Dynamic: description
26
22
  Dynamic: description-content-type
27
23
  Dynamic: home-page
28
24
  Dynamic: license
29
- Dynamic: provides-extra
30
25
  Dynamic: requires-dist
31
26
  Dynamic: requires-python
32
27
  Dynamic: summary
@@ -7,7 +7,6 @@ hestia_earth/utils/__init__.py
7
7
  hestia_earth/utils/api.py
8
8
  hestia_earth/utils/blank_node.py
9
9
  hestia_earth/utils/calculation_status.py
10
- hestia_earth/utils/csv_utils.py
11
10
  hestia_earth/utils/cycle.py
12
11
  hestia_earth/utils/date.py
13
12
  hestia_earth/utils/descriptive_stats.py
@@ -2,11 +2,5 @@ hestia-earth-schema>=35.0.1
2
2
  requests>=2.24.0
3
3
  urllib3~=1.26.0
4
4
  python-dateutil>=2.8.1
5
- numpy>=2
6
- flatten_json
7
-
8
- [pivot-csv]
9
- pandas>=2
10
-
11
- [table]
12
5
  pandas>=2
6
+ flatten_json
@@ -32,9 +32,5 @@ setup(
32
32
  scripts=[
33
33
  'bin/hestia-pivot-csv',
34
34
  'bin/hestia-format-upload'
35
- ],
36
- extras_require={
37
- 'pivot-csv': ['pandas>=2'],
38
- 'table': ['pandas>=2'],
39
- }
35
+ ]
40
36
  )
@@ -0,0 +1,128 @@
1
+ import pytest
2
+ import pandas as pd
3
+
4
+ from .utils import fixtures_path
5
+ from hestia_earth.utils.lookup import (
6
+ load_lookup,
7
+ get_table_value,
8
+ find_term_ids_by,
9
+ download_lookup,
10
+ extract_grouped_data,
11
+ extract_grouped_data_closest_date,
12
+ _get_single_table_value,
13
+ lookup_term_ids,
14
+ lookup_columns
15
+ )
16
+
17
+
18
+ def test_load_lookup_type():
19
+ lookup = load_lookup(f"{fixtures_path}/lookup.csv")
20
+ assert isinstance(lookup, pd.DataFrame)
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ 'col_match,col_match_with,col_val,expected',
25
+ [
26
+ ('Col1', 'val10', 'Col3', 'val30'),
27
+ ('Col1', 'val10', 'Col5', None),
28
+ ('Col1', 'val10', 'Col4', ''),
29
+ ]
30
+ )
31
+ def test_get_table_value(col_match, col_match_with, col_val, expected):
32
+ lookup = load_lookup(f"{fixtures_path}/lookup.csv")
33
+ assert get_table_value(lookup, col_match, col_match_with, col_val) == expected
34
+
35
+
36
+ def test_get_table_value_no_lookup():
37
+ assert not get_table_value(None, 'Col10', 'val10', 'Col3')
38
+
39
+
40
+ def test_get_table_value_default_value():
41
+ lookup = load_lookup(f"{fixtures_path}/lookup.csv")
42
+ assert get_table_value(lookup, 'Col2', 'val22', 'Col1') == ''
43
+
44
+ lookup = download_lookup('crop.csv')
45
+ assert get_table_value(lookup, 'term.id', 'genericCropSeed', 'Plantation_density') == ''
46
+ assert get_table_value(lookup, 'term.id', 'fixedNitrogen', 'Combustion_Factor_crop_residue') == ''
47
+
48
+
49
+ def test_find_term_ids_by():
50
+ lookup = download_lookup('crop.csv')
51
+ assert 'wheatGrain' in find_term_ids_by(lookup, 'cropGroupingFAO', 'Temporary crops')
52
+
53
+
54
+ def test_handle_missing_float_value():
55
+ filename = 'measurement.csv'
56
+ lookup = download_lookup(filename)
57
+ assert get_table_value(lookup, 'term.id', 'rainfallPeriod', 'maximum') == ''
58
+
59
+
60
+ def test_handle_missing_lookup_value():
61
+ filename = 'region-crop-cropGroupingFaostatProduction-price.csv'
62
+ lookup = download_lookup(filename)
63
+ assert get_table_value(lookup, 'term.id', 'GADM-CYP', 'Sugar crops nes') is None
64
+
65
+
66
+ def test_extract_grouped_data_no_data():
67
+ assert not extract_grouped_data('', '2000')
68
+ assert not extract_grouped_data('-', '2000')
69
+
70
+
71
+ def test_extract_grouped_data():
72
+ data = 'Average_price_per_tonne:106950.5556;1991:-;1992:-'
73
+ assert extract_grouped_data(data, 'Average_price_per_tonne') == 106950.5556
74
+ assert extract_grouped_data(data, '2010') is None
75
+
76
+
77
+ def test_extract_grouped_data_lookup():
78
+ filename = 'region-crop-cropGroupingFaostatProduction-price.csv'
79
+ lookup = download_lookup(filename)
80
+ data = get_table_value(lookup, 'term.id', 'GADM-NPL', 'Chick peas, dry')
81
+ assert extract_grouped_data(data, '2000') is None
82
+ assert extract_grouped_data(data, '2012') is not None
83
+
84
+ filename = 'region-animalProduct-animalProductGroupingFAO-price.csv'
85
+ lookup = download_lookup(filename)
86
+ data = get_table_value(lookup, 'term.id', 'GADM-NPL', 'Eggs from other birds in shell, fresh, n.e.c.')
87
+ assert extract_grouped_data(data, '2000') is None
88
+ assert extract_grouped_data(data, '2012') is not None
89
+
90
+
91
+ def test_get_single_table_value_float_values():
92
+ filename = 'ecoClimateZone.csv'
93
+ lookup = download_lookup(filename)
94
+ column = 'STEHFEST_BOUWMAN_2006_N2O-N_FACTOR'
95
+ assert _get_single_table_value(lookup, 'ecoClimateZone', 11, column) == -0.3022
96
+
97
+
98
+ def test_extract_grouped_data_closest_date_no_data():
99
+ assert not extract_grouped_data_closest_date('', 2000)
100
+ assert not extract_grouped_data_closest_date('-', 2000)
101
+
102
+
103
+ def test_extract_grouped_data_closest_date():
104
+ data = '2000:-;2001:0.1;2002:0.2;2003:0.3;2004:0.4;2005:0.5'
105
+ assert extract_grouped_data_closest_date(data, 2000) == 0.1
106
+ assert extract_grouped_data_closest_date(data, 2001) == 0.1
107
+ assert extract_grouped_data_closest_date(data, 2020) == 0.5
108
+
109
+
110
+ def test_lookup_term_ids():
111
+ assert 'wheatGrain' in lookup_term_ids(download_lookup('crop.csv'))
112
+
113
+
114
+ def test_lookup_columns():
115
+ assert 'term.id' in lookup_columns(download_lookup('crop.csv'))
116
+
117
+
118
+ def test_get_data_advanced():
119
+ lookup = download_lookup('liveAnimal.csv')
120
+ value = get_table_value(lookup, 'term.id', 'sheepRam', 'ratioCPregnancyNetEnergyPregnancyIpcc2019')
121
+ assert value == ''
122
+
123
+
124
+ def test_grouping_with_comma():
125
+ lookup = download_lookup('animalProduct.csv')
126
+ term_id = 'meatChickenReadyToCookWeight'
127
+ value = get_table_value(lookup, 'term.id', term_id, 'animalProductGroupingFAO')
128
+ assert value == 'Meat of chickens, fresh or chilled'
@@ -1,84 +0,0 @@
1
- import io
2
- import csv
3
- import re
4
- import numpy as np
5
-
6
- _MISSING_VALUE = '-'
7
- _MISSING = -99999
8
- _DELIMITER = ','
9
- _QUOTE_CHAR = '"'
10
- ENCODING = 'ISO-8859-1'
11
- # default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
12
- _DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
13
-
14
-
15
- def is_missing_value(value): return value == _MISSING_VALUE or value == _MISSING or value == str(_MISSING)
16
-
17
-
18
- def _replace_missing_values(value: str): return str(_MISSING) if str(value) == _MISSING_VALUE else value
19
-
20
-
21
- def _replace_chars(value: str): return re.sub(f'[{re.escape(_DELETE_CHARS)}]', '', value.replace(' ', '_'))
22
-
23
-
24
- def _text_to_csv(csv_content: str):
25
- return csv.reader(io.StringIO(csv_content.strip()), delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
26
-
27
-
28
- def _get_columns(csv_content: str):
29
- try:
30
- reader = _text_to_csv(csv_content)
31
- names = next(reader)
32
- return list(map(_replace_chars, names))
33
- except StopIteration:
34
- return []
35
-
36
-
37
- def _get_rows(csv_content: str):
38
- string_io = io.StringIO(csv_content.strip())
39
- try:
40
- next(string_io)
41
- except StopIteration:
42
- return
43
-
44
- return csv.reader(string_io, delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
45
-
46
-
47
- def _csv_str_to_recarray_chunks_numpy(csv_content: str, chunk_size: int = 5):
48
- names = _get_columns(csv_content)
49
- num_cols = len(names)
50
-
51
- max_size = 1000
52
- dtype = [(name, f"U{max_size}") for name in names]
53
-
54
- reader = _get_rows(csv_content)
55
-
56
- # 4. Process the file in batches
57
- chunk_rows = []
58
- for row in reader:
59
- if not row:
60
- continue
61
- if len(row) != num_cols:
62
- continue
63
-
64
- # replace missing values
65
- processed_row = tuple(_replace_missing_values(field) for field in row)
66
- chunk_rows.append(processed_row)
67
-
68
- if len(chunk_rows) >= chunk_size:
69
- yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
70
- chunk_rows = []
71
-
72
- if chunk_rows:
73
- yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
74
-
75
-
76
- def csv_str_to_recarray(csv_content: str) -> np.recarray:
77
- array_rows = list(_csv_str_to_recarray_chunks_numpy(csv_content))
78
- return np.hstack(array_rows).view(np.recarray)
79
-
80
-
81
- def csv_file_to_recarray(filepath: str):
82
- with open(filepath, 'r', encoding=ENCODING) as f:
83
- content = f.read()
84
- return csv_str_to_recarray(content)
@@ -1 +0,0 @@
1
- VERSION = '0.16.6'
@@ -1,163 +0,0 @@
1
- import numpy
2
-
3
- from .utils import fixtures_path
4
- from hestia_earth.utils.lookup import (
5
- load_lookup,
6
- column_name,
7
- get_table_value,
8
- find_term_ids_by,
9
- download_lookup,
10
- extract_grouped_data,
11
- extract_grouped_data_closest_date,
12
- _get_single_table_value,
13
- lookup_term_ids,
14
- lookup_columns
15
- )
16
-
17
-
18
- def test_load_lookup_numpy_array():
19
- lookup = load_lookup(f"{fixtures_path}/lookup.csv")
20
- assert isinstance(lookup, numpy.recarray)
21
-
22
-
23
- def test_column_name():
24
- assert column_name('Maize (corn)') == 'Maize_corn'
25
- assert column_name('grassland/pasture/meadow') == 'grasslandpasturemeadow'
26
-
27
-
28
- def test_get_table_value():
29
- lookup = load_lookup(f"{fixtures_path}/lookup.csv")
30
-
31
- # single column match
32
- assert get_table_value(lookup, column_name('Col1'), 'val10', column_name('Col3')) == 'val30'
33
- # multiple column match
34
- assert get_table_value(lookup, [
35
- column_name('Col1'),
36
- column_name('Col2'),
37
- ], [
38
- 'val10',
39
- 'val21'
40
- ], column_name('Col3')) == 'val31'
41
- # no match
42
- assert not get_table_value(lookup, column_name('Col10'), 'val10', column_name('Col3'))
43
-
44
- # column does not exist
45
- assert not get_table_value(lookup, [
46
- column_name('Col1'),
47
- column_name('Col2'),
48
- ], [
49
- 'random',
50
- 'val21'
51
- ], column_name('random'))
52
-
53
- # table does not exist
54
- assert not get_table_value(None, column_name('Col10'), 'val10', column_name('Col3'))
55
-
56
-
57
- def test_get_table_value_empty():
58
- lookup = load_lookup(f"{fixtures_path}/lookup.csv")
59
- assert get_table_value(lookup, column_name('Col1'), 'val10', column_name('Col4'), default_value=None) is None
60
- assert get_table_value(lookup, column_name('Col2'), 'val22', column_name('Col1')) == ''
61
-
62
- lookup = download_lookup('crop.csv')
63
- assert get_table_value(lookup, 'termid', 'genericCropSeed', column_name('Plantation_density')) == ''
64
-
65
-
66
- def test_find_term_ids_by():
67
- lookup = download_lookup('crop.csv')
68
- assert 'wheatGrain' in find_term_ids_by(lookup, column_name('cropGroupingFAO'), 'Temporary crops')
69
-
70
-
71
- def test_download_lookup_with_index():
72
- filename = 'crop.csv'
73
- lookup = download_lookup(filename, keep_in_memory=False, build_index=True)
74
- assert isinstance(lookup, dict) is True
75
-
76
-
77
- def test_download_lookup_without_index():
78
- filename = 'crop.csv'
79
- lookup = download_lookup(filename, keep_in_memory=False, build_index=False)
80
- assert isinstance(lookup, numpy.recarray) is True
81
-
82
-
83
- def test_handle_missing_float_value():
84
- filename = 'measurement.csv'
85
- lookup = download_lookup(filename)
86
- assert get_table_value(lookup, 'termid', 'rainfallPeriod', 'maximum') == ''
87
-
88
-
89
- def test_handle_missing_string_value():
90
- filename = 'crop.csv'
91
- lookup = download_lookup(filename)
92
- assert get_table_value(lookup, 'termid', 'fixedNitrogen', 'combustion_factor_crop_residue') is None
93
-
94
-
95
- def test_handle_missing_lookup_value():
96
- filename = 'region-crop-cropGroupingFaostatProduction-price.csv'
97
- lookup = download_lookup(filename)
98
- assert get_table_value(lookup, 'termid', 'GADM-CYP', column_name('Sugar crops nes')) is None
99
-
100
-
101
- def test_extract_grouped_data_no_data():
102
- assert not extract_grouped_data('', '2000')
103
- assert not extract_grouped_data('-', '2000')
104
-
105
-
106
- def test_extract_grouped_data():
107
- data = 'Average_price_per_tonne:106950.5556;1991:-;1992:-'
108
- assert extract_grouped_data(data, 'Average_price_per_tonne') == 106950.5556
109
- assert extract_grouped_data(data, '2010') is None
110
-
111
-
112
- def test_extract_grouped_data_lookup():
113
- filename = 'region-crop-cropGroupingFaostatProduction-price.csv'
114
- lookup = download_lookup(filename)
115
- data = get_table_value(lookup, 'termid', 'GADM-NPL', column_name('Chick peas, dry'))
116
- assert extract_grouped_data(data, '2000') is None
117
- assert extract_grouped_data(data, '2012') is not None
118
-
119
- filename = 'region-animalProduct-animalProductGroupingFAO-price.csv'
120
- lookup = download_lookup(filename)
121
- data = get_table_value(lookup, 'termid', 'GADM-NPL', column_name('Eggs from other birds in shell, fresh, n.e.c.'))
122
- assert extract_grouped_data(data, '2000') is None
123
- assert extract_grouped_data(data, '2012') is not None
124
-
125
-
126
- def test_get_single_table_value_float_values():
127
- filename = 'ecoClimateZone.csv'
128
- lookup = download_lookup(filename)
129
- column = column_name('STEHFEST_BOUWMAN_2006_N2O-N_FACTOR')
130
- assert _get_single_table_value(lookup, column_name('ecoClimateZone'), '11', column) == '-0.3022'
131
-
132
-
133
- def test_extract_grouped_data_closest_date_no_data():
134
- assert not extract_grouped_data_closest_date('', 2000)
135
- assert not extract_grouped_data_closest_date('-', 2000)
136
-
137
-
138
- def test_extract_grouped_data_closest_date():
139
- data = '2000:-;2001:0.1;2002:0.2;2003:0.3;2004:0.4;2005:0.5'
140
- assert extract_grouped_data_closest_date(data, 2000) == 0.1
141
- assert extract_grouped_data_closest_date(data, 2001) == 0.1
142
- assert extract_grouped_data_closest_date(data, 2020) == 0.5
143
-
144
-
145
- def test_lookup_term_ids():
146
- assert 'wheatGrain' in lookup_term_ids(download_lookup('crop.csv'))
147
-
148
-
149
- def test_lookup_columns():
150
- assert 'termid' in lookup_columns(download_lookup('crop.csv'))
151
-
152
-
153
- def test_get_data_advanced():
154
- lookup = download_lookup('liveAnimal.csv')
155
- value = get_table_value(lookup, 'termid', 'sheepRam', column_name('ratioCPregnancyNetEnergyPregnancyIpcc2019'))
156
- assert value == ''
157
-
158
-
159
- def test_grouping_with_comma():
160
- lookup = download_lookup('animalProduct.csv')
161
- term_id = 'meatChickenReadyToCookWeight'
162
- value = get_table_value(lookup, 'termid', term_id, column_name('animalProductGroupingFAO'))
163
- assert value == 'Meat of chickens, fresh or chilled'