hestia-earth-utils 0.15.16__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +0,0 @@
1
- from pkgutil import extend_path
2
-
3
- __path__ = extend_path(__path__, __name__)
@@ -0,0 +1,72 @@
1
+ import io
2
+ import csv
3
+ import re
4
+ import numpy as np
5
+
6
+ _MISSING_VALUE = '-'
7
+ _MISSING = -99999
8
+ _DELIMITER = ','
9
+ _QUOTE_CHAR = '"'
10
+ ENCODING = 'ISO-8859-1'
11
+ # default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
12
+ _DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
13
+
14
+
15
+ def is_missing_value(value): return value == _MISSING_VALUE or value == _MISSING or value == str(_MISSING)
16
+
17
+
18
+ def _replace_missing_values(value: str): return str(_MISSING) if str(value) == _MISSING_VALUE else value
19
+
20
+
21
+ def _replace_chars(value: str): return re.sub(f'[{re.escape(_DELETE_CHARS)}]', '', value.replace(' ', '_'))
22
+
23
+
24
+ def _text_to_csv(csv_content: str):
25
+ return csv.reader(io.StringIO(csv_content.strip()), delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
26
+
27
+
28
+ def _csv_reader_converter(field_str_bytes):
29
+ field_str = field_str_bytes if isinstance(field_str_bytes, str) else field_str_bytes.decode('utf-8')
30
+ reader = _text_to_csv(field_str)
31
+
32
+ try:
33
+ return _replace_missing_values(next(reader)[0].strip())
34
+ except StopIteration:
35
+ return str(_MISSING)
36
+
37
+
38
+ def _get_columns(csv_content: str):
39
+ try:
40
+ reader = _text_to_csv(csv_content)
41
+ names = next(reader)
42
+ return list(map(_replace_chars, names))
43
+ except StopIteration:
44
+ return []
45
+
46
+
47
+ def csv_str_to_recarray(csv_content: str) -> np.recarray:
48
+ names = _get_columns(csv_content)
49
+ num_cols = len(names)
50
+
51
+ converters_dict = {
52
+ i: _csv_reader_converter
53
+ for i in range(num_cols)
54
+ }
55
+
56
+ # TODO: find the maximum column size instead of using 1000
57
+ max_size = 1000
58
+ return np.loadtxt(
59
+ io.StringIO(csv_content.strip()),
60
+ delimiter=_DELIMITER,
61
+ quotechar=_QUOTE_CHAR,
62
+ skiprows=1,
63
+ converters=converters_dict,
64
+ dtype=[(name, f"U{max_size}") for name in names],
65
+ encoding=ENCODING
66
+ ).view(np.recarray)
67
+
68
+
69
+ def csv_file_to_recarray(filepath: str):
70
+ with open(filepath, 'r', encoding=ENCODING) as f:
71
+ content = f.read()
72
+ return csv_str_to_recarray(content)
@@ -1,51 +1,16 @@
1
1
  from functools import reduce
2
- from io import StringIO
3
2
  from typing import Union
4
- import re
5
3
  import requests
6
- import csv
7
4
  import numpy
5
+ import traceback
8
6
 
9
7
  from .storage import _load_from_storage
10
8
  from .request import request_url, web_url
9
+ from .csv_utils import csv_str_to_recarray, csv_file_to_recarray, is_missing_value, _replace_chars
11
10
 
12
- DELIMITER = '\t'
13
- ENCODING = 'ISO-8859-1'
14
- GLOSSARY_FOLDER = 'glossary/lookups'
11
+ _GLOSSARY_FOLDER = 'glossary/lookups'
15
12
  _memory = {}
16
- MISSING_VALUE = '-'
17
- MISSING = -99999
18
13
  _INDEX_COL = 'termid'
19
- # default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
20
- _DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
21
-
22
-
23
- def _is_missing_value(value): return value == MISSING_VALUE or value == MISSING or value == str(MISSING)
24
-
25
-
26
- def _replace_missing_values(value: str): return str(MISSING) if str(value) == '-' else value
27
-
28
-
29
- def _rewrite_csv_file_as_tab(filepath: str):
30
- with open(filepath, 'r', encoding=ENCODING) as fp:
31
- reader = csv.reader(fp)
32
- for row in reader:
33
- yield DELIMITER.join(list(map(_replace_missing_values, row)))
34
-
35
-
36
- def _rewrite_csv_text_as_tab(text: str):
37
- reader = csv.reader(StringIO(text))
38
- for row in reader:
39
- yield DELIMITER.join(list(map(_replace_missing_values, row)))
40
-
41
-
42
- def _recfromcsv(data): return numpy.recfromcsv(data,
43
- missing_values=MISSING_VALUE,
44
- filling_values=MISSING,
45
- delimiter=DELIMITER,
46
- encoding=ENCODING,
47
- case_sensitive=True,
48
- deletechars=_DELETE_CHARS)
49
14
 
50
15
 
51
16
  def _memory_wrapper(key: str, func):
@@ -70,12 +35,12 @@ def load_lookup(filepath: str, keep_in_memory: bool = False):
70
35
  numpy.recarray
71
36
  The `numpy.recarray` converted from the csv content.
72
37
  """
73
- def load(): return _recfromcsv(_rewrite_csv_file_as_tab(filepath))
38
+ def load(): return csv_file_to_recarray(filepath)
74
39
  return _memory_wrapper(filepath, load) if keep_in_memory else load()
75
40
 
76
41
 
77
42
  def _download_lookup_data(filename: str):
78
- filepath = f"{GLOSSARY_FOLDER}/{filename}"
43
+ filepath = f"{_GLOSSARY_FOLDER}/{filename}"
79
44
 
80
45
  def fallback():
81
46
  url = request_url(f"{web_url()}/{filepath}")
@@ -121,12 +86,14 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
121
86
  """
122
87
  def load():
123
88
  data = _download_lookup_data(filename)
124
- rec = _recfromcsv(_rewrite_csv_text_as_tab(data)) if data else None
89
+ rec = csv_str_to_recarray(data) if data else None
125
90
  return (_build_index(rec) if build_index else rec) if data else None
126
91
 
127
92
  try:
128
93
  return _memory_wrapper(filename, load) if keep_in_memory else load()
129
94
  except Exception:
95
+ stack = traceback.format_exc()
96
+ print(stack)
130
97
  return None
131
98
 
132
99
 
@@ -144,7 +111,19 @@ def column_name(key: str):
144
111
  str
145
112
  The column name that can be used in `get_table_value`.
146
113
  """
147
- return re.sub('[' + re.escape(_DELETE_CHARS) + ']', '', key.replace(' ', '_')) if key else ''
114
+ return _replace_chars(key) if key else ''
115
+
116
+
117
+ def _parse_value(value: str):
118
+ """ Automatically converts the value to float or bool if possible """
119
+ try:
120
+ return (
121
+ True if str(value).lower() == 'true' else
122
+ False if str(value).lower() == 'false' else
123
+ float(value)
124
+ )
125
+ except Exception:
126
+ return value
148
127
 
149
128
 
150
129
  def _get_single_table_value(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
@@ -191,7 +170,7 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
191
170
  _get_single_table_value(lookup, col_match, col_match_with, col_val) if single else
192
171
  _get_multiple_table_values(lookup, col_match, col_match_with, col_val)
193
172
  )
194
- return None if _is_missing_value(value) else value
173
+ return None if is_missing_value(value) else _parse_value(value)
195
174
  except Exception:
196
175
  return None
197
176
 
@@ -251,7 +230,7 @@ def extract_grouped_data(data: str, key: str) -> str:
251
230
  **{curr.split(':')[0]: curr.split(':')[1]}
252
231
  }, data.split(';'), {}) if data is not None and isinstance(data, str) and len(data) > 1 else {}
253
232
  value = grouped_data.get(key)
254
- return None if _is_missing_value(value) else value
233
+ return None if is_missing_value(value) else _parse_value(value)
255
234
 
256
235
 
257
236
  def extract_grouped_data_closest_date(data: str, year: int) -> str:
@@ -278,13 +257,13 @@ def extract_grouped_data_closest_date(data: str, year: int) -> str:
278
257
  lambda prev, curr: {
279
258
  **prev,
280
259
  **{curr.split(':')[0]: curr.split(':')[1]}
281
- } if len(curr) > 0 and not _is_missing_value(curr.split(':')[1]) else prev,
260
+ } if len(curr) > 0 and not is_missing_value(curr.split(':')[1]) else prev,
282
261
  data.split(';'),
283
262
  {}
284
263
  ) if data is not None and isinstance(data, str) and len(data) > 1 else {}
285
264
  dist_years = list(data_by_date.keys())
286
265
  closest_year = min(dist_years, key=lambda x: abs(int(x) - year)) if len(dist_years) > 0 else None
287
- return None if closest_year is None else data_by_date.get(closest_year)
266
+ return None if closest_year is None else _parse_value(data_by_date.get(closest_year))
288
267
 
289
268
 
290
269
  def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
@@ -2,6 +2,7 @@ import copy
2
2
  import json
3
3
  import re
4
4
  import numpy as np
5
+ import pandas as pd
5
6
  from hestia_earth.schema import UNIQUENESS_FIELDS, Term, NODE_TYPES
6
7
  from hestia_earth.schema.utils.sort import get_sort_key, SORT_CONFIG
7
8
  from flatten_json import flatten as flatten_json
@@ -11,17 +12,6 @@ from ..api import find_term_ids_by_names
11
12
  from ._shared import EXCLUDE_FIELDS, EXCLUDE_PRIVATE_FIELDS, _with_csv_formatting, _filter_emissions_not_relevant
12
13
 
13
14
 
14
- PANDAS_IMPORT_ERROR_MSG = "Run `pip install pandas>=1.2` to use this functionality"
15
- try:
16
- import pandas as pd
17
-
18
- version = [int(x) for x in pd.__version__.split('+')[0].split(".")]
19
- if version[0] < 1 or (version[0] == 1 and version[1] < 2):
20
- raise ImportError(PANDAS_IMPORT_ERROR_MSG)
21
- except ImportError:
22
- raise ImportError(PANDAS_IMPORT_ERROR_MSG)
23
-
24
-
25
15
  # We only want to pivot array items containing blank nodes
26
16
  # Assume these are all fields with uniqueness fields not of type Node
27
17
  def _get_blank_node_uniqueness_fields():
@@ -1,22 +1,12 @@
1
1
  from functools import reduce
2
2
  import numpy as np
3
+ import pandas as pd
3
4
  from hestia_earth.schema import NodeType
4
5
 
5
6
  # __package__ = "hestia_earth.utils" # required to run interactively in vscode
6
7
  from .tools import flatten
7
8
 
8
9
 
9
- PANDAS_IMPORT_ERROR_MSG = "Run `pip install pandas>=1.2` to use this functionality"
10
- try:
11
- import pandas as pd
12
-
13
- version = [int(x) for x in pd.__version__.split('+')[0].split(".")]
14
- if version[0] < 1 or (version[0] == 1 and version[1] < 2):
15
- raise ImportError(PANDAS_IMPORT_ERROR_MSG)
16
- except ImportError:
17
- raise ImportError(PANDAS_IMPORT_ERROR_MSG)
18
-
19
-
20
10
  def _replace_ids(df):
21
11
  # in columns, first letter is always lower case
22
12
  node_types = [e.value[0].lower() + e.value[1:] for e in NodeType]
@@ -74,11 +64,6 @@ def format_for_upload(filepath: str):
74
64
  pandas.DataFrame
75
65
  Formatted pandas dataframe
76
66
  """
77
- try:
78
- import pandas as pd
79
- except ImportError:
80
- raise ImportError("Run `pip install pandas~=1.2.0` to use this functionality")
81
-
82
67
  df = pd.read_csv(filepath, index_col=None, na_values="")
83
68
 
84
69
  # replace @id with id for top-level Node
@@ -1 +1 @@
1
- VERSION = '0.15.16'
1
+ VERSION = '0.16.2'
@@ -1,22 +1,35 @@
1
- Metadata-Version: 2.1
2
- Name: hestia-earth-utils
3
- Version: 0.15.16
1
+ Metadata-Version: 2.4
2
+ Name: hestia_earth_utils
3
+ Version: 0.16.2
4
4
  Summary: HESTIA's utils library
5
5
  Home-page: https://gitlab.com/hestia-earth/hestia-utils
6
6
  Author: HESTIA Team
7
7
  Author-email: guillaumeroyer.mail@gmail.com
8
8
  License: MIT
9
- Platform: UNKNOWN
10
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
- Classifier: Programming Language :: Python :: 3.9
12
- Requires-Python: >=3.9
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Python: >=3.12
13
11
  Description-Content-Type: text/markdown
14
- Requires-Dist: hestia-earth.schema>=24.3.0
12
+ Requires-Dist: hestia-earth-schema>=35.0.1
15
13
  Requires-Dist: requests>=2.24.0
16
14
  Requires-Dist: urllib3~=1.26.0
17
15
  Requires-Dist: python-dateutil>=2.8.1
18
- Requires-Dist: numpy<2,>=1.25.0
19
- Requires-Dist: flatten-json
16
+ Requires-Dist: numpy>=2
17
+ Requires-Dist: flatten_json
18
+ Provides-Extra: pivot-csv
19
+ Requires-Dist: pandas>=2; extra == "pivot-csv"
20
+ Provides-Extra: table
21
+ Requires-Dist: pandas>=2; extra == "table"
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: provides-extra
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
20
33
 
21
34
  # HESTIA Utils
22
35
 
@@ -66,5 +79,3 @@ from hestia_earth.utils.lookup import download_lookup
66
79
 
67
80
  df = download_lookup('crop.csv')
68
81
  ```
69
-
70
-
@@ -1,37 +1,34 @@
1
- hestia_earth/__init__.py,sha256=G-d438vPx7m_ks5e9XTtM3u7LDRO5dSSukibukWmyPM,56
2
- hestia_earth/utils/__init__.py,sha256=qEFeq3yuf3lQKVseALmL8aPM8fpCS54B_5pry00M3hk,76
1
+ hestia_earth/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
2
  hestia_earth/utils/api.py,sha256=y0gw5pCCHNnFIhM62Hok_5eDtH3QDAZdkye_1mANMNs,9654
4
3
  hestia_earth/utils/blank_node.py,sha256=1wc9zUkOvFhJS-YmuKexfIdYxfsp5KyJczLmHlW559Q,7375
5
4
  hestia_earth/utils/calculation_status.py,sha256=X7lbgVMD9luH1gj9lEcxd3_P2-u7e8ZPGCvX1czPZUo,2238
5
+ hestia_earth/utils/csv_utils.py,sha256=nb_ihJaTj3K5hO7cxXO1xjTLVGVX1P13m9SgquO5-XY,1990
6
6
  hestia_earth/utils/cycle.py,sha256=rFLRL9X4KQ1UrE6fEPA_gV8KmwzrZpR3Ce56zg41lRk,1326
7
7
  hestia_earth/utils/date.py,sha256=SPQ69uxHiv1o3BqIkBKkM5XX_CmS20CB7g6u2rhsdh8,1807
8
8
  hestia_earth/utils/descriptive_stats.py,sha256=EMVwFvg2OnZgKRAfireAoWY2EbrSvqR0V0bK9B53p28,1583
9
9
  hestia_earth/utils/emission.py,sha256=BhBitooLTxZSh82S982v2QfPxxTF1kmGClG_uHyWdz4,1981
10
- hestia_earth/utils/lookup.py,sha256=0RLqy3HPzkbhkRaO7fYoHU0jKhAYzI6QHMptMEbqTlg,10344
10
+ hestia_earth/utils/lookup.py,sha256=NoEv0Hd496I9kf-shYXYUwNabatjc_uO9Ade8J98oBI,9490
11
11
  hestia_earth/utils/lookup_utils.py,sha256=_k3RZ1pK-gw7jq8wn9HrPWfDl4FlEWRb8bXmgaARu0w,6716
12
12
  hestia_earth/utils/model.py,sha256=uUcrF07XmBzqLni8VSaP0HoebJnQ57kk0EOmhwYMbfI,4637
13
13
  hestia_earth/utils/pipeline.py,sha256=O-6DPtK0U1lJ51LFGa1gM6pjkBJUfxOjNjY8LxQPXV0,9588
14
14
  hestia_earth/utils/request.py,sha256=bu7hkWKmFdXl2_Feawiam_x32whlclA9oP0asJyC69k,626
15
15
  hestia_earth/utils/stats.py,sha256=4t3op10xDJbGxWJEY1Jtyl302PYWyMFwLpsSkMlzQn8,34667
16
- hestia_earth/utils/table.py,sha256=RrTt-KF_QzjKiCpaAueoG6La1FG-Iusxw5NMDpoRBpQ,2861
16
+ hestia_earth/utils/table.py,sha256=MOJDo5fQPRDogAty_UXbO9-EXFwz97m0f7--mOM17lQ,2363
17
17
  hestia_earth/utils/term.py,sha256=6LiUSc6KX3IOkfWF6fYkQ2tENCO8ENljcdDypxU6WtA,1060
18
18
  hestia_earth/utils/tools.py,sha256=9GaUJwxL-CTzEOGnRFkUQDVFelPevQSxXrf25vssCVo,4990
19
- hestia_earth/utils/version.py,sha256=JDQ_516e1l28tU9cgTLDX-12-rAvR8X42sQ5QDTbtTU,20
19
+ hestia_earth/utils/version.py,sha256=8a5HJaemwtN_jTS8fGe4SSrLufF3bwMJrcS4e735nPY,19
20
20
  hestia_earth/utils/pivot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  hestia_earth/utils/pivot/_shared.py,sha256=JnyIOzpans45DE2hSa9-4yvNhq8t08lx1IAWGJi6WPQ,1397
22
- hestia_earth/utils/pivot/pivot_csv.py,sha256=zaiDcig4I5lVSHPZ-2bXKKBcIRrayA0GUaw0c8H3D-w,12371
22
+ hestia_earth/utils/pivot/pivot_csv.py,sha256=7f6kMqeb1b3RKANLGeDgVu8G5WC-vXIijHnsJhO-CjI,12022
23
23
  hestia_earth/utils/pivot/pivot_json.py,sha256=GBu5CFgCNdFjAuKGNsk2Phgds-xp4iREa5YIrplpFwA,9801
24
24
  hestia_earth/utils/storage/__init__.py,sha256=uNX6_EHWWnNUIm4Ng7L43-cQmuc6NGFAxXye85saIXQ,922
25
25
  hestia_earth/utils/storage/_azure_client.py,sha256=sevCZni04eknMql2DgUsWG23f7u0KvsXP7me1ZUBy00,1274
26
26
  hestia_earth/utils/storage/_local_client.py,sha256=IbzziUKY0QS3ybHFfgEpELqvafa7hQnZ-DdGdjQuypE,515
27
27
  hestia_earth/utils/storage/_s3_client.py,sha256=B2yTsf-VfHcRLCKTMes4S_nCXxrZad9umyZx3b5Pu_c,3181
28
28
  hestia_earth/utils/storage/_sns_client.py,sha256=LowUatj78Egu6_Id6Rr7hZjfZx1WguS3lozB3yAwSps,347
29
- hestia_earth_utils-0.15.16.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
30
- hestia_earth_utils-0.15.16.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
31
- tests/pivot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
- tests/pivot/test_pivot_csv.py,sha256=aYni7o3QDPSgtVxVCspEetotgpYHY7Lz5VHf-DR89gw,8131
33
- tests/pivot/test_pivot_json.py,sha256=UYTAN4AZhzVicIYsU1A2VgJcctUXohjHppg6s-pqwcg,8287
34
- hestia_earth_utils-0.15.16.dist-info/METADATA,sha256=nTJS2R1fi2c9Lz3R7zvRuf8HKG1n7K72KoDek_C9LpU,1758
35
- hestia_earth_utils-0.15.16.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
36
- hestia_earth_utils-0.15.16.dist-info/top_level.txt,sha256=1dqA9TqpOLTEgpqa-YBsmbCmmNU1y56AtfFGEceZ2A0,19
37
- hestia_earth_utils-0.15.16.dist-info/RECORD,,
29
+ hestia_earth_utils-0.16.2.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
30
+ hestia_earth_utils-0.16.2.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
31
+ hestia_earth_utils-0.16.2.dist-info/METADATA,sha256=mz11GR2ctUEK-YYl0x2s4f1UVFwUAKb4rt-L-MHnItA,2030
32
+ hestia_earth_utils-0.16.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ hestia_earth_utils-0.16.2.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
34
+ hestia_earth_utils-0.16.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
hestia_earth/__init__.py DELETED
@@ -1 +0,0 @@
1
- __import__('pkg_resources').declare_namespace(__name__)
tests/pivot/__init__.py DELETED
File without changes
@@ -1,267 +0,0 @@
1
- import os
2
- import pandas as pd
3
- from unittest.mock import patch, call
4
-
5
- from tests.utils import fixtures_path
6
- from hestia_earth.utils.pivot.pivot_csv import pivot_csv, pivot_hestia_file
7
-
8
- class_path = 'hestia_earth.utils.pivot.pivot_csv'
9
- fixtures_folder = os.path.join(fixtures_path, 'pivot', 'pivot_csv')
10
-
11
-
12
- @patch(
13
- f"{class_path}.find_term_ids_by_names",
14
- return_value={
15
- "Full tillage": "fullTillage",
16
- "Diesel": "diesel",
17
- "Inorganic Potassium fertiliser, unspecified (kg K2O)": "inorganicPotassiumFertiliserUnspecifiedKgK2O",
18
- "Inorganic Phosphorus fertiliser, unspecified (kg P2O5)": "inorganicPhosphorusFertiliserUnspecifiedKgP2O5",
19
- "Urea (kg N)": "ureaKgN",
20
- "Peanut, in shell": "peanutInShell",
21
- },
22
- )
23
- def test_pivot_csv_cycle(mock):
24
- filepath = f"{fixtures_folder}/cycle.csv"
25
- expected = pd.read_csv(
26
- f"{fixtures_folder}/cycle-pivoted.csv", index_col=None, dtype=object
27
- )
28
- df = pivot_csv(filepath)
29
- assert df.to_csv() == expected.to_csv()
30
- mock.assert_has_calls(
31
- [
32
- call(
33
- [
34
- "Diesel",
35
- "Full tillage",
36
- "Inorganic Phosphorus fertiliser, unspecified (kg P2O5)",
37
- "Inorganic Potassium fertiliser, unspecified (kg K2O)",
38
- "Peanut, in shell",
39
- "Urea (kg N)",
40
- ]
41
- )
42
- ]
43
- )
44
-
45
-
46
- @patch(
47
- f"{class_path}.find_term_ids_by_names",
48
- return_value={
49
- "Eutrophication potential, excluding fate": "eutrophicationPotentialExcludingFate",
50
- "GWP100": "gwp100",
51
- "N2O, to air, organic fertiliser, direct": "n2OToAirOrganicFertiliserDirect",
52
- "N2O, to air, inorganic fertiliser, direct": "n2OToAirInorganicFertiliserDirect",
53
- },
54
- )
55
- def test_pivot_csv_impact(mock):
56
- filepath = f"{fixtures_folder}/impact.csv"
57
- expected = pd.read_csv(
58
- f"{fixtures_folder}/impact-pivoted.csv", index_col=None, dtype=object
59
- )
60
- df = pivot_csv(filepath)
61
- assert df.to_csv() == expected.to_csv()
62
- mock.assert_has_calls(
63
- [
64
- call(
65
- [
66
- "Eutrophication potential, excluding fate",
67
- "GWP100",
68
- "N2O, to air, inorganic fertiliser, direct",
69
- "N2O, to air, organic fertiliser, direct",
70
- ]
71
- )
72
- ]
73
- )
74
-
75
-
76
- def test_pivot_csv_multinode_rows():
77
- filepath = f"{fixtures_folder}/multinode-rows.csv"
78
- expected = pd.read_csv(
79
- f"{fixtures_folder}/multinode-rows-pivoted.csv",
80
- index_col=None,
81
- dtype=object,
82
- )
83
- df = pivot_csv(filepath)
84
- assert df.to_csv() == expected.to_csv()
85
-
86
-
87
- @patch(
88
- f"{class_path}.find_term_ids_by_names",
89
- return_value={"Urea (kg N)": "ureaKgN"},
90
- )
91
- def test_pivot_csv_cycle_missing_ids(mock):
92
- filepath = f"{fixtures_folder}/missing-ids.csv"
93
- expected = pd.read_csv(
94
- f"{fixtures_folder}/missing-ids-pivoted.csv",
95
- index_col=None,
96
- dtype=object,
97
- )
98
- df = pivot_csv(filepath)
99
- assert df.to_csv() == expected.to_csv()
100
- mock.assert_has_calls([call(["Urea (kg N)"])])
101
-
102
-
103
- @patch(
104
- f"{class_path}.find_term_ids_by_names",
105
- return_value={"Irrigated": "irrigated"},
106
- )
107
- def test_pivot_csv_empty_cells(mock):
108
- filepath = f"{fixtures_folder}/empty-cells.csv"
109
- expected = pd.read_csv(
110
- f"{fixtures_folder}/empty-cells-pivoted.csv",
111
- index_col=None,
112
- dtype=object,
113
- )
114
- df = pivot_csv(filepath)
115
- assert df.to_csv() == expected.to_csv()
116
-
117
-
118
- def test_pivot_csv_preserves_uniqueness_fields():
119
- filepath = f"{fixtures_folder}/uniqueness-fields-undifferentiating.csv"
120
- expected = pd.read_csv(
121
- f"{fixtures_folder}/uniqueness-fields-undifferentiating-pivoted.csv",
122
- index_col=None,
123
- dtype=object,
124
- )
125
- df = pivot_csv(filepath)
126
- assert df.to_csv() == expected.to_csv()
127
-
128
-
129
- @patch(
130
- f"{class_path}.find_term_ids_by_names",
131
- return_value={
132
- "Helicopter use, operation unspecified": "helicopterUseOperationUnspecified",
133
- "Cooling, with evaporative cooling tower": "coolingWithEvaporativeCoolingTower",
134
- "Small tractor use, operation unspecified": "smallTractorUseOperationUnspecified",
135
- "Coating seeds": "coatingSeeds",
136
- "Buttage of vine": "buttageOfVine",
137
- },
138
- )
139
- def test_pivot_csv_uniqueness_fields_differentiating(mock):
140
- filepath = f"{fixtures_folder}/uniqueness-fields-differentiating.csv"
141
- expected = pd.read_csv(
142
- f"{fixtures_folder}/uniqueness-fields-differentiating-pivoted.csv",
143
- index_col=None,
144
- dtype=object,
145
- )
146
- df = pivot_csv(filepath)
147
- assert df.to_csv() == expected.to_csv()
148
- mock.assert_has_calls(
149
- [
150
- call(
151
- [
152
- "Buttage of vine",
153
- "Coating seeds",
154
- "Cooling, with evaporative cooling tower",
155
- "Helicopter use, operation unspecified",
156
- "Small tractor use, operation unspecified",
157
- ]
158
- )
159
- ]
160
- )
161
-
162
-
163
- @patch(
164
- f"{class_path}.find_term_ids_by_names",
165
- return_value={
166
- "Cooling, with evaporative cooling tower": "coolingWithEvaporativeCoolingTower",
167
- },
168
- )
169
- def test_pivot_csv_uniqueness_fields_non_matching(mock):
170
- filepath = f"{fixtures_folder}/uniqueness-fields-non-matching.csv"
171
- expected = pd.read_csv(
172
- f"{fixtures_folder}/uniqueness-fields-non-matching-pivoted.csv",
173
- index_col=None,
174
- dtype=object,
175
- )
176
- df = pivot_csv(filepath)
177
- assert df.to_csv() == expected.to_csv()
178
- mock.assert_has_calls([call(["Cooling, with evaporative cooling tower"])])
179
-
180
-
181
- @patch(
182
- f"{class_path}.find_term_ids_by_names",
183
- return_value={
184
- "Nitrogen content": "nitrogenContent",
185
- },
186
- )
187
- def test_pivot_csv_properties(mock):
188
- filepath = f"{fixtures_folder}/properties-exception.csv"
189
- expected = pd.read_csv(
190
- f"{fixtures_folder}/properties-exception-pivoted.csv",
191
- index_col=None,
192
- dtype=object,
193
- )
194
- df = pivot_csv(filepath)
195
- assert df.to_csv() == expected.to_csv()
196
- mock.assert_has_calls([call(["Nitrogen content"])])
197
-
198
-
199
- def test_pivot_csv_depth():
200
- filepath = f"{fixtures_folder}/depth-exception.csv"
201
- expected = pd.read_csv(
202
- f"{fixtures_folder}/depth-exception-pivoted.csv",
203
- index_col=None,
204
- dtype=object,
205
- )
206
- df = pivot_csv(filepath)
207
- assert df.to_csv() == expected.to_csv()
208
-
209
-
210
- def test_pivot_csv_shuffled():
211
- filepath = f"{fixtures_folder}/shuffled.csv"
212
- expected = pd.read_csv(
213
- f"{fixtures_folder}/shuffled-pivoted.csv",
214
- index_col=None,
215
- dtype=object,
216
- )
217
- df = pivot_csv(filepath)
218
- assert df.to_csv() == expected.to_csv()
219
-
220
-
221
- @patch(
222
- f"{class_path}.find_term_ids_by_names",
223
- return_value={"Full tillage": "fullTillage", "Urea (kg N)": "ureaKgN"},
224
- )
225
- def test_pivot_csv_cycle_deep(*args):
226
- filepath = f"{fixtures_folder}/deep.csv"
227
- expected = pd.read_csv(
228
- f"{fixtures_folder}/deep-pivoted.csv",
229
- index_col=None,
230
- dtype=object,
231
- )
232
- df = pivot_csv(filepath)
233
- assert df.to_csv() == expected.to_csv()
234
-
235
-
236
- def test_pivot_csv_non_node_arrayfields(*args):
237
- filepath = f"{fixtures_folder}/non-node-arrayfields.csv"
238
- expected = pd.read_csv(
239
- f"{fixtures_folder}/non-node-arrayfields-pivoted.csv",
240
- index_col=None,
241
- dtype=object,
242
- )
243
- df = pivot_csv(filepath)
244
- assert df.to_csv() == expected.to_csv()
245
-
246
-
247
- @patch(
248
- f"{class_path}.find_term_ids_by_names",
249
- return_value={
250
- "Grinding, with grinder": "grinding",
251
- "Motor gasoline": "motorGasoline",
252
- "Orchard density": "orchardDensity",
253
- },
254
- )
255
- def test_pivot_hestia_file(*args):
256
- filepath = f"{fixtures_folder}/nodes.hestia"
257
- expected = pd.read_csv(
258
- f"{fixtures_folder}/nodes.hestia-pivoted.csv",
259
- index_col=None,
260
- dtype=object,
261
- )
262
-
263
- with open(filepath) as fd:
264
- hestia_file = fd.read()
265
-
266
- df = pivot_hestia_file(hestia_file)
267
- assert df.to_csv() == expected.to_csv()
@@ -1,231 +0,0 @@
1
- import os
2
- import json
3
- import re
4
- import numpy as np
5
- import pandas as pd
6
-
7
- from tests.utils import fixtures_path
8
- from hestia_earth.utils.pivot.pivot_json import (
9
- _with_csv_formatting,
10
- pivot_nodes,
11
- pivot_hestia_file,
12
- )
13
- from flatten_json import unflatten_list
14
- from hestia_earth.schema.utils.sort import SORT_CONFIG
15
- from hestia_earth import schema
16
-
17
- class_path = 'hestia_earth.utils.pivot.pivot_csv'
18
- fixtures_folder = os.path.join(fixtures_path, 'pivot', 'pivot_json')
19
-
20
- node_types = {k: getattr(schema, k)().fields for k in schema.SCHEMA_TYPES}
21
- name_to_ids_mapping = {
22
- "Full tillage": "fullTillage",
23
- "Diesel": "diesel",
24
- "Motor gasoline": "motorGasoline",
25
- "Inorganic Potassium fertiliser, unspecified (kg K2O)": "inorganicPotassiumFertiliserUnspecifiedKgK2O",
26
- "Inorganic Phosphorus fertiliser, unspecified (kg P2O5)": "inorganicPhosphorusFertiliserUnspecifiedKgP2O5",
27
- "Urea (kg N)": "ureaKgN",
28
- "Peanut, in shell": "peanutInShell",
29
- "Eutrophication potential, excluding fate": "eutrophicationPotentialExcludingFate",
30
- "GWP100": "gwp100",
31
- "N2O, to air, organic fertiliser, direct": "n2OToAirOrganicFertiliserDirect",
32
- "N2O, to air, inorganic fertiliser, direct": "n2OToAirInorganicFertiliserDirect",
33
- "Irrigated": "irrigated",
34
- "Helicopter use, operation unspecified": "helicopterUseOperationUnspecified",
35
- "Cooling, with evaporative cooling tower": "coolingWithEvaporativeCoolingTower",
36
- "Small tractor use, operation unspecified": "smallTractorUseOperationUnspecified",
37
- "Coating seeds": "coatingSeeds",
38
- "Buttage of vine": "buttageOfVine",
39
- "Nitrogen content": "nitrogenContent",
40
- "Grinding, with grinder": "grinding",
41
- "Orchard density": "orchardDensity",
42
- }
43
-
44
-
45
- def _get_node_type(col):
46
- label = col.split(".")[0]
47
- return label[0].upper() + label[1:]
48
-
49
-
50
- def _add_missing_fields(row, is_input, col, parent_type, prefix=""):
51
- subnode_col = re.search(r"(.+?\.\d+)\.(.+)", col)
52
- if not subnode_col:
53
- return None
54
- sub_node, deep_col = subnode_col.groups()
55
- node_type = (
56
- # We are not handling fields like subnode_type_A.subnode_type_B.0
57
- # We are always fetching type_A in this scenario.
58
- SORT_CONFIG.get(parent_type)
59
- .get(sub_node.split(".")[0])
60
- .get("type")
61
- )
62
- next_prefix = ".".join([el for el in (prefix, sub_node) if el])
63
- row[f"{next_prefix}.@type"] = node_type
64
- _add_missing_fields(row, is_input, deep_col, node_type, prefix=next_prefix)
65
-
66
-
67
- def _row_to_dict(row, is_input, parent_type):
68
- row.dropna(inplace=True)
69
- if is_input:
70
- for col in row.index:
71
- _add_missing_fields(row, is_input, col, parent_type)
72
- return row.to_dict()
73
-
74
-
75
- def _df_to_dict(df, is_input):
76
- df.index = map(lambda col: ".".join(col.split(".")[1:]), df.index)
77
- df.loc["@type"] = df.name
78
- dicts = df.apply(_row_to_dict, is_input=is_input, parent_type=df.name)
79
- return dicts
80
-
81
-
82
- def _ensure_id_cols(df, name_to_ids):
83
- names_df = df.filter(regex=r"\.name", axis=1)
84
- for name_col in names_df.columns:
85
- id_col = name_col.replace(".name", ".@id")
86
- for idx, name in df[name_col].items():
87
- if id_col not in df:
88
- df[id_col] = np.nan
89
- if pd.isna(df.loc[idx, id_col]):
90
- df.loc[idx, id_col] = name_to_ids[name]
91
-
92
-
93
- def _convert_csv_to_nodes(fixture, is_input, name_to_ids):
94
- """
95
- Gets json fixtures or creates them from corresponding csv files.
96
- Conversion for *-pivoted files is not perfect as we do not detect
97
- the difference between an empty cell which should be discarded
98
- (ie. header not used by a row) and a node without a value key
99
- (the latter are represented in csv as field.nodeId.value = None)
100
- """
101
- filepath = (
102
- f"{fixtures_path}/pivot/pivot_csv/{fixture}.csv"
103
- if is_input
104
- else f"{fixtures_path}/pivot/pivot_csv/{fixture}-pivoted.csv"
105
- )
106
- df = pd.read_csv(filepath, index_col=None, dtype=object)
107
- df.drop(columns="-", errors="ignore", inplace=True)
108
- df.replace("-", np.nan, inplace=True)
109
- df.replace(
110
- ["TRUE", "True", "true", "FALSE", "False", "false"],
111
- [True, True, True, False, False, False],
112
- inplace=True,
113
- )
114
- if is_input:
115
- df.dropna(how="all", axis=1, inplace=True)
116
- df.rename(lambda col: col.replace(".id", ".@id"), axis=1, inplace=True)
117
- if is_input:
118
- _ensure_id_cols(df, name_to_ids)
119
- df = df.T.groupby(_get_node_type).apply(_df_to_dict, is_input)
120
- nodes = [
121
- node for _node_type, nodes in df.iterrows() for node in nodes if node.get("@id")
122
- ]
123
- return nodes
124
-
125
-
126
- def get_nodes_from_fixture(fixture, name_to_ids={}):
127
- try:
128
- with open(f"{fixtures_folder}/{fixture}.json") as file:
129
- input = json.load(file, object_hook=_with_csv_formatting)["nodes"]
130
- with open(f"{fixtures_folder}/{fixture}-pivoted.json") as file:
131
- expected = json.load(file, object_hook=_with_csv_formatting)["nodes"]
132
- except FileNotFoundError:
133
- print(f"\n{fixture} not found: attempting to create from csv.\n")
134
- name_to_ids.update({np.nan: np.nan})
135
- input = _convert_csv_to_nodes(fixture, True, name_to_ids)
136
- expected = _convert_csv_to_nodes(fixture, False, name_to_ids)
137
-
138
- input, expected = (
139
- [unflatten_list(node, ".") for node in input],
140
- [unflatten_list(node, ".") for node in expected],
141
- )
142
- with open(f"{fixtures_folder}/{fixture}.json", "w") as file:
143
- file.write(json.dumps({"nodes": input}, sort_keys=True, indent=2))
144
- with open(
145
- f"{fixtures_folder}/{fixture}-pivoted.json", "w"
146
- ) as file:
147
- file.write(json.dumps({"nodes": expected}, sort_keys=True, indent=2))
148
-
149
- return (input, expected)
150
-
151
-
152
- def test_pivot_json_cycle():
153
- input, expected = get_nodes_from_fixture("cycle", name_to_ids_mapping)
154
- actual = pivot_nodes(input)
155
- assert expected == actual
156
-
157
-
158
- def test_pivot_json_impact():
159
- input, expected = get_nodes_from_fixture("impact", name_to_ids_mapping)
160
- actual = pivot_nodes(input)
161
- assert expected == actual
162
-
163
-
164
- def test_pivot_json_multinode_rows():
165
- input, expected = get_nodes_from_fixture("multinode-rows")
166
- actual = pivot_nodes(input)
167
- assert expected == actual
168
-
169
-
170
- def test_pivot_json_preserves_uniqueness_fields():
171
- input, expected = get_nodes_from_fixture(
172
- "uniqueness-fields-undifferentiating", name_to_ids_mapping
173
- )
174
- actual = pivot_nodes(input)
175
- assert expected == actual
176
-
177
-
178
- def test_pivot_json_uniqueness_fields_differentiating():
179
- input, expected = get_nodes_from_fixture(
180
- "uniqueness-fields-differentiating", name_to_ids_mapping
181
- )
182
- actual = pivot_nodes(input)
183
- assert expected == actual
184
-
185
-
186
- # Output differs from CSV pivoter (see https://gitlab.com/hestia-earth/hestia-utils/-/issues/32)
187
- def test_pivot_json_uniqueness_fields_non_matching():
188
- input, expected = get_nodes_from_fixture("uniqueness-fields-non-matching", name_to_ids_mapping)
189
- actual = pivot_nodes(input)
190
- assert expected == actual
191
-
192
-
193
- def test_pivot_json_properties():
194
- input, expected = get_nodes_from_fixture("properties-exception", name_to_ids_mapping)
195
- actual = pivot_nodes(input)
196
- assert expected == actual
197
-
198
-
199
- def test_pivot_json_depth():
200
- input, expected = get_nodes_from_fixture("depth-exception")
201
- actual = pivot_nodes(input)
202
- assert expected == actual
203
-
204
-
205
- # Output differs from CSV pivoter (see https://gitlab.com/hestia-earth/hestia-utils/-/issues/32)
206
- def test_pivot_json_cycle_deep():
207
- input, expected = get_nodes_from_fixture("deep", name_to_ids_mapping)
208
- actual = pivot_nodes(input)
209
- assert expected == actual
210
-
211
-
212
- def test_pivot_json_node_arrayfields_merged():
213
- input, expected = get_nodes_from_fixture("node-arrayfields-merged")
214
- actual = pivot_nodes(input)
215
-
216
- assert expected == actual
217
-
218
-
219
- def test_pivot_json_unindexed_node():
220
- input, expected = get_nodes_from_fixture("unindexed-node")
221
- actual = pivot_nodes(input)
222
-
223
- assert expected == actual
224
-
225
-
226
- def test_pivot_hestia_file():
227
- _input, expected = get_nodes_from_fixture("nodes.hestia", name_to_ids_mapping)
228
- actual = pivot_hestia_file(
229
- open(f"{fixtures_folder}/nodes.hestia.json", "r").read()
230
- )
231
- assert expected == actual