PyPI - hestia-earth-utils - Versions diffs - 0.16.1__tar.gz → 0.16.3__tar.gz - Mend

hestia-earth-utils 0.16.1tar.gz → 0.16.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hestia_earth_utils
-Version: 0.16.1
+Version: 0.16.3
 Summary: HESTIA's utils library
 Home-page: https://gitlab.com/hestia-earth/hestia-utils
 Author: HESTIA Team
@@ -13,8 +13,12 @@ Requires-Dist: hestia-earth-schema>=35.0.1
 Requires-Dist: requests>=2.24.0
 Requires-Dist: urllib3~=1.26.0
 Requires-Dist: python-dateutil>=2.8.1
-Requires-Dist: numpy<2,>=1.25.0
+Requires-Dist: numpy>=2
 Requires-Dist: flatten_json
+Provides-Extra: pivot-csv
+Requires-Dist: pandas>=2; extra == "pivot-csv"
+Provides-Extra: table
+Requires-Dist: pandas>=2; extra == "table"
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -22,6 +26,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python
 Dynamic: summary

hestia_earth_utils-0.16.3/hestia_earth/utils/csv_utils.py ADDED Viewed

@@ -0,0 +1,72 @@
+import io
+import csv
+import re
+import numpy as np
+_MISSING_VALUE = '-'
+_MISSING = -99999
+_DELIMITER = ','
+_QUOTE_CHAR = '"'
+ENCODING = 'ISO-8859-1'
+# default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
+_DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
+def is_missing_value(value): return value == _MISSING_VALUE or value == _MISSING or value == str(_MISSING)
+def _replace_missing_values(value: str): return str(_MISSING) if str(value) == _MISSING_VALUE else value
+def _replace_chars(value: str): return re.sub(f'[{re.escape(_DELETE_CHARS)}]', '', value.replace(' ', '_'))
+def _text_to_csv(csv_content: str):
+    return csv.reader(io.StringIO(csv_content.strip()), delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
+def _csv_reader_converter(field_str_bytes):
+    field_str = field_str_bytes if isinstance(field_str_bytes, str) else field_str_bytes.decode('utf-8')
+    reader = _text_to_csv(field_str)
+    try:
+        return _replace_missing_values(next(reader)[0].strip())
+    except StopIteration:
+        return str(_MISSING)
+def _get_columns(csv_content: str):
+    try:
+        reader = _text_to_csv(csv_content)
+        names = next(reader)
+        return list(map(_replace_chars, names))
+    except StopIteration:
+        return []
+def csv_str_to_recarray(csv_content: str) -> np.recarray:
+    names = _get_columns(csv_content)
+    num_cols = len(names)
+    converters_dict = {
+        i: _csv_reader_converter
+        for i in range(num_cols)
+    }
+    # TODO: find the maximum column size instead of using 1000
+    max_size = 1000
+    return np.loadtxt(
+        io.StringIO(csv_content.strip()),
+        delimiter=_DELIMITER,
+        quotechar=_QUOTE_CHAR,
+        skiprows=1,
+        converters=converters_dict,
+        dtype=[(name, f"U{max_size}") for name in names],
+        encoding=ENCODING
+    ).view(np.recarray)
+def csv_file_to_recarray(filepath: str):
+    with open(filepath, 'r', encoding=ENCODING) as f:
+        content = f.read()
+    return csv_str_to_recarray(content)

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth/utils/lookup.py RENAMED Viewed

@@ -1,51 +1,16 @@
 from functools import reduce
-from io import StringIO
 from typing import Union
-import re
 import requests
-import csv
 import numpy
+import traceback
 from .storage import _load_from_storage
 from .request import request_url, web_url
+from .csv_utils import csv_str_to_recarray, csv_file_to_recarray, is_missing_value, _replace_chars
-DELIMITER = '\t'
-ENCODING = 'ISO-8859-1'
-GLOSSARY_FOLDER = 'glossary/lookups'
+_GLOSSARY_FOLDER = 'glossary/lookups'
 _memory = {}
-MISSING_VALUE = '-'
-MISSING = -99999
 _INDEX_COL = 'termid'
-# default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
-_DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
-def _is_missing_value(value): return value == MISSING_VALUE or value == MISSING or value == str(MISSING)
-def _replace_missing_values(value: str): return str(MISSING) if str(value) == '-' else value
-def _rewrite_csv_file_as_tab(filepath: str):
-    with open(filepath, 'r', encoding=ENCODING) as fp:
-        reader = csv.reader(fp)
-        for row in reader:
-            yield DELIMITER.join(list(map(_replace_missing_values, row)))
-def _rewrite_csv_text_as_tab(text: str):
-    reader = csv.reader(StringIO(text))
-    for row in reader:
-        yield DELIMITER.join(list(map(_replace_missing_values, row)))
-def _recfromcsv(data): return numpy.recfromcsv(data,
-                                               missing_values=MISSING_VALUE,
-                                               filling_values=MISSING,
-                                               delimiter=DELIMITER,
-                                               encoding=ENCODING,
-                                               case_sensitive=True,
-                                               deletechars=_DELETE_CHARS)
 def _memory_wrapper(key: str, func):
@@ -70,12 +35,12 @@ def load_lookup(filepath: str, keep_in_memory: bool = False):
     numpy.recarray
         The `numpy.recarray` converted from the csv content.
     """
-    def load(): return _recfromcsv(_rewrite_csv_file_as_tab(filepath))
+    def load(): return csv_file_to_recarray(filepath)
     return _memory_wrapper(filepath, load) if keep_in_memory else load()
 def _download_lookup_data(filename: str):
-    filepath = f"{GLOSSARY_FOLDER}/{filename}"
+    filepath = f"{_GLOSSARY_FOLDER}/{filename}"
     def fallback():
         url = request_url(f"{web_url()}/{filepath}")
@@ -121,12 +86,14 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
     """
     def load():
         data = _download_lookup_data(filename)
-        rec = _recfromcsv(_rewrite_csv_text_as_tab(data)) if data else None
+        rec = csv_str_to_recarray(data) if data else None
         return (_build_index(rec) if build_index else rec) if data else None
     try:
         return _memory_wrapper(filename, load) if keep_in_memory else load()
     except Exception:
+        stack = traceback.format_exc()
+        print(stack)
         return None
@@ -144,7 +111,19 @@ def column_name(key: str):
     str
         The column name that can be used in `get_table_value`.
     """
-    return re.sub('[' + re.escape(_DELETE_CHARS) + ']', '', key.replace(' ', '_')) if key else ''
+    return _replace_chars(key) if key else ''
+def _parse_value(value: str):
+    """ Automatically converts the value to float or bool if possible """
+    try:
+        return (
+            True if str(value).lower() == 'true' else
+            False if str(value).lower() == 'false' else
+            float(value)
+        )
+    except Exception:
+        return value
 def _get_single_table_value(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
@@ -163,7 +142,7 @@ def _get_multiple_table_values(data: Union[dict, numpy.recarray], col_match: str
     return reduce(reducer, enumerate(col_match), data)[col_val][0]
-def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
+def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val, default_value=''):
     """
     Get a value matched by one or more columns from a `numpy.recarray`.
@@ -179,6 +158,8 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
         Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
     col_val: str
         The column which contains the value to look for.
+    default_value : Any
+        A value to return when none if found in the data.
     Returns
     -------
@@ -191,7 +172,7 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
             _get_single_table_value(lookup, col_match, col_match_with, col_val) if single else
             _get_multiple_table_values(lookup, col_match, col_match_with, col_val)
         )
-        return None if _is_missing_value(value) else value
+        return default_value if is_missing_value(value) else _parse_value(value)
     except Exception:
         return None
@@ -251,7 +232,7 @@ def extract_grouped_data(data: str, key: str) -> str:
         **{curr.split(':')[0]: curr.split(':')[1]}
     }, data.split(';'), {}) if data is not None and isinstance(data, str) and len(data) > 1 else {}
     value = grouped_data.get(key)
-    return None if _is_missing_value(value) else value
+    return None if is_missing_value(value) else _parse_value(value)
 def extract_grouped_data_closest_date(data: str, year: int) -> str:
@@ -278,13 +259,13 @@ def extract_grouped_data_closest_date(data: str, year: int) -> str:
         lambda prev, curr: {
             **prev,
             **{curr.split(':')[0]: curr.split(':')[1]}
-        } if len(curr) > 0 and not _is_missing_value(curr.split(':')[1]) else prev,
+        } if len(curr) > 0 and not is_missing_value(curr.split(':')[1]) else prev,
         data.split(';'),
         {}
     ) if data is not None and isinstance(data, str) and len(data) > 1 else {}
     dist_years = list(data_by_date.keys())
     closest_year = min(dist_years, key=lambda x: abs(int(x) - year)) if len(dist_years) > 0 else None
-    return None if closest_year is None else data_by_date.get(closest_year)
+    return None if closest_year is None else _parse_value(data_by_date.get(closest_year))
 def lookup_term_ids(lookup: Union[dict, numpy.recarray]):

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth/utils/pivot/pivot_csv.py RENAMED Viewed

@@ -2,6 +2,7 @@ import copy
 import json
 import re
 import numpy as np
+import pandas as pd
 from hestia_earth.schema import UNIQUENESS_FIELDS, Term, NODE_TYPES
 from hestia_earth.schema.utils.sort import get_sort_key, SORT_CONFIG
 from flatten_json import flatten as flatten_json
@@ -11,17 +12,6 @@ from ..api import find_term_ids_by_names
 from ._shared import EXCLUDE_FIELDS, EXCLUDE_PRIVATE_FIELDS, _with_csv_formatting, _filter_emissions_not_relevant
-PANDAS_IMPORT_ERROR_MSG = "Run `pip install pandas>=1.2` to use this functionality"
-try:
-    import pandas as pd
-    version = [int(x) for x in pd.__version__.split('+')[0].split(".")]
-    if version[0] < 1 or (version[0] == 1 and version[1] < 2):
-        raise ImportError(PANDAS_IMPORT_ERROR_MSG)
-except ImportError:
-    raise ImportError(PANDAS_IMPORT_ERROR_MSG)
 # We only want to pivot array items containing blank nodes
 # Assume these are all fields with uniqueness fields not of type Node
 def _get_blank_node_uniqueness_fields():

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth/utils/table.py RENAMED Viewed

@@ -1,22 +1,12 @@
 from functools import reduce
 import numpy as np
+import pandas as pd
 from hestia_earth.schema import NodeType
 # __package__ = "hestia_earth.utils" # required to run interactively in vscode
 from .tools import flatten
-PANDAS_IMPORT_ERROR_MSG = "Run `pip install pandas>=1.2` to use this functionality"
-try:
-    import pandas as pd
-    version = [int(x) for x in pd.__version__.split('+')[0].split(".")]
-    if version[0] < 1 or (version[0] == 1 and version[1] < 2):
-        raise ImportError(PANDAS_IMPORT_ERROR_MSG)
-except ImportError:
-    raise ImportError(PANDAS_IMPORT_ERROR_MSG)
 def _replace_ids(df):
     # in columns, first letter is always lower case
     node_types = [e.value[0].lower() + e.value[1:] for e in NodeType]
@@ -74,11 +64,6 @@ def format_for_upload(filepath: str):
     pandas.DataFrame
         Formatted pandas dataframe
     """
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("Run `pip install pandas~=1.2.0` to use this functionality")
     df = pd.read_csv(filepath, index_col=None, na_values="")
     # replace @id with id for top-level Node

hestia_earth_utils-0.16.3/hestia_earth/utils/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ VERSION = '0.16.3'

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth_utils.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hestia_earth_utils
-Version: 0.16.1
+Version: 0.16.3
 Summary: HESTIA's utils library
 Home-page: https://gitlab.com/hestia-earth/hestia-utils
 Author: HESTIA Team
@@ -13,8 +13,12 @@ Requires-Dist: hestia-earth-schema>=35.0.1
 Requires-Dist: requests>=2.24.0
 Requires-Dist: urllib3~=1.26.0
 Requires-Dist: python-dateutil>=2.8.1
-Requires-Dist: numpy<2,>=1.25.0
+Requires-Dist: numpy>=2
 Requires-Dist: flatten_json
+Provides-Extra: pivot-csv
+Requires-Dist: pandas>=2; extra == "pivot-csv"
+Provides-Extra: table
+Requires-Dist: pandas>=2; extra == "table"
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -22,6 +26,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python
 Dynamic: summary

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth_utils.egg-info/SOURCES.txt RENAMED Viewed

@@ -7,6 +7,7 @@ hestia_earth/utils/__init__.py
 hestia_earth/utils/api.py
 hestia_earth/utils/blank_node.py
 hestia_earth/utils/calculation_status.py
+hestia_earth/utils/csv_utils.py
 hestia_earth/utils/cycle.py
 hestia_earth/utils/date.py
 hestia_earth/utils/descriptive_stats.py

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/hestia_earth_utils.egg-info/requires.txt RENAMED Viewed

@@ -2,5 +2,11 @@ hestia-earth-schema>=35.0.1
 requests>=2.24.0
 urllib3~=1.26.0
 python-dateutil>=2.8.1
-numpy<2,>=1.25.0
+numpy>=2
 flatten_json
+[pivot-csv]
+pandas>=2
+[table]
+pandas>=2

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/setup.py RENAMED Viewed

@@ -32,5 +32,9 @@ setup(
     scripts=[
         'bin/hestia-pivot-csv',
         'bin/hestia-format-upload'
-    ]
+    ],
+    extras_require={
+        'pivot-csv': ['pandas>=2'],
+        'table': ['pandas>=2'],
+    }
 )

{hestia_earth_utils-0.16.1 → hestia_earth_utils-0.16.3}/tests/test_lookup.py RENAMED Viewed

@@ -56,8 +56,11 @@ def test_get_table_value():
 def test_get_table_value_empty():
     lookup = load_lookup(f"{fixtures_path}/lookup.csv")
-    assert get_table_value(lookup, column_name('Col1'), 'val10', column_name('Col4')) is None
-    assert get_table_value(lookup, column_name('Col2'), 'val22', column_name('Col1')) is None
+    assert get_table_value(lookup, column_name('Col1'), 'val10', column_name('Col4'), default_value=None) is None
+    assert get_table_value(lookup, column_name('Col2'), 'val22', column_name('Col1')) == ''
+    lookup = download_lookup('crop.csv')
+    assert get_table_value(lookup, 'termid', 'genericCropSeed', column_name('Plantation_density')) == ''
 def test_find_term_ids_by():
@@ -68,19 +71,19 @@ def test_find_term_ids_by():
 def test_download_lookup_with_index():
     filename = 'crop.csv'
     lookup = download_lookup(filename, keep_in_memory=False, build_index=True)
-    assert isinstance(lookup, dict)
+    assert isinstance(lookup, dict) is True
 def test_download_lookup_without_index():
     filename = 'crop.csv'
     lookup = download_lookup(filename, keep_in_memory=False, build_index=False)
-    assert isinstance(lookup, numpy.recarray)
+    assert isinstance(lookup, numpy.recarray) is True
 def test_handle_missing_float_value():
     filename = 'measurement.csv'
     lookup = download_lookup(filename)
-    assert get_table_value(lookup, 'termid', 'rainfallPeriod', 'maximum') is None
+    assert get_table_value(lookup, 'termid', 'rainfallPeriod', 'maximum') == ''
 def test_handle_missing_string_value():
@@ -102,7 +105,7 @@ def test_extract_grouped_data_no_data():
 def test_extract_grouped_data():
     data = 'Average_price_per_tonne:106950.5556;1991:-;1992:-'
-    assert extract_grouped_data(data, 'Average_price_per_tonne') == '106950.5556'
+    assert extract_grouped_data(data, 'Average_price_per_tonne') == 106950.5556
     assert extract_grouped_data(data, '2010') is None
@@ -124,7 +127,7 @@ def test_get_single_table_value_float_values():
     filename = 'ecoClimateZone.csv'
     lookup = download_lookup(filename)
     column = column_name('STEHFEST_BOUWMAN_2006_N2O-N_FACTOR')
-    assert _get_single_table_value(lookup, column_name('ecoClimateZone'), 11, column) == -0.3022
+    assert _get_single_table_value(lookup, column_name('ecoClimateZone'), '11', column) == '-0.3022'
 def test_extract_grouped_data_closest_date_no_data():
@@ -134,9 +137,9 @@ def test_extract_grouped_data_closest_date_no_data():
 def test_extract_grouped_data_closest_date():
     data = '2000:-;2001:0.1;2002:0.2;2003:0.3;2004:0.4;2005:0.5'
-    assert extract_grouped_data_closest_date(data, 2000) == '0.1'
-    assert extract_grouped_data_closest_date(data, 2001) == '0.1'
-    assert extract_grouped_data_closest_date(data, 2020) == '0.5'
+    assert extract_grouped_data_closest_date(data, 2000) == 0.1
+    assert extract_grouped_data_closest_date(data, 2001) == 0.1
+    assert extract_grouped_data_closest_date(data, 2020) == 0.5
 def test_lookup_term_ids():
@@ -145,3 +148,9 @@ def test_lookup_term_ids():
 def test_lookup_columns():
     assert 'termid' in lookup_columns(download_lookup('crop.csv'))
+def test_get_data_advanced():
+    lookup = download_lookup('liveAnimal.csv')
+    value = get_table_value(lookup, 'termid', 'sheepRam', column_name('ratioCPregnancyNetEnergyPregnancyIpcc2019'))
+    assert value == ''