PyPI - hestia-earth-utils - Versions diffs - 0.16.5__py3-none-any.whl → 0.16.7__py3-none-any.whl - Mend

hestia-earth-utils 0.16.5py3-none-any.whl → 0.16.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

hestia_earth/utils/blank_node.py CHANGED Viewed

@@ -5,7 +5,7 @@ from functools import reduce
 from statistics import mode, mean
 from hestia_earth.schema import TermTermType
-from .lookup import download_lookup, get_table_value, column_name
+from .lookup import download_lookup, get_table_value
 from .tools import non_empty_list, non_empty_value, flatten
 from .emission import cycle_emissions_in_system_boundary
 from .model import filter_list_term_type
@@ -15,7 +15,7 @@ def get_lookup_value(blank_node: dict, column: str):
     term = blank_node.get('term', {})
     table_name = f"{term.get('termType')}.csv" if term else None
     value = get_table_value(
-        download_lookup(table_name), 'termid', term.get('@id'), column_name(column)
+        download_lookup(table_name), 'term.id', term.get('@id'), column
     ) if table_name else None
     return value

hestia_earth/utils/lookup.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from functools import reduce
-from typing import Union
+from typing import Any
 import requests
-import numpy
+from io import StringIO
+import pandas as pd
 from .storage import _load_from_storage
 from .request import request_url, web_url
-from .csv_utils import csv_str_to_recarray, csv_file_to_recarray, is_missing_value, _replace_chars
 _GLOSSARY_FOLDER = 'glossary/lookups'
 _memory = {}
-_INDEX_COL = 'termid'
 def _memory_wrapper(key: str, func):
@@ -18,6 +17,18 @@ def _memory_wrapper(key: str, func):
     return _memory[key]
+def _read_csv(value: str) -> pd.DataFrame:
+    return pd.read_csv(value, na_values=['-', ''])
+def _read_csv_from_string(data: str) -> pd.DataFrame:
+    return _read_csv(StringIO(data))
+def is_missing_value(value):
+    return pd.isna(value) or value is None or value == '' or value == '-'
 def load_lookup(filepath: str, keep_in_memory: bool = False):
     """
     Import local lookup table as csv file into a `numpy.recarray`.
@@ -34,7 +45,7 @@ def load_lookup(filepath: str, keep_in_memory: bool = False):
     numpy.recarray
         The `numpy.recarray` converted from the csv content.
     """
-    def load(): return csv_file_to_recarray(filepath)
+    def load(): return _read_csv(filepath)
     return _memory_wrapper(filepath, load) if keep_in_memory else load()
@@ -43,7 +54,8 @@ def _download_lookup_data(filename: str):
     def fallback():
         url = request_url(f"{web_url()}/{filepath}")
-        return requests.get(url).content.decode('utf-8')
+        data = requests.get(url).content.decode('utf-8')
+        return data if data and '<html' not in data else None
     try:
         data = _load_from_storage(filepath, glossary=True)
@@ -52,20 +64,7 @@ def _download_lookup_data(filename: str):
         return fallback()
-def _build_index(array: numpy.recarray):
-    columns = list(array.dtype.names)
-    try:
-        return {
-            row[_INDEX_COL]: {col: row[col] for col in columns}
-            for row in array
-        } if _INDEX_COL in columns else array
-    except TypeError:
-        return {
-            array[_INDEX_COL].item(): {col: array[col].item() for col in columns}
-        } if _INDEX_COL in columns else array
-def download_lookup(filename: str, keep_in_memory: bool = True, build_index: bool = False):
+def download_lookup(filename: str, keep_in_memory: bool = True):
     """
     Download lookup table from HESTIA as csv into a `numpy.recarray`.
@@ -85,8 +84,7 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
     """
     def load():
         data = _download_lookup_data(filename)
-        rec = csv_str_to_recarray(data) if data else None
-        return (_build_index(rec) if build_index else rec) if data else None
+        return _read_csv_from_string(data) if data else None
     try:
         return _memory_wrapper(filename, load) if keep_in_memory else load()
@@ -96,19 +94,9 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
 def column_name(key: str):
     """
-    Convert the column name to a usable key on a `numpy.recarray`.
-    Parameters
-    ----------
-    key : str
-        The column name.
-    Returns
-    -------
-    str
-        The column name that can be used in `get_table_value`.
+    Deprecated. Columns are no longer renamed.
     """
-    return _replace_chars(key) if key else ''
+    return key
 def _parse_value(value: str):
@@ -123,36 +111,24 @@ def _parse_value(value: str):
         return value
-def _get_single_table_value(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
-    return (
-        data.get(col_match_with, {})[col_val] if isinstance(data, dict) else
-        data[data[col_match] == col_match_with][col_val][0]
-    )
-def _get_multiple_table_values(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
-    def reducer(x, values):
-        col = values[1]
-        value = col_match_with[values[0]]
-        return x.get(value) if isinstance(x, dict) else x[x[col] == value]
+def _get_single_table_value(df: pd.DataFrame, col_match: str, col_match_with, col_val):
+    filtered_df = df[df[col_match] == col_match_with]
+    return None if filtered_df.empty else filtered_df[col_val].iloc[0]
-    return reduce(reducer, enumerate(col_match), data)[col_val][0]
-def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val, default_value=''):
+def get_table_value(lookup: pd.DataFrame, col_match: str, col_match_with: str, col_val: Any, default_value=''):
     """
     Get a value matched by one or more columns from a `numpy.recarray`.
     Parameters
     ----------
-    lookup : dict | numpy.recarray
+    lookup : DataFrame
         The value returned by the `download_lookup` function.
     col_match : str
         Which `column` should be used to find data in. This will restrict the rows to search for.
         Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
-    col_match_with
+    col_match_with: str
         Which column `value` should be used to find data in. This will restrict the rows to search for.
-        Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
     col_val: str
         The column which contains the value to look for.
     default_value : Any
@@ -163,44 +139,35 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
     str
         The value found or `None` if no match.
     """
-    single = isinstance(col_match, str) and isinstance(col_match_with, str)
     try:
-        value = (
-            _get_single_table_value(lookup, col_match, col_match_with, col_val) if single else
-            _get_multiple_table_values(lookup, col_match, col_match_with, col_val)
-        )
+        value = _get_single_table_value(lookup, col_match, col_match_with, col_val)
+        print(value, type(value))
         return default_value if is_missing_value(value) else _parse_value(value)
     except Exception:
         return None
-def find_term_ids_by(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with):
+def find_term_ids_by(lookup: pd.DataFrame, col_match: str, col_match_with: str):
     """
     Find `term.id` values where a column matches a specific value.
     Parameters
     ----------
-    lookup : dict | numpy.recarray
+    lookup : DataFrame
         The value returned by the `download_lookup` function.
     col_match : str
         Which `column` should be used to find data in. This will restrict the rows to search for.
         Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
-    col_match_with
+    col_match_with: str
         Which column `value` should be used to find data in. This will restrict the rows to search for.
-        Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
     Returns
     -------
     list[str]
         The list of `term.id` that matched the expected column value.
     """
-    term_ids = (
-        set([
-            key
-            for key, value in lookup.items()
-            if value.get(col_match) == col_match_with
-        ])
-    ) if isinstance(lookup, dict) else set(list(lookup[lookup[col_match] == col_match_with].termid))
+    filtered_df = lookup[lookup[col_match] == col_match_with]
+    term_ids = filtered_df['term.id'].unique().tolist() if 'term.id' in filtered_df.columns else []
     return list(map(str, term_ids))
@@ -265,13 +232,13 @@ def extract_grouped_data_closest_date(data: str, year: int) -> str:
     return None if closest_year is None else _parse_value(data_by_date.get(closest_year))
-def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
+def lookup_term_ids(lookup: pd.DataFrame):
     """
     Get the `term.id` values from a lookup.
     Parameters
     ----------
-    lookup : dict | numpy.recarray
+    lookup : DataFrame
         The value returned by the `download_lookup` function.
     Returns
@@ -279,16 +246,16 @@ def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
     list[str]
         The `term.id` values from the lookup.
     """
-    return lookup.keys() if isinstance(lookup, dict) else list(lookup.termid)
+    return list(map(str, lookup['term.id'].tolist())) if 'term.id' in lookup.columns else []
-def lookup_columns(lookup: Union[dict, numpy.recarray]):
+def lookup_columns(lookup: pd.DataFrame):
     """
     Get the columns from a lookup.
     Parameters
     ----------
-    lookup : dict | numpy.recarray
+    lookup : DataFrame
         The value returned by the `download_lookup` function.
     Returns
@@ -296,4 +263,4 @@ def lookup_columns(lookup: Union[dict, numpy.recarray]):
     list[str]
         The columns from the lookup.
     """
-    return list(list(lookup.values())[0].keys()) if isinstance(lookup, dict) else list(lookup.dtype.names)
+    return list(lookup.columns)

hestia_earth/utils/lookup_utils.py CHANGED Viewed

@@ -2,7 +2,7 @@ from functools import lru_cache
 import json
 from hestia_earth.schema import SchemaType
-from .lookup import _download_lookup_data, download_lookup, get_table_value, column_name
+from .lookup import _download_lookup_data, download_lookup, get_table_value
 from .api import download_hestia
 from .tools import non_empty_list, flatten
@@ -45,7 +45,7 @@ def _allowed_model_mapping(model: str, term_id: str, column: str):
     mapping = _allowed_mapping_data()
     value = mapping.get(term_id, {}).get(model, {}).get(column) if mapping else get_table_value(
         download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}-model-{column}.csv"),
-        'termid', term_id, column_name(column)
+        'term.id', term_id, column
     )
     return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
@@ -78,7 +78,7 @@ def _allowed_mapping(term_id: str, column: str):
     mapping = _allowed_mapping_data()
     value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
         download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
-        'termid', term_id, column_name(column)
+        'term.id', term_id, column
     )
     return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
@@ -174,7 +174,7 @@ def is_in_system_boundary(term_id: str) -> bool:
     column = 'inHestiaDefaultSystemBoundary'
     value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
         download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
-        'termid', term_id, column_name(column)
+        'term.id', term_id, column
     )
     # handle numpy bool from table value
     return not (not value)

hestia_earth/utils/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = '0.16.5'
1	+ VERSION = '0.16.7'

{hestia_earth_utils-0.16.5.dist-info → hestia_earth_utils-0.16.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hestia_earth_utils
-Version: 0.16.5
+Version: 0.16.7
 Summary: HESTIA's utils library
 Home-page: https://gitlab.com/hestia-earth/hestia-utils
 Author: HESTIA Team
@@ -13,12 +13,8 @@ Requires-Dist: hestia-earth-schema>=35.0.1
 Requires-Dist: requests>=2.24.0
 Requires-Dist: urllib3~=1.26.0
 Requires-Dist: python-dateutil>=2.8.1
-Requires-Dist: numpy>=2
+Requires-Dist: pandas>=2
 Requires-Dist: flatten_json
-Provides-Extra: pivot-csv
-Requires-Dist: pandas>=2; extra == "pivot-csv"
-Provides-Extra: table
-Requires-Dist: pandas>=2; extra == "table"
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -26,7 +22,6 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
-Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python
 Dynamic: summary

{hestia_earth_utils-0.16.5.dist-info → hestia_earth_utils-0.16.7.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,13 @@
 hestia_earth/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hestia_earth/utils/api.py,sha256=y0gw5pCCHNnFIhM62Hok_5eDtH3QDAZdkye_1mANMNs,9654
-hestia_earth/utils/blank_node.py,sha256=1wc9zUkOvFhJS-YmuKexfIdYxfsp5KyJczLmHlW559Q,7375
+hestia_earth/utils/blank_node.py,sha256=kLjq8U0PYyq_SQ-VHGMll_3XxKdYEnHEwtCCglNT3vg,7350
 hestia_earth/utils/calculation_status.py,sha256=X7lbgVMD9luH1gj9lEcxd3_P2-u7e8ZPGCvX1czPZUo,2238
-hestia_earth/utils/csv_utils.py,sha256=BK-tci1sALmsxamSR1Y7f9O6ajTTdhggLC2pBEWhYME,2310
 hestia_earth/utils/cycle.py,sha256=rFLRL9X4KQ1UrE6fEPA_gV8KmwzrZpR3Ce56zg41lRk,1326
 hestia_earth/utils/date.py,sha256=SPQ69uxHiv1o3BqIkBKkM5XX_CmS20CB7g6u2rhsdh8,1807
 hestia_earth/utils/descriptive_stats.py,sha256=EMVwFvg2OnZgKRAfireAoWY2EbrSvqR0V0bK9B53p28,1583
 hestia_earth/utils/emission.py,sha256=BhBitooLTxZSh82S982v2QfPxxTF1kmGClG_uHyWdz4,1981
-hestia_earth/utils/lookup.py,sha256=XKmxFEH9o1Rhi4oTLteQnMAwNXiObjSX7pMfrUw8q1I,9522
-hestia_earth/utils/lookup_utils.py,sha256=_k3RZ1pK-gw7jq8wn9HrPWfDl4FlEWRb8bXmgaARu0w,6716
+hestia_earth/utils/lookup.py,sha256=Sea1EkwT1K4mb9eNQBkJHoXkvNLSg_N9eeNiUL6pLq0,8028
+hestia_earth/utils/lookup_utils.py,sha256=P3Ae2MqZWvk3f9AObNwk6Fq9AyyX279K4kR9qHX8rKQ,6667
 hestia_earth/utils/model.py,sha256=uUcrF07XmBzqLni8VSaP0HoebJnQ57kk0EOmhwYMbfI,4637
 hestia_earth/utils/pipeline.py,sha256=O-6DPtK0U1lJ51LFGa1gM6pjkBJUfxOjNjY8LxQPXV0,9588
 hestia_earth/utils/request.py,sha256=bu7hkWKmFdXl2_Feawiam_x32whlclA9oP0asJyC69k,626
@@ -16,7 +15,7 @@ hestia_earth/utils/stats.py,sha256=4t3op10xDJbGxWJEY1Jtyl302PYWyMFwLpsSkMlzQn8,3
 hestia_earth/utils/table.py,sha256=MOJDo5fQPRDogAty_UXbO9-EXFwz97m0f7--mOM17lQ,2363
 hestia_earth/utils/term.py,sha256=6LiUSc6KX3IOkfWF6fYkQ2tENCO8ENljcdDypxU6WtA,1060
 hestia_earth/utils/tools.py,sha256=9GaUJwxL-CTzEOGnRFkUQDVFelPevQSxXrf25vssCVo,4990
-hestia_earth/utils/version.py,sha256=RKa3Cna34LUgL3Ye-ubIdZ9B2mS9iFURrl2snrPa3uY,19
+hestia_earth/utils/version.py,sha256=izOjXE-oE9zdUdGeSgNJik6goDuxSRXghKlLPR0OuNE,19
 hestia_earth/utils/pivot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hestia_earth/utils/pivot/_shared.py,sha256=JnyIOzpans45DE2hSa9-4yvNhq8t08lx1IAWGJi6WPQ,1397
 hestia_earth/utils/pivot/pivot_csv.py,sha256=7f6kMqeb1b3RKANLGeDgVu8G5WC-vXIijHnsJhO-CjI,12022
@@ -26,9 +25,9 @@ hestia_earth/utils/storage/_azure_client.py,sha256=sevCZni04eknMql2DgUsWG23f7u0K
 hestia_earth/utils/storage/_local_client.py,sha256=IbzziUKY0QS3ybHFfgEpELqvafa7hQnZ-DdGdjQuypE,515
 hestia_earth/utils/storage/_s3_client.py,sha256=B2yTsf-VfHcRLCKTMes4S_nCXxrZad9umyZx3b5Pu_c,3181
 hestia_earth/utils/storage/_sns_client.py,sha256=LowUatj78Egu6_Id6Rr7hZjfZx1WguS3lozB3yAwSps,347
-hestia_earth_utils-0.16.5.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
-hestia_earth_utils-0.16.5.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
-hestia_earth_utils-0.16.5.dist-info/METADATA,sha256=PAHew6LMon2UUdlJlGUSE7plh0VzfcBXXsV3SPzLL5A,2030
-hestia_earth_utils-0.16.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-hestia_earth_utils-0.16.5.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
-hestia_earth_utils-0.16.5.dist-info/RECORD,,
+hestia_earth_utils-0.16.7.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
+hestia_earth_utils-0.16.7.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
+hestia_earth_utils-0.16.7.dist-info/METADATA,sha256=o6sR5_7DeeXBLuKWYMFmg0CWRg3O-Cynh6NVZkI1mC0,1869
+hestia_earth_utils-0.16.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+hestia_earth_utils-0.16.7.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
+hestia_earth_utils-0.16.7.dist-info/RECORD,,

hestia_earth/utils/csv_utils.py DELETED Viewed

@@ -1,84 +0,0 @@
-import io
-import csv
-import re
-import numpy as np
-_MISSING_VALUE = '-'
-_MISSING = -99999
-_DELIMITER = ','
-_QUOTE_CHAR = '"'
-ENCODING = 'ISO-8859-1'
-# default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
-_DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
-def is_missing_value(value): return value == _MISSING_VALUE or value == _MISSING or value == str(_MISSING)
-def _replace_missing_values(value: str): return str(_MISSING) if str(value) == _MISSING_VALUE else value
-def _replace_chars(value: str): return re.sub(f'[{re.escape(_DELETE_CHARS)}]', '', value.replace(' ', '_'))
-def _text_to_csv(csv_content: str):
-    return csv.reader(io.StringIO(csv_content.strip()), delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
-def _get_columns(csv_content: str):
-    try:
-        reader = _text_to_csv(csv_content)
-        names = next(reader)
-        return list(map(_replace_chars, names))
-    except StopIteration:
-        return []
-def _get_rows(csv_content: str):
-    string_io = io.StringIO(csv_content.strip())
-    try:
-        next(string_io)
-    except StopIteration:
-        return
-    return csv.reader(string_io, delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
-def _csv_str_to_recarray_chunks_numpy(csv_content: str, chunk_size: int = 5):
-    names = _get_columns(csv_content)
-    num_cols = len(names)
-    max_size = 1000
-    dtype = [(name, f"U{max_size}") for name in names]
-    reader = _get_rows(csv_content)
-    # 4. Process the file in batches
-    chunk_rows = []
-    for row in reader:
-        if not row:
-            continue
-        if len(row) != num_cols:
-            continue
-        # replace missing values
-        processed_row = tuple(_replace_missing_values(field) for field in row)
-        chunk_rows.append(processed_row)
-        if len(chunk_rows) >= chunk_size:
-            yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
-            chunk_rows = []
-    if chunk_rows:
-        yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
-def csv_str_to_recarray(csv_content: str) -> np.recarray:
-    array_rows = list(_csv_str_to_recarray_chunks_numpy(csv_content))
-    return np.hstack(array_rows).view(np.recarray)
-def csv_file_to_recarray(filepath: str):
-    with open(filepath, 'r', encoding=ENCODING) as f:
-        content = f.read()
-    return csv_str_to_recarray(content)

{hestia_earth_utils-0.16.5.data → hestia_earth_utils-0.16.7.data}/scripts/hestia-format-upload RENAMED Viewed

File without changes

{hestia_earth_utils-0.16.5.data → hestia_earth_utils-0.16.7.data}/scripts/hestia-pivot-csv RENAMED Viewed

File without changes

{hestia_earth_utils-0.16.5.dist-info → hestia_earth_utils-0.16.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{hestia_earth_utils-0.16.5.dist-info → hestia_earth_utils-0.16.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

hestia-earth-utils 0.16.5__py3-none-any.whl → 0.16.7__py3-none-any.whl

hestia-earth-utils 0.16.5py3-none-any.whl → 0.16.7py3-none-any.whl