PyPI - hestia-earth-aggregation - Versions diffs - 0.21.2__tar.gz → 0.21.4__tar.gz - Mend

hestia-earth-aggregation 0.21.2tar.gz → 0.21.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{hestia_earth_aggregation-0.21.2/hestia_earth_aggregation.egg-info → hestia_earth_aggregation-0.21.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hestia_earth_aggregation
-Version: 0.21.2
+Version: 0.21.4
 Summary: HESTIA's aggregation engine.
 Home-page: https://gitlab.com/hestia-earth/hestia-aggregation-engine
 Author: HESTIA Team

{hestia_earth_aggregation-0.21.2 → hestia_earth_aggregation-0.21.4}/hestia_earth/aggregation/__init__.py RENAMED Viewed

@@ -9,8 +9,10 @@ from .utils.quality_score import calculate_score
 def _mock_nb_distribution(include_distribution: bool):
     original_func = distribution._nb_iterations
-    distribution._nb_iterations = lambda *args: original_func(*args) if include_distribution else 0
-    not include_distribution and logger.warning('Not generating distribution.')
+    distribution._nb_iterations = lambda *args: (
+        original_func(*args) if include_distribution else 0
+    )
+    not include_distribution and logger.warning("Not generating distribution.")
 def aggregate(
@@ -20,7 +22,7 @@ def aggregate(
     end_year: int,
     source: dict = None,
     include_distribution: bool = True,
-    filter_by_country: bool = True
+    filter_by_country: bool = True,
 ):
     """
     Aggregates data from HESTIA.
@@ -53,16 +55,23 @@ def aggregate(
     _mock_nb_distribution(include_distribution)
     now = current_time_ms()
-    logger.info('Aggregating %s in %s for period %s to %s' + (' with distribution' if include_distribution else ''),
-                product.get('name'),
-                country.get('name'),
-                start_year,
-                end_year)
-    aggregations, countries = run_aggregate(country, product, source, start_year, end_year, filter_by_country)
-    logger.info('time=%s, unit=ms', current_time_ms() - now)
-    aggregations = [
-        recalculate(agg, product) for agg in aggregations
-    ] if should_recalculate(product) else aggregations
+    logger.info(
+        "Aggregating %s in %s for period %s to %s"
+        + (" with distribution" if include_distribution else ""),
+        product.get("name"),
+        country.get("name"),
+        start_year,
+        end_year,
+    )
+    aggregations, countries = run_aggregate(
+        country, product, source, start_year, end_year, filter_by_country
+    )
+    logger.info("time=%s, unit=ms", current_time_ms() - now)
+    aggregations = (
+        [recalculate(agg, product) for agg in aggregations]
+        if should_recalculate(product)
+        else aggregations
+    )
     aggregations = [
         calculate_score(cycle=agg, countries=countries) for agg in aggregations
     ]

{hestia_earth_aggregation-0.21.2 → hestia_earth_aggregation-0.21.4}/hestia_earth/aggregation/aggregate_cycles.py RENAMED Viewed

@@ -3,23 +3,36 @@ from hestia_earth.utils.tools import non_empty_list
 from hestia_earth.aggregation.log import logger, log_memory_usage
 from hestia_earth.aggregation.utils import CYCLE_AGGREGATION_KEYS, SITE_AGGREGATION_KEYS
-from hestia_earth.aggregation.utils.queries import find_global_nodes, find_country_nodes, download_site
+from hestia_earth.aggregation.utils.queries import (
+    find_global_nodes,
+    find_country_nodes,
+    download_site,
+)
 from hestia_earth.aggregation.utils.term import _is_global
 from hestia_earth.aggregation.utils.group import group_blank_nodes
 from hestia_earth.aggregation.utils.blank_node import cleanup_node_blank_nodes
-from hestia_earth.aggregation.utils.aggregate_weighted import aggregate as aggregate_weighted
+from hestia_earth.aggregation.utils.aggregate_weighted import (
+    aggregate as aggregate_weighted,
+)
 from hestia_earth.aggregation.utils.aggregate_country_nodes import aggregate_cycles
 from hestia_earth.aggregation.utils.weights import (
-    country_weights, country_weight_node_id, world_weights, world_weight_node_id
+    country_weights,
+    country_weight_node_id,
+    world_weights,
+    world_weight_node_id,
 )
 from hestia_earth.aggregation.utils.site import format_site
 from hestia_earth.aggregation.utils.cycle import (
-    aggregate_with_matrix, format_for_grouping, format_terms_results, format_country_results, update_cycle
+    aggregate_with_matrix,
+    format_for_grouping,
+    format_terms_results,
+    format_country_results,
+    update_cycle,
 )
 from hestia_earth.aggregation.utils.covariance import (
     init_covariance_files,
     remove_covariance_files,
-    generate_covariance_country
+    generate_covariance_country,
 )
@@ -31,37 +44,44 @@ def _aggregate_country(
     start_year: int,
     end_year: int,
     generate_weights_func=None,
-    missing_weights_node_id_func=None
+    missing_weights_node_id_func=None,
 ) -> Tuple[dict, dict]:
-    functional_unit = cycles[0].get('functionalUnit')
-    site_type = cycles[0].get('site', {}).get('siteType')
+    functional_unit = cycles[0].get("functionalUnit")
+    site_type = cycles[0].get("site", {}).get("siteType")
     # aggregate cycles with weights
     cycles_formatted = format_for_grouping(cycles)
     cycle_data = group_blank_nodes(
-        cycles_formatted, CYCLE_AGGREGATION_KEYS, start_year, end_year, product=product, site_type=site_type
+        cycles_formatted,
+        CYCLE_AGGREGATION_KEYS,
+        start_year,
+        end_year,
+        product=product,
+        site_type=site_type,
     )
     weights = generate_weights_func(cycle_data)
     cycle_data = cycle_data | aggregate_weighted(
         aggregate_keys=CYCLE_AGGREGATION_KEYS,
         data=cycle_data,
         weights=weights,
-        missing_weights_node_id_func=missing_weights_node_id_func
+        missing_weights_node_id_func=missing_weights_node_id_func,
     )
     # aggregate sites with weights
-    sites = [c.get('site') for c in cycles]
+    sites = [c.get("site") for c in cycles]
     site_data = group_blank_nodes(sites, SITE_AGGREGATION_KEYS)
     site_data = aggregate_weighted(
         aggregate_keys=SITE_AGGREGATION_KEYS,
         data=site_data,
         weights=weights,
-        missing_weights_node_id_func=missing_weights_node_id_func
+        missing_weights_node_id_func=missing_weights_node_id_func,
     )
     aggregated_site = format_site(site_data, sites)
     cycle_data = format_country_results(cycle_data, product, country, aggregated_site)
-    aggregated_cycle = update_cycle(country, start_year, end_year, source, functional_unit, False)(cycle_data)
+    aggregated_cycle = update_cycle(
+        country, start_year, end_year, source, functional_unit, False
+    )(cycle_data)
     return (aggregated_cycle, weights)
@@ -71,7 +91,7 @@ def aggregate_country(
     source: dict,
     start_year: int,
     end_year: int,
-    filter_by_country: bool = True
+    filter_by_country: bool = True,
 ) -> Tuple[list, list]:
     """
     Create 1 to many country-level aggregations.
@@ -100,52 +120,70 @@ def aggregate_country(
     """
     init_covariance_files()
-    cycles = find_country_nodes(product, start_year, end_year, country if filter_by_country else None)
+    cycles = find_country_nodes(
+        product, start_year, end_year, country if filter_by_country else None
+    )
     if not cycles:
-        logger.info('1 - No cycles to run aggregation.')
+        logger.info("1 - No cycles to run aggregation.")
         return ([], [])
     # combine cycles into a "master" cycle with multiple values
     cycles_aggregated = aggregate_cycles(
-        cycles=cycles,
-        product=product,
-        start_year=start_year,
-        end_year=end_year
+        cycles=cycles, product=product, start_year=start_year, end_year=end_year
     )
     if not cycles_aggregated:
-        logger.info('2 - No aggregated cycles.')
+        logger.info("2 - No aggregated cycles.")
         return ([], [])
-    logger.info('Cycles aggregated, generating final country aggregation...')
+    logger.info("Cycles aggregated, generating final country aggregation...")
     log_memory_usage()
-    functional_unit = cycles_aggregated[0].get('functionalUnit')
+    functional_unit = cycles_aggregated[0].get("functionalUnit")
     include_matrix = aggregate_with_matrix(product)
-    cycles_aggregated = non_empty_list([
-        format_terms_results(cycle, product, country) for cycle in cycles_aggregated
-    ])
-    cycles_aggregated = non_empty_list(map(
-        update_cycle(country, start_year, end_year, source, functional_unit, include_matrix),
-        cycles_aggregated
-    ))
+    cycles_aggregated = non_empty_list(
+        [format_terms_results(cycle, product, country) for cycle in cycles_aggregated]
+    )
+    cycles_aggregated = non_empty_list(
+        map(
+            update_cycle(
+                country, start_year, end_year, source, functional_unit, include_matrix
+            ),
+            cycles_aggregated,
+        )
+    )
     logger.info(f"Found {len(cycles_aggregated)} cycles at sub-country level")
     if len(cycles_aggregated) == 0:
-        logger.info('3 - No cycles to run aggregation.')
+        logger.info("3 - No cycles to run aggregation.")
         return []
     # step 2: use aggregated cycles to calculate country-level cycles
-    country_cycle, weights = _aggregate_country(
-        country, product, cycles_aggregated, source, start_year, end_year,
-        generate_weights_func=country_weights,
-        missing_weights_node_id_func=country_weight_node_id
-    ) if all([
-        cycles_aggregated,
-        # when not including matrix, cycles and country_cycles will be the same
-        include_matrix
-    ]) else (None, {})
+    country_cycle, weights = (
+        _aggregate_country(
+            country,
+            product,
+            cycles_aggregated,
+            source,
+            start_year,
+            end_year,
+            generate_weights_func=country_weights,
+            missing_weights_node_id_func=country_weight_node_id,
+        )
+        if all(
+            [
+                cycles_aggregated,
+                # when not including matrix, cycles and country_cycles will be the same
+                include_matrix,
+            ]
+        )
+        else (None, {})
+    )
     log_memory_usage()
-    country_cycle = (country_cycle | generate_covariance_country(weights=weights)) if country_cycle else None
+    country_cycle = (
+        (country_cycle | generate_covariance_country(weights=weights))
+        if country_cycle
+        else None
+    )
     log_memory_usage()
@@ -163,7 +201,7 @@ def aggregate_global(
     start_year: int,
     end_year: int,
     *args,
-    **kwargs
+    **kwargs,
 ) -> Tuple[list, list]:
     """
     Aggregate World and other regions level 0 (like `region-easter-europe`).
@@ -188,14 +226,28 @@ def aggregate_global(
         The list of countries that were used to aggregate.
     """
     cycles = find_global_nodes(product, start_year, end_year, country)
-    cycles = [cycle | {'site': download_site(cycle.get('site'), data_state='original')} for cycle in cycles]
-    countries = non_empty_list([cycle.get('site', {}).get('country') for cycle in cycles])
-    aggregated_cycle, *args = _aggregate_country(
-        country, product, cycles, source, start_year, end_year,
-        generate_weights_func=world_weights,
-        missing_weights_node_id_func=world_weight_node_id
-    ) if cycles else (None, {})
+    cycles = [
+        cycle | {"site": download_site(cycle.get("site"), data_state="original")}
+        for cycle in cycles
+    ]
+    countries = non_empty_list(
+        [cycle.get("site", {}).get("country") for cycle in cycles]
+    )
+    aggregated_cycle, *args = (
+        _aggregate_country(
+            country,
+            product,
+            cycles,
+            source,
+            start_year,
+            end_year,
+            generate_weights_func=world_weights,
+            missing_weights_node_id_func=world_weight_node_id,
+        )
+        if cycles
+        else (None, {})
+    )
     return (non_empty_list([cleanup_node_blank_nodes(aggregated_cycle)]), countries)

{hestia_earth_aggregation-0.21.2 → hestia_earth_aggregation-0.21.4}/hestia_earth/aggregation/log.py RENAMED Viewed

@@ -4,14 +4,14 @@ import platform
 import resource
 import logging
-LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
 # disable root logger
 root_logger = logging.getLogger()
 root_logger.disabled = True
 # create custom logger
-logger = logging.getLogger('hestia_earth.aggregation')
+logger = logging.getLogger("hestia_earth.aggregation")
 logger.removeHandler(sys.stdout)
 logger.setLevel(logging.getLevelName(LOG_LEVEL))
@@ -29,28 +29,28 @@ def log_to_file(filepath: str):
     formatter = logging.Formatter(
         '{"timestamp": "%(asctime)s", "level": "%(levelname)s", "logger": "%(name)s", '
         '"filename": "%(filename)s", "message": "%(message)s"}',
-        '%Y-%m-%dT%H:%M:%S%z')
-    handler = logging.FileHandler(filepath, encoding='utf-8')
+        "%Y-%m-%dT%H:%M:%S%z",
+    )
+    handler = logging.FileHandler(filepath, encoding="utf-8")
     handler.setFormatter(formatter)
     handler.setLevel(logging.getLevelName(LOG_LEVEL))
     logger.addHandler(handler)
-LOG_FILENAME = os.getenv('LOG_FILENAME')
+LOG_FILENAME = os.getenv("LOG_FILENAME")
 if LOG_FILENAME is not None:
     log_to_file(LOG_FILENAME)
-def _join_args(**kwargs): return ', '.join([f"{key}={value}" for key, value in kwargs.items()])
+def _join_args(**kwargs):
+    return ", ".join([f"{key}={value}" for key, value in kwargs.items()])
 def log_memory_usage(**kwargs):
-    factor = 1024 * (
-        1024 if platform.system() in ['Darwin', 'Windows'] else 1
-    )
+    factor = 1024 * (1024 if platform.system() in ["Darwin", "Windows"] else 1)
     value = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / factor
-    extra = (', ' + _join_args(**kwargs)) if len(kwargs.keys()) > 0 else ''
-    logger.info('memory_used=%s, unit=MB' + extra, value)
+    extra = (", " + _join_args(**kwargs)) if len(kwargs.keys()) > 0 else ""
+    logger.info("memory_used=%s, unit=MB" + extra, value)
 def debugRequirements(**kwargs):
@@ -67,7 +67,13 @@ def _sum_values(values: list):
 def debugWeights(weights: dict):
-    total_weight = _sum_values(v.get('weight') for v in weights.values()) or 100
+    total_weight = _sum_values(v.get("weight") for v in weights.values()) or 100
     for id, weight in weights.items():
-        value = weight.get('weight')
-        logger.debug('id=%s, weight=%s, ratio=%s/%s', id, value * 100 / total_weight, value, total_weight)
+        value = weight.get("weight")
+        logger.debug(
+            "id=%s, weight=%s, ratio=%s/%s",
+            id,
+            value * 100 / total_weight,
+            value,
+            total_weight,
+        )

{hestia_earth_aggregation-0.21.2 → hestia_earth_aggregation-0.21.4}/hestia_earth/aggregation/recalculate_cycles.py RENAMED Viewed

@@ -3,7 +3,7 @@ import json
 from hestia_earth.orchestrator import run
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-CONFIG_PATH = os.path.join(CURRENT_DIR, 'config', 'Cycle')
+CONFIG_PATH = os.path.join(CURRENT_DIR, "config", "Cycle")
 def should_recalculate(product: dict):

hestia_earth_aggregation-0.21.4/hestia_earth/aggregation/utils/__init__.py ADDED Viewed

@@ -0,0 +1,226 @@
+import os
+import json
+from decimal import Decimal
+from statistics import stdev, mean
+from hestia_earth.utils.model import linked_node
+from hestia_earth.utils.tools import non_empty_list, flatten, safe_parse_date
+from ..version import VERSION
+MIN_NB_OBSERVATIONS = 20
+CYCLE_AGGREGATION_KEYS = ["inputs", "practices", "products", "emissions"]
+SITE_AGGREGATION_KEYS = ["measurements", "management"]
+class HestiaError(Exception):
+    def __init__(self, message: str, data: dict = {}):
+        super().__init__(message)
+        self.error = {"message": message} | data
+    def __str__(self):
+        return f"Error downloading nodes: {json.dumps(self.error or {})}"
+def create_folders(filepath: str):
+    return os.makedirs(os.path.dirname(filepath), exist_ok=True)
+def pick(value: dict, keys: list):
+    return {k: value.get(k) for k in keys if k in value}
+def is_empty(value):
+    return value is None or (
+        value in [None, "", "-"]
+        if isinstance(value, str)
+        else (
+            len(value) == 0
+            if isinstance(value, list)
+            else len(value.keys()) == 0 if isinstance(value, dict) else False
+        )
+    )
+def remove_empty_fields(value: dict):
+    return {key: value for key, value in value.items() if not is_empty(value)}
+def _save_json(data: dict, filename: str):
+    should_run = os.getenv("DEBUG", "false") == "true"
+    if not should_run:
+        return
+    dir = os.getenv("TMP_DIR", "/tmp")
+    filepath = f"{dir}/{filename}.jsonld"
+    create_folders(filepath)
+    with open(filepath, "w") as f:
+        return json.dump(data, f, indent=2)
+def sum_data(nodes: list, key: str):
+    return sum([node.get(key, 1) for node in nodes])
+def format_aggregated_list(node_type: str, values: list):
+    nodes = non_empty_list(
+        flatten(
+            [
+                {"@id": v} if isinstance(v, str) else v.get(f"aggregated{node_type}s")
+                for v in non_empty_list(values)
+            ]
+        )
+    )
+    # build sorted list of ids
+    ids = sorted(list(set(map(lambda x: x["@id"], nodes))))
+    nodes = [{"@type": node_type, "@id": v} for v in ids]
+    return list(map(linked_node, nodes))
+def match_dates(blank_node: dict, start_year: int, end_year: int):
+    dates = blank_node.get("dates", [])
+    start_date = safe_parse_date(blank_node.get("startDate"), default=None)
+    end_date = safe_parse_date(blank_node.get("endDate"), default=None)
+    return all(
+        [
+            not dates
+            or any(
+                [
+                    int(start_year) <= safe_parse_date(date).year <= int(end_year)
+                    for date in dates
+                    if safe_parse_date(date, default=None)
+                ]
+            ),
+            not start_date
+            or not end_date
+            or any(
+                [
+                    int(start_year) <= start_date.year <= int(end_year),
+                    int(start_year) <= end_date.year <= int(end_year),
+                ]
+            ),
+        ]
+    )
+def _aggregated_node(node: dict):
+    return node | {"aggregated": True, "aggregatedVersion": VERSION}
+def _aggregated_version(node: dict):
+    keys = list(node.keys())
+    keys.remove("@type") if "@type" in keys else None
+    node["aggregated"] = node.get("aggregated", [])
+    node["aggregatedVersion"] = node.get("aggregatedVersion", [])
+    for key in keys:
+        if node.get(key) is None:
+            continue
+        if key in node["aggregated"]:
+            node.get("aggregatedVersion")[node["aggregated"].index(key)] = VERSION
+        else:
+            node["aggregated"].append(key)
+            node["aggregatedVersion"].append(VERSION)
+    return node
+def _min(values, observations: int = 0, min_observations: int = MIN_NB_OBSERVATIONS):
+    has_boolean = any([isinstance(v, bool) for v in values])
+    return (
+        None
+        if has_boolean
+        else min(values) if (observations or len(values)) >= min_observations else None
+    )
+def _max(values, observations: int = 0, min_observations: int = MIN_NB_OBSERVATIONS):
+    has_boolean = any([isinstance(v, bool) for v in values])
+    return (
+        None
+        if has_boolean
+        else max(values) if (observations or len(values)) >= min_observations else None
+    )
+def _sd(values):
+    return stdev(values) if len(values) >= 2 else None
+def _all_boolean(values: list):
+    return all([isinstance(v, bool) for v in values])
+def _numeric_weighted_average(values: list):
+    total_weight = (
+        sum(Decimal(str(weight)) for _v, weight in values) if values else Decimal(0)
+    )
+    weighted_values = [
+        Decimal(str(value)) * Decimal(str(weight)) for value, weight in values
+    ]
+    average = (
+        sum(weighted_values) / (total_weight if total_weight else 1)
+        if weighted_values
+        else None
+    )
+    return None if average is None else float(average)
+def _bool_weighted_average(values: list):
+    return mean(map(int, values)) >= 0.5
+def weighted_average(weighted_values: list):
+    values = [v for v, _w in weighted_values]
+    all_boolean = _all_boolean(values)
+    return (
+        None
+        if not values
+        else (
+            _bool_weighted_average(values)
+            if all_boolean
+            else _numeric_weighted_average(weighted_values)
+        )
+    )
+def _unique_nodes(nodes: list):
+    return sorted(
+        list({n.get("@id"): n for n in nodes}.values()), key=lambda n: n.get("@id")
+    )
+def _set_dict_single(data: dict, key: str, value, strict=False):
+    if data is not None and value is not None and (not strict or not is_empty(value)):
+        data[key] = value
+    return data
+def _set_dict_array(data: dict, key: str, value, strict=False):
+    if data is not None and value is not None and (not strict or value != 0):
+        data[key] = [value]
+    return data
+def format_evs(value: float):
+    return min([100, round(value, 2)]) if value else value
+def value_difference(value: float, expected_value: float):
+    """
+    Get the difference in percentage between a value and the expected value.
+    Parameters
+    ----------
+    value : float
+        The value to check.
+    expected_value : float
+        The expected value.
+    Returns
+    -------
+    bool
+        The difference in percentage between the value and the expected value.
+    """
+    return (
+        0
+        if (isinstance(expected_value, list) and len(expected_value) == 0)
+        or expected_value == 0
+        else (round(abs(value - expected_value) / expected_value, 4))
+    )

hestia-earth-aggregation 0.21.2__tar.gz → 0.21.4__tar.gz

hestia-earth-aggregation 0.21.2tar.gz → 0.21.4tar.gz