hestia-earth-utils 0.16.12__py3-none-any.whl → 0.16.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,15 @@
1
+ from calendar import monthrange
2
+ from datetime import datetime
3
+ from dateutil.relativedelta import relativedelta
4
+ from enum import Enum
5
+ from math import floor
6
+ from typing import Any, Callable, Literal, Optional, Union
7
+
1
8
  from dateutil.parser import parse
2
9
  import re
3
10
 
11
+ from .tools import is_list_like, safe_parse_date
12
+
4
13
  SECOND = 1
5
14
  MINUTE = 60 * SECOND
6
15
  HOUR = 60 * MINUTE
@@ -12,6 +21,8 @@ def diff_in_days(from_date: str, to_date: str) -> float:
12
21
  """
13
22
  Return the difference in days between two dates.
14
23
 
24
+ Deprecated, use `diff_in` function with `unit = TimeUnit.DAY` instead.
25
+
15
26
  Parameters
16
27
  ----------
17
28
  from_date : str
@@ -32,6 +43,8 @@ def diff_in_years(from_date: str, to_date: str) -> float:
32
43
  """
33
44
  Return the difference in years between two dates.
34
45
 
46
+ Deprecated, use `diff_in` function with `unit = TimeUnit.YEAR` instead.
47
+
35
48
  Parameters
36
49
  ----------
37
50
  from_date : str
@@ -51,6 +64,8 @@ def is_in_days(date: str) -> bool:
51
64
  """
52
65
  Check if the date as a string contains year, month and day.
53
66
 
67
+ Deprecated, use `validate_datestr_format` with `valid_format = DatestrFormat.YEAR_MONTH_DAY` instead.
68
+
54
69
  Parameters
55
70
  ----------
56
71
  date : str
@@ -71,6 +86,8 @@ def is_in_months(date: str) -> bool:
71
86
  """
72
87
  Check if the date as a string contains year, month but no day.
73
88
 
89
+ Deprecated, use `validate_datestr_format` with `valid_format = DatestrFormat.YEAR_MONTH` instead.
90
+
74
91
  Parameters
75
92
  ----------
76
93
  date : str
@@ -84,3 +101,425 @@ def is_in_months(date: str) -> bool:
84
101
  return (
85
102
  date is not None and re.compile(r"^[\d]{4}\-[\d]{2}$").match(date) is not None
86
103
  )
104
+
105
+
106
+ OLDEST_DATE = "1800"
107
+
108
+
109
+ class TimeUnit(Enum):
110
+ YEAR = "year"
111
+ MONTH = "month"
112
+ DAY = "day"
113
+ HOUR = "hour"
114
+ MINUTE = "minute"
115
+ SECOND = "second"
116
+
117
+
118
+ class DatestrFormat(Enum):
119
+ """
120
+ Enum representing ISO date formats permitted by HESTIA.
121
+
122
+ See: https://en.wikipedia.org/wiki/ISO_8601
123
+ """
124
+
125
+ YEAR = r"%Y"
126
+ YEAR_MONTH = r"%Y-%m"
127
+ YEAR_MONTH_DAY = r"%Y-%m-%d"
128
+ YEAR_MONTH_DAY_HOUR_MINUTE_SECOND = r"%Y-%m-%dT%H:%M:%S"
129
+ MONTH = r"--%m"
130
+ MONTH_DAY = r"--%m-%d"
131
+
132
+
133
+ DatestrGapfillMode = Literal["start", "middle", "end"]
134
+
135
+
136
+ DATESTR_FORMAT_TO_EXPECTED_LENGTH = {
137
+ DatestrFormat.YEAR: len("2001"),
138
+ DatestrFormat.YEAR_MONTH: len("2001-01"),
139
+ DatestrFormat.YEAR_MONTH_DAY: len("2001-01-01"),
140
+ DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND: len("2001-01-01T00:00:00"),
141
+ DatestrFormat.MONTH: len("--01"),
142
+ DatestrFormat.MONTH_DAY: len("--01-01"),
143
+ }
144
+
145
+
146
+ TIME_UNIT_TO_DATESTR_FORMAT = {
147
+ TimeUnit.YEAR: DatestrFormat.YEAR,
148
+ TimeUnit.MONTH: DatestrFormat.YEAR_MONTH,
149
+ TimeUnit.DAY: DatestrFormat.YEAR_MONTH_DAY,
150
+ TimeUnit.HOUR: DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND,
151
+ TimeUnit.MINUTE: DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND,
152
+ TimeUnit.SECOND: DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND,
153
+ }
154
+ """
155
+ Minimum Datestr format required to express DatetimeUnit.
156
+ """
157
+
158
+
159
+ _SECONDS_IN_MINUTE = 60 # 60 seconds in a minute
160
+ _MINUTES_IN_HOUR = 60 # 60 minutes in an hour
161
+ _HOURS_IN_DAY = 24 # 24 hours in a day
162
+ _MONTHS_IN_YEAR = 12 # 12 months in a year
163
+
164
+ _DAYS_IN_YEAR = YEAR # average days in a year (365.2425)
165
+ _DAYS_IN_MONTH = (
166
+ _DAYS_IN_YEAR / _MONTHS_IN_YEAR
167
+ ) # average days in a month (365.2425/12)
168
+
169
+
170
+ DATETIME_UNIT_CONVERSIONS: dict[str, dict[str, float]] = {
171
+ TimeUnit.YEAR.value: {
172
+ TimeUnit.MONTH.value: _MONTHS_IN_YEAR,
173
+ TimeUnit.DAY.value: _DAYS_IN_YEAR,
174
+ TimeUnit.HOUR.value: _DAYS_IN_YEAR * _HOURS_IN_DAY,
175
+ TimeUnit.MINUTE.value: _DAYS_IN_YEAR * _HOURS_IN_DAY * _MINUTES_IN_HOUR,
176
+ TimeUnit.SECOND.value: _DAYS_IN_YEAR
177
+ * _HOURS_IN_DAY
178
+ * _MINUTES_IN_HOUR
179
+ * _SECONDS_IN_MINUTE,
180
+ },
181
+ TimeUnit.MONTH.value: {
182
+ TimeUnit.YEAR.value: 1 / _MONTHS_IN_YEAR,
183
+ TimeUnit.DAY.value: _DAYS_IN_MONTH,
184
+ TimeUnit.HOUR.value: _DAYS_IN_MONTH * _HOURS_IN_DAY,
185
+ TimeUnit.MINUTE.value: _DAYS_IN_MONTH * _HOURS_IN_DAY * _MINUTES_IN_HOUR,
186
+ TimeUnit.SECOND.value: _DAYS_IN_MONTH
187
+ * _HOURS_IN_DAY
188
+ * _MINUTES_IN_HOUR
189
+ * _SECONDS_IN_MINUTE,
190
+ },
191
+ TimeUnit.DAY.value: {
192
+ TimeUnit.YEAR.value: 1 / _DAYS_IN_YEAR,
193
+ TimeUnit.MONTH.value: 1 / _DAYS_IN_MONTH,
194
+ TimeUnit.HOUR.value: _HOURS_IN_DAY,
195
+ TimeUnit.MINUTE.value: _HOURS_IN_DAY * _MINUTES_IN_HOUR,
196
+ TimeUnit.SECOND.value: _HOURS_IN_DAY * _MINUTES_IN_HOUR * _SECONDS_IN_MINUTE,
197
+ },
198
+ TimeUnit.HOUR.value: {
199
+ TimeUnit.YEAR.value: 1 / (_HOURS_IN_DAY * _DAYS_IN_YEAR),
200
+ TimeUnit.MONTH.value: 1 / (_HOURS_IN_DAY * _DAYS_IN_MONTH),
201
+ TimeUnit.DAY.value: 1 / (_HOURS_IN_DAY),
202
+ TimeUnit.MINUTE.value: _MINUTES_IN_HOUR,
203
+ TimeUnit.SECOND.value: _MINUTES_IN_HOUR * _SECONDS_IN_MINUTE,
204
+ },
205
+ TimeUnit.MINUTE.value: {
206
+ TimeUnit.YEAR.value: 1 / (_MINUTES_IN_HOUR * _HOURS_IN_DAY * _DAYS_IN_YEAR),
207
+ TimeUnit.MONTH.value: 1 / (_MINUTES_IN_HOUR * _HOURS_IN_DAY * _DAYS_IN_MONTH),
208
+ TimeUnit.DAY.value: 1 / (_MINUTES_IN_HOUR * _HOURS_IN_DAY),
209
+ TimeUnit.HOUR.value: 1 / _MINUTES_IN_HOUR,
210
+ TimeUnit.SECOND.value: _SECONDS_IN_MINUTE,
211
+ },
212
+ TimeUnit.SECOND.value: {
213
+ TimeUnit.YEAR.value: 1
214
+ / (_SECONDS_IN_MINUTE * _MINUTES_IN_HOUR * _HOURS_IN_DAY * _DAYS_IN_YEAR),
215
+ TimeUnit.MONTH.value: 1
216
+ / (_SECONDS_IN_MINUTE * _MINUTES_IN_HOUR * _HOURS_IN_DAY * _DAYS_IN_MONTH),
217
+ TimeUnit.DAY.value: 1 / (_SECONDS_IN_MINUTE * _MINUTES_IN_HOUR * _HOURS_IN_DAY),
218
+ TimeUnit.HOUR.value: 1 / (_SECONDS_IN_MINUTE * _MINUTES_IN_HOUR),
219
+ TimeUnit.MINUTE.value: 1 / _SECONDS_IN_MINUTE,
220
+ },
221
+ }
222
+ """
223
+ A dict of TimeUnit conversion factors with format:
224
+ ```
225
+ {
226
+ source (str): {
227
+ dest (str): conversion_factor (float)
228
+ }
229
+ }
230
+ ```
231
+ """
232
+
233
+
234
+ def _get_time_unit_conversion(
235
+ src_unit: TimeUnit, dest_unit: TimeUnit, default_value: float = 1
236
+ ):
237
+ src_key = src_unit if isinstance(src_unit, str) else src_unit.value
238
+ dest_key = dest_unit if isinstance(dest_unit, str) else dest_unit.value
239
+ return DATETIME_UNIT_CONVERSIONS.get(src_key, {}).get(dest_key, default_value)
240
+
241
+
242
+ def convert_duration(
243
+ duration: float,
244
+ src_unit: TimeUnit,
245
+ dest_unit: TimeUnit,
246
+ default_conversion_factor: float = 1,
247
+ ):
248
+ conversion_factor = _get_time_unit_conversion(
249
+ src_unit, dest_unit, default_conversion_factor
250
+ )
251
+ return duration * conversion_factor
252
+
253
+
254
+ def _check_datestr_format(datestr: str, format: DatestrFormat) -> bool:
255
+ """
256
+ Use `datetime.strptime` to determine if a datestr is in a particular ISO format.
257
+ """
258
+ try:
259
+ expected_length = DATESTR_FORMAT_TO_EXPECTED_LENGTH.get(format, 0)
260
+ format_str = format.value
261
+ return len(datestr) == expected_length and bool(
262
+ datetime.strptime(datestr, format_str)
263
+ )
264
+ except ValueError:
265
+ return False
266
+
267
+
268
+ def _get_datestr_format(
269
+ datestr: str, default: Optional[Any] = None
270
+ ) -> Union[DatestrFormat, Any, None]:
271
+ """
272
+ Check a datestr against each ISO format permitted by the HESTIA schema and
273
+ return the matching format.
274
+ """
275
+ return next(
276
+ (
277
+ date_format
278
+ for date_format in DatestrFormat
279
+ if _check_datestr_format(str(datestr), date_format)
280
+ ),
281
+ default,
282
+ )
283
+
284
+
285
+ def validate_datestr_format(
286
+ datestr: str,
287
+ valid_format: Union[DatestrFormat, list[DatestrFormat]] = [
288
+ DatestrFormat.YEAR,
289
+ DatestrFormat.YEAR_MONTH,
290
+ DatestrFormat.YEAR_MONTH_DAY,
291
+ ],
292
+ ):
293
+ valid_formats = valid_format if is_list_like(valid_format) else [valid_format]
294
+ format_ = _get_datestr_format(datestr)
295
+ return format_ in valid_formats
296
+
297
+
298
+ def _gapfill_datestr_start(datestr: str, *_) -> str:
299
+ """
300
+ Gapfill an incomplete datestr with the earliest possible date and time.
301
+
302
+ Datestr will snap to the start of the year/month/day as appropriate.
303
+ """
304
+ return datestr + "YYYY-01-01T00:00:00"[len(datestr) :]
305
+
306
+
307
+ def _days_in_month(datestr: str) -> int:
308
+ """
309
+ Get the number of days in the datestr's month. If datestr invalid, return minimum value of 28.
310
+ """
311
+ datetime = safe_parse_date(datestr)
312
+ return monthrange(datetime.year, datetime.month)[1] if datetime else 28
313
+
314
+
315
+ def _gapfill_datestr_end(datestr: str, format: DatestrFormat) -> str:
316
+ """
317
+ Gapfill an incomplete datestr with the latest possible date and time.
318
+
319
+ Datestr will snap to the end of the year/month/day as appropriate.
320
+ """
321
+ days = _days_in_month(datestr) if format == DatestrFormat.YEAR_MONTH else 31
322
+ completion_str = f"YYYY-12-{days}T23:59:59"
323
+ return datestr + completion_str[len(datestr) :]
324
+
325
+
326
+ def _gapfill_datestr_middle(datestr: str, format: DatestrFormat) -> str:
327
+ """
328
+ Gap-fill an incomplete datestr with the middle value, halfway between the latest and earliest values.
329
+ """
330
+ start_date_obj = datetime.strptime(
331
+ _gapfill_datestr_start(datestr),
332
+ DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND.value,
333
+ )
334
+ end_date_obj = datetime.strptime(
335
+ _gapfill_datestr_end(datestr, format=format),
336
+ DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND.value,
337
+ )
338
+ middle_date = start_date_obj + (end_date_obj - start_date_obj) / 2
339
+ return datetime.strftime(
340
+ middle_date, DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND.value
341
+ )
342
+
343
+
344
+ DATESTR_GAPFILL_MODE_TO_GAPFILL_FUNCTION: dict[DatestrGapfillMode, Callable] = {
345
+ "start": _gapfill_datestr_start,
346
+ "middle": _gapfill_datestr_middle,
347
+ "end": _gapfill_datestr_end,
348
+ }
349
+
350
+ _VALID_GAPFILL_DATE_FORMATS = {
351
+ DatestrFormat.YEAR,
352
+ DatestrFormat.YEAR_MONTH,
353
+ DatestrFormat.YEAR_MONTH_DAY,
354
+ }
355
+
356
+
357
+ def gapfill_datestr(datestr: str, mode: DatestrGapfillMode = "start") -> str:
358
+ """
359
+ Gapfill incomplete datestrs and returns them in the format `YYYY-MM-DDTHH:mm:ss`.
360
+ """
361
+ datestr_ = str(datestr)
362
+ format_ = _get_datestr_format(datestr_)
363
+ should_run = format_ in _VALID_GAPFILL_DATE_FORMATS
364
+ return (
365
+ None
366
+ if datestr is None
367
+ else (
368
+ DATESTR_GAPFILL_MODE_TO_GAPFILL_FUNCTION[mode](datestr_, format_)
369
+ if should_run
370
+ else datestr_
371
+ )
372
+ )
373
+
374
+
375
+ def convert_datestr(
376
+ datestr: str,
377
+ target_format: DatestrFormat,
378
+ gapfill_mode: DatestrGapfillMode = "start",
379
+ ) -> str:
380
+ should_run = validate_datestr_format(datestr, _VALID_GAPFILL_DATE_FORMATS)
381
+ return (
382
+ datetime.strptime(
383
+ gapfill_datestr(datestr, gapfill_mode),
384
+ DatestrFormat.YEAR_MONTH_DAY_HOUR_MINUTE_SECOND.value,
385
+ ).strftime(target_format.value)
386
+ if should_run
387
+ else datestr
388
+ )
389
+
390
+
391
+ def parse_gapfilled_datestr(
392
+ datestr: str, gapfill_mode: DatestrGapfillMode = "start", default: Any = None
393
+ ):
394
+ return safe_parse_date(gapfill_datestr(datestr, mode=gapfill_mode), default=default)
395
+
396
+
397
+ def datestrs_match(
398
+ datestr_a: str, datestr_b: str, mode: DatestrGapfillMode = "start"
399
+ ) -> bool:
400
+ """
401
+ Comparison of non-gap-filled string dates.
402
+ example: For end dates, '2010' would match '2010-12-31', but not '2010-01-01'
403
+ """
404
+ return gapfill_datestr(datestr=datestr_a, mode=mode) == gapfill_datestr(
405
+ datestr=datestr_b, mode=mode
406
+ )
407
+
408
+
409
+ def _diff_in_years_calendar(a: datetime, b: datetime, *, add_second: bool, **_) -> int:
410
+ reverse = a > b
411
+ b_ = (
412
+ b
413
+ if not add_second
414
+ else b - relativedelta(seconds=1) if reverse else b + relativedelta(seconds=1)
415
+ )
416
+ diff = relativedelta(b_, a)
417
+ return diff.years
418
+
419
+
420
+ def _diff_in_months_calendar(a: datetime, b: datetime, *, add_second: bool, **_) -> int:
421
+ reverse = a > b
422
+ b_ = (
423
+ b
424
+ if not add_second
425
+ else b - relativedelta(seconds=1) if reverse else b + relativedelta(seconds=1)
426
+ )
427
+ diff = relativedelta(b_, a)
428
+ return diff.years * 12 + diff.months
429
+
430
+
431
+ def _diff(
432
+ a: datetime, b: datetime, *, unit: TimeUnit, add_second: bool, complete_only: bool
433
+ ) -> Union[float, int]:
434
+ reverse = a > b
435
+ b_ = (
436
+ b
437
+ if not add_second
438
+ else b - relativedelta(seconds=1) if reverse else b + relativedelta(seconds=1)
439
+ )
440
+ diff = convert_duration((b_ - a).total_seconds(), TimeUnit.SECOND, unit)
441
+ return floor(diff) if complete_only else diff
442
+
443
+
444
+ DIFF_FUNCTION = {
445
+ (TimeUnit.YEAR, True): _diff_in_years_calendar,
446
+ (TimeUnit.MONTH, True): _diff_in_months_calendar,
447
+ }
448
+ """
449
+ (unit: TimeUnit, calendar: bool): Callable
450
+ """
451
+
452
+
453
+ def diff_in(
454
+ a: Union[datetime, str],
455
+ b: Union[datetime, str],
456
+ unit: TimeUnit,
457
+ add_second=False,
458
+ calendar=False,
459
+ gapfill_mode: DatestrGapfillMode = "start",
460
+ ):
461
+ """
462
+ Calculate the difference between two dates.
463
+
464
+ This function does NOT return the absolute difference. If `b` is before `a` the function will return a negative
465
+ value.
466
+
467
+ If dates are passed as datestrings, they will be parsed into datetime objects. Caution is advised when using
468
+ datestrings with formats `--MM` and `--MM-DD` as these might be parsed in unexpected ways.
469
+
470
+ Parameters
471
+ ----------
472
+ a : datetime | str
473
+ The first date.
474
+
475
+ b: datetime | str
476
+ The second date.
477
+
478
+ unit : TimeUnit
479
+ The time unit to calculate the diff in.
480
+
481
+ add_second : bool, optional, default = `False`
482
+ A flag to determine whether to add one second to diff results.
483
+
484
+ Set to `True` in cases where you are calculating the duration of nodes with incomplete datestrings.
485
+
486
+ For example, a node with `"startDate"` = `"2000"` and `"endDate"` = `"2001"` will ordinarily be assumed to take
487
+ place over the entirety of 2000 and 2001 (i.e., from `"2000-01-01T00-00-00"` to `"2001-12-31T23-59-59"`).
488
+ However, If `add_second` = `False`, the diff in days will be slightly less than 731 because the final second of
489
+ 2001-12-31 is not accounted for. If `True` the diff will be exactly 731.
490
+
491
+ calendar : bool, optional, default = `False`
492
+ A flag to determine whether to use calendar time units.
493
+
494
+ If `True` the diff in years between `"2000"` and `"2001"` will be exactly 1, if `False` the diff will be
495
+ slightly over 1 because a leap year is longer than the average year.
496
+
497
+ If `True` the diff in months between `"2000-02"` and `"2000-03"` will be exactly 1, if `False` the diff will be
498
+ approximately 0.95 because February is shorter than the average month.
499
+
500
+ For all units, if `True`, only complete units will be counted, For example, the diff in days between
501
+ `"2000-01-01:00:00:00"` and `"2000-01-01:12:00:00"` will be 0. If `False` the diff will be 0.5.
502
+
503
+ gapfill_mode : DatestrGapfillMode, optional, default = `"start"`
504
+ How to gapfill incomplete datestrings (`"start"`, `"middle"` or `"end"`).
505
+
506
+ Returns
507
+ -------
508
+ diff : float | int
509
+ The difference between the dates in the selected units.
510
+ """
511
+ a_, b_ = (
512
+ (
513
+ d
514
+ if isinstance(d, datetime)
515
+ else parse_gapfilled_datestr(d, gapfill_mode=gapfill_mode)
516
+ )
517
+ for d in (a, b)
518
+ )
519
+
520
+ diff_func = DIFF_FUNCTION.get(
521
+ (unit, calendar),
522
+ lambda *_, **kwargs: _diff(a_, b_, **kwargs, complete_only=calendar),
523
+ )
524
+
525
+ return diff_func(a_, b_, unit=unit, add_second=add_second)
@@ -1,6 +1,10 @@
1
1
  import json
2
2
  import numpy as np
3
- from hestia_earth.schema import EmissionMethodTier
3
+ import pandas as pd
4
+ from hestia_earth.schema import SCHEMA_TYPES, NODE_TYPES, EmissionMethodTier
5
+ from flatten_json import flatten as flatten_json
6
+
7
+ from ..tools import list_sum
4
8
 
5
9
 
6
10
  EXCLUDE_FIELDS = ["@type", "type", "@context"]
@@ -14,6 +18,20 @@ EXCLUDE_PRIVATE_FIELDS = [
14
18
  ]
15
19
 
16
20
 
21
+ # assuming column labels always camelCase
22
+ def _get_node_type_label(node_type):
23
+ return node_type[0].lower() + node_type[1:]
24
+
25
+
26
+ def _get_node_type_from_label(node_type):
27
+ return node_type[0].upper() + node_type[1:]
28
+
29
+
30
+ def _is_blank_node(data: dict):
31
+ node_type = data.get("@type") or data.get("type")
32
+ return node_type in SCHEMA_TYPES and node_type not in NODE_TYPES
33
+
34
+
17
35
  def _with_csv_formatting(dct):
18
36
  """
19
37
  Use as object_hook when parsing a JSON node: json.loads(node, object_hook=_with_csv_formatting).
@@ -44,7 +62,7 @@ def _filter_not_relevant(blank_node: dict):
44
62
 
45
63
  def _filter_emissions_not_relevant(node: dict):
46
64
  """
47
- Ignore all emissions where `methodTier=not relevant` so save space.
65
+ Ignore all emissions where `methodTier=not relevant` to save space.
48
66
  """
49
67
  return node | (
50
68
  {
@@ -53,3 +71,40 @@ def _filter_emissions_not_relevant(node: dict):
53
71
  if key in node
54
72
  }
55
73
  )
74
+
75
+
76
+ def _filter_zero_value(blank_node: dict):
77
+ value = blank_node.get("value")
78
+ value = (
79
+ list_sum(blank_node.get("value"), default=-1)
80
+ if isinstance(value, list)
81
+ else value
82
+ )
83
+ return value != 0
84
+
85
+
86
+ def _filter_zero_values(node: dict):
87
+ """
88
+ Ignore all blank nodes where `value=0` to save space.
89
+ """
90
+ return node | (
91
+ {
92
+ key: list(filter(_filter_zero_value, value))
93
+ for key, value in node.items()
94
+ if isinstance(value, list)
95
+ and isinstance(value[0], dict)
96
+ and _is_blank_node(value[0])
97
+ }
98
+ )
99
+
100
+
101
+ def nodes_to_df(nodes: list[dict]):
102
+ nodes_flattened = [
103
+ flatten_json(
104
+ dict([(_get_node_type_label(node.get("@type", node.get("type"))), node)]),
105
+ ".",
106
+ )
107
+ for node in nodes
108
+ ]
109
+
110
+ return pd.json_normalize(nodes_flattened)
@@ -5,7 +5,6 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from hestia_earth.schema import UNIQUENESS_FIELDS, Term, NODE_TYPES
7
7
  from hestia_earth.schema.utils.sort import get_sort_key, SORT_CONFIG
8
- from flatten_json import flatten as flatten_json
9
8
 
10
9
  # __package__ = "hestia_earth.utils" # required to run interactively in vscode
11
10
  from ..api import find_term_ids_by_names
@@ -14,6 +13,9 @@ from ._shared import (
14
13
  EXCLUDE_PRIVATE_FIELDS,
15
14
  _with_csv_formatting,
16
15
  _filter_emissions_not_relevant,
16
+ _get_node_type_label,
17
+ _get_node_type_from_label,
18
+ nodes_to_df,
17
19
  )
18
20
 
19
21
 
@@ -36,15 +38,6 @@ def _get_blank_node_uniqueness_fields():
36
38
  BLANK_NODE_UNIQUENESS_FIELDS = _get_blank_node_uniqueness_fields()
37
39
 
38
40
 
39
- # assuming column labels always camelCase
40
- def _get_node_type_label(node_type):
41
- return node_type[0].lower() + node_type[1:]
42
-
43
-
44
- def _get_node_type_from_label(node_type):
45
- return node_type[0].upper() + node_type[1:]
46
-
47
-
48
41
  def _get_names(df):
49
42
  names = []
50
43
  for node_type, array_fields in BLANK_NODE_UNIQUENESS_FIELDS.items():
@@ -283,18 +276,6 @@ def _format_and_pivot(df_in):
283
276
  return df_out
284
277
 
285
278
 
286
- def nodes_to_df(nodes: list[dict]):
287
- nodes_flattened = [
288
- flatten_json(
289
- dict([(_get_node_type_label(node.get("@type", node.get("type"))), node)]),
290
- ".",
291
- )
292
- for node in nodes
293
- ]
294
-
295
- return pd.json_normalize(nodes_flattened)
296
-
297
-
298
279
  def pivot_nodes(nodes: list[dict]):
299
280
  """
300
281
  Pivot array of nodes in dict format (e.g under the 'nodes' key of a .hestia file)
@@ -11,6 +11,7 @@ from ._shared import (
11
11
  EXCLUDE_PRIVATE_FIELDS,
12
12
  _with_csv_formatting,
13
13
  _filter_emissions_not_relevant,
14
+ _filter_zero_values,
14
15
  )
15
16
 
16
17
  pivot_exclude_fields = Term().fields
@@ -22,6 +23,8 @@ term_exclude_fields = Term().fields
22
23
  del term_exclude_fields["name"]
23
24
  term_exclude_fields.update({k: "" for k in EXCLUDE_PRIVATE_FIELDS})
24
25
 
26
+ include_all_unique_keys = ["emissions", "emissionsResourceUse"]
27
+
25
28
  # Treat properties uniqueness fields as special case for now
26
29
  PROPERTIES_VIRTUAL_UNIQUENESS_FIELD = "propertyValues"
27
30
  ADAPTED_UNIQUENESS_FIELDS = deepcopy(UNIQUENESS_FIELDS)
@@ -90,6 +93,7 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
90
93
 
91
94
  # print('\n', level, 'fields_to_pivot', fields_to_pivot)
92
95
  for field, uniqueness_fields in fields_to_pivot:
96
+ include_all_unique_fields = field in include_all_unique_keys
93
97
  # print('\nbefore processing node field', level, field, node[field], '\n')
94
98
  # Compress lists of 'Node' nodes to dict with single @id key.
95
99
  # The compressed field matches uniqueness fields like cycle.emissions.inputs.@id.
@@ -155,14 +159,13 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
155
159
  del term_data["combined_fields"][id_key]
156
160
  # print('combined_fields', field, term_id, term_data['combined_fields'], '\n')
157
161
  fields_to_include = {
158
- field: any(
159
- by_idx[idx].get(field) != by_idx[indexes[0]].get(field)
160
- for idx in indexes
162
+ k: include_all_unique_fields
163
+ or any(
164
+ by_idx[idx].get(k) != by_idx[indexes[0]].get(k) for idx in indexes
161
165
  )
162
- for field in term_data["combined_fields"].keys()
163
- if field in uniqueness_fields
164
- or field != "value"
165
- and field.split(".")[-1] not in pivot_exclude_fields
166
+ for k in term_data["combined_fields"].keys()
167
+ if k in uniqueness_fields
168
+ or (k != "value" and k.split(".")[-1] not in pivot_exclude_fields)
166
169
  }
167
170
  # print('fields_to_include', level, field, term_id, fields_to_include, '\n')
168
171
  for idx in indexes:
@@ -179,19 +182,23 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
179
182
  )
180
183
  ]
181
184
  # print('distingishing_field_fields', level, field, term_id, distingishing_field_fields, '\n')
182
- unanimous_fields = {
183
- field: term_data["combined_fields"][field]
184
- for field, not_unanimous in fields_to_include.items()
185
- if field not in distingishing_field_fields
186
- and not not_unanimous
187
- and field is not PROPERTIES_VIRTUAL_UNIQUENESS_FIELD
188
- }
189
185
  # print('unanimous_fields', level, field, term_id, unanimous_fields, '\n')
186
+ unanimous_fields = (
187
+ {}
188
+ if include_all_unique_fields
189
+ else {
190
+ field: term_data["combined_fields"][field]
191
+ for field, not_unanimous in fields_to_include.items()
192
+ if field not in distingishing_field_fields
193
+ and not not_unanimous
194
+ and field is not PROPERTIES_VIRTUAL_UNIQUENESS_FIELD
195
+ }
196
+ )
190
197
  differentiated_fields = {
191
198
  field: term[field]
192
199
  for field, not_unanimous in fields_to_include.items()
193
200
  if field not in distingishing_field_fields
194
- and not_unanimous
201
+ and (include_all_unique_fields or not_unanimous)
195
202
  and field in term
196
203
  }
197
204
  # print('differentiated_fields', level, field, term_id, differentiated_fields, '\n')
@@ -223,19 +230,35 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
223
230
  return pivoted_node
224
231
 
225
232
 
226
- def pivot_node(node: dict):
233
+ def pivot_node(
234
+ node: dict,
235
+ include_emissions_not_relevant: bool = False,
236
+ include_zero_values: bool = True,
237
+ ):
227
238
  """
228
239
  Pivot single node in dict format parsed with object_hook=_with_csv_formatting
229
240
  """
230
- return _do_pivot(_filter_emissions_not_relevant(node))
241
+ node = (
242
+ node if include_emissions_not_relevant else _filter_emissions_not_relevant(node)
243
+ )
244
+ node = node if include_zero_values else _filter_zero_values(node)
245
+ return _do_pivot(node)
231
246
 
232
247
 
233
- def pivot_json_node(json_node: str):
248
+ def pivot_json_node(
249
+ json_node: str,
250
+ include_emissions_not_relevant: bool = False,
251
+ include_zero_values: bool = True,
252
+ ):
234
253
  """
235
254
  Pivot single schema-compliant unparsed json string node
236
255
  """
237
256
  node = json.loads(json_node, object_hook=_with_csv_formatting)
238
- return pivot_node(node)
257
+ return pivot_node(
258
+ node,
259
+ include_emissions_not_relevant=include_emissions_not_relevant,
260
+ include_zero_values=include_zero_values,
261
+ )
239
262
 
240
263
 
241
264
  def pivot_hestia_file(hestia_file: str):
@@ -246,8 +269,19 @@ def pivot_hestia_file(hestia_file: str):
246
269
  return pivot_nodes(parsed.get("nodes", []))
247
270
 
248
271
 
249
- def pivot_nodes(nodes: list[dict]):
272
+ def pivot_nodes(
273
+ nodes: list[dict],
274
+ include_emissions_not_relevant: bool = False,
275
+ include_zero_values: bool = True,
276
+ ):
250
277
  """
251
278
  Pivot multiple nodes in dict format parsed with object_hook=_with_csv_formatting
252
279
  """
253
- return [pivot_node(node) for node in nodes]
280
+ return [
281
+ pivot_node(
282
+ node,
283
+ include_emissions_not_relevant=include_emissions_not_relevant,
284
+ include_zero_values=include_zero_values,
285
+ )
286
+ for node in nodes
287
+ ]
@@ -1,9 +1,11 @@
1
- import time
1
+ from collections.abc import Generator, Iterable
2
2
  from dateutil.parser import parse
3
- from statistics import mean
4
3
  from functools import reduce
5
- from math import log10, floor
6
4
  import numpy
5
+ from math import log10, floor
6
+ from statistics import mean
7
+ import time
8
+ from typing import Literal
7
9
  from hestia_earth.schema import NodeType
8
10
 
9
11
 
@@ -268,3 +270,67 @@ def pick(value: dict, keys: list) -> dict:
268
270
 
269
271
  def unique_values(values: list, key: str = "@id"):
270
272
  return list({v[key]: v for v in values}.values())
273
+
274
+
275
+ def is_list_like(obj) -> bool:
276
+ """
277
+ Return `True` if the input arg is an instance of an `Iterable` (excluding `str` and `bytes`) or a `Generator`, else
278
+ return `False`.
279
+ """
280
+ return isinstance(obj, (Iterable, Generator)) and not isinstance(obj, (str, bytes))
281
+
282
+
283
+ TO_LIST_LIKE_CONSTRUCTOR = {"list": list, "set": set, "tuple": tuple}
284
+
285
+
286
+ def _as_list_like(obj, to: Literal["list", "set", "tuple"] = "list"):
287
+ """
288
+ Convert an object to either a list, set or tuple.
289
+
290
+ If the object is list-like, convert it to the target iterable. If the object is not list-like, wrap the
291
+ object in the iterable.
292
+
293
+ `str` and `bytes` objects are not consider list-like and, therefore, will be wrapped.
294
+ """
295
+ constructor = TO_LIST_LIKE_CONSTRUCTOR.get(to, list)
296
+ return (
297
+ obj
298
+ if isinstance(obj, constructor)
299
+ else constructor(obj) if is_list_like(obj) else constructor([obj])
300
+ )
301
+
302
+
303
+ def as_list(obj) -> list:
304
+ """
305
+ Convert an object to a list.
306
+
307
+ If the object is a list, return it. Else, if the object is list-like, convert it into a list. Else, wrap the object
308
+ in a list (e.g., `[obj]`).
309
+
310
+ `str` and `bytes` objects are not consider list-like and, therefore, will be wrapped (e.g., `"abc"` -> `["abc"]`).
311
+ """
312
+ return _as_list_like(obj, "list")
313
+
314
+
315
+ def as_set(obj) -> set:
316
+ """
317
+ Convert an object to a set.
318
+
319
+ If the object is a set, return it. Else, if the object is list-like, convert it into a set. Else, wrap the object
320
+ in a set (e.g., `{obj}`).
321
+
322
+ `str` and `bytes` objects are not consider list-like and, therefore, will be wrapped (e.g., `"abc"` -> `{"abc"}`).
323
+ """
324
+ return _as_list_like(obj, "set")
325
+
326
+
327
+ def as_tuple(obj) -> tuple:
328
+ """
329
+ Convert an object to a tuple.
330
+
331
+ If the object is a tuple, return it. Else, if the object is list-like, convert it into a tuple. Else, wrap the
332
+ object in a tuple (e.g., `(obj, )`)
333
+
334
+ `str` and `bytes` objects are not consider list-like and, therefore, will be wrapped (e.g., `"abc"` -> `("abc", )`).
335
+ """
336
+ return _as_list_like(obj, "tuple")
@@ -1 +1 @@
1
- VERSION = "0.16.12"
1
+ VERSION = "0.16.14"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hestia_earth_utils
3
- Version: 0.16.12
3
+ Version: 0.16.14
4
4
  Summary: HESTIA's utils library
5
5
  Home-page: https://gitlab.com/hestia-earth/hestia-utils
6
6
  Author: HESTIA Team
@@ -3,7 +3,7 @@ hestia_earth/utils/api.py,sha256=b6g87ylAgdWvwPlDeZDv74UGfXVe1KFXap5-Rv5daSE,992
3
3
  hestia_earth/utils/blank_node.py,sha256=K_8wWACiwOqEJR71ClQFInzsIdDH3UUSwrtCZoh9V-o,7854
4
4
  hestia_earth/utils/calculation_status.py,sha256=f5b05cEFXMfFI1clirIt7v3Y9H2Nja66GYv1NCyZjf0,2381
5
5
  hestia_earth/utils/cycle.py,sha256=oo0CesLMblL8ewI4s7eXdyNjr9R9df4Vyr0iRXCYFu4,1326
6
- hestia_earth/utils/date.py,sha256=IVqZjIJda8lUbDZkEbaNN5FBQoTHdNudgKXpngm-2a4,1847
6
+ hestia_earth/utils/date.py,sha256=cwcGtYVh2K5rH7k5uO1Olovy-hzIDrvsfaiR94ZlyO0,16165
7
7
  hestia_earth/utils/descriptive_stats.py,sha256=YvDI6EWCcZWw8yCxYhqyzMCDqCu2X8DjvygmMK_AVvc,1633
8
8
  hestia_earth/utils/emission.py,sha256=rHHf5vwe-RxTOaOJ9N0MuyJOLpDnPjjxv6MHYSpPgcU,2165
9
9
  hestia_earth/utils/lookup.py,sha256=SArKqjqs_Yt5cC6TIk12WVQNToun2spZ_iNUm4fX9FA,8274
@@ -14,20 +14,20 @@ hestia_earth/utils/request.py,sha256=EfitmS13abPxfl5gSAOyHRUYhk4R2Rfv94fF9lvOT00
14
14
  hestia_earth/utils/stats.py,sha256=vTNyKcMKmX0DoodM9QEG7HF8qm2Wf-4ckMWQFWZ1VgE,34729
15
15
  hestia_earth/utils/table.py,sha256=MOJDo5fQPRDogAty_UXbO9-EXFwz97m0f7--mOM17lQ,2363
16
16
  hestia_earth/utils/term.py,sha256=aBVYuYv55nPqJPyt5mN4Fz652s_1hwUPckNUZX0pMP8,1064
17
- hestia_earth/utils/tools.py,sha256=b8_ZJ4lfRVwOFXzl4_Tbah1FmrGnzDx5QVgVLQmQUz4,5372
18
- hestia_earth/utils/version.py,sha256=z2tz1TqNic_wxJ0cdufZWrE4ryfXE9jZgzJKKWhdm4E,20
17
+ hestia_earth/utils/tools.py,sha256=WMx05cBtBR8mYQnLLBA2cgF1x2tI41514diFDeX4gLQ,7533
18
+ hestia_earth/utils/version.py,sha256=R_I9V6MKTrP_WY9UKq_aQBdaU5xIiWzkcyC-foZlSnM,20
19
19
  hestia_earth/utils/pivot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- hestia_earth/utils/pivot/_shared.py,sha256=8-AZAAitXU53BYpOFFGnuCy5VBSphyClgIqGU-kg6CM,1436
21
- hestia_earth/utils/pivot/pivot_csv.py,sha256=TtePpo9L_GKISpQMW9dwjBv2tHHE822rb-j6_bflVOA,12251
22
- hestia_earth/utils/pivot/pivot_json.py,sha256=CrFwIxRXuUemdr5-k7weloB5rHeTLX5yB9K-I3xDVmk,9977
20
+ hestia_earth/utils/pivot/_shared.py,sha256=ob8T7bziZ6YffpRx_srzzH0h96z2GZJB5uVe9NxALIg,2828
21
+ hestia_earth/utils/pivot/pivot_csv.py,sha256=1zGYuC_xlrLoX5FYSG3mt5HSc03CO-0p8J0oAmyI1VI,11778
22
+ hestia_earth/utils/pivot/pivot_json.py,sha256=yWbsuFeTQJPk4H2b7ibVALGJa3qfKGyNuPeqM8yrcsI,11030
23
23
  hestia_earth/utils/storage/__init__.py,sha256=uNX6_EHWWnNUIm4Ng7L43-cQmuc6NGFAxXye85saIXQ,922
24
24
  hestia_earth/utils/storage/_azure_client.py,sha256=mseexhzjteRDzzoFe2fEXe9MYLmvjG3sfl9-eha4ZW0,1353
25
25
  hestia_earth/utils/storage/_local_client.py,sha256=KbYqTfniIU5R5J1m_unCQip9kOz9EGIGI0OH0QvD8eo,551
26
26
  hestia_earth/utils/storage/_s3_client.py,sha256=8TCxiHfxE7G8kdp3CnEFrxgmPwfPyci3-blsowE2T7o,3146
27
27
  hestia_earth/utils/storage/_sns_client.py,sha256=pvtXYw-sQ8ns3mlDz7ld9iZp3FYSm0xSXSXMJ5IPnBc,380
28
- hestia_earth_utils-0.16.12.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
29
- hestia_earth_utils-0.16.12.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
30
- hestia_earth_utils-0.16.12.dist-info/METADATA,sha256=CZegpFMoj0MWumacYaibHi1BCg6IDaJDuIDtMlEzT3A,1870
31
- hestia_earth_utils-0.16.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
32
- hestia_earth_utils-0.16.12.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
33
- hestia_earth_utils-0.16.12.dist-info/RECORD,,
28
+ hestia_earth_utils-0.16.14.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
29
+ hestia_earth_utils-0.16.14.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
30
+ hestia_earth_utils-0.16.14.dist-info/METADATA,sha256=4i7b1a12W_HBp3NtPrR1Kr6ztiedReV_svrcDcnjKEY,1870
31
+ hestia_earth_utils-0.16.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
32
+ hestia_earth_utils-0.16.14.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
33
+ hestia_earth_utils-0.16.14.dist-info/RECORD,,