hestia-earth-utils 0.16.8__py3-none-any.whl → 0.16.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. hestia_earth/utils/api.py +78 -36
  2. hestia_earth/utils/blank_node.py +101 -60
  3. hestia_earth/utils/calculation_status.py +45 -35
  4. hestia_earth/utils/cycle.py +7 -7
  5. hestia_earth/utils/date.py +7 -2
  6. hestia_earth/utils/descriptive_stats.py +10 -6
  7. hestia_earth/utils/emission.py +26 -15
  8. hestia_earth/utils/lookup.py +62 -28
  9. hestia_earth/utils/lookup_utils.py +89 -63
  10. hestia_earth/utils/model.py +45 -40
  11. hestia_earth/utils/pipeline.py +179 -90
  12. hestia_earth/utils/pivot/_shared.py +16 -12
  13. hestia_earth/utils/pivot/pivot_csv.py +35 -18
  14. hestia_earth/utils/pivot/pivot_json.py +34 -18
  15. hestia_earth/utils/request.py +17 -6
  16. hestia_earth/utils/stats.py +89 -68
  17. hestia_earth/utils/storage/_azure_client.py +17 -6
  18. hestia_earth/utils/storage/_local_client.py +8 -3
  19. hestia_earth/utils/storage/_s3_client.py +27 -22
  20. hestia_earth/utils/storage/_sns_client.py +7 -2
  21. hestia_earth/utils/term.py +5 -5
  22. hestia_earth/utils/tools.py +50 -21
  23. hestia_earth/utils/version.py +1 -1
  24. {hestia_earth_utils-0.16.8.dist-info → hestia_earth_utils-0.16.10.dist-info}/METADATA +1 -1
  25. hestia_earth_utils-0.16.10.dist-info/RECORD +33 -0
  26. hestia_earth_utils-0.16.8.dist-info/RECORD +0 -33
  27. {hestia_earth_utils-0.16.8.data → hestia_earth_utils-0.16.10.data}/scripts/hestia-format-upload +0 -0
  28. {hestia_earth_utils-0.16.8.data → hestia_earth_utils-0.16.10.data}/scripts/hestia-pivot-csv +0 -0
  29. {hestia_earth_utils-0.16.8.dist-info → hestia_earth_utils-0.16.10.dist-info}/WHEEL +0 -0
  30. {hestia_earth_utils-0.16.8.dist-info → hestia_earth_utils-0.16.10.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,19 @@ import numpy as np
5
5
  from .tools import current_time_ms, non_empty_list, flatten
6
6
  from .api import find_related
7
7
  from .storage._s3_client import (
8
- _load_from_bucket, _upload_to_bucket, _last_modified, _read_metadata, _update_metadata, _exists_in_bucket
8
+ _load_from_bucket,
9
+ _upload_to_bucket,
10
+ _last_modified,
11
+ _read_metadata,
12
+ _update_metadata,
13
+ _exists_in_bucket,
9
14
  )
10
15
  from .storage._sns_client import _get_sns_client
11
16
 
12
- PROGRESS_EXT = '.progress'
13
- CALC_FOLDER = 'recalculated'
14
- METADATA_STAGE_KEY = 'stage'
15
- METADATA_PROGRESS_KEY = 'calculating'
17
+ PROGRESS_EXT = ".progress"
18
+ CALC_FOLDER = "recalculated"
19
+ METADATA_STAGE_KEY = "stage"
20
+ METADATA_PROGRESS_KEY = "calculating"
16
21
 
17
22
 
18
23
  # fix error "Object of type int64 is not JSON serializable"
@@ -27,10 +32,12 @@ class NpEncoder(json.JSONEncoder):
27
32
  return super(NpEncoder, self).default(obj)
28
33
 
29
34
 
30
- def to_string(data: dict, indent: int = None): return json.dumps(data, indent=indent, ensure_ascii=False, cls=NpEncoder)
35
+ def to_string(data: dict, indent: int = None):
36
+ return json.dumps(data, indent=indent, ensure_ascii=False, cls=NpEncoder)
31
37
 
32
38
 
33
- def to_bytes(data: dict): return to_string(data).encode('utf8')
39
+ def to_bytes(data: dict):
40
+ return to_string(data).encode("utf8")
34
41
 
35
42
 
36
43
  def upload_json(bucket_name: str, file_key: str, body: dict):
@@ -38,15 +45,21 @@ def upload_json(bucket_name: str, file_key: str, body: dict):
38
45
  bucket=bucket_name,
39
46
  key=file_key,
40
47
  body=to_bytes(body),
41
- content_type='application/json'
48
+ content_type="application/json",
42
49
  )
43
50
 
44
51
 
45
52
  def _to_file_progress(filepath: str):
46
- return filepath.replace('.csv', PROGRESS_EXT).replace('.json', PROGRESS_EXT).replace('.hestia', PROGRESS_EXT)
53
+ return (
54
+ filepath.replace(".csv", PROGRESS_EXT)
55
+ .replace(".json", PROGRESS_EXT)
56
+ .replace(".hestia", PROGRESS_EXT)
57
+ )
47
58
 
48
59
 
49
- def handle_result(bucket_name: str, file_key: str, step: str, start: int, content: dict):
60
+ def handle_result(
61
+ bucket_name: str, file_key: str, step: str, start: int, content: dict
62
+ ):
50
63
  filepath = _to_file_progress(file_key)
51
64
 
52
65
  # try to read existing progress to update the time per step
@@ -55,89 +68,122 @@ def handle_result(bucket_name: str, file_key: str, step: str, start: int, conten
55
68
  except Exception:
56
69
  data = {}
57
70
 
58
- return upload_json(bucket_name, filepath, {
59
- **data,
60
- 'step': step,
61
- 'time': {
62
- **(data.get('time', {}) if isinstance(data.get('time', {}), dict) else {}),
63
- step: current_time_ms() - start
71
+ return upload_json(
72
+ bucket_name,
73
+ filepath,
74
+ {
75
+ **data,
76
+ "step": step,
77
+ "time": {
78
+ **(
79
+ data.get("time", {})
80
+ if isinstance(data.get("time", {}), dict)
81
+ else {}
82
+ ),
83
+ step: current_time_ms() - start,
84
+ },
85
+ **content,
64
86
  },
65
- **content
66
- })
87
+ )
67
88
 
68
89
 
69
90
  def handle_error(
70
- bucket_name: str, file_key: str, step: str, start: int,
71
- err: str = '', stack: str = '', errors=[], warnings=[],
72
- extras: dict = {}
91
+ bucket_name: str,
92
+ file_key: str,
93
+ step: str,
94
+ start: int,
95
+ err: str = "",
96
+ stack: str = "",
97
+ errors=[],
98
+ warnings=[],
99
+ extras: dict = {},
73
100
  ):
74
- return handle_result(bucket_name, file_key, step, start, extras | {
75
- 'success': False,
76
- 'error': {
77
- 'message': err,
78
- 'stack': stack,
79
- 'errors': errors,
80
- 'warnings': warnings
101
+ return handle_result(
102
+ bucket_name,
103
+ file_key,
104
+ step,
105
+ start,
106
+ extras
107
+ | {
108
+ "success": False,
109
+ "error": {
110
+ "message": err,
111
+ "stack": stack,
112
+ "errors": errors,
113
+ "warnings": warnings,
114
+ },
115
+ "warning": {"warnings": warnings},
81
116
  },
82
- 'warning': {
83
- 'warnings': warnings
84
- }
85
- })
117
+ )
86
118
 
87
119
 
88
- def handle_success(bucket_name: str, file_key: str, step: str, start: int, extras: dict = {}):
89
- return handle_result(bucket_name, file_key, step, start, extras | {'success': True})
120
+ def handle_success(
121
+ bucket_name: str, file_key: str, step: str, start: int, extras: dict = {}
122
+ ):
123
+ return handle_result(bucket_name, file_key, step, start, extras | {"success": True})
90
124
 
91
125
 
92
- def publish_result(topic_arn: str, bucket_name: str, file_key: str, filepath: str, step: str, success: bool):
126
+ def publish_result(
127
+ topic_arn: str,
128
+ bucket_name: str,
129
+ file_key: str,
130
+ filepath: str,
131
+ step: str,
132
+ success: bool,
133
+ ):
93
134
  return _get_sns_client().publish(
94
135
  TopicArn=topic_arn,
95
- Message=to_string({
96
- 'bucket': bucket_name,
97
- 'key': file_key,
98
- 'filepath': filepath
99
- }),
136
+ Message=to_string(
137
+ {"bucket": bucket_name, "key": file_key, "filepath": filepath}
138
+ ),
100
139
  MessageAttributes={
101
- 'functionName': {
102
- 'DataType': 'String',
103
- 'StringValue': step + ('Done' if success else 'Error')
140
+ "functionName": {
141
+ "DataType": "String",
142
+ "StringValue": step + ("Done" if success else "Error"),
104
143
  }
105
- }
144
+ },
106
145
  )
107
146
 
108
147
 
109
148
  def _parse_event_s3(event: dict):
110
- return {'bucket': event['s3']['bucket']['name'], 'key': event['s3']['object']['key']}
149
+ return {
150
+ "bucket": event["s3"]["bucket"]["name"],
151
+ "key": event["s3"]["object"]["key"],
152
+ }
111
153
 
112
154
 
113
155
  def _parse_event_SNS(event: dict):
114
- event = event.get('Sns', {})
115
- data = json.loads(event.get('Message', '{}'))
116
- attributes: dict = event.get('MessageAttributes', {})
117
- data['attributes'] = {key: value.get('Value') for key, value in attributes.items()}
156
+ event = event.get("Sns", {})
157
+ data = json.loads(event.get("Message", "{}"))
158
+ attributes: dict = event.get("MessageAttributes", {})
159
+ data["attributes"] = {key: value.get("Value") for key, value in attributes.items()}
118
160
  return data
119
161
 
120
162
 
121
163
  def _parse_event_SQS(event: dict):
122
- condition = event.get('requestContext', {}).get('condition')
123
- return _get_data_from_event(event.get('requestPayload', {})) if condition != 'RetriesExhausted' else None
164
+ condition = event.get("requestContext", {}).get("condition")
165
+ return (
166
+ _get_data_from_event(event.get("requestPayload", {}))
167
+ if condition != "RetriesExhausted"
168
+ else None
169
+ )
124
170
 
125
171
 
126
172
  def _get_data_from_event(event): # noqa: C901
127
173
  if isinstance(event, dict):
128
- if 's3' in event:
174
+ if "s3" in event:
129
175
  return _parse_event_s3(event)
130
176
  # invoked when running asynchronously
131
- if 'Sns' in event:
177
+ if "Sns" in event:
132
178
  return _parse_event_SNS(event)
133
179
  # invoked through http event
134
- if 'body' in event:
135
- return _get_data_from_event(json.loads(event.get('body', '{}')))
180
+ if "body" in event:
181
+ return _get_data_from_event(json.loads(event.get("body", "{}")))
136
182
  # invoked through s3 put object
137
- if 'Records' in event:
138
- return flatten(map(_get_data_from_event, event.get('Records', [])))
183
+ if "Records" in event:
184
+ return flatten(map(_get_data_from_event, event.get("Records", [])))
139
185
  # invoked when calculation timedout or failed
140
- if 'requestPayload' in event:
186
+ if "requestPayload" in event:
141
187
  return _parse_event_SQS(event)
142
188
  return event
143
189
  if isinstance(event, str):
@@ -149,34 +195,51 @@ def parse_event(event: dict):
149
195
  return non_empty_list(flatten(data) if isinstance(data, list) else [data])
150
196
 
151
197
 
152
- def _node_type(node: dict): return node.get('@type', node.get('type'))
198
+ def _node_type(node: dict):
199
+ return node.get("@type", node.get("type"))
153
200
 
154
201
 
155
- def _node_id(node: dict): return node.get('@id', node.get('id'))
202
+ def _node_id(node: dict):
203
+ return node.get("@id", node.get("id"))
156
204
 
157
205
 
158
- def _node_path(node: dict, folder: str = ''): return join(folder, _node_type(node), f"{_node_id(node)}.jsonld")
206
+ def _node_path(node: dict, folder: str = ""):
207
+ return join(folder, _node_type(node), f"{_node_id(node)}.jsonld")
159
208
 
160
209
 
161
- def _load_node(bucket: str, file_key: str): return json.loads(_load_from_bucket(bucket, file_key))
210
+ def _load_node(bucket: str, file_key: str):
211
+ return json.loads(_load_from_bucket(bucket, file_key))
162
212
 
163
213
 
164
- def _cache_path(node: dict): return join(_node_type(node), f"{_node_id(node)}.cache")
214
+ def _cache_path(node: dict):
215
+ return join(_node_type(node), f"{_node_id(node)}.cache")
165
216
 
166
217
 
167
- def _has_cache(bucket: str, node: dict): return _exists_in_bucket(bucket, _cache_path(node))
218
+ def _has_cache(bucket: str, node: dict):
219
+ return _exists_in_bucket(bucket, _cache_path(node))
168
220
 
169
221
 
170
- def is_calculating(bucket: str, node: dict, folder: str = ''):
171
- return _read_metadata(bucket, _node_path(node, folder)).get(METADATA_PROGRESS_KEY, 'false') == 'true'
222
+ def is_calculating(bucket: str, node: dict, folder: str = ""):
223
+ return (
224
+ _read_metadata(bucket, _node_path(node, folder)).get(
225
+ METADATA_PROGRESS_KEY, "false"
226
+ )
227
+ == "true"
228
+ )
172
229
 
173
230
 
174
- def set_calculating(bucket: str, node: dict, in_progress: bool, folder: str = ''):
175
- return _update_metadata(bucket, _node_path(node, folder), {METADATA_PROGRESS_KEY: str(in_progress).lower()})
231
+ def set_calculating(bucket: str, node: dict, in_progress: bool, folder: str = ""):
232
+ return _update_metadata(
233
+ bucket,
234
+ _node_path(node, folder),
235
+ {METADATA_PROGRESS_KEY: str(in_progress).lower()},
236
+ )
176
237
 
177
238
 
178
239
  def get_stage(bucket: str, node: dict, folder: str = CALC_FOLDER):
179
- stage = _read_metadata(bucket, _node_path(node, folder=CALC_FOLDER)).get(METADATA_STAGE_KEY)
240
+ stage = _read_metadata(bucket, _node_path(node, folder=CALC_FOLDER)).get(
241
+ METADATA_STAGE_KEY
242
+ )
180
243
  return int(stage) if stage else stage
181
244
 
182
245
 
@@ -196,29 +259,43 @@ def load_cache(bucket: str, node: dict):
196
259
  dict
197
260
  The cached data.
198
261
  """
199
- cache_path = join(node['@type'], f"{node['@id']}.cache")
262
+ cache_path = join(node["@type"], f"{node['@id']}.cache")
200
263
  try:
201
264
  return json.loads(_load_from_bucket(bucket, cache_path))
202
265
  except Exception:
203
- print('No cache found for', cache_path)
266
+ print("No cache found for", cache_path)
204
267
  return {}
205
268
 
206
269
 
207
- def _filter_by_type(nodes: list, type: str): return [n for n in nodes if n.get('@type', n.get('type')) == type]
270
+ def _filter_by_type(nodes: list, type: str):
271
+ return [n for n in nodes if n.get("@type", n.get("type")) == type]
208
272
 
209
273
 
210
- def _find_related_nodes(from_type: str, from_id: str, related_type: str, related_key: str):
211
- should_find_related = related_key == 'related'
212
- print('Find related nodes from API', from_type, from_id, related_key, related_type)
213
- return find_related(from_type, from_id, related_type, limit=10000) if should_find_related else []
274
+ def _find_related_nodes(
275
+ from_type: str, from_id: str, related_type: str, related_key: str
276
+ ):
277
+ should_find_related = related_key == "related"
278
+ print("Find related nodes from API", from_type, from_id, related_key, related_type)
279
+ return (
280
+ find_related(from_type, from_id, related_type, limit=10000)
281
+ if should_find_related
282
+ else []
283
+ )
214
284
 
215
285
 
216
- def _get_cached_nodes(cache: dict, related_key: str, from_type: str, from_id: str, to_type: str):
286
+ def _get_cached_nodes(
287
+ cache: dict, related_key: str, from_type: str, from_id: str, to_type: str
288
+ ):
217
289
  # if key is in cache, use nodes in cache, otherwise use API
218
290
  if related_key in cache:
219
291
  nodes = _filter_by_type(cache.get(related_key, []), to_type)
220
- print('Using cached data to', related_key, to_type, nodes)
221
- return list(map(lambda node: {'@type': to_type, '@id': node.get('@id', node.get('id'))}, nodes))
292
+ print("Using cached data to", related_key, to_type, nodes)
293
+ return list(
294
+ map(
295
+ lambda node: {"@type": to_type, "@id": node.get("@id", node.get("id"))},
296
+ nodes,
297
+ )
298
+ )
222
299
  else:
223
300
  return _find_related_nodes(from_type, from_id, to_type, related_key)
224
301
 
@@ -244,15 +321,22 @@ def get_related_nodes(node: dict, related_key: str, related_type: str, cache: di
244
321
  List[dict]
245
322
  The related nodes.
246
323
  """
247
- from_type = node.get('@type', node.get('type'))
248
- from_id = node.get('@id', node.get('id'))
324
+ from_type = node.get("@type", node.get("type"))
325
+ from_id = node.get("@id", node.get("id"))
249
326
 
250
- related_nodes = _get_cached_nodes(cache or {}, related_key, from_type, from_id, related_type) or []
327
+ related_nodes = (
328
+ _get_cached_nodes(cache or {}, related_key, from_type, from_id, related_type)
329
+ or []
330
+ )
251
331
 
252
- return list({f"{node['@type']}/{node['@id']}": node for node in related_nodes}.values())
332
+ return list(
333
+ {f"{node['@type']}/{node['@id']}": node for node in related_nodes}.values()
334
+ )
253
335
 
254
336
 
255
- def get_related_nodes_data(bucket_name: str, node: dict, related_key: str, related_type: str, cache: dict):
337
+ def get_related_nodes_data(
338
+ bucket_name: str, node: dict, related_key: str, related_type: str, cache: dict
339
+ ):
256
340
  """
257
341
  Given a node, return all related nodes with extra data.
258
342
 
@@ -275,14 +359,19 @@ def get_related_nodes_data(bucket_name: str, node: dict, related_key: str, relat
275
359
  List[dict]
276
360
  The related nodes with extra data: `indexed_at`, `recalculated_at` and `recalculated_stage`.
277
361
  """
278
- related_nodes = get_related_nodes(node=node, related_key=related_key, related_type=related_type, cache=cache)
362
+ related_nodes = get_related_nodes(
363
+ node=node, related_key=related_key, related_type=related_type, cache=cache
364
+ )
279
365
 
280
366
  return [
281
- node | {
282
- 'indexed_at': _last_modified(bucket=bucket_name, key=_node_path(node)),
283
- 'recalculated_at': _last_modified(bucket=bucket_name, key=_node_path(node, folder=CALC_FOLDER)),
284
- 'recalculated_stage': get_stage(bucket_name, node),
285
- 'is_calculating': is_calculating(bucket_name, node)
367
+ node
368
+ | {
369
+ "indexed_at": _last_modified(bucket=bucket_name, key=_node_path(node)),
370
+ "recalculated_at": _last_modified(
371
+ bucket=bucket_name, key=_node_path(node, folder=CALC_FOLDER)
372
+ ),
373
+ "recalculated_stage": get_stage(bucket_name, node),
374
+ "is_calculating": is_calculating(bucket_name, node),
286
375
  }
287
376
  for node in related_nodes
288
377
  ]
@@ -5,10 +5,12 @@ from hestia_earth.schema import EmissionMethodTier
5
5
 
6
6
  EXCLUDE_FIELDS = ["@type", "type", "@context"]
7
7
  EXCLUDE_PRIVATE_FIELDS = [
8
- "added", "addedVersion",
9
- "updated", "updatedVersion",
8
+ "added",
9
+ "addedVersion",
10
+ "updated",
11
+ "updatedVersion",
10
12
  "aggregatedVersion",
11
- "_cache"
13
+ "_cache",
12
14
  ]
13
15
 
14
16
 
@@ -17,11 +19,11 @@ def _with_csv_formatting(dct):
17
19
  Use as object_hook when parsing a JSON node: json.loads(node, object_hook=_with_csv_formatting).
18
20
  Ensures parsed JSON has field values formatted according to hestia csv conventions.
19
21
  """
20
- if 'boundary' in dct:
21
- dct['boundary'] = json.dumps(dct['boundary'])
22
+ if "boundary" in dct:
23
+ dct["boundary"] = json.dumps(dct["boundary"])
22
24
  for key, value in dct.items():
23
25
  if _is_scalar_list(value):
24
- dct[key] = ';'.join([str(el) for el in value])
26
+ dct[key] = ";".join([str(el) for el in value])
25
27
  return dct
26
28
 
27
29
 
@@ -37,15 +39,17 @@ def _is_scalar_list(value):
37
39
 
38
40
 
39
41
  def _filter_not_relevant(blank_node: dict):
40
- return blank_node.get('methodTier') != EmissionMethodTier.NOT_RELEVANT.value
42
+ return blank_node.get("methodTier") != EmissionMethodTier.NOT_RELEVANT.value
41
43
 
42
44
 
43
45
  def _filter_emissions_not_relevant(node: dict):
44
46
  """
45
47
  Ignore all emissions where `methodTier=not relevant` so save space.
46
48
  """
47
- return node | ({
48
- key: list(filter(_filter_not_relevant, node[key]))
49
- for key in ['emissions', 'emissionsResourceUse']
50
- if key in node
51
- })
49
+ return node | (
50
+ {
51
+ key: list(filter(_filter_not_relevant, node[key]))
52
+ for key in ["emissions", "emissionsResourceUse"]
53
+ if key in node
54
+ }
55
+ )
@@ -9,7 +9,12 @@ from flatten_json import flatten as flatten_json
9
9
 
10
10
  # __package__ = "hestia_earth.utils" # required to run interactively in vscode
11
11
  from ..api import find_term_ids_by_names
12
- from ._shared import EXCLUDE_FIELDS, EXCLUDE_PRIVATE_FIELDS, _with_csv_formatting, _filter_emissions_not_relevant
12
+ from ._shared import (
13
+ EXCLUDE_FIELDS,
14
+ EXCLUDE_PRIVATE_FIELDS,
15
+ _with_csv_formatting,
16
+ _filter_emissions_not_relevant,
17
+ )
13
18
 
14
19
 
15
20
  # We only want to pivot array items containing blank nodes
@@ -18,11 +23,13 @@ def _get_blank_node_uniqueness_fields():
18
23
  filtered_uniqueness_fields = copy.deepcopy(UNIQUENESS_FIELDS)
19
24
  for node_type, array_fields in UNIQUENESS_FIELDS.items():
20
25
  for array_field in array_fields.keys():
21
- if SORT_CONFIG[node_type][array_field]['type'] in NODE_TYPES:
26
+ if SORT_CONFIG[node_type][array_field]["type"] in NODE_TYPES:
22
27
  del filtered_uniqueness_fields[node_type][array_field]
23
28
  # include `impactAssessment.@id` since it is not part of original uniqueness
24
- if 'impactAssessment.id' in array_fields[array_field]:
25
- filtered_uniqueness_fields[node_type][array_field].append('impactAssessment.@id')
29
+ if "impactAssessment.id" in array_fields[array_field]:
30
+ filtered_uniqueness_fields[node_type][array_field].append(
31
+ "impactAssessment.@id"
32
+ )
26
33
  return filtered_uniqueness_fields
27
34
 
28
35
 
@@ -226,15 +233,17 @@ def _do_pivot(df_in, name_id_dict):
226
233
  deep_pivoted, left_index=True, right_index=True, how="outer"
227
234
  )
228
235
 
229
- field_cols.dropna(axis=0, how='all', inplace=True)
236
+ field_cols.dropna(axis=0, how="all", inplace=True)
230
237
 
231
- with_grouped_cols = field_cols.T.groupby(
232
- _get_term_index, group_keys=True
233
- ).apply(
234
- _group_by_term,
235
- name_id_dict=name_id_dict,
236
- uniqueness_fields=uniqueness_fields
237
- ).T
238
+ with_grouped_cols = (
239
+ field_cols.T.groupby(_get_term_index, group_keys=True)
240
+ .apply(
241
+ _group_by_term,
242
+ name_id_dict=name_id_dict,
243
+ uniqueness_fields=uniqueness_fields,
244
+ )
245
+ .T
246
+ )
238
247
 
239
248
  pivoted_terms = with_grouped_cols.apply(
240
249
  _pivot_row, axis=1, uniqueness_fields=uniqueness_fields
@@ -242,9 +251,13 @@ def _do_pivot(df_in, name_id_dict):
242
251
 
243
252
  # merge any duplicated columns caused by shuffled term positions
244
253
  # this operation coincidentally sorts the columns alphabetically
245
- pivoted_terms = pivoted_terms.T.groupby(
246
- level=pivoted_terms.columns.nlevels - 1, group_keys=False
247
- ).apply(lambda term: term.bfill().iloc[0, :]).T
254
+ pivoted_terms = (
255
+ pivoted_terms.T.groupby(
256
+ level=pivoted_terms.columns.nlevels - 1, group_keys=False
257
+ )
258
+ .apply(lambda term: term.bfill().iloc[0, :])
259
+ .T
260
+ )
248
261
 
249
262
  pivoted_terms.columns = map(
250
263
  lambda col: f"{nt_label}.{field}.{col}", pivoted_terms.columns
@@ -265,14 +278,18 @@ def _format_and_pivot(df_in):
265
278
  df_out = _do_pivot(df_in.copy(), name_id_dict)
266
279
 
267
280
  _sort_inplace(df_out)
268
- df_out = df_out.astype('object')
281
+ df_out = df_out.astype("object")
269
282
  df_out.fillna("-", inplace=True)
270
283
  return df_out
271
284
 
272
285
 
273
286
  def nodes_to_df(nodes: list[dict]):
274
287
  nodes_flattened = [
275
- flatten_json(dict([(_get_node_type_label(node.get("@type", node.get('type'))), node)]), '.') for node in nodes
288
+ flatten_json(
289
+ dict([(_get_node_type_label(node.get("@type", node.get("type"))), node)]),
290
+ ".",
291
+ )
292
+ for node in nodes
276
293
  ]
277
294
 
278
295
  return pd.json_normalize(nodes_flattened)
@@ -306,7 +323,7 @@ def pivot_hestia_file(hestia_file: str):
306
323
  Pandas dataframe with pivoted array terms
307
324
  """
308
325
  parsed = json.loads(hestia_file, object_hook=_with_csv_formatting)
309
- nodes = parsed.get('nodes', [parsed])
326
+ nodes = parsed.get("nodes", [parsed])
310
327
  return pivot_nodes(nodes)
311
328
 
312
329
 
@@ -6,10 +6,17 @@ from collections import defaultdict
6
6
  from copy import deepcopy
7
7
 
8
8
  from hestia_earth.utils.pipeline import _node_type
9
- from ._shared import EXCLUDE_FIELDS, EXCLUDE_PRIVATE_FIELDS, _with_csv_formatting, _filter_emissions_not_relevant
9
+ from ._shared import (
10
+ EXCLUDE_FIELDS,
11
+ EXCLUDE_PRIVATE_FIELDS,
12
+ _with_csv_formatting,
13
+ _filter_emissions_not_relevant,
14
+ )
10
15
 
11
16
  pivot_exclude_fields = Term().fields
12
- pivot_exclude_fields.update({k: "" for k in EXCLUDE_FIELDS} | {k: "" for k in EXCLUDE_PRIVATE_FIELDS})
17
+ pivot_exclude_fields.update(
18
+ {k: "" for k in EXCLUDE_FIELDS} | {k: "" for k in EXCLUDE_PRIVATE_FIELDS}
19
+ )
13
20
 
14
21
  term_exclude_fields = Term().fields
15
22
  del term_exclude_fields["name"]
@@ -31,8 +38,10 @@ for node_type, array_fields in UNIQUENESS_FIELDS.items():
31
38
  if f not in ("properties.term.@id", "properties.value")
32
39
  ]
33
40
  # include `impactAssessment.@id` since it is not part of original uniqueness
34
- if 'impactAssessment.id' in array_fields[array_field]:
35
- ADAPTED_UNIQUENESS_FIELDS[node_type][array_field].append('impactAssessment.@id')
41
+ if "impactAssessment.id" in array_fields[array_field]:
42
+ ADAPTED_UNIQUENESS_FIELDS[node_type][array_field].append(
43
+ "impactAssessment.@id"
44
+ )
36
45
 
37
46
 
38
47
  def _combine_node_ids(nodes: list):
@@ -42,13 +51,18 @@ def _combine_node_ids(nodes: list):
42
51
  def _base_pivoted_value(key: str, value, is_top_level: bool):
43
52
  # handle list of Nodes
44
53
  return (
45
- _combine_node_ids(value) if isinstance(value[0], dict) and value[0].get('@type') in NODE_TYPES else
46
- json.dumps(value, separators=(',', ':')) if any([
47
- is_top_level,
48
- key in ['distribution']
49
- ]) else
50
- value
51
- ) if isinstance(value, list) else value
54
+ (
55
+ _combine_node_ids(value)
56
+ if isinstance(value[0], dict) and value[0].get("@type") in NODE_TYPES
57
+ else (
58
+ json.dumps(value, separators=(",", ":"))
59
+ if any([is_top_level, key in ["distribution"]])
60
+ else value
61
+ )
62
+ )
63
+ if isinstance(value, list)
64
+ else value
65
+ )
52
66
 
53
67
 
54
68
  def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa: C901
@@ -57,13 +71,15 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
57
71
  if node_type not in ADAPTED_UNIQUENESS_FIELDS:
58
72
  return node
59
73
  pivoted_node = {
60
- field: _base_pivoted_value(field, value, level==0)
74
+ field: _base_pivoted_value(field, value, level == 0)
61
75
  for field, value in node.items()
62
- if all([
63
- field not in ADAPTED_UNIQUENESS_FIELDS[node_type],
64
- node_type != 'Term' or field not in term_exclude_fields,
65
- field not in EXCLUDE_PRIVATE_FIELDS
66
- ])
76
+ if all(
77
+ [
78
+ field not in ADAPTED_UNIQUENESS_FIELDS[node_type],
79
+ node_type != "Term" or field not in term_exclude_fields,
80
+ field not in EXCLUDE_PRIVATE_FIELDS,
81
+ ]
82
+ )
67
83
  }
68
84
 
69
85
  fields_to_pivot = [
@@ -227,7 +243,7 @@ def pivot_hestia_file(hestia_file: str):
227
243
  Pivot json array of schema-compliant nodes on 'nodes' key of unparsed json string
228
244
  """
229
245
  parsed = json.loads(hestia_file, object_hook=_with_csv_formatting)
230
- return pivot_nodes(parsed.get('nodes', []))
246
+ return pivot_nodes(parsed.get("nodes", []))
231
247
 
232
248
 
233
249
  def pivot_nodes(nodes: list[dict]):