hestia-earth-utils 0.16.9__py3-none-any.whl → 0.16.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hestia_earth/utils/api.py +78 -36
- hestia_earth/utils/blank_node.py +101 -60
- hestia_earth/utils/calculation_status.py +45 -35
- hestia_earth/utils/cycle.py +7 -7
- hestia_earth/utils/date.py +7 -2
- hestia_earth/utils/descriptive_stats.py +10 -6
- hestia_earth/utils/emission.py +26 -15
- hestia_earth/utils/lookup.py +62 -28
- hestia_earth/utils/lookup_utils.py +89 -63
- hestia_earth/utils/model.py +45 -40
- hestia_earth/utils/pipeline.py +179 -90
- hestia_earth/utils/pivot/_shared.py +16 -12
- hestia_earth/utils/pivot/pivot_csv.py +35 -18
- hestia_earth/utils/pivot/pivot_json.py +34 -18
- hestia_earth/utils/request.py +17 -6
- hestia_earth/utils/stats.py +89 -68
- hestia_earth/utils/storage/_azure_client.py +17 -6
- hestia_earth/utils/storage/_local_client.py +8 -3
- hestia_earth/utils/storage/_s3_client.py +27 -22
- hestia_earth/utils/storage/_sns_client.py +7 -2
- hestia_earth/utils/term.py +5 -5
- hestia_earth/utils/tools.py +50 -21
- hestia_earth/utils/version.py +1 -1
- {hestia_earth_utils-0.16.9.dist-info → hestia_earth_utils-0.16.10.dist-info}/METADATA +1 -1
- hestia_earth_utils-0.16.10.dist-info/RECORD +33 -0
- hestia_earth_utils-0.16.9.dist-info/RECORD +0 -33
- {hestia_earth_utils-0.16.9.data → hestia_earth_utils-0.16.10.data}/scripts/hestia-format-upload +0 -0
- {hestia_earth_utils-0.16.9.data → hestia_earth_utils-0.16.10.data}/scripts/hestia-pivot-csv +0 -0
- {hestia_earth_utils-0.16.9.dist-info → hestia_earth_utils-0.16.10.dist-info}/WHEEL +0 -0
- {hestia_earth_utils-0.16.9.dist-info → hestia_earth_utils-0.16.10.dist-info}/top_level.txt +0 -0
hestia_earth/utils/pipeline.py
CHANGED
|
@@ -5,14 +5,19 @@ import numpy as np
|
|
|
5
5
|
from .tools import current_time_ms, non_empty_list, flatten
|
|
6
6
|
from .api import find_related
|
|
7
7
|
from .storage._s3_client import (
|
|
8
|
-
_load_from_bucket,
|
|
8
|
+
_load_from_bucket,
|
|
9
|
+
_upload_to_bucket,
|
|
10
|
+
_last_modified,
|
|
11
|
+
_read_metadata,
|
|
12
|
+
_update_metadata,
|
|
13
|
+
_exists_in_bucket,
|
|
9
14
|
)
|
|
10
15
|
from .storage._sns_client import _get_sns_client
|
|
11
16
|
|
|
12
|
-
PROGRESS_EXT =
|
|
13
|
-
CALC_FOLDER =
|
|
14
|
-
METADATA_STAGE_KEY =
|
|
15
|
-
METADATA_PROGRESS_KEY =
|
|
17
|
+
PROGRESS_EXT = ".progress"
|
|
18
|
+
CALC_FOLDER = "recalculated"
|
|
19
|
+
METADATA_STAGE_KEY = "stage"
|
|
20
|
+
METADATA_PROGRESS_KEY = "calculating"
|
|
16
21
|
|
|
17
22
|
|
|
18
23
|
# fix error "Object of type int64 is not JSON serializable"
|
|
@@ -27,10 +32,12 @@ class NpEncoder(json.JSONEncoder):
|
|
|
27
32
|
return super(NpEncoder, self).default(obj)
|
|
28
33
|
|
|
29
34
|
|
|
30
|
-
def to_string(data: dict, indent: int = None):
|
|
35
|
+
def to_string(data: dict, indent: int = None):
|
|
36
|
+
return json.dumps(data, indent=indent, ensure_ascii=False, cls=NpEncoder)
|
|
31
37
|
|
|
32
38
|
|
|
33
|
-
def to_bytes(data: dict):
|
|
39
|
+
def to_bytes(data: dict):
|
|
40
|
+
return to_string(data).encode("utf8")
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
def upload_json(bucket_name: str, file_key: str, body: dict):
|
|
@@ -38,15 +45,21 @@ def upload_json(bucket_name: str, file_key: str, body: dict):
|
|
|
38
45
|
bucket=bucket_name,
|
|
39
46
|
key=file_key,
|
|
40
47
|
body=to_bytes(body),
|
|
41
|
-
content_type=
|
|
48
|
+
content_type="application/json",
|
|
42
49
|
)
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
def _to_file_progress(filepath: str):
|
|
46
|
-
return
|
|
53
|
+
return (
|
|
54
|
+
filepath.replace(".csv", PROGRESS_EXT)
|
|
55
|
+
.replace(".json", PROGRESS_EXT)
|
|
56
|
+
.replace(".hestia", PROGRESS_EXT)
|
|
57
|
+
)
|
|
47
58
|
|
|
48
59
|
|
|
49
|
-
def handle_result(
|
|
60
|
+
def handle_result(
|
|
61
|
+
bucket_name: str, file_key: str, step: str, start: int, content: dict
|
|
62
|
+
):
|
|
50
63
|
filepath = _to_file_progress(file_key)
|
|
51
64
|
|
|
52
65
|
# try to read existing progress to update the time per step
|
|
@@ -55,89 +68,122 @@ def handle_result(bucket_name: str, file_key: str, step: str, start: int, conten
|
|
|
55
68
|
except Exception:
|
|
56
69
|
data = {}
|
|
57
70
|
|
|
58
|
-
return upload_json(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
**
|
|
63
|
-
step:
|
|
71
|
+
return upload_json(
|
|
72
|
+
bucket_name,
|
|
73
|
+
filepath,
|
|
74
|
+
{
|
|
75
|
+
**data,
|
|
76
|
+
"step": step,
|
|
77
|
+
"time": {
|
|
78
|
+
**(
|
|
79
|
+
data.get("time", {})
|
|
80
|
+
if isinstance(data.get("time", {}), dict)
|
|
81
|
+
else {}
|
|
82
|
+
),
|
|
83
|
+
step: current_time_ms() - start,
|
|
84
|
+
},
|
|
85
|
+
**content,
|
|
64
86
|
},
|
|
65
|
-
|
|
66
|
-
})
|
|
87
|
+
)
|
|
67
88
|
|
|
68
89
|
|
|
69
90
|
def handle_error(
|
|
70
|
-
bucket_name: str,
|
|
71
|
-
|
|
72
|
-
|
|
91
|
+
bucket_name: str,
|
|
92
|
+
file_key: str,
|
|
93
|
+
step: str,
|
|
94
|
+
start: int,
|
|
95
|
+
err: str = "",
|
|
96
|
+
stack: str = "",
|
|
97
|
+
errors=[],
|
|
98
|
+
warnings=[],
|
|
99
|
+
extras: dict = {},
|
|
73
100
|
):
|
|
74
|
-
return handle_result(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
101
|
+
return handle_result(
|
|
102
|
+
bucket_name,
|
|
103
|
+
file_key,
|
|
104
|
+
step,
|
|
105
|
+
start,
|
|
106
|
+
extras
|
|
107
|
+
| {
|
|
108
|
+
"success": False,
|
|
109
|
+
"error": {
|
|
110
|
+
"message": err,
|
|
111
|
+
"stack": stack,
|
|
112
|
+
"errors": errors,
|
|
113
|
+
"warnings": warnings,
|
|
114
|
+
},
|
|
115
|
+
"warning": {"warnings": warnings},
|
|
81
116
|
},
|
|
82
|
-
|
|
83
|
-
'warnings': warnings
|
|
84
|
-
}
|
|
85
|
-
})
|
|
117
|
+
)
|
|
86
118
|
|
|
87
119
|
|
|
88
|
-
def handle_success(
|
|
89
|
-
|
|
120
|
+
def handle_success(
|
|
121
|
+
bucket_name: str, file_key: str, step: str, start: int, extras: dict = {}
|
|
122
|
+
):
|
|
123
|
+
return handle_result(bucket_name, file_key, step, start, extras | {"success": True})
|
|
90
124
|
|
|
91
125
|
|
|
92
|
-
def publish_result(
|
|
126
|
+
def publish_result(
|
|
127
|
+
topic_arn: str,
|
|
128
|
+
bucket_name: str,
|
|
129
|
+
file_key: str,
|
|
130
|
+
filepath: str,
|
|
131
|
+
step: str,
|
|
132
|
+
success: bool,
|
|
133
|
+
):
|
|
93
134
|
return _get_sns_client().publish(
|
|
94
135
|
TopicArn=topic_arn,
|
|
95
|
-
Message=to_string(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
'filepath': filepath
|
|
99
|
-
}),
|
|
136
|
+
Message=to_string(
|
|
137
|
+
{"bucket": bucket_name, "key": file_key, "filepath": filepath}
|
|
138
|
+
),
|
|
100
139
|
MessageAttributes={
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
140
|
+
"functionName": {
|
|
141
|
+
"DataType": "String",
|
|
142
|
+
"StringValue": step + ("Done" if success else "Error"),
|
|
104
143
|
}
|
|
105
|
-
}
|
|
144
|
+
},
|
|
106
145
|
)
|
|
107
146
|
|
|
108
147
|
|
|
109
148
|
def _parse_event_s3(event: dict):
|
|
110
|
-
return {
|
|
149
|
+
return {
|
|
150
|
+
"bucket": event["s3"]["bucket"]["name"],
|
|
151
|
+
"key": event["s3"]["object"]["key"],
|
|
152
|
+
}
|
|
111
153
|
|
|
112
154
|
|
|
113
155
|
def _parse_event_SNS(event: dict):
|
|
114
|
-
event = event.get(
|
|
115
|
-
data = json.loads(event.get(
|
|
116
|
-
attributes: dict = event.get(
|
|
117
|
-
data[
|
|
156
|
+
event = event.get("Sns", {})
|
|
157
|
+
data = json.loads(event.get("Message", "{}"))
|
|
158
|
+
attributes: dict = event.get("MessageAttributes", {})
|
|
159
|
+
data["attributes"] = {key: value.get("Value") for key, value in attributes.items()}
|
|
118
160
|
return data
|
|
119
161
|
|
|
120
162
|
|
|
121
163
|
def _parse_event_SQS(event: dict):
|
|
122
|
-
condition = event.get(
|
|
123
|
-
return
|
|
164
|
+
condition = event.get("requestContext", {}).get("condition")
|
|
165
|
+
return (
|
|
166
|
+
_get_data_from_event(event.get("requestPayload", {}))
|
|
167
|
+
if condition != "RetriesExhausted"
|
|
168
|
+
else None
|
|
169
|
+
)
|
|
124
170
|
|
|
125
171
|
|
|
126
172
|
def _get_data_from_event(event): # noqa: C901
|
|
127
173
|
if isinstance(event, dict):
|
|
128
|
-
if
|
|
174
|
+
if "s3" in event:
|
|
129
175
|
return _parse_event_s3(event)
|
|
130
176
|
# invoked when running asynchronously
|
|
131
|
-
if
|
|
177
|
+
if "Sns" in event:
|
|
132
178
|
return _parse_event_SNS(event)
|
|
133
179
|
# invoked through http event
|
|
134
|
-
if
|
|
135
|
-
return _get_data_from_event(json.loads(event.get(
|
|
180
|
+
if "body" in event:
|
|
181
|
+
return _get_data_from_event(json.loads(event.get("body", "{}")))
|
|
136
182
|
# invoked through s3 put object
|
|
137
|
-
if
|
|
138
|
-
return flatten(map(_get_data_from_event, event.get(
|
|
183
|
+
if "Records" in event:
|
|
184
|
+
return flatten(map(_get_data_from_event, event.get("Records", [])))
|
|
139
185
|
# invoked when calculation timedout or failed
|
|
140
|
-
if
|
|
186
|
+
if "requestPayload" in event:
|
|
141
187
|
return _parse_event_SQS(event)
|
|
142
188
|
return event
|
|
143
189
|
if isinstance(event, str):
|
|
@@ -149,34 +195,51 @@ def parse_event(event: dict):
|
|
|
149
195
|
return non_empty_list(flatten(data) if isinstance(data, list) else [data])
|
|
150
196
|
|
|
151
197
|
|
|
152
|
-
def _node_type(node: dict):
|
|
198
|
+
def _node_type(node: dict):
|
|
199
|
+
return node.get("@type", node.get("type"))
|
|
153
200
|
|
|
154
201
|
|
|
155
|
-
def _node_id(node: dict):
|
|
202
|
+
def _node_id(node: dict):
|
|
203
|
+
return node.get("@id", node.get("id"))
|
|
156
204
|
|
|
157
205
|
|
|
158
|
-
def _node_path(node: dict, folder: str =
|
|
206
|
+
def _node_path(node: dict, folder: str = ""):
|
|
207
|
+
return join(folder, _node_type(node), f"{_node_id(node)}.jsonld")
|
|
159
208
|
|
|
160
209
|
|
|
161
|
-
def _load_node(bucket: str, file_key: str):
|
|
210
|
+
def _load_node(bucket: str, file_key: str):
|
|
211
|
+
return json.loads(_load_from_bucket(bucket, file_key))
|
|
162
212
|
|
|
163
213
|
|
|
164
|
-
def _cache_path(node: dict):
|
|
214
|
+
def _cache_path(node: dict):
|
|
215
|
+
return join(_node_type(node), f"{_node_id(node)}.cache")
|
|
165
216
|
|
|
166
217
|
|
|
167
|
-
def _has_cache(bucket: str, node: dict):
|
|
218
|
+
def _has_cache(bucket: str, node: dict):
|
|
219
|
+
return _exists_in_bucket(bucket, _cache_path(node))
|
|
168
220
|
|
|
169
221
|
|
|
170
|
-
def is_calculating(bucket: str, node: dict, folder: str =
|
|
171
|
-
return
|
|
222
|
+
def is_calculating(bucket: str, node: dict, folder: str = ""):
|
|
223
|
+
return (
|
|
224
|
+
_read_metadata(bucket, _node_path(node, folder)).get(
|
|
225
|
+
METADATA_PROGRESS_KEY, "false"
|
|
226
|
+
)
|
|
227
|
+
== "true"
|
|
228
|
+
)
|
|
172
229
|
|
|
173
230
|
|
|
174
|
-
def set_calculating(bucket: str, node: dict, in_progress: bool, folder: str =
|
|
175
|
-
return _update_metadata(
|
|
231
|
+
def set_calculating(bucket: str, node: dict, in_progress: bool, folder: str = ""):
|
|
232
|
+
return _update_metadata(
|
|
233
|
+
bucket,
|
|
234
|
+
_node_path(node, folder),
|
|
235
|
+
{METADATA_PROGRESS_KEY: str(in_progress).lower()},
|
|
236
|
+
)
|
|
176
237
|
|
|
177
238
|
|
|
178
239
|
def get_stage(bucket: str, node: dict, folder: str = CALC_FOLDER):
|
|
179
|
-
stage = _read_metadata(bucket, _node_path(node, folder=CALC_FOLDER)).get(
|
|
240
|
+
stage = _read_metadata(bucket, _node_path(node, folder=CALC_FOLDER)).get(
|
|
241
|
+
METADATA_STAGE_KEY
|
|
242
|
+
)
|
|
180
243
|
return int(stage) if stage else stage
|
|
181
244
|
|
|
182
245
|
|
|
@@ -196,29 +259,43 @@ def load_cache(bucket: str, node: dict):
|
|
|
196
259
|
dict
|
|
197
260
|
The cached data.
|
|
198
261
|
"""
|
|
199
|
-
cache_path = join(node[
|
|
262
|
+
cache_path = join(node["@type"], f"{node['@id']}.cache")
|
|
200
263
|
try:
|
|
201
264
|
return json.loads(_load_from_bucket(bucket, cache_path))
|
|
202
265
|
except Exception:
|
|
203
|
-
print(
|
|
266
|
+
print("No cache found for", cache_path)
|
|
204
267
|
return {}
|
|
205
268
|
|
|
206
269
|
|
|
207
|
-
def _filter_by_type(nodes: list, type: str):
|
|
270
|
+
def _filter_by_type(nodes: list, type: str):
|
|
271
|
+
return [n for n in nodes if n.get("@type", n.get("type")) == type]
|
|
208
272
|
|
|
209
273
|
|
|
210
|
-
def _find_related_nodes(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
274
|
+
def _find_related_nodes(
|
|
275
|
+
from_type: str, from_id: str, related_type: str, related_key: str
|
|
276
|
+
):
|
|
277
|
+
should_find_related = related_key == "related"
|
|
278
|
+
print("Find related nodes from API", from_type, from_id, related_key, related_type)
|
|
279
|
+
return (
|
|
280
|
+
find_related(from_type, from_id, related_type, limit=10000)
|
|
281
|
+
if should_find_related
|
|
282
|
+
else []
|
|
283
|
+
)
|
|
214
284
|
|
|
215
285
|
|
|
216
|
-
def _get_cached_nodes(
|
|
286
|
+
def _get_cached_nodes(
|
|
287
|
+
cache: dict, related_key: str, from_type: str, from_id: str, to_type: str
|
|
288
|
+
):
|
|
217
289
|
# if key is in cache, use nodes in cache, otherwise use API
|
|
218
290
|
if related_key in cache:
|
|
219
291
|
nodes = _filter_by_type(cache.get(related_key, []), to_type)
|
|
220
|
-
print(
|
|
221
|
-
return list(
|
|
292
|
+
print("Using cached data to", related_key, to_type, nodes)
|
|
293
|
+
return list(
|
|
294
|
+
map(
|
|
295
|
+
lambda node: {"@type": to_type, "@id": node.get("@id", node.get("id"))},
|
|
296
|
+
nodes,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
222
299
|
else:
|
|
223
300
|
return _find_related_nodes(from_type, from_id, to_type, related_key)
|
|
224
301
|
|
|
@@ -244,15 +321,22 @@ def get_related_nodes(node: dict, related_key: str, related_type: str, cache: di
|
|
|
244
321
|
List[dict]
|
|
245
322
|
The related nodes.
|
|
246
323
|
"""
|
|
247
|
-
from_type = node.get(
|
|
248
|
-
from_id = node.get(
|
|
324
|
+
from_type = node.get("@type", node.get("type"))
|
|
325
|
+
from_id = node.get("@id", node.get("id"))
|
|
249
326
|
|
|
250
|
-
related_nodes =
|
|
327
|
+
related_nodes = (
|
|
328
|
+
_get_cached_nodes(cache or {}, related_key, from_type, from_id, related_type)
|
|
329
|
+
or []
|
|
330
|
+
)
|
|
251
331
|
|
|
252
|
-
return list(
|
|
332
|
+
return list(
|
|
333
|
+
{f"{node['@type']}/{node['@id']}": node for node in related_nodes}.values()
|
|
334
|
+
)
|
|
253
335
|
|
|
254
336
|
|
|
255
|
-
def get_related_nodes_data(
|
|
337
|
+
def get_related_nodes_data(
|
|
338
|
+
bucket_name: str, node: dict, related_key: str, related_type: str, cache: dict
|
|
339
|
+
):
|
|
256
340
|
"""
|
|
257
341
|
Given a node, return all related nodes with extra data.
|
|
258
342
|
|
|
@@ -275,14 +359,19 @@ def get_related_nodes_data(bucket_name: str, node: dict, related_key: str, relat
|
|
|
275
359
|
List[dict]
|
|
276
360
|
The related nodes with extra data: `indexed_at`, `recalculated_at` and `recalculated_stage`.
|
|
277
361
|
"""
|
|
278
|
-
related_nodes = get_related_nodes(
|
|
362
|
+
related_nodes = get_related_nodes(
|
|
363
|
+
node=node, related_key=related_key, related_type=related_type, cache=cache
|
|
364
|
+
)
|
|
279
365
|
|
|
280
366
|
return [
|
|
281
|
-
node
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
367
|
+
node
|
|
368
|
+
| {
|
|
369
|
+
"indexed_at": _last_modified(bucket=bucket_name, key=_node_path(node)),
|
|
370
|
+
"recalculated_at": _last_modified(
|
|
371
|
+
bucket=bucket_name, key=_node_path(node, folder=CALC_FOLDER)
|
|
372
|
+
),
|
|
373
|
+
"recalculated_stage": get_stage(bucket_name, node),
|
|
374
|
+
"is_calculating": is_calculating(bucket_name, node),
|
|
286
375
|
}
|
|
287
376
|
for node in related_nodes
|
|
288
377
|
]
|
|
@@ -5,10 +5,12 @@ from hestia_earth.schema import EmissionMethodTier
|
|
|
5
5
|
|
|
6
6
|
EXCLUDE_FIELDS = ["@type", "type", "@context"]
|
|
7
7
|
EXCLUDE_PRIVATE_FIELDS = [
|
|
8
|
-
"added",
|
|
9
|
-
"
|
|
8
|
+
"added",
|
|
9
|
+
"addedVersion",
|
|
10
|
+
"updated",
|
|
11
|
+
"updatedVersion",
|
|
10
12
|
"aggregatedVersion",
|
|
11
|
-
"_cache"
|
|
13
|
+
"_cache",
|
|
12
14
|
]
|
|
13
15
|
|
|
14
16
|
|
|
@@ -17,11 +19,11 @@ def _with_csv_formatting(dct):
|
|
|
17
19
|
Use as object_hook when parsing a JSON node: json.loads(node, object_hook=_with_csv_formatting).
|
|
18
20
|
Ensures parsed JSON has field values formatted according to hestia csv conventions.
|
|
19
21
|
"""
|
|
20
|
-
if
|
|
21
|
-
dct[
|
|
22
|
+
if "boundary" in dct:
|
|
23
|
+
dct["boundary"] = json.dumps(dct["boundary"])
|
|
22
24
|
for key, value in dct.items():
|
|
23
25
|
if _is_scalar_list(value):
|
|
24
|
-
dct[key] =
|
|
26
|
+
dct[key] = ";".join([str(el) for el in value])
|
|
25
27
|
return dct
|
|
26
28
|
|
|
27
29
|
|
|
@@ -37,15 +39,17 @@ def _is_scalar_list(value):
|
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
def _filter_not_relevant(blank_node: dict):
|
|
40
|
-
return blank_node.get(
|
|
42
|
+
return blank_node.get("methodTier") != EmissionMethodTier.NOT_RELEVANT.value
|
|
41
43
|
|
|
42
44
|
|
|
43
45
|
def _filter_emissions_not_relevant(node: dict):
|
|
44
46
|
"""
|
|
45
47
|
Ignore all emissions where `methodTier=not relevant` so save space.
|
|
46
48
|
"""
|
|
47
|
-
return node | (
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
return node | (
|
|
50
|
+
{
|
|
51
|
+
key: list(filter(_filter_not_relevant, node[key]))
|
|
52
|
+
for key in ["emissions", "emissionsResourceUse"]
|
|
53
|
+
if key in node
|
|
54
|
+
}
|
|
55
|
+
)
|
|
@@ -9,7 +9,12 @@ from flatten_json import flatten as flatten_json
|
|
|
9
9
|
|
|
10
10
|
# __package__ = "hestia_earth.utils" # required to run interactively in vscode
|
|
11
11
|
from ..api import find_term_ids_by_names
|
|
12
|
-
from ._shared import
|
|
12
|
+
from ._shared import (
|
|
13
|
+
EXCLUDE_FIELDS,
|
|
14
|
+
EXCLUDE_PRIVATE_FIELDS,
|
|
15
|
+
_with_csv_formatting,
|
|
16
|
+
_filter_emissions_not_relevant,
|
|
17
|
+
)
|
|
13
18
|
|
|
14
19
|
|
|
15
20
|
# We only want to pivot array items containing blank nodes
|
|
@@ -18,11 +23,13 @@ def _get_blank_node_uniqueness_fields():
|
|
|
18
23
|
filtered_uniqueness_fields = copy.deepcopy(UNIQUENESS_FIELDS)
|
|
19
24
|
for node_type, array_fields in UNIQUENESS_FIELDS.items():
|
|
20
25
|
for array_field in array_fields.keys():
|
|
21
|
-
if SORT_CONFIG[node_type][array_field][
|
|
26
|
+
if SORT_CONFIG[node_type][array_field]["type"] in NODE_TYPES:
|
|
22
27
|
del filtered_uniqueness_fields[node_type][array_field]
|
|
23
28
|
# include `impactAssessment.@id` since it is not part of original uniqueness
|
|
24
|
-
if
|
|
25
|
-
filtered_uniqueness_fields[node_type][array_field].append(
|
|
29
|
+
if "impactAssessment.id" in array_fields[array_field]:
|
|
30
|
+
filtered_uniqueness_fields[node_type][array_field].append(
|
|
31
|
+
"impactAssessment.@id"
|
|
32
|
+
)
|
|
26
33
|
return filtered_uniqueness_fields
|
|
27
34
|
|
|
28
35
|
|
|
@@ -226,15 +233,17 @@ def _do_pivot(df_in, name_id_dict):
|
|
|
226
233
|
deep_pivoted, left_index=True, right_index=True, how="outer"
|
|
227
234
|
)
|
|
228
235
|
|
|
229
|
-
field_cols.dropna(axis=0, how=
|
|
236
|
+
field_cols.dropna(axis=0, how="all", inplace=True)
|
|
230
237
|
|
|
231
|
-
with_grouped_cols =
|
|
232
|
-
_get_term_index, group_keys=True
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
+
with_grouped_cols = (
|
|
239
|
+
field_cols.T.groupby(_get_term_index, group_keys=True)
|
|
240
|
+
.apply(
|
|
241
|
+
_group_by_term,
|
|
242
|
+
name_id_dict=name_id_dict,
|
|
243
|
+
uniqueness_fields=uniqueness_fields,
|
|
244
|
+
)
|
|
245
|
+
.T
|
|
246
|
+
)
|
|
238
247
|
|
|
239
248
|
pivoted_terms = with_grouped_cols.apply(
|
|
240
249
|
_pivot_row, axis=1, uniqueness_fields=uniqueness_fields
|
|
@@ -242,9 +251,13 @@ def _do_pivot(df_in, name_id_dict):
|
|
|
242
251
|
|
|
243
252
|
# merge any duplicated columns caused by shuffled term positions
|
|
244
253
|
# this operation coincidentally sorts the columns alphabetically
|
|
245
|
-
pivoted_terms =
|
|
246
|
-
|
|
247
|
-
|
|
254
|
+
pivoted_terms = (
|
|
255
|
+
pivoted_terms.T.groupby(
|
|
256
|
+
level=pivoted_terms.columns.nlevels - 1, group_keys=False
|
|
257
|
+
)
|
|
258
|
+
.apply(lambda term: term.bfill().iloc[0, :])
|
|
259
|
+
.T
|
|
260
|
+
)
|
|
248
261
|
|
|
249
262
|
pivoted_terms.columns = map(
|
|
250
263
|
lambda col: f"{nt_label}.{field}.{col}", pivoted_terms.columns
|
|
@@ -265,14 +278,18 @@ def _format_and_pivot(df_in):
|
|
|
265
278
|
df_out = _do_pivot(df_in.copy(), name_id_dict)
|
|
266
279
|
|
|
267
280
|
_sort_inplace(df_out)
|
|
268
|
-
df_out = df_out.astype(
|
|
281
|
+
df_out = df_out.astype("object")
|
|
269
282
|
df_out.fillna("-", inplace=True)
|
|
270
283
|
return df_out
|
|
271
284
|
|
|
272
285
|
|
|
273
286
|
def nodes_to_df(nodes: list[dict]):
|
|
274
287
|
nodes_flattened = [
|
|
275
|
-
flatten_json(
|
|
288
|
+
flatten_json(
|
|
289
|
+
dict([(_get_node_type_label(node.get("@type", node.get("type"))), node)]),
|
|
290
|
+
".",
|
|
291
|
+
)
|
|
292
|
+
for node in nodes
|
|
276
293
|
]
|
|
277
294
|
|
|
278
295
|
return pd.json_normalize(nodes_flattened)
|
|
@@ -306,7 +323,7 @@ def pivot_hestia_file(hestia_file: str):
|
|
|
306
323
|
Pandas dataframe with pivoted array terms
|
|
307
324
|
"""
|
|
308
325
|
parsed = json.loads(hestia_file, object_hook=_with_csv_formatting)
|
|
309
|
-
nodes = parsed.get(
|
|
326
|
+
nodes = parsed.get("nodes", [parsed])
|
|
310
327
|
return pivot_nodes(nodes)
|
|
311
328
|
|
|
312
329
|
|
|
@@ -6,10 +6,17 @@ from collections import defaultdict
|
|
|
6
6
|
from copy import deepcopy
|
|
7
7
|
|
|
8
8
|
from hestia_earth.utils.pipeline import _node_type
|
|
9
|
-
from ._shared import
|
|
9
|
+
from ._shared import (
|
|
10
|
+
EXCLUDE_FIELDS,
|
|
11
|
+
EXCLUDE_PRIVATE_FIELDS,
|
|
12
|
+
_with_csv_formatting,
|
|
13
|
+
_filter_emissions_not_relevant,
|
|
14
|
+
)
|
|
10
15
|
|
|
11
16
|
pivot_exclude_fields = Term().fields
|
|
12
|
-
pivot_exclude_fields.update(
|
|
17
|
+
pivot_exclude_fields.update(
|
|
18
|
+
{k: "" for k in EXCLUDE_FIELDS} | {k: "" for k in EXCLUDE_PRIVATE_FIELDS}
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
term_exclude_fields = Term().fields
|
|
15
22
|
del term_exclude_fields["name"]
|
|
@@ -31,8 +38,10 @@ for node_type, array_fields in UNIQUENESS_FIELDS.items():
|
|
|
31
38
|
if f not in ("properties.term.@id", "properties.value")
|
|
32
39
|
]
|
|
33
40
|
# include `impactAssessment.@id` since it is not part of original uniqueness
|
|
34
|
-
if
|
|
35
|
-
ADAPTED_UNIQUENESS_FIELDS[node_type][array_field].append(
|
|
41
|
+
if "impactAssessment.id" in array_fields[array_field]:
|
|
42
|
+
ADAPTED_UNIQUENESS_FIELDS[node_type][array_field].append(
|
|
43
|
+
"impactAssessment.@id"
|
|
44
|
+
)
|
|
36
45
|
|
|
37
46
|
|
|
38
47
|
def _combine_node_ids(nodes: list):
|
|
@@ -42,13 +51,18 @@ def _combine_node_ids(nodes: list):
|
|
|
42
51
|
def _base_pivoted_value(key: str, value, is_top_level: bool):
|
|
43
52
|
# handle list of Nodes
|
|
44
53
|
return (
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
54
|
+
(
|
|
55
|
+
_combine_node_ids(value)
|
|
56
|
+
if isinstance(value[0], dict) and value[0].get("@type") in NODE_TYPES
|
|
57
|
+
else (
|
|
58
|
+
json.dumps(value, separators=(",", ":"))
|
|
59
|
+
if any([is_top_level, key in ["distribution"]])
|
|
60
|
+
else value
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
if isinstance(value, list)
|
|
64
|
+
else value
|
|
65
|
+
)
|
|
52
66
|
|
|
53
67
|
|
|
54
68
|
def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa: C901
|
|
@@ -57,13 +71,15 @@ def _do_pivot(node, parent_node_type=None, parent_field=None, level=0): # noqa:
|
|
|
57
71
|
if node_type not in ADAPTED_UNIQUENESS_FIELDS:
|
|
58
72
|
return node
|
|
59
73
|
pivoted_node = {
|
|
60
|
-
field: _base_pivoted_value(field, value, level==0)
|
|
74
|
+
field: _base_pivoted_value(field, value, level == 0)
|
|
61
75
|
for field, value in node.items()
|
|
62
|
-
if all(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
76
|
+
if all(
|
|
77
|
+
[
|
|
78
|
+
field not in ADAPTED_UNIQUENESS_FIELDS[node_type],
|
|
79
|
+
node_type != "Term" or field not in term_exclude_fields,
|
|
80
|
+
field not in EXCLUDE_PRIVATE_FIELDS,
|
|
81
|
+
]
|
|
82
|
+
)
|
|
67
83
|
}
|
|
68
84
|
|
|
69
85
|
fields_to_pivot = [
|
|
@@ -227,7 +243,7 @@ def pivot_hestia_file(hestia_file: str):
|
|
|
227
243
|
Pivot json array of schema-compliant nodes on 'nodes' key of unparsed json string
|
|
228
244
|
"""
|
|
229
245
|
parsed = json.loads(hestia_file, object_hook=_with_csv_formatting)
|
|
230
|
-
return pivot_nodes(parsed.get(
|
|
246
|
+
return pivot_nodes(parsed.get("nodes", []))
|
|
231
247
|
|
|
232
248
|
|
|
233
249
|
def pivot_nodes(nodes: list[dict]):
|