hestia-earth-utils 0.16.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. hestia_earth_utils-0.16.10/MANIFEST.in +1 -0
  2. hestia_earth_utils-0.16.10/PKG-INFO +76 -0
  3. hestia_earth_utils-0.16.10/README.md +48 -0
  4. hestia_earth_utils-0.16.10/bin/hestia-format-upload +24 -0
  5. hestia_earth_utils-0.16.10/bin/hestia-pivot-csv +32 -0
  6. hestia_earth_utils-0.16.10/hestia_earth/utils/__init__.py +0 -0
  7. hestia_earth_utils-0.16.10/hestia_earth/utils/api.py +323 -0
  8. hestia_earth_utils-0.16.10/hestia_earth/utils/blank_node.py +249 -0
  9. hestia_earth_utils-0.16.10/hestia_earth/utils/calculation_status.py +75 -0
  10. hestia_earth_utils-0.16.10/hestia_earth/utils/cycle.py +36 -0
  11. hestia_earth_utils-0.16.10/hestia_earth/utils/date.py +86 -0
  12. hestia_earth_utils-0.16.10/hestia_earth/utils/descriptive_stats.py +53 -0
  13. hestia_earth_utils-0.16.10/hestia_earth/utils/emission.py +74 -0
  14. hestia_earth_utils-0.16.10/hestia_earth/utils/lookup.py +299 -0
  15. hestia_earth_utils-0.16.10/hestia_earth/utils/lookup_utils.py +206 -0
  16. hestia_earth_utils-0.16.10/hestia_earth/utils/model.py +172 -0
  17. hestia_earth_utils-0.16.10/hestia_earth/utils/pipeline.py +377 -0
  18. hestia_earth_utils-0.16.10/hestia_earth/utils/pivot/__init__.py +0 -0
  19. hestia_earth_utils-0.16.10/hestia_earth/utils/pivot/_shared.py +55 -0
  20. hestia_earth_utils-0.16.10/hestia_earth/utils/pivot/pivot_csv.py +348 -0
  21. hestia_earth_utils-0.16.10/hestia_earth/utils/pivot/pivot_json.py +253 -0
  22. hestia_earth_utils-0.16.10/hestia_earth/utils/request.py +31 -0
  23. hestia_earth_utils-0.16.10/hestia_earth/utils/stats.py +1037 -0
  24. hestia_earth_utils-0.16.10/hestia_earth/utils/storage/__init__.py +23 -0
  25. hestia_earth_utils-0.16.10/hestia_earth/utils/storage/_azure_client.py +47 -0
  26. hestia_earth_utils-0.16.10/hestia_earth/utils/storage/_local_client.py +22 -0
  27. hestia_earth_utils-0.16.10/hestia_earth/utils/storage/_s3_client.py +120 -0
  28. hestia_earth_utils-0.16.10/hestia_earth/utils/storage/_sns_client.py +17 -0
  29. hestia_earth_utils-0.16.10/hestia_earth/utils/table.py +78 -0
  30. hestia_earth_utils-0.16.10/hestia_earth/utils/term.py +31 -0
  31. hestia_earth_utils-0.16.10/hestia_earth/utils/tools.py +264 -0
  32. hestia_earth_utils-0.16.10/hestia_earth/utils/version.py +1 -0
  33. hestia_earth_utils-0.16.10/hestia_earth_utils.egg-info/PKG-INFO +76 -0
  34. hestia_earth_utils-0.16.10/hestia_earth_utils.egg-info/SOURCES.txt +53 -0
  35. hestia_earth_utils-0.16.10/hestia_earth_utils.egg-info/dependency_links.txt +1 -0
  36. hestia_earth_utils-0.16.10/hestia_earth_utils.egg-info/requires.txt +6 -0
  37. hestia_earth_utils-0.16.10/hestia_earth_utils.egg-info/top_level.txt +1 -0
  38. hestia_earth_utils-0.16.10/setup.cfg +4 -0
  39. hestia_earth_utils-0.16.10/setup.py +33 -0
  40. hestia_earth_utils-0.16.10/tests/test_api.py +171 -0
  41. hestia_earth_utils-0.16.10/tests/test_blank_node.py +59 -0
  42. hestia_earth_utils-0.16.10/tests/test_calculation_status.py +40 -0
  43. hestia_earth_utils-0.16.10/tests/test_cycle.py +18 -0
  44. hestia_earth_utils-0.16.10/tests/test_date.py +17 -0
  45. hestia_earth_utils-0.16.10/tests/test_descriptive_stats.py +49 -0
  46. hestia_earth_utils-0.16.10/tests/test_emission.py +51 -0
  47. hestia_earth_utils-0.16.10/tests/test_lookup.py +142 -0
  48. hestia_earth_utils-0.16.10/tests/test_lookup_utils.py +104 -0
  49. hestia_earth_utils-0.16.10/tests/test_model.py +57 -0
  50. hestia_earth_utils-0.16.10/tests/test_pipeline.py +250 -0
  51. hestia_earth_utils-0.16.10/tests/test_request.py +9 -0
  52. hestia_earth_utils-0.16.10/tests/test_stats.py +218 -0
  53. hestia_earth_utils-0.16.10/tests/test_table.py +11 -0
  54. hestia_earth_utils-0.16.10/tests/test_term.py +19 -0
  55. hestia_earth_utils-0.16.10/tests/test_tools.py +140 -0
@@ -0,0 +1 @@
1
+ include README.md
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: hestia_earth_utils
3
+ Version: 0.16.10
4
+ Summary: HESTIA's utils library
5
+ Home-page: https://gitlab.com/hestia-earth/hestia-utils
6
+ Author: HESTIA Team
7
+ Author-email: guillaumeroyer.mail@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: hestia-earth-schema>=35.0.1
13
+ Requires-Dist: requests>=2.24.0
14
+ Requires-Dist: urllib3~=1.26.0
15
+ Requires-Dist: python-dateutil>=2.8.1
16
+ Requires-Dist: pandas>=2
17
+ Requires-Dist: flatten_json
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: license
25
+ Dynamic: requires-dist
26
+ Dynamic: requires-python
27
+ Dynamic: summary
28
+
29
+ # HESTIA Utils
30
+
31
+ ## Install
32
+
33
+ 1. Install the module:
34
+ ```bash
35
+ pip install hestia_earth.utils
36
+ ```
37
+ 2. Add this to your environment variables:
38
+ ```
39
+ API_URL=https://api.hestia.earth
40
+ WEB_URL=https://www.hestia.earth
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ 1. To download a file from the HESTIA API:
46
+ ```python
47
+ from hestia_earth.schema import SchemaType
48
+ from hestia_earth.utils.api import download_hestia
49
+
50
+ cycle = download_hestia('cycleId', SchemaType.CYCLE)
51
+ sandContent = download_hestia('sandContent', SchemaType.TERM)
52
+ ```
53
+
54
+ 2. To search for a specific Node on HESTIA:
55
+ ```python
56
+ from hestia_earth.schema import SchemaType
57
+ from hestia_earth.utils.api import find_node_exact
58
+
59
+ source = find_node_exact(SchemaType.SOURCE, {'bibliography.title': 'My Bibliography'})
60
+ ```
61
+
62
+ 3. To get a lookup table from local file system:
63
+ ```python
64
+ from hestia_earth.schema import SchemaType
65
+ from hestia_earth.utils.lookup import load_lookup
66
+
67
+ df = load_lookup('path/to/my/lookup.csv')
68
+ ```
69
+
70
+ 4. To get a lookup table from HESTIA:
71
+ ```python
72
+ from hestia_earth.schema import SchemaType
73
+ from hestia_earth.utils.lookup import download_lookup
74
+
75
+ df = download_lookup('crop.csv')
76
+ ```
@@ -0,0 +1,48 @@
1
+ # HESTIA Utils
2
+
3
+ ## Install
4
+
5
+ 1. Install the module:
6
+ ```bash
7
+ pip install hestia_earth.utils
8
+ ```
9
+ 2. Add this to your environment variables:
10
+ ```
11
+ API_URL=https://api.hestia.earth
12
+ WEB_URL=https://www.hestia.earth
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ 1. To download a file from the HESTIA API:
18
+ ```python
19
+ from hestia_earth.schema import SchemaType
20
+ from hestia_earth.utils.api import download_hestia
21
+
22
+ cycle = download_hestia('cycleId', SchemaType.CYCLE)
23
+ sandContent = download_hestia('sandContent', SchemaType.TERM)
24
+ ```
25
+
26
+ 2. To search for a specific Node on HESTIA:
27
+ ```python
28
+ from hestia_earth.schema import SchemaType
29
+ from hestia_earth.utils.api import find_node_exact
30
+
31
+ source = find_node_exact(SchemaType.SOURCE, {'bibliography.title': 'My Bibliography'})
32
+ ```
33
+
34
+ 3. To get a lookup table from local file system:
35
+ ```python
36
+ from hestia_earth.schema import SchemaType
37
+ from hestia_earth.utils.lookup import load_lookup
38
+
39
+ df = load_lookup('path/to/my/lookup.csv')
40
+ ```
41
+
42
+ 4. To get a lookup table from HESTIA:
43
+ ```python
44
+ from hestia_earth.schema import SchemaType
45
+ from hestia_earth.utils.lookup import download_lookup
46
+
47
+ df = download_lookup('crop.csv')
48
+ ```
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import argparse
5
+ from hestia_earth.utils.table import format_for_upload
6
+
7
+
8
+ parser = argparse.ArgumentParser(description='Format data for upload.')
9
+ parser.add_argument('--input-file', type=str, required=True,
10
+ help='The path of the CSV or JSON file.')
11
+ parser.add_argument('--output-file', type=str,
12
+ help='The path of where to store the pivoted CSV. Adds "-formatted" suffix by default to input.')
13
+ args = parser.parse_args()
14
+
15
+
16
+ def main():
17
+ src = args.input_file
18
+ dest = args.output_file or src.split('.')[0] + '-formatted.csv'
19
+ pd = format_for_upload(src)
20
+ pd.to_csv(dest, index=None)
21
+
22
+
23
+ if __name__ == '__main__':
24
+ sys.exit(main())
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import json
4
+ import sys
5
+ import argparse
6
+ from hestia_earth.utils.pivot.pivot_json import pivot_hestia_file as pivot_hestia_file_json
7
+ from hestia_earth.utils.pivot.pivot_csv import pivot_csv, pivot_hestia_file as pivot_hestia_file_csv
8
+
9
+
10
+ parser = argparse.ArgumentParser(description='Pivot nodes in CSV or JSON format.')
11
+ parser.add_argument('--input-file', type=str, required=True,
12
+ help='The path of the CSV or JSON file.')
13
+ parser.add_argument('--output-file', type=str,
14
+ help='The path of where to store the pivoted file. Adds "-pivoted.csv" suffix by default to input.')
15
+
16
+ args = parser.parse_args()
17
+
18
+
19
+ def main():
20
+ src = args.input_file
21
+ dest = args.output_file or src.split('.')[0] + '-pivoted.csv'
22
+ if dest.endswith('.csv'):
23
+ pd = pivot_csv(src) if src.endswith('.csv') else pivot_hestia_file_csv(open(src, 'r').read())
24
+ pd.to_csv(dest, index=None)
25
+ else:
26
+ pivoted = pivot_hestia_file_json(open(src, 'r').read())
27
+ with open(dest, 'w') as file:
28
+ file.write(json.dumps({'nodes': pivoted}, sort_keys=True, indent=4))
29
+
30
+
31
+ if __name__ == '__main__':
32
+ sys.exit(main())
@@ -0,0 +1,323 @@
1
+ import requests
2
+ import os
3
+ import json
4
+ from functools import cache
5
+ from hestia_earth.schema import SchemaType, NESTED_SEARCHABLE_KEYS
6
+
7
+ from .storage import _load_from_storage, _exists
8
+ from .request import request_url, api_url, api_access_token
9
+
10
+
11
+ def _match_key_value(key: str, value):
12
+ first_key = key.split(".")[0]
13
+ query = {"match": {key: value}}
14
+ return (
15
+ {"nested": {"path": first_key, "query": query}}
16
+ if first_key in NESTED_SEARCHABLE_KEYS
17
+ else query
18
+ )
19
+
20
+
21
+ def _retry_request_error(func, retry_max: int = 5):
22
+ err = None
23
+
24
+ for _ in range(retry_max):
25
+ try:
26
+ return func()
27
+ except json.decoder.JSONDecodeError as e:
28
+ err = e
29
+ continue
30
+
31
+ raise err
32
+
33
+
34
+ def _safe_get_request(url: str, res_error=None):
35
+ def exec():
36
+ try:
37
+ headers = {"Content-Type": "application/json"}
38
+ access_token = api_access_token()
39
+ if access_token:
40
+ headers["X-Access-Token"] = access_token
41
+ return requests.get(url, headers=headers).json()
42
+ except requests.exceptions.RequestException:
43
+ return res_error
44
+
45
+ return _retry_request_error(exec)
46
+
47
+
48
+ def _safe_post_request(url: str, body: dict, res_error={}):
49
+ def exec():
50
+ try:
51
+ headers = {"Content-Type": "application/json"}
52
+ access_token = api_access_token()
53
+ if access_token:
54
+ headers["X-Access-Token"] = access_token
55
+ return requests.post(url, json.dumps(body), headers=headers).json()
56
+ except requests.exceptions.RequestException:
57
+ return res_error
58
+
59
+ return _retry_request_error(exec)
60
+
61
+
62
+ def _parse_node_type(node_type: SchemaType):
63
+ return node_type if isinstance(node_type, str) else node_type.value
64
+
65
+
66
+ def node_type_to_url(node_type: SchemaType):
67
+ return f"{_parse_node_type(node_type)}s".lower()
68
+
69
+
70
+ def node_to_path(node_type: SchemaType, node_id: str, data_state=None):
71
+ jsonld_path = os.path.join(_parse_node_type(node_type), f"{node_id}.jsonld")
72
+ return (
73
+ jsonld_path
74
+ if data_state is None or data_state == "original" or len(data_state) == 0
75
+ else os.path.join(data_state, jsonld_path)
76
+ )
77
+
78
+
79
+ def find_related(
80
+ node_type: SchemaType,
81
+ id: str,
82
+ related_type: SchemaType,
83
+ limit=100,
84
+ offset=0,
85
+ relationship=None,
86
+ ):
87
+ """
88
+ Return the list of related Nodes by going through a "relationship".
89
+ You can navigate the HESTIA Graph Database using this method.
90
+
91
+ Parameters
92
+ ----------
93
+ node_type
94
+ The `@type` of the Node to start from. Example: use `SchemaType.Cycle` to find nodes related to a `Cycle`.
95
+ id
96
+ The `@id` of the Node to start from.
97
+ related_type
98
+ The other Node to which the relation should go to. Example: use `SchemaType.Source` to find `Source` related to
99
+ `Cycle`.
100
+ limit
101
+ The limit of relationships to return. Asking for large number might result in timeouts.
102
+ offset
103
+ Use with limit to paginate through the results.
104
+ relationship
105
+ The relationship used to connect both Node. See the API for more information.
106
+ """
107
+ url = request_url(
108
+ f"{api_url()}/{node_type_to_url(node_type)}/{id}/{node_type_to_url(related_type)}",
109
+ limit=limit,
110
+ offset=offset,
111
+ relationship=relationship,
112
+ )
113
+ response = _safe_get_request(url)
114
+ # handle errors
115
+ return response.get("results", []) if isinstance(response, dict) else response
116
+
117
+
118
+ def _exec_download_hestia(
119
+ node_id: str, node_type=SchemaType.TERM, data_state="", mode=""
120
+ ) -> dict:
121
+ def fallback():
122
+ url = request_url(
123
+ f"{api_url()}/{node_type_to_url(node_type)}/{node_id}",
124
+ dataState=data_state,
125
+ mode=mode,
126
+ )
127
+ return _safe_get_request(url)
128
+
129
+ try:
130
+ jsonld_path = node_to_path(node_type, node_id, data_state)
131
+ data = _load_from_storage(jsonld_path)
132
+ return json.loads(data) if data else None
133
+ except ImportError:
134
+ return fallback()
135
+
136
+
137
+ _exec_download_hestia_cached = cache(_exec_download_hestia)
138
+
139
+
140
+ def download_hestia(
141
+ node_id: str, node_type=SchemaType.TERM, data_state="", mode=""
142
+ ) -> dict:
143
+ """
144
+ Download a Node from the HESTIA Database.
145
+
146
+ Parameters
147
+ ----------
148
+ node_id
149
+ The `@id` of the Node.
150
+ node_type
151
+ The `@type` of the Node.
152
+ data_state
153
+ Optional - the `dataState` of the Node.
154
+ By default, `original` version will be returned.
155
+ Use `recalculated` to download the recalculated version instead (if available).
156
+ mode
157
+ Optional - use `csv` to download as a CSV file, `zip` to download as a ZIP file. Defaults to `JSON`.
158
+
159
+ Returns
160
+ -------
161
+ JSON
162
+ The `JSON` content of the Node.
163
+ """
164
+ # cache all requests to `Term` by default, as the values are not likely to change during a single execution
165
+ download_func = (
166
+ _exec_download_hestia_cached
167
+ if _parse_node_type(node_type) == "Term"
168
+ else _exec_download_hestia
169
+ )
170
+ return download_func(node_id, node_type, data_state, mode)
171
+
172
+
173
+ def node_exists(node_id: str, node_type=SchemaType.TERM) -> bool:
174
+ """
175
+ Checks if a node exists on the HESTIA Database.
176
+
177
+ Parameters
178
+ ----------
179
+ node_id
180
+ The `@id` of the Node.
181
+ node_type
182
+ The `@type` of the Node.
183
+
184
+ Returns
185
+ -------
186
+ bool
187
+ True if the node exists, False otherwise.
188
+ """
189
+
190
+ def fallback():
191
+ url = request_url(f"{api_url()}/{node_type_to_url(node_type)}/{node_id}")
192
+ result = _safe_get_request(url)
193
+ return result is not None and "@id" in result
194
+
195
+ try:
196
+ return _exists(node_to_path(node_type, node_id))
197
+ except ImportError:
198
+ return fallback()
199
+
200
+
201
+ def search(
202
+ query: dict, fields=["@type", "@id", "name"], limit=10, offset=0, sort=None
203
+ ) -> list:
204
+ """
205
+ Executes a raw search on the HESTIA Platform.
206
+
207
+ Parameters
208
+ ----------
209
+ query
210
+ The search engine is using ElasticSearch engine version 7:
211
+ https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html.
212
+ All options can be used here.
213
+ fields
214
+ The list of fields to return. Example: ['@type', '@id']. Defaults to `['@type', '@id', 'name']`.
215
+ limit
216
+ Optional - limit the number of results to return. Defaults to `10`.
217
+ offset
218
+ Optional - use with limit to paginate the results. Defaults to `0`.
219
+ sort : dict
220
+ Sorting options. Please refer to the ElasticSearch version 7 documentation for use.
221
+
222
+ Returns
223
+ -------
224
+ List[JSON]
225
+ List of Nodes (as JSON) found.
226
+ """
227
+ return _safe_post_request(
228
+ f"{api_url()}/search",
229
+ {
230
+ "query": query,
231
+ "limit": limit,
232
+ "offset": offset,
233
+ "fields": fields,
234
+ **({"sort": sort} if sort is not None else {}),
235
+ },
236
+ ).get("results", [])
237
+
238
+
239
+ def find_node(node_type: SchemaType, args: dict, limit=10) -> list:
240
+ """
241
+ Finds nodes on the HESTIA Platform.
242
+
243
+ Parameters
244
+ ----------
245
+ node_type
246
+ The `@type` of the Node.
247
+ args
248
+ Dictionary of key/value to exec search on. Example: use `{'bibliography.title': 'My biblio'}` on a
249
+ `SchemaType.Source` to find all `Source`s having a `bibliography` with `title` == `My biblio`
250
+ limit
251
+ Optional - limit the number of results to return. Defaults to `10`.
252
+
253
+ Returns
254
+ -------
255
+ List[JSON]
256
+ List of Nodes (as JSON) found.
257
+ """
258
+ query_args = list(
259
+ map(lambda key: _match_key_value(key, args.get(key)), args.keys())
260
+ )
261
+ must = [{"match": {"@type": node_type.value}}]
262
+ must.extend(query_args)
263
+ return search(query={"bool": {"must": must}}, limit=limit)
264
+
265
+
266
+ def find_node_exact(node_type: SchemaType, args: dict) -> dict:
267
+ """
268
+ Finds a single Node on the HESTIA Platform.
269
+
270
+ Parameters
271
+ ----------
272
+ node_type
273
+ The `@type` of the Node.
274
+ args
275
+ Dictionary of key/value to exec search on. Example: use `{'bibliography.title': 'My biblio'}` on a
276
+ `SchemaType.Source` to find all `Source`s having a `bibliography` with `title` == `My biblio`
277
+
278
+ Returns
279
+ -------
280
+ JSON
281
+ JSON of the node if found, else `None`.
282
+ """
283
+ query_args = list(
284
+ map(lambda key: _match_key_value(f"{key}.keyword", args.get(key)), args.keys())
285
+ )
286
+ must = [{"match": {"@type": node_type.value}}]
287
+ must.extend(query_args)
288
+ results = search(query={"bool": {"must": must}}, limit=2)
289
+ # do not return a duplicate
290
+ return results[0] if len(results) == 1 else None
291
+
292
+
293
+ # should support up to 65,000 terms, but limit to 1000 just in case
294
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html#terms-top-level-params
295
+ def find_term_ids_by_names(names, batch_size=1000):
296
+ unique_names_set = set(names)
297
+ unique_names = list(unique_names_set)
298
+ unique_names_count = len(unique_names)
299
+ result = {}
300
+ for i in range(0, unique_names_count, batch_size):
301
+ query = {
302
+ "constant_score": {
303
+ "filter": {
304
+ "bool": {
305
+ "must": [
306
+ {
307
+ "terms": {
308
+ "name.keyword": unique_names[i : i + batch_size],
309
+ }
310
+ },
311
+ {"term": {"@type.keyword": "Term"}},
312
+ ]
313
+ }
314
+ }
315
+ }
316
+ }
317
+ results = search(query=query, limit=batch_size, fields=["@id", "name"])
318
+ for term in results:
319
+ result[term.get("name")] = term.get("@id")
320
+ missing_names = unique_names_set - set(result.keys())
321
+ if len(missing_names):
322
+ raise Exception(f"Failed to find ids for names: {'; '.join(missing_names)}")
323
+ return result