hestia-earth-utils 0.16.6__py3-none-any.whl → 0.16.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hestia_earth/utils/blank_node.py +2 -2
- hestia_earth/utils/lookup.py +39 -73
- hestia_earth/utils/lookup_utils.py +4 -4
- hestia_earth/utils/version.py +1 -1
- {hestia_earth_utils-0.16.6.dist-info → hestia_earth_utils-0.16.7.dist-info}/METADATA +2 -7
- {hestia_earth_utils-0.16.6.dist-info → hestia_earth_utils-0.16.7.dist-info}/RECORD +10 -11
- hestia_earth/utils/csv_utils.py +0 -84
- {hestia_earth_utils-0.16.6.data → hestia_earth_utils-0.16.7.data}/scripts/hestia-format-upload +0 -0
- {hestia_earth_utils-0.16.6.data → hestia_earth_utils-0.16.7.data}/scripts/hestia-pivot-csv +0 -0
- {hestia_earth_utils-0.16.6.dist-info → hestia_earth_utils-0.16.7.dist-info}/WHEEL +0 -0
- {hestia_earth_utils-0.16.6.dist-info → hestia_earth_utils-0.16.7.dist-info}/top_level.txt +0 -0
hestia_earth/utils/blank_node.py
CHANGED
|
@@ -5,7 +5,7 @@ from functools import reduce
|
|
|
5
5
|
from statistics import mode, mean
|
|
6
6
|
from hestia_earth.schema import TermTermType
|
|
7
7
|
|
|
8
|
-
from .lookup import download_lookup, get_table_value
|
|
8
|
+
from .lookup import download_lookup, get_table_value
|
|
9
9
|
from .tools import non_empty_list, non_empty_value, flatten
|
|
10
10
|
from .emission import cycle_emissions_in_system_boundary
|
|
11
11
|
from .model import filter_list_term_type
|
|
@@ -15,7 +15,7 @@ def get_lookup_value(blank_node: dict, column: str):
|
|
|
15
15
|
term = blank_node.get('term', {})
|
|
16
16
|
table_name = f"{term.get('termType')}.csv" if term else None
|
|
17
17
|
value = get_table_value(
|
|
18
|
-
download_lookup(table_name), '
|
|
18
|
+
download_lookup(table_name), 'term.id', term.get('@id'), column
|
|
19
19
|
) if table_name else None
|
|
20
20
|
return value
|
|
21
21
|
|
hestia_earth/utils/lookup.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
from functools import reduce
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any
|
|
3
3
|
import requests
|
|
4
|
-
import
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
from .storage import _load_from_storage
|
|
7
8
|
from .request import request_url, web_url
|
|
8
|
-
from .csv_utils import csv_str_to_recarray, csv_file_to_recarray, is_missing_value, _replace_chars
|
|
9
9
|
|
|
10
10
|
_GLOSSARY_FOLDER = 'glossary/lookups'
|
|
11
11
|
_memory = {}
|
|
12
|
-
_INDEX_COL = 'termid'
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
def _memory_wrapper(key: str, func):
|
|
@@ -18,6 +17,18 @@ def _memory_wrapper(key: str, func):
|
|
|
18
17
|
return _memory[key]
|
|
19
18
|
|
|
20
19
|
|
|
20
|
+
def _read_csv(value: str) -> pd.DataFrame:
|
|
21
|
+
return pd.read_csv(value, na_values=['-', ''])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _read_csv_from_string(data: str) -> pd.DataFrame:
|
|
25
|
+
return _read_csv(StringIO(data))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_missing_value(value):
|
|
29
|
+
return pd.isna(value) or value is None or value == '' or value == '-'
|
|
30
|
+
|
|
31
|
+
|
|
21
32
|
def load_lookup(filepath: str, keep_in_memory: bool = False):
|
|
22
33
|
"""
|
|
23
34
|
Import local lookup table as csv file into a `numpy.recarray`.
|
|
@@ -34,7 +45,7 @@ def load_lookup(filepath: str, keep_in_memory: bool = False):
|
|
|
34
45
|
numpy.recarray
|
|
35
46
|
The `numpy.recarray` converted from the csv content.
|
|
36
47
|
"""
|
|
37
|
-
def load(): return
|
|
48
|
+
def load(): return _read_csv(filepath)
|
|
38
49
|
return _memory_wrapper(filepath, load) if keep_in_memory else load()
|
|
39
50
|
|
|
40
51
|
|
|
@@ -53,20 +64,7 @@ def _download_lookup_data(filename: str):
|
|
|
53
64
|
return fallback()
|
|
54
65
|
|
|
55
66
|
|
|
56
|
-
def
|
|
57
|
-
columns = list(array.dtype.names)
|
|
58
|
-
try:
|
|
59
|
-
return {
|
|
60
|
-
row[_INDEX_COL]: {col: row[col] for col in columns}
|
|
61
|
-
for row in array
|
|
62
|
-
} if _INDEX_COL in columns else array
|
|
63
|
-
except TypeError:
|
|
64
|
-
return {
|
|
65
|
-
array[_INDEX_COL].item(): {col: array[col].item() for col in columns}
|
|
66
|
-
} if _INDEX_COL in columns else array
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def download_lookup(filename: str, keep_in_memory: bool = True, build_index: bool = False):
|
|
67
|
+
def download_lookup(filename: str, keep_in_memory: bool = True):
|
|
70
68
|
"""
|
|
71
69
|
Download lookup table from HESTIA as csv into a `numpy.recarray`.
|
|
72
70
|
|
|
@@ -86,8 +84,7 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
|
|
|
86
84
|
"""
|
|
87
85
|
def load():
|
|
88
86
|
data = _download_lookup_data(filename)
|
|
89
|
-
|
|
90
|
-
return (_build_index(rec) if build_index else rec) if data else None
|
|
87
|
+
return _read_csv_from_string(data) if data else None
|
|
91
88
|
|
|
92
89
|
try:
|
|
93
90
|
return _memory_wrapper(filename, load) if keep_in_memory else load()
|
|
@@ -97,19 +94,9 @@ def download_lookup(filename: str, keep_in_memory: bool = True, build_index: boo
|
|
|
97
94
|
|
|
98
95
|
def column_name(key: str):
|
|
99
96
|
"""
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
Parameters
|
|
103
|
-
----------
|
|
104
|
-
key : str
|
|
105
|
-
The column name.
|
|
106
|
-
|
|
107
|
-
Returns
|
|
108
|
-
-------
|
|
109
|
-
str
|
|
110
|
-
The column name that can be used in `get_table_value`.
|
|
97
|
+
Deprecated. Columns are no longer renamed.
|
|
111
98
|
"""
|
|
112
|
-
return
|
|
99
|
+
return key
|
|
113
100
|
|
|
114
101
|
|
|
115
102
|
def _parse_value(value: str):
|
|
@@ -124,36 +111,24 @@ def _parse_value(value: str):
|
|
|
124
111
|
return value
|
|
125
112
|
|
|
126
113
|
|
|
127
|
-
def _get_single_table_value(
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
data[data[col_match] == col_match_with][col_val][0]
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def _get_multiple_table_values(data: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val):
|
|
135
|
-
def reducer(x, values):
|
|
136
|
-
col = values[1]
|
|
137
|
-
value = col_match_with[values[0]]
|
|
138
|
-
return x.get(value) if isinstance(x, dict) else x[x[col] == value]
|
|
114
|
+
def _get_single_table_value(df: pd.DataFrame, col_match: str, col_match_with, col_val):
|
|
115
|
+
filtered_df = df[df[col_match] == col_match_with]
|
|
116
|
+
return None if filtered_df.empty else filtered_df[col_val].iloc[0]
|
|
139
117
|
|
|
140
|
-
return reduce(reducer, enumerate(col_match), data)[col_val][0]
|
|
141
118
|
|
|
142
|
-
|
|
143
|
-
def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_match_with, col_val, default_value=''):
|
|
119
|
+
def get_table_value(lookup: pd.DataFrame, col_match: str, col_match_with: str, col_val: Any, default_value=''):
|
|
144
120
|
"""
|
|
145
121
|
Get a value matched by one or more columns from a `numpy.recarray`.
|
|
146
122
|
|
|
147
123
|
Parameters
|
|
148
124
|
----------
|
|
149
|
-
lookup :
|
|
125
|
+
lookup : DataFrame
|
|
150
126
|
The value returned by the `download_lookup` function.
|
|
151
127
|
col_match : str
|
|
152
128
|
Which `column` should be used to find data in. This will restrict the rows to search for.
|
|
153
129
|
Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
|
|
154
|
-
col_match_with
|
|
130
|
+
col_match_with: str
|
|
155
131
|
Which column `value` should be used to find data in. This will restrict the rows to search for.
|
|
156
|
-
Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
|
|
157
132
|
col_val: str
|
|
158
133
|
The column which contains the value to look for.
|
|
159
134
|
default_value : Any
|
|
@@ -164,44 +139,35 @@ def get_table_value(lookup: Union[dict, numpy.recarray], col_match: str, col_mat
|
|
|
164
139
|
str
|
|
165
140
|
The value found or `None` if no match.
|
|
166
141
|
"""
|
|
167
|
-
single = isinstance(col_match, str) and isinstance(col_match_with, str)
|
|
168
142
|
try:
|
|
169
|
-
value = (
|
|
170
|
-
|
|
171
|
-
_get_multiple_table_values(lookup, col_match, col_match_with, col_val)
|
|
172
|
-
)
|
|
143
|
+
value = _get_single_table_value(lookup, col_match, col_match_with, col_val)
|
|
144
|
+
print(value, type(value))
|
|
173
145
|
return default_value if is_missing_value(value) else _parse_value(value)
|
|
174
146
|
except Exception:
|
|
175
147
|
return None
|
|
176
148
|
|
|
177
149
|
|
|
178
|
-
def find_term_ids_by(lookup:
|
|
150
|
+
def find_term_ids_by(lookup: pd.DataFrame, col_match: str, col_match_with: str):
|
|
179
151
|
"""
|
|
180
152
|
Find `term.id` values where a column matches a specific value.
|
|
181
153
|
|
|
182
154
|
Parameters
|
|
183
155
|
----------
|
|
184
|
-
lookup :
|
|
156
|
+
lookup : DataFrame
|
|
185
157
|
The value returned by the `download_lookup` function.
|
|
186
158
|
col_match : str
|
|
187
159
|
Which `column` should be used to find data in. This will restrict the rows to search for.
|
|
188
160
|
Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match_with`.
|
|
189
|
-
col_match_with
|
|
161
|
+
col_match_with: str
|
|
190
162
|
Which column `value` should be used to find data in. This will restrict the rows to search for.
|
|
191
|
-
Can be a single `str` or a list of `str`. If a list is used, must be the same length as `col_match`.
|
|
192
163
|
|
|
193
164
|
Returns
|
|
194
165
|
-------
|
|
195
166
|
list[str]
|
|
196
167
|
The list of `term.id` that matched the expected column value.
|
|
197
168
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
key
|
|
201
|
-
for key, value in lookup.items()
|
|
202
|
-
if value.get(col_match) == col_match_with
|
|
203
|
-
])
|
|
204
|
-
) if isinstance(lookup, dict) else set(list(lookup[lookup[col_match] == col_match_with].termid))
|
|
169
|
+
filtered_df = lookup[lookup[col_match] == col_match_with]
|
|
170
|
+
term_ids = filtered_df['term.id'].unique().tolist() if 'term.id' in filtered_df.columns else []
|
|
205
171
|
return list(map(str, term_ids))
|
|
206
172
|
|
|
207
173
|
|
|
@@ -266,13 +232,13 @@ def extract_grouped_data_closest_date(data: str, year: int) -> str:
|
|
|
266
232
|
return None if closest_year is None else _parse_value(data_by_date.get(closest_year))
|
|
267
233
|
|
|
268
234
|
|
|
269
|
-
def lookup_term_ids(lookup:
|
|
235
|
+
def lookup_term_ids(lookup: pd.DataFrame):
|
|
270
236
|
"""
|
|
271
237
|
Get the `term.id` values from a lookup.
|
|
272
238
|
|
|
273
239
|
Parameters
|
|
274
240
|
----------
|
|
275
|
-
lookup :
|
|
241
|
+
lookup : DataFrame
|
|
276
242
|
The value returned by the `download_lookup` function.
|
|
277
243
|
|
|
278
244
|
Returns
|
|
@@ -280,16 +246,16 @@ def lookup_term_ids(lookup: Union[dict, numpy.recarray]):
|
|
|
280
246
|
list[str]
|
|
281
247
|
The `term.id` values from the lookup.
|
|
282
248
|
"""
|
|
283
|
-
return lookup.
|
|
249
|
+
return list(map(str, lookup['term.id'].tolist())) if 'term.id' in lookup.columns else []
|
|
284
250
|
|
|
285
251
|
|
|
286
|
-
def lookup_columns(lookup:
|
|
252
|
+
def lookup_columns(lookup: pd.DataFrame):
|
|
287
253
|
"""
|
|
288
254
|
Get the columns from a lookup.
|
|
289
255
|
|
|
290
256
|
Parameters
|
|
291
257
|
----------
|
|
292
|
-
lookup :
|
|
258
|
+
lookup : DataFrame
|
|
293
259
|
The value returned by the `download_lookup` function.
|
|
294
260
|
|
|
295
261
|
Returns
|
|
@@ -297,4 +263,4 @@ def lookup_columns(lookup: Union[dict, numpy.recarray]):
|
|
|
297
263
|
list[str]
|
|
298
264
|
The columns from the lookup.
|
|
299
265
|
"""
|
|
300
|
-
return list(
|
|
266
|
+
return list(lookup.columns)
|
|
@@ -2,7 +2,7 @@ from functools import lru_cache
|
|
|
2
2
|
import json
|
|
3
3
|
from hestia_earth.schema import SchemaType
|
|
4
4
|
|
|
5
|
-
from .lookup import _download_lookup_data, download_lookup, get_table_value
|
|
5
|
+
from .lookup import _download_lookup_data, download_lookup, get_table_value
|
|
6
6
|
from .api import download_hestia
|
|
7
7
|
from .tools import non_empty_list, flatten
|
|
8
8
|
|
|
@@ -45,7 +45,7 @@ def _allowed_model_mapping(model: str, term_id: str, column: str):
|
|
|
45
45
|
mapping = _allowed_mapping_data()
|
|
46
46
|
value = mapping.get(term_id, {}).get(model, {}).get(column) if mapping else get_table_value(
|
|
47
47
|
download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}-model-{column}.csv"),
|
|
48
|
-
'
|
|
48
|
+
'term.id', term_id, column
|
|
49
49
|
)
|
|
50
50
|
return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
|
|
51
51
|
|
|
@@ -78,7 +78,7 @@ def _allowed_mapping(term_id: str, column: str):
|
|
|
78
78
|
mapping = _allowed_mapping_data()
|
|
79
79
|
value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
|
|
80
80
|
download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
|
|
81
|
-
'
|
|
81
|
+
'term.id', term_id, column
|
|
82
82
|
)
|
|
83
83
|
return (value or _ALLOW_ALL).split(';') if isinstance(value, str) else _ALLOW_ALL
|
|
84
84
|
|
|
@@ -174,7 +174,7 @@ def is_in_system_boundary(term_id: str) -> bool:
|
|
|
174
174
|
column = 'inHestiaDefaultSystemBoundary'
|
|
175
175
|
value = mapping.get(term_id, {}).get(column) if mapping else get_table_value(
|
|
176
176
|
download_lookup(f"{(download_hestia(term_id) or {}).get('termType')}.csv"),
|
|
177
|
-
'
|
|
177
|
+
'term.id', term_id, column
|
|
178
178
|
)
|
|
179
179
|
# handle numpy bool from table value
|
|
180
180
|
return not (not value)
|
hestia_earth/utils/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
VERSION = '0.16.
|
|
1
|
+
VERSION = '0.16.7'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hestia_earth_utils
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.7
|
|
4
4
|
Summary: HESTIA's utils library
|
|
5
5
|
Home-page: https://gitlab.com/hestia-earth/hestia-utils
|
|
6
6
|
Author: HESTIA Team
|
|
@@ -13,12 +13,8 @@ Requires-Dist: hestia-earth-schema>=35.0.1
|
|
|
13
13
|
Requires-Dist: requests>=2.24.0
|
|
14
14
|
Requires-Dist: urllib3~=1.26.0
|
|
15
15
|
Requires-Dist: python-dateutil>=2.8.1
|
|
16
|
-
Requires-Dist:
|
|
16
|
+
Requires-Dist: pandas>=2
|
|
17
17
|
Requires-Dist: flatten_json
|
|
18
|
-
Provides-Extra: pivot-csv
|
|
19
|
-
Requires-Dist: pandas>=2; extra == "pivot-csv"
|
|
20
|
-
Provides-Extra: table
|
|
21
|
-
Requires-Dist: pandas>=2; extra == "table"
|
|
22
18
|
Dynamic: author
|
|
23
19
|
Dynamic: author-email
|
|
24
20
|
Dynamic: classifier
|
|
@@ -26,7 +22,6 @@ Dynamic: description
|
|
|
26
22
|
Dynamic: description-content-type
|
|
27
23
|
Dynamic: home-page
|
|
28
24
|
Dynamic: license
|
|
29
|
-
Dynamic: provides-extra
|
|
30
25
|
Dynamic: requires-dist
|
|
31
26
|
Dynamic: requires-python
|
|
32
27
|
Dynamic: summary
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
hestia_earth/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
hestia_earth/utils/api.py,sha256=y0gw5pCCHNnFIhM62Hok_5eDtH3QDAZdkye_1mANMNs,9654
|
|
3
|
-
hestia_earth/utils/blank_node.py,sha256=
|
|
3
|
+
hestia_earth/utils/blank_node.py,sha256=kLjq8U0PYyq_SQ-VHGMll_3XxKdYEnHEwtCCglNT3vg,7350
|
|
4
4
|
hestia_earth/utils/calculation_status.py,sha256=X7lbgVMD9luH1gj9lEcxd3_P2-u7e8ZPGCvX1czPZUo,2238
|
|
5
|
-
hestia_earth/utils/csv_utils.py,sha256=BK-tci1sALmsxamSR1Y7f9O6ajTTdhggLC2pBEWhYME,2310
|
|
6
5
|
hestia_earth/utils/cycle.py,sha256=rFLRL9X4KQ1UrE6fEPA_gV8KmwzrZpR3Ce56zg41lRk,1326
|
|
7
6
|
hestia_earth/utils/date.py,sha256=SPQ69uxHiv1o3BqIkBKkM5XX_CmS20CB7g6u2rhsdh8,1807
|
|
8
7
|
hestia_earth/utils/descriptive_stats.py,sha256=EMVwFvg2OnZgKRAfireAoWY2EbrSvqR0V0bK9B53p28,1583
|
|
9
8
|
hestia_earth/utils/emission.py,sha256=BhBitooLTxZSh82S982v2QfPxxTF1kmGClG_uHyWdz4,1981
|
|
10
|
-
hestia_earth/utils/lookup.py,sha256=
|
|
11
|
-
hestia_earth/utils/lookup_utils.py,sha256=
|
|
9
|
+
hestia_earth/utils/lookup.py,sha256=Sea1EkwT1K4mb9eNQBkJHoXkvNLSg_N9eeNiUL6pLq0,8028
|
|
10
|
+
hestia_earth/utils/lookup_utils.py,sha256=P3Ae2MqZWvk3f9AObNwk6Fq9AyyX279K4kR9qHX8rKQ,6667
|
|
12
11
|
hestia_earth/utils/model.py,sha256=uUcrF07XmBzqLni8VSaP0HoebJnQ57kk0EOmhwYMbfI,4637
|
|
13
12
|
hestia_earth/utils/pipeline.py,sha256=O-6DPtK0U1lJ51LFGa1gM6pjkBJUfxOjNjY8LxQPXV0,9588
|
|
14
13
|
hestia_earth/utils/request.py,sha256=bu7hkWKmFdXl2_Feawiam_x32whlclA9oP0asJyC69k,626
|
|
@@ -16,7 +15,7 @@ hestia_earth/utils/stats.py,sha256=4t3op10xDJbGxWJEY1Jtyl302PYWyMFwLpsSkMlzQn8,3
|
|
|
16
15
|
hestia_earth/utils/table.py,sha256=MOJDo5fQPRDogAty_UXbO9-EXFwz97m0f7--mOM17lQ,2363
|
|
17
16
|
hestia_earth/utils/term.py,sha256=6LiUSc6KX3IOkfWF6fYkQ2tENCO8ENljcdDypxU6WtA,1060
|
|
18
17
|
hestia_earth/utils/tools.py,sha256=9GaUJwxL-CTzEOGnRFkUQDVFelPevQSxXrf25vssCVo,4990
|
|
19
|
-
hestia_earth/utils/version.py,sha256=
|
|
18
|
+
hestia_earth/utils/version.py,sha256=izOjXE-oE9zdUdGeSgNJik6goDuxSRXghKlLPR0OuNE,19
|
|
20
19
|
hestia_earth/utils/pivot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
20
|
hestia_earth/utils/pivot/_shared.py,sha256=JnyIOzpans45DE2hSa9-4yvNhq8t08lx1IAWGJi6WPQ,1397
|
|
22
21
|
hestia_earth/utils/pivot/pivot_csv.py,sha256=7f6kMqeb1b3RKANLGeDgVu8G5WC-vXIijHnsJhO-CjI,12022
|
|
@@ -26,9 +25,9 @@ hestia_earth/utils/storage/_azure_client.py,sha256=sevCZni04eknMql2DgUsWG23f7u0K
|
|
|
26
25
|
hestia_earth/utils/storage/_local_client.py,sha256=IbzziUKY0QS3ybHFfgEpELqvafa7hQnZ-DdGdjQuypE,515
|
|
27
26
|
hestia_earth/utils/storage/_s3_client.py,sha256=B2yTsf-VfHcRLCKTMes4S_nCXxrZad9umyZx3b5Pu_c,3181
|
|
28
27
|
hestia_earth/utils/storage/_sns_client.py,sha256=LowUatj78Egu6_Id6Rr7hZjfZx1WguS3lozB3yAwSps,347
|
|
29
|
-
hestia_earth_utils-0.16.
|
|
30
|
-
hestia_earth_utils-0.16.
|
|
31
|
-
hestia_earth_utils-0.16.
|
|
32
|
-
hestia_earth_utils-0.16.
|
|
33
|
-
hestia_earth_utils-0.16.
|
|
34
|
-
hestia_earth_utils-0.16.
|
|
28
|
+
hestia_earth_utils-0.16.7.data/scripts/hestia-format-upload,sha256=IhLAHHPJqRgUcht-M_EUEsRMbRbMfshig07o488zscM,703
|
|
29
|
+
hestia_earth_utils-0.16.7.data/scripts/hestia-pivot-csv,sha256=0YBuGuyPO8rytod6iwWEKiQdSlr9JLuD001k6U5t6no,1163
|
|
30
|
+
hestia_earth_utils-0.16.7.dist-info/METADATA,sha256=o6sR5_7DeeXBLuKWYMFmg0CWRg3O-Cynh6NVZkI1mC0,1869
|
|
31
|
+
hestia_earth_utils-0.16.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
32
|
+
hestia_earth_utils-0.16.7.dist-info/top_level.txt,sha256=q0QxKEYx9uLpAD5ZtC7Ypq29smEPfOzEAn7Xv8XHGOQ,13
|
|
33
|
+
hestia_earth_utils-0.16.7.dist-info/RECORD,,
|
hestia_earth/utils/csv_utils.py
DELETED
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
import io
|
|
2
|
-
import csv
|
|
3
|
-
import re
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
_MISSING_VALUE = '-'
|
|
7
|
-
_MISSING = -99999
|
|
8
|
-
_DELIMITER = ','
|
|
9
|
-
_QUOTE_CHAR = '"'
|
|
10
|
-
ENCODING = 'ISO-8859-1'
|
|
11
|
-
# default: " !#$%&'()*+,-./:;<=>?@[\\]^{|}~"
|
|
12
|
-
_DELETE_CHARS = " !#$%&'()*,./:;<=>?@^{|}~"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def is_missing_value(value): return value == _MISSING_VALUE or value == _MISSING or value == str(_MISSING)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _replace_missing_values(value: str): return str(_MISSING) if str(value) == _MISSING_VALUE else value
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _replace_chars(value: str): return re.sub(f'[{re.escape(_DELETE_CHARS)}]', '', value.replace(' ', '_'))
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _text_to_csv(csv_content: str):
|
|
25
|
-
return csv.reader(io.StringIO(csv_content.strip()), delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _get_columns(csv_content: str):
|
|
29
|
-
try:
|
|
30
|
-
reader = _text_to_csv(csv_content)
|
|
31
|
-
names = next(reader)
|
|
32
|
-
return list(map(_replace_chars, names))
|
|
33
|
-
except StopIteration:
|
|
34
|
-
return []
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _get_rows(csv_content: str):
|
|
38
|
-
string_io = io.StringIO(csv_content.strip())
|
|
39
|
-
try:
|
|
40
|
-
next(string_io)
|
|
41
|
-
except StopIteration:
|
|
42
|
-
return
|
|
43
|
-
|
|
44
|
-
return csv.reader(string_io, delimiter=_DELIMITER, quotechar=_QUOTE_CHAR)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def _csv_str_to_recarray_chunks_numpy(csv_content: str, chunk_size: int = 5):
|
|
48
|
-
names = _get_columns(csv_content)
|
|
49
|
-
num_cols = len(names)
|
|
50
|
-
|
|
51
|
-
max_size = 1000
|
|
52
|
-
dtype = [(name, f"U{max_size}") for name in names]
|
|
53
|
-
|
|
54
|
-
reader = _get_rows(csv_content)
|
|
55
|
-
|
|
56
|
-
# 4. Process the file in batches
|
|
57
|
-
chunk_rows = []
|
|
58
|
-
for row in reader:
|
|
59
|
-
if not row:
|
|
60
|
-
continue
|
|
61
|
-
if len(row) != num_cols:
|
|
62
|
-
continue
|
|
63
|
-
|
|
64
|
-
# replace missing values
|
|
65
|
-
processed_row = tuple(_replace_missing_values(field) for field in row)
|
|
66
|
-
chunk_rows.append(processed_row)
|
|
67
|
-
|
|
68
|
-
if len(chunk_rows) >= chunk_size:
|
|
69
|
-
yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
|
|
70
|
-
chunk_rows = []
|
|
71
|
-
|
|
72
|
-
if chunk_rows:
|
|
73
|
-
yield np.array(chunk_rows, dtype=dtype).view(np.recarray)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def csv_str_to_recarray(csv_content: str) -> np.recarray:
|
|
77
|
-
array_rows = list(_csv_str_to_recarray_chunks_numpy(csv_content))
|
|
78
|
-
return np.hstack(array_rows).view(np.recarray)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def csv_file_to_recarray(filepath: str):
|
|
82
|
-
with open(filepath, 'r', encoding=ENCODING) as f:
|
|
83
|
-
content = f.read()
|
|
84
|
-
return csv_str_to_recarray(content)
|
{hestia_earth_utils-0.16.6.data → hestia_earth_utils-0.16.7.data}/scripts/hestia-format-upload
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|