geometallurgy 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elphick/geomet/__init__.py +11 -11
- elphick/geomet/base.py +1133 -1133
- elphick/geomet/block_model.py +319 -358
- elphick/geomet/config/__init__.py +1 -1
- elphick/geomet/config/config_read.py +39 -39
- elphick/geomet/config/flowsheet_example_partition.yaml +31 -31
- elphick/geomet/config/flowsheet_example_simple.yaml +25 -25
- elphick/geomet/config/mc_config.yml +35 -35
- elphick/geomet/data/downloader.py +39 -39
- elphick/geomet/data/register.csv +12 -12
- elphick/geomet/datasets/__init__.py +2 -2
- elphick/geomet/datasets/datasets.py +47 -47
- elphick/geomet/datasets/downloader.py +40 -40
- elphick/geomet/datasets/register.csv +12 -12
- elphick/geomet/datasets/sample_data.py +196 -196
- elphick/geomet/extras.py +35 -35
- elphick/geomet/flowsheet/__init__.py +1 -1
- elphick/geomet/flowsheet/flowsheet.py +1216 -1216
- elphick/geomet/flowsheet/loader.py +99 -99
- elphick/geomet/flowsheet/operation.py +256 -256
- elphick/geomet/flowsheet/stream.py +39 -39
- elphick/geomet/interval_sample.py +641 -641
- elphick/geomet/io.py +379 -379
- elphick/geomet/plot.py +147 -147
- elphick/geomet/sample.py +28 -28
- elphick/geomet/utils/amenability.py +49 -49
- elphick/geomet/utils/block_model_converter.py +93 -93
- elphick/geomet/utils/components.py +136 -136
- elphick/geomet/utils/data.py +49 -49
- elphick/geomet/utils/estimates.py +108 -108
- elphick/geomet/utils/interp.py +193 -193
- elphick/geomet/utils/interp2.py +134 -134
- elphick/geomet/utils/layout.py +72 -72
- elphick/geomet/utils/moisture.py +61 -61
- elphick/geomet/utils/output.html +617 -0
- elphick/geomet/utils/pandas.py +378 -378
- elphick/geomet/utils/parallel.py +29 -29
- elphick/geomet/utils/partition.py +63 -63
- elphick/geomet/utils/size.py +51 -51
- elphick/geomet/utils/timer.py +80 -80
- elphick/geomet/utils/viz.py +56 -56
- elphick/geomet/validate.py.hide +176 -176
- {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/LICENSE +21 -21
- {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/METADATA +7 -5
- geometallurgy-0.4.13.dist-info/RECORD +49 -0
- {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/WHEEL +1 -1
- geometallurgy-0.4.12.dist-info/RECORD +0 -48
- {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/entry_points.txt +0 -0
elphick/geomet/io.py
CHANGED
|
@@ -1,379 +1,379 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import tokenize
|
|
4
|
-
from abc import abstractmethod, ABC
|
|
5
|
-
from io import StringIO
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Optional
|
|
8
|
-
|
|
9
|
-
import pyarrow as pa
|
|
10
|
-
import os
|
|
11
|
-
|
|
12
|
-
import numpy as np
|
|
13
|
-
import pandas as pd
|
|
14
|
-
from omf import OMFReader, VolumeGridGeometry
|
|
15
|
-
import pyarrow.parquet as pq
|
|
16
|
-
from pandera import DataFrameSchema
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class BaseReader(ABC):
|
|
20
|
-
|
|
21
|
-
def __init__(self, file_path: Path):
|
|
22
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
23
|
-
self.file_path: Path = file_path
|
|
24
|
-
self.variables_in_file: list[str] = []
|
|
25
|
-
self.records_in_file: int = 0
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def _parse_query_columns(query) -> list[str]:
|
|
29
|
-
# Create a list to store the column names
|
|
30
|
-
column_names = []
|
|
31
|
-
|
|
32
|
-
# Tokenize the query string
|
|
33
|
-
for token in tokenize.generate_tokens(StringIO(query).readline):
|
|
34
|
-
token_type, token_string, _, _, _ = token
|
|
35
|
-
|
|
36
|
-
# If the token is a name, and it's not a built-in Python name, add it to the list
|
|
37
|
-
if token_type == tokenize.NAME and token_string not in __builtins__:
|
|
38
|
-
column_names.append(token_string)
|
|
39
|
-
|
|
40
|
-
return column_names
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None) -> pd.DataFrame:
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
@abstractmethod
|
|
47
|
-
def get_index(self) -> pd.Index:
|
|
48
|
-
pass
|
|
49
|
-
|
|
50
|
-
def validate(self, schema_file: Path, data: Optional[pd.DataFrame]) -> pd.DataFrame:
|
|
51
|
-
"""Validate using a pandera schema
|
|
52
|
-
|
|
53
|
-
This method does not leverage multiprocessing, and loads the entire dataframe into memory.
|
|
54
|
-
Args:
|
|
55
|
-
schema_file: The path to the schema yaml file
|
|
56
|
-
data: The data to validate, if not provided, the underlying read method will be called.
|
|
57
|
-
Returns:
|
|
58
|
-
The coerced DataFrame after validation
|
|
59
|
-
"""
|
|
60
|
-
import pandera as pa
|
|
61
|
-
schema: DataFrameSchema = pa.DataFrameSchema.from_yaml(schema_file)
|
|
62
|
-
if data:
|
|
63
|
-
df = data
|
|
64
|
-
else:
|
|
65
|
-
df = self.read()
|
|
66
|
-
schema.validate(df, lazy=True, inplace=True)
|
|
67
|
-
return df
|
|
68
|
-
|
|
69
|
-
def preprocess(self, negative_to_nan_threshold: Optional[float] = -1,
|
|
70
|
-
not_detected_assays_threshold: Optional[float] = 0.5) -> pd.DataFrame:
|
|
71
|
-
"""
|
|
72
|
-
Preprocess the data by managing negative values.
|
|
73
|
-
Args:
|
|
74
|
-
negative_to_nan_threshold: Values below this threshold will be replaced with NaN
|
|
75
|
-
not_detected_assays_threshold: Values above this threshold will be replaced with half the absolute value
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
The preprocessed DataFrame, with no negatives and no values above the not_detected_assays_threshold.
|
|
79
|
-
|
|
80
|
-
"""
|
|
81
|
-
if negative_to_nan_threshold > 0:
|
|
82
|
-
raise ValueError("The negative_to_nan_threshold must be less than or equal to zero or None.")
|
|
83
|
-
if not_detected_assays_threshold > 0:
|
|
84
|
-
raise ValueError("The not_detected_assays_threshold must be less than or equal to zero or None")
|
|
85
|
-
|
|
86
|
-
df = self.read()
|
|
87
|
-
|
|
88
|
-
# detect numeric columns
|
|
89
|
-
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
90
|
-
|
|
91
|
-
if negative_to_nan_threshold:
|
|
92
|
-
df.loc[df[numeric_cols] < negative_to_nan_threshold, numeric_cols] = np.nan
|
|
93
|
-
if not_detected_assays_threshold:
|
|
94
|
-
mask = (df[numeric_cols] > not_detected_assays_threshold) and (df[numeric_cols] < 0)
|
|
95
|
-
df.loc[mask, numeric_cols] = np.abs(df.loc[mask, numeric_cols]) / 2
|
|
96
|
-
return df
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
class ParquetFileReader(BaseReader):
|
|
100
|
-
"""
|
|
101
|
-
Read a Parquet file
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
def __init__(self, file_path: Path):
|
|
105
|
-
"""
|
|
106
|
-
Initialize the parquet reader. While not enforced, it is expected that the file is indexed by x, y, z, or
|
|
107
|
-
x, y, z, dx, dy, dz
|
|
108
|
-
Args:
|
|
109
|
-
file_path: The path to the Parquet file.
|
|
110
|
-
"""
|
|
111
|
-
super().__init__(file_path)
|
|
112
|
-
self.variables_in_file = self._get_parquet_columns()
|
|
113
|
-
self.records_in_file = self._get_parquet_length()
|
|
114
|
-
|
|
115
|
-
def _get_parquet_columns(self):
|
|
116
|
-
parquet_file = pq.ParquetFile(self.file_path)
|
|
117
|
-
metadata: dict = self.get_parquet_metadata()
|
|
118
|
-
cols = [col for col in parquet_file.schema.names if col not in metadata['index_columns']]
|
|
119
|
-
return cols
|
|
120
|
-
|
|
121
|
-
def _get_parquet_length(self):
|
|
122
|
-
parquet_file = pq.ParquetFile(self.file_path)
|
|
123
|
-
return parquet_file.metadata.num_rows
|
|
124
|
-
|
|
125
|
-
def get_parquet_metadata(self) -> dict:
|
|
126
|
-
parquet_file = pq.ParquetFile(self.file_path)
|
|
127
|
-
pd_metadata_bytes = parquet_file.metadata.metadata.get(b'pandas')
|
|
128
|
-
pd_metadata_str: str = pd_metadata_bytes.decode('utf-8')
|
|
129
|
-
return json.loads(pd_metadata_str)
|
|
130
|
-
|
|
131
|
-
def get_index(self) -> pd.Index:
|
|
132
|
-
parquet_file = pq.ParquetFile(self.file_path)
|
|
133
|
-
pd_metadata: dict = self.get_parquet_metadata()
|
|
134
|
-
index_columns = pd_metadata['index_columns']
|
|
135
|
-
# deal with the single range index case
|
|
136
|
-
if len(index_columns) == 1:
|
|
137
|
-
if index_columns[0].get('kind') == 'range':
|
|
138
|
-
df_index = pd.Index(
|
|
139
|
-
range(index_columns[0].get('start'), index_columns[0].get('stop'), index_columns[0].get('step')))
|
|
140
|
-
else:
|
|
141
|
-
df_index = pd.Index(parquet_file.read(columns=index_columns[0].get('name')).to_pandas())
|
|
142
|
-
else:
|
|
143
|
-
# extract the pd.MultiIndex
|
|
144
|
-
df_index = parquet_file.read(columns=index_columns).to_pandas().index
|
|
145
|
-
return df_index
|
|
146
|
-
|
|
147
|
-
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
|
|
148
|
-
with_index: bool = True) -> pd.DataFrame:
|
|
149
|
-
# If no columns are specified, load all columns
|
|
150
|
-
if not columns:
|
|
151
|
-
columns = self.variables_in_file
|
|
152
|
-
else:
|
|
153
|
-
# Check if the columns specified are valid
|
|
154
|
-
for col in columns:
|
|
155
|
-
if col not in self.variables_in_file:
|
|
156
|
-
raise ValueError(f"Column '{col}' not found in the Parquet file: {self.file_path}. "
|
|
157
|
-
f"Available columns are: {self.variables_in_file}")
|
|
158
|
-
|
|
159
|
-
# If a query is specified, parse it to find the columns involved
|
|
160
|
-
if query:
|
|
161
|
-
query_columns = self._parse_query_columns(query)
|
|
162
|
-
# Load only the columns involved in the query
|
|
163
|
-
parquet_file = pq.ParquetFile(self.file_path)
|
|
164
|
-
df_query = parquet_file.read(columns=query_columns).to_pandas() # Apply the query to the DataFrame
|
|
165
|
-
df_query = df_query.query(query)
|
|
166
|
-
# Get the indices of the rows that match the query
|
|
167
|
-
query_indices = df_query.index
|
|
168
|
-
# Load the remaining columns, but only for the rows that match the query
|
|
169
|
-
remaining_columns = [col for col in columns if col not in query_columns]
|
|
170
|
-
if remaining_columns:
|
|
171
|
-
chunks = []
|
|
172
|
-
for col in remaining_columns:
|
|
173
|
-
df_col = parquet_file.read(columns=[col]).to_pandas()
|
|
174
|
-
chunks.append(df_col.loc[query_indices])
|
|
175
|
-
# Concatenate the query DataFrame and the remaining DataFrame
|
|
176
|
-
df = pd.concat([df_query, *chunks], axis=1)
|
|
177
|
-
else:
|
|
178
|
-
df = df_query
|
|
179
|
-
if with_index:
|
|
180
|
-
df_index: pd.Index = self.get_index()[query_indices]
|
|
181
|
-
df.set_index(df_index, inplace=True, drop=True)
|
|
182
|
-
|
|
183
|
-
else:
|
|
184
|
-
# If no query is specified, load the specified columns
|
|
185
|
-
df = pd.read_parquet(self.file_path, columns=columns)
|
|
186
|
-
if with_index is False:
|
|
187
|
-
df.reset_index(drop=True, inplace=True)
|
|
188
|
-
|
|
189
|
-
return df
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
class OMFFileReader(BaseReader):
|
|
193
|
-
"""
|
|
194
|
-
Read an OMF file
|
|
195
|
-
"""
|
|
196
|
-
|
|
197
|
-
def __init__(self, file_path, element: str):
|
|
198
|
-
"""
|
|
199
|
-
Initialize the OMF file reader. The element must be a VolumeElement in the OMF file.
|
|
200
|
-
Args:
|
|
201
|
-
file_path: The path to the OMF file
|
|
202
|
-
element: The name of the element in the OMF file to be validated. E.g. 'Block Model'
|
|
203
|
-
"""
|
|
204
|
-
super().__init__(file_path)
|
|
205
|
-
|
|
206
|
-
# check that the element provided is a valid VolumeElement in the OMF file.
|
|
207
|
-
self.elements = OMFReader(str(file_path)).get_project().elements
|
|
208
|
-
self.element_names = [e.name for e in self.elements]
|
|
209
|
-
if element not in self.element_names:
|
|
210
|
-
raise ValueError(f"Element '{element}' not found in the OMF file: {file_path}. Available elements are:"
|
|
211
|
-
f" {list(self.elements.keys())}")
|
|
212
|
-
elif self.get_element_by_name(element).__class__.__name__ != 'VolumeElement':
|
|
213
|
-
raise ValueError(f"Element '{element}' is not a VolumeElement in the OMF file: {file_path}")
|
|
214
|
-
|
|
215
|
-
self.element = self.get_element_by_name(element)
|
|
216
|
-
|
|
217
|
-
self.variables_in_file = [v.name for v in self.element.data]
|
|
218
|
-
self.records_in_file = len(self.element.data[0].array.array)
|
|
219
|
-
|
|
220
|
-
def get_element_by_name(self, element_name: str):
|
|
221
|
-
# get the index of the element in order to index into elements
|
|
222
|
-
element_index = self.element_names.index(element_name)
|
|
223
|
-
return self.elements[element_index]
|
|
224
|
-
|
|
225
|
-
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
|
|
226
|
-
with_index: bool = True) -> pd.DataFrame:
|
|
227
|
-
# Get the VolumeElement from the OMF file
|
|
228
|
-
# volume_element = OMFReader(self.file_path).get_project().elements[self.element]
|
|
229
|
-
|
|
230
|
-
# If no columns are specified, load all columns
|
|
231
|
-
if not columns:
|
|
232
|
-
columns = self.variables_in_file
|
|
233
|
-
else:
|
|
234
|
-
# Check if the columns specified are valid
|
|
235
|
-
for col in columns:
|
|
236
|
-
if col not in self.variables_in_file:
|
|
237
|
-
raise ValueError(f"Column '{col}' not found in the VolumeElement: {self.element}")
|
|
238
|
-
|
|
239
|
-
# If a query is specified, parse it to find the columns involved
|
|
240
|
-
if query:
|
|
241
|
-
query_columns = self._parse_query_columns(query)
|
|
242
|
-
# Load only the columns involved in the query
|
|
243
|
-
df_query: pd.DataFrame = self.read_volume_variables(self.element, variables=query_columns)
|
|
244
|
-
# Apply the query to the DataFrame
|
|
245
|
-
df_query = df_query.query(query)
|
|
246
|
-
# Get the indices of the rows that match the query
|
|
247
|
-
query_indices = df_query.index
|
|
248
|
-
# Load the remaining columns, but only for the rows that match the query
|
|
249
|
-
remaining_columns = [col for col in columns if col not in query_columns]
|
|
250
|
-
if remaining_columns:
|
|
251
|
-
chunks = []
|
|
252
|
-
for col in remaining_columns:
|
|
253
|
-
data_array = self.read_volume_variables(self.element, variables=[col])
|
|
254
|
-
# Filter the numpy array using the query indices
|
|
255
|
-
filtered_data_array = data_array[query_indices]
|
|
256
|
-
# Convert the filtered numpy array to a DataFrame
|
|
257
|
-
chunks.append(pd.DataFrame(filtered_data_array, columns=[col]))
|
|
258
|
-
# Concatenate the query DataFrame and the remaining DataFrame
|
|
259
|
-
df = pd.concat([df_query, *chunks], axis=1)
|
|
260
|
-
else:
|
|
261
|
-
df = df_query
|
|
262
|
-
else:
|
|
263
|
-
# If no query is specified, load the specified columns
|
|
264
|
-
df = self.read_volume_variables(self.element, variables=columns)
|
|
265
|
-
|
|
266
|
-
# add the index
|
|
267
|
-
if with_index:
|
|
268
|
-
df.set_index(self.get_index(), inplace=True, drop=True)
|
|
269
|
-
|
|
270
|
-
return df
|
|
271
|
-
|
|
272
|
-
def get_index(self) -> pd.MultiIndex:
|
|
273
|
-
|
|
274
|
-
geometry: VolumeGridGeometry = self.element.geometry
|
|
275
|
-
ox, oy, oz = geometry.origin
|
|
276
|
-
|
|
277
|
-
# Make coordinates (points) along each axis, i, j, k
|
|
278
|
-
i = ox + np.cumsum(geometry.tensor_u)
|
|
279
|
-
i = np.insert(i, 0, ox)
|
|
280
|
-
j = oy + np.cumsum(self.element.geometry.tensor_v)
|
|
281
|
-
j = np.insert(j, 0, oy)
|
|
282
|
-
k = oz + np.cumsum(self.element.geometry.tensor_w)
|
|
283
|
-
k = np.insert(k, 0, oz)
|
|
284
|
-
|
|
285
|
-
# convert to centroids
|
|
286
|
-
x, y, z = (i[1:] + i[:-1]) / 2, (j[1:] + j[:-1]) / 2, (k[1:] + k[:-1]) / 2
|
|
287
|
-
xx, yy, zz = np.meshgrid(x, y, z, indexing="ij")
|
|
288
|
-
|
|
289
|
-
# Calculate dx, dy, dz
|
|
290
|
-
dxx, dyy, dzz = np.meshgrid(geometry.tensor_u, geometry.tensor_v, geometry.tensor_w, indexing="ij")
|
|
291
|
-
|
|
292
|
-
# TODO: consider rotation
|
|
293
|
-
|
|
294
|
-
index = pd.MultiIndex.from_arrays([xx.ravel("F"), yy.ravel("F"), zz.ravel("F"),
|
|
295
|
-
dxx.ravel("F"), dyy.ravel("F"), dzz.ravel("F")],
|
|
296
|
-
names=['x', 'y', 'z', 'dx', 'dy', 'dz'])
|
|
297
|
-
|
|
298
|
-
if len(index) != self.records_in_file:
|
|
299
|
-
raise ValueError(f"The length of the index ({len(index)}) does not match the number of records"
|
|
300
|
-
f" in the VolumeElement ({self.records_in_file})")
|
|
301
|
-
|
|
302
|
-
return index
|
|
303
|
-
|
|
304
|
-
def read_volume_variables(self, element: str, variables: list[str]) -> pd.DataFrame:
|
|
305
|
-
# Loop over the variables
|
|
306
|
-
chunks: list[pd.DataFrame] = []
|
|
307
|
-
for variable in variables:
|
|
308
|
-
# Check if the variable exists in the VolumeElement
|
|
309
|
-
if variable not in self.variables_in_file:
|
|
310
|
-
raise ValueError(f"Variable '{variable}' not found in the VolumeElement: {element}")
|
|
311
|
-
chunks.append(self._get_variable_by_name(variable).ravel())
|
|
312
|
-
|
|
313
|
-
# Concatenate all chunks into a single DataFrame
|
|
314
|
-
return pd.DataFrame(np.vstack(chunks), index=variables).T
|
|
315
|
-
|
|
316
|
-
def _get_variable_by_name(self, variable_name: str):
|
|
317
|
-
# get the index of the variable in order to index into elements
|
|
318
|
-
variable_index = self.variables_in_file.index(variable_name)
|
|
319
|
-
return self.element.data[variable_index].array.array
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
class ParquetFileWriter:
|
|
323
|
-
|
|
324
|
-
def __init__(self):
|
|
325
|
-
pass
|
|
326
|
-
|
|
327
|
-
@classmethod
|
|
328
|
-
def from_column_generator(cls, index: pd.Index, column_generator):
|
|
329
|
-
|
|
330
|
-
# Path to the final output file
|
|
331
|
-
output_file = "final.parquet"
|
|
332
|
-
|
|
333
|
-
# Temp directory for storing parquet columns
|
|
334
|
-
temp_dir = "temp/"
|
|
335
|
-
|
|
336
|
-
# Ensure the temp directory exists
|
|
337
|
-
os.makedirs(temp_dir, exist_ok=True)
|
|
338
|
-
|
|
339
|
-
# Write the index to a separate Parquet file
|
|
340
|
-
index_table = pa.Table.from_pandas(index.to_frame('index'))
|
|
341
|
-
pq.write_table(index_table, temp_dir + "index.parquet")
|
|
342
|
-
index_pf = pq.ParquetFile(temp_dir + "index.parquet")
|
|
343
|
-
|
|
344
|
-
for i, column in enumerate(column_generator):
|
|
345
|
-
# Write each column to a temporary parquet file
|
|
346
|
-
table = pa.Table.from_pandas(column.to_frame())
|
|
347
|
-
pq.write_table(table, temp_dir + f"column_{i}.parquet")
|
|
348
|
-
|
|
349
|
-
# Collect paths to the temporary Parquet files
|
|
350
|
-
paths = [temp_dir + file for file in os.listdir(temp_dir) if file != "index.parquet"]
|
|
351
|
-
|
|
352
|
-
# Create a ParquetWriter for the final output file
|
|
353
|
-
first_pf = pq.ParquetFile(paths[0])
|
|
354
|
-
writer = pq.ParquetWriter(output_file, first_pf.schema)
|
|
355
|
-
|
|
356
|
-
for i in range(index_pf.num_row_groups):
|
|
357
|
-
# Read index chunk
|
|
358
|
-
index_chunk = index_pf.read_row_group(i).to_pandas()
|
|
359
|
-
|
|
360
|
-
# Dataframe to store chunk data
|
|
361
|
-
df = pd.DataFrame(index=index_chunk['index'])
|
|
362
|
-
|
|
363
|
-
for path in paths:
|
|
364
|
-
pf = pq.ParquetFile(path)
|
|
365
|
-
# Read data chunk
|
|
366
|
-
data_chunk = pf.read_row_group(i).to_pandas()
|
|
367
|
-
|
|
368
|
-
# Concatenate data chunk to the dataframe
|
|
369
|
-
df = pd.concat([df, data_chunk], axis=1)
|
|
370
|
-
|
|
371
|
-
# Write the chunk to the output file
|
|
372
|
-
writer.write_table(pa.Table.from_pandas(df))
|
|
373
|
-
|
|
374
|
-
# Close the writer and release resources
|
|
375
|
-
writer.close()
|
|
376
|
-
|
|
377
|
-
# Remove temporary files
|
|
378
|
-
for file in os.listdir(temp_dir):
|
|
379
|
-
os.remove(temp_dir + file)
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import tokenize
|
|
4
|
+
from abc import abstractmethod, ABC
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from omf import OMFReader, VolumeGridGeometry
|
|
15
|
+
import pyarrow.parquet as pq
|
|
16
|
+
from pandera import DataFrameSchema
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseReader(ABC):
|
|
20
|
+
|
|
21
|
+
def __init__(self, file_path: Path):
|
|
22
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
23
|
+
self.file_path: Path = file_path
|
|
24
|
+
self.variables_in_file: list[str] = []
|
|
25
|
+
self.records_in_file: int = 0
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _parse_query_columns(query) -> list[str]:
|
|
29
|
+
# Create a list to store the column names
|
|
30
|
+
column_names = []
|
|
31
|
+
|
|
32
|
+
# Tokenize the query string
|
|
33
|
+
for token in tokenize.generate_tokens(StringIO(query).readline):
|
|
34
|
+
token_type, token_string, _, _, _ = token
|
|
35
|
+
|
|
36
|
+
# If the token is a name, and it's not a built-in Python name, add it to the list
|
|
37
|
+
if token_type == tokenize.NAME and token_string not in __builtins__:
|
|
38
|
+
column_names.append(token_string)
|
|
39
|
+
|
|
40
|
+
return column_names
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None) -> pd.DataFrame:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def get_index(self) -> pd.Index:
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
def validate(self, schema_file: Path, data: Optional[pd.DataFrame]) -> pd.DataFrame:
|
|
51
|
+
"""Validate using a pandera schema
|
|
52
|
+
|
|
53
|
+
This method does not leverage multiprocessing, and loads the entire dataframe into memory.
|
|
54
|
+
Args:
|
|
55
|
+
schema_file: The path to the schema yaml file
|
|
56
|
+
data: The data to validate, if not provided, the underlying read method will be called.
|
|
57
|
+
Returns:
|
|
58
|
+
The coerced DataFrame after validation
|
|
59
|
+
"""
|
|
60
|
+
import pandera as pa
|
|
61
|
+
schema: DataFrameSchema = pa.DataFrameSchema.from_yaml(schema_file)
|
|
62
|
+
if data:
|
|
63
|
+
df = data
|
|
64
|
+
else:
|
|
65
|
+
df = self.read()
|
|
66
|
+
schema.validate(df, lazy=True, inplace=True)
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
def preprocess(self, negative_to_nan_threshold: Optional[float] = -1,
|
|
70
|
+
not_detected_assays_threshold: Optional[float] = 0.5) -> pd.DataFrame:
|
|
71
|
+
"""
|
|
72
|
+
Preprocess the data by managing negative values.
|
|
73
|
+
Args:
|
|
74
|
+
negative_to_nan_threshold: Values below this threshold will be replaced with NaN
|
|
75
|
+
not_detected_assays_threshold: Values above this threshold will be replaced with half the absolute value
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The preprocessed DataFrame, with no negatives and no values above the not_detected_assays_threshold.
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
if negative_to_nan_threshold > 0:
|
|
82
|
+
raise ValueError("The negative_to_nan_threshold must be less than or equal to zero or None.")
|
|
83
|
+
if not_detected_assays_threshold > 0:
|
|
84
|
+
raise ValueError("The not_detected_assays_threshold must be less than or equal to zero or None")
|
|
85
|
+
|
|
86
|
+
df = self.read()
|
|
87
|
+
|
|
88
|
+
# detect numeric columns
|
|
89
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
90
|
+
|
|
91
|
+
if negative_to_nan_threshold:
|
|
92
|
+
df.loc[df[numeric_cols] < negative_to_nan_threshold, numeric_cols] = np.nan
|
|
93
|
+
if not_detected_assays_threshold:
|
|
94
|
+
mask = (df[numeric_cols] > not_detected_assays_threshold) and (df[numeric_cols] < 0)
|
|
95
|
+
df.loc[mask, numeric_cols] = np.abs(df.loc[mask, numeric_cols]) / 2
|
|
96
|
+
return df
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ParquetFileReader(BaseReader):
|
|
100
|
+
"""
|
|
101
|
+
Read a Parquet file
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, file_path: Path):
|
|
105
|
+
"""
|
|
106
|
+
Initialize the parquet reader. While not enforced, it is expected that the file is indexed by x, y, z, or
|
|
107
|
+
x, y, z, dx, dy, dz
|
|
108
|
+
Args:
|
|
109
|
+
file_path: The path to the Parquet file.
|
|
110
|
+
"""
|
|
111
|
+
super().__init__(file_path)
|
|
112
|
+
self.variables_in_file = self._get_parquet_columns()
|
|
113
|
+
self.records_in_file = self._get_parquet_length()
|
|
114
|
+
|
|
115
|
+
def _get_parquet_columns(self):
|
|
116
|
+
parquet_file = pq.ParquetFile(self.file_path)
|
|
117
|
+
metadata: dict = self.get_parquet_metadata()
|
|
118
|
+
cols = [col for col in parquet_file.schema.names if col not in metadata['index_columns']]
|
|
119
|
+
return cols
|
|
120
|
+
|
|
121
|
+
def _get_parquet_length(self):
|
|
122
|
+
parquet_file = pq.ParquetFile(self.file_path)
|
|
123
|
+
return parquet_file.metadata.num_rows
|
|
124
|
+
|
|
125
|
+
def get_parquet_metadata(self) -> dict:
|
|
126
|
+
parquet_file = pq.ParquetFile(self.file_path)
|
|
127
|
+
pd_metadata_bytes = parquet_file.metadata.metadata.get(b'pandas')
|
|
128
|
+
pd_metadata_str: str = pd_metadata_bytes.decode('utf-8')
|
|
129
|
+
return json.loads(pd_metadata_str)
|
|
130
|
+
|
|
131
|
+
def get_index(self) -> pd.Index:
|
|
132
|
+
parquet_file = pq.ParquetFile(self.file_path)
|
|
133
|
+
pd_metadata: dict = self.get_parquet_metadata()
|
|
134
|
+
index_columns = pd_metadata['index_columns']
|
|
135
|
+
# deal with the single range index case
|
|
136
|
+
if len(index_columns) == 1:
|
|
137
|
+
if index_columns[0].get('kind') == 'range':
|
|
138
|
+
df_index = pd.Index(
|
|
139
|
+
range(index_columns[0].get('start'), index_columns[0].get('stop'), index_columns[0].get('step')))
|
|
140
|
+
else:
|
|
141
|
+
df_index = pd.Index(parquet_file.read(columns=index_columns[0].get('name')).to_pandas())
|
|
142
|
+
else:
|
|
143
|
+
# extract the pd.MultiIndex
|
|
144
|
+
df_index = parquet_file.read(columns=index_columns).to_pandas().index
|
|
145
|
+
return df_index
|
|
146
|
+
|
|
147
|
+
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
|
|
148
|
+
with_index: bool = True) -> pd.DataFrame:
|
|
149
|
+
# If no columns are specified, load all columns
|
|
150
|
+
if not columns:
|
|
151
|
+
columns = self.variables_in_file
|
|
152
|
+
else:
|
|
153
|
+
# Check if the columns specified are valid
|
|
154
|
+
for col in columns:
|
|
155
|
+
if col not in self.variables_in_file:
|
|
156
|
+
raise ValueError(f"Column '{col}' not found in the Parquet file: {self.file_path}. "
|
|
157
|
+
f"Available columns are: {self.variables_in_file}")
|
|
158
|
+
|
|
159
|
+
# If a query is specified, parse it to find the columns involved
|
|
160
|
+
if query:
|
|
161
|
+
query_columns = self._parse_query_columns(query)
|
|
162
|
+
# Load only the columns involved in the query
|
|
163
|
+
parquet_file = pq.ParquetFile(self.file_path)
|
|
164
|
+
df_query = parquet_file.read(columns=query_columns).to_pandas() # Apply the query to the DataFrame
|
|
165
|
+
df_query = df_query.query(query)
|
|
166
|
+
# Get the indices of the rows that match the query
|
|
167
|
+
query_indices = df_query.index
|
|
168
|
+
# Load the remaining columns, but only for the rows that match the query
|
|
169
|
+
remaining_columns = [col for col in columns if col not in query_columns]
|
|
170
|
+
if remaining_columns:
|
|
171
|
+
chunks = []
|
|
172
|
+
for col in remaining_columns:
|
|
173
|
+
df_col = parquet_file.read(columns=[col]).to_pandas()
|
|
174
|
+
chunks.append(df_col.loc[query_indices])
|
|
175
|
+
# Concatenate the query DataFrame and the remaining DataFrame
|
|
176
|
+
df = pd.concat([df_query, *chunks], axis=1)
|
|
177
|
+
else:
|
|
178
|
+
df = df_query
|
|
179
|
+
if with_index:
|
|
180
|
+
df_index: pd.Index = self.get_index()[query_indices]
|
|
181
|
+
df.set_index(df_index, inplace=True, drop=True)
|
|
182
|
+
|
|
183
|
+
else:
|
|
184
|
+
# If no query is specified, load the specified columns
|
|
185
|
+
df = pd.read_parquet(self.file_path, columns=columns)
|
|
186
|
+
if with_index is False:
|
|
187
|
+
df.reset_index(drop=True, inplace=True)
|
|
188
|
+
|
|
189
|
+
return df
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class OMFFileReader(BaseReader):
|
|
193
|
+
"""
|
|
194
|
+
Read an OMF file
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
def __init__(self, file_path, element: str):
|
|
198
|
+
"""
|
|
199
|
+
Initialize the OMF file reader. The element must be a VolumeElement in the OMF file.
|
|
200
|
+
Args:
|
|
201
|
+
file_path: The path to the OMF file
|
|
202
|
+
element: The name of the element in the OMF file to be validated. E.g. 'Block Model'
|
|
203
|
+
"""
|
|
204
|
+
super().__init__(file_path)
|
|
205
|
+
|
|
206
|
+
# check that the element provided is a valid VolumeElement in the OMF file.
|
|
207
|
+
self.elements = OMFReader(str(file_path)).get_project().elements
|
|
208
|
+
self.element_names = [e.name for e in self.elements]
|
|
209
|
+
if element not in self.element_names:
|
|
210
|
+
raise ValueError(f"Element '{element}' not found in the OMF file: {file_path}. Available elements are:"
|
|
211
|
+
f" {list(self.elements.keys())}")
|
|
212
|
+
elif self.get_element_by_name(element).__class__.__name__ != 'VolumeElement':
|
|
213
|
+
raise ValueError(f"Element '{element}' is not a VolumeElement in the OMF file: {file_path}")
|
|
214
|
+
|
|
215
|
+
self.element = self.get_element_by_name(element)
|
|
216
|
+
|
|
217
|
+
self.variables_in_file = [v.name for v in self.element.data]
|
|
218
|
+
self.records_in_file = len(self.element.data[0].array.array)
|
|
219
|
+
|
|
220
|
+
def get_element_by_name(self, element_name: str):
|
|
221
|
+
# get the index of the element in order to index into elements
|
|
222
|
+
element_index = self.element_names.index(element_name)
|
|
223
|
+
return self.elements[element_index]
|
|
224
|
+
|
|
225
|
+
def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
|
|
226
|
+
with_index: bool = True) -> pd.DataFrame:
|
|
227
|
+
# Get the VolumeElement from the OMF file
|
|
228
|
+
# volume_element = OMFReader(self.file_path).get_project().elements[self.element]
|
|
229
|
+
|
|
230
|
+
# If no columns are specified, load all columns
|
|
231
|
+
if not columns:
|
|
232
|
+
columns = self.variables_in_file
|
|
233
|
+
else:
|
|
234
|
+
# Check if the columns specified are valid
|
|
235
|
+
for col in columns:
|
|
236
|
+
if col not in self.variables_in_file:
|
|
237
|
+
raise ValueError(f"Column '{col}' not found in the VolumeElement: {self.element}")
|
|
238
|
+
|
|
239
|
+
# If a query is specified, parse it to find the columns involved
|
|
240
|
+
if query:
|
|
241
|
+
query_columns = self._parse_query_columns(query)
|
|
242
|
+
# Load only the columns involved in the query
|
|
243
|
+
df_query: pd.DataFrame = self.read_volume_variables(self.element, variables=query_columns)
|
|
244
|
+
# Apply the query to the DataFrame
|
|
245
|
+
df_query = df_query.query(query)
|
|
246
|
+
# Get the indices of the rows that match the query
|
|
247
|
+
query_indices = df_query.index
|
|
248
|
+
# Load the remaining columns, but only for the rows that match the query
|
|
249
|
+
remaining_columns = [col for col in columns if col not in query_columns]
|
|
250
|
+
if remaining_columns:
|
|
251
|
+
chunks = []
|
|
252
|
+
for col in remaining_columns:
|
|
253
|
+
data_array = self.read_volume_variables(self.element, variables=[col])
|
|
254
|
+
# Filter the numpy array using the query indices
|
|
255
|
+
filtered_data_array = data_array[query_indices]
|
|
256
|
+
# Convert the filtered numpy array to a DataFrame
|
|
257
|
+
chunks.append(pd.DataFrame(filtered_data_array, columns=[col]))
|
|
258
|
+
# Concatenate the query DataFrame and the remaining DataFrame
|
|
259
|
+
df = pd.concat([df_query, *chunks], axis=1)
|
|
260
|
+
else:
|
|
261
|
+
df = df_query
|
|
262
|
+
else:
|
|
263
|
+
# If no query is specified, load the specified columns
|
|
264
|
+
df = self.read_volume_variables(self.element, variables=columns)
|
|
265
|
+
|
|
266
|
+
# add the index
|
|
267
|
+
if with_index:
|
|
268
|
+
df.set_index(self.get_index(), inplace=True, drop=True)
|
|
269
|
+
|
|
270
|
+
return df
|
|
271
|
+
|
|
272
|
+
def get_index(self) -> pd.MultiIndex:
|
|
273
|
+
|
|
274
|
+
geometry: VolumeGridGeometry = self.element.geometry
|
|
275
|
+
ox, oy, oz = geometry.origin
|
|
276
|
+
|
|
277
|
+
# Make coordinates (points) along each axis, i, j, k
|
|
278
|
+
i = ox + np.cumsum(geometry.tensor_u)
|
|
279
|
+
i = np.insert(i, 0, ox)
|
|
280
|
+
j = oy + np.cumsum(self.element.geometry.tensor_v)
|
|
281
|
+
j = np.insert(j, 0, oy)
|
|
282
|
+
k = oz + np.cumsum(self.element.geometry.tensor_w)
|
|
283
|
+
k = np.insert(k, 0, oz)
|
|
284
|
+
|
|
285
|
+
# convert to centroids
|
|
286
|
+
x, y, z = (i[1:] + i[:-1]) / 2, (j[1:] + j[:-1]) / 2, (k[1:] + k[:-1]) / 2
|
|
287
|
+
xx, yy, zz = np.meshgrid(x, y, z, indexing="ij")
|
|
288
|
+
|
|
289
|
+
# Calculate dx, dy, dz
|
|
290
|
+
dxx, dyy, dzz = np.meshgrid(geometry.tensor_u, geometry.tensor_v, geometry.tensor_w, indexing="ij")
|
|
291
|
+
|
|
292
|
+
# TODO: consider rotation
|
|
293
|
+
|
|
294
|
+
index = pd.MultiIndex.from_arrays([xx.ravel("F"), yy.ravel("F"), zz.ravel("F"),
|
|
295
|
+
dxx.ravel("F"), dyy.ravel("F"), dzz.ravel("F")],
|
|
296
|
+
names=['x', 'y', 'z', 'dx', 'dy', 'dz'])
|
|
297
|
+
|
|
298
|
+
if len(index) != self.records_in_file:
|
|
299
|
+
raise ValueError(f"The length of the index ({len(index)}) does not match the number of records"
|
|
300
|
+
f" in the VolumeElement ({self.records_in_file})")
|
|
301
|
+
|
|
302
|
+
return index
|
|
303
|
+
|
|
304
|
+
def read_volume_variables(self, element: str, variables: list[str]) -> pd.DataFrame:
|
|
305
|
+
# Loop over the variables
|
|
306
|
+
chunks: list[pd.DataFrame] = []
|
|
307
|
+
for variable in variables:
|
|
308
|
+
# Check if the variable exists in the VolumeElement
|
|
309
|
+
if variable not in self.variables_in_file:
|
|
310
|
+
raise ValueError(f"Variable '{variable}' not found in the VolumeElement: {element}")
|
|
311
|
+
chunks.append(self._get_variable_by_name(variable).ravel())
|
|
312
|
+
|
|
313
|
+
# Concatenate all chunks into a single DataFrame
|
|
314
|
+
return pd.DataFrame(np.vstack(chunks), index=variables).T
|
|
315
|
+
|
|
316
|
+
def _get_variable_by_name(self, variable_name: str):
|
|
317
|
+
# get the index of the variable in order to index into elements
|
|
318
|
+
variable_index = self.variables_in_file.index(variable_name)
|
|
319
|
+
return self.element.data[variable_index].array.array
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class ParquetFileWriter:
|
|
323
|
+
|
|
324
|
+
def __init__(self):
|
|
325
|
+
pass
|
|
326
|
+
|
|
327
|
+
@classmethod
|
|
328
|
+
def from_column_generator(cls, index: pd.Index, column_generator):
|
|
329
|
+
|
|
330
|
+
# Path to the final output file
|
|
331
|
+
output_file = "final.parquet"
|
|
332
|
+
|
|
333
|
+
# Temp directory for storing parquet columns
|
|
334
|
+
temp_dir = "temp/"
|
|
335
|
+
|
|
336
|
+
# Ensure the temp directory exists
|
|
337
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
338
|
+
|
|
339
|
+
# Write the index to a separate Parquet file
|
|
340
|
+
index_table = pa.Table.from_pandas(index.to_frame('index'))
|
|
341
|
+
pq.write_table(index_table, temp_dir + "index.parquet")
|
|
342
|
+
index_pf = pq.ParquetFile(temp_dir + "index.parquet")
|
|
343
|
+
|
|
344
|
+
for i, column in enumerate(column_generator):
|
|
345
|
+
# Write each column to a temporary parquet file
|
|
346
|
+
table = pa.Table.from_pandas(column.to_frame())
|
|
347
|
+
pq.write_table(table, temp_dir + f"column_{i}.parquet")
|
|
348
|
+
|
|
349
|
+
# Collect paths to the temporary Parquet files
|
|
350
|
+
paths = [temp_dir + file for file in os.listdir(temp_dir) if file != "index.parquet"]
|
|
351
|
+
|
|
352
|
+
# Create a ParquetWriter for the final output file
|
|
353
|
+
first_pf = pq.ParquetFile(paths[0])
|
|
354
|
+
writer = pq.ParquetWriter(output_file, first_pf.schema)
|
|
355
|
+
|
|
356
|
+
for i in range(index_pf.num_row_groups):
|
|
357
|
+
# Read index chunk
|
|
358
|
+
index_chunk = index_pf.read_row_group(i).to_pandas()
|
|
359
|
+
|
|
360
|
+
# Dataframe to store chunk data
|
|
361
|
+
df = pd.DataFrame(index=index_chunk['index'])
|
|
362
|
+
|
|
363
|
+
for path in paths:
|
|
364
|
+
pf = pq.ParquetFile(path)
|
|
365
|
+
# Read data chunk
|
|
366
|
+
data_chunk = pf.read_row_group(i).to_pandas()
|
|
367
|
+
|
|
368
|
+
# Concatenate data chunk to the dataframe
|
|
369
|
+
df = pd.concat([df, data_chunk], axis=1)
|
|
370
|
+
|
|
371
|
+
# Write the chunk to the output file
|
|
372
|
+
writer.write_table(pa.Table.from_pandas(df))
|
|
373
|
+
|
|
374
|
+
# Close the writer and release resources
|
|
375
|
+
writer.close()
|
|
376
|
+
|
|
377
|
+
# Remove temporary files
|
|
378
|
+
for file in os.listdir(temp_dir):
|
|
379
|
+
os.remove(temp_dir + file)
|