geometallurgy 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. elphick/geomet/__init__.py +11 -11
  2. elphick/geomet/base.py +1133 -1133
  3. elphick/geomet/block_model.py +319 -358
  4. elphick/geomet/config/__init__.py +1 -1
  5. elphick/geomet/config/config_read.py +39 -39
  6. elphick/geomet/config/flowsheet_example_partition.yaml +31 -31
  7. elphick/geomet/config/flowsheet_example_simple.yaml +25 -25
  8. elphick/geomet/config/mc_config.yml +35 -35
  9. elphick/geomet/data/downloader.py +39 -39
  10. elphick/geomet/data/register.csv +12 -12
  11. elphick/geomet/datasets/__init__.py +2 -2
  12. elphick/geomet/datasets/datasets.py +47 -47
  13. elphick/geomet/datasets/downloader.py +40 -40
  14. elphick/geomet/datasets/register.csv +12 -12
  15. elphick/geomet/datasets/sample_data.py +196 -196
  16. elphick/geomet/extras.py +35 -35
  17. elphick/geomet/flowsheet/__init__.py +1 -1
  18. elphick/geomet/flowsheet/flowsheet.py +1216 -1216
  19. elphick/geomet/flowsheet/loader.py +99 -99
  20. elphick/geomet/flowsheet/operation.py +256 -256
  21. elphick/geomet/flowsheet/stream.py +39 -39
  22. elphick/geomet/interval_sample.py +641 -641
  23. elphick/geomet/io.py +379 -379
  24. elphick/geomet/plot.py +147 -147
  25. elphick/geomet/sample.py +28 -28
  26. elphick/geomet/utils/amenability.py +49 -49
  27. elphick/geomet/utils/block_model_converter.py +93 -93
  28. elphick/geomet/utils/components.py +136 -136
  29. elphick/geomet/utils/data.py +49 -49
  30. elphick/geomet/utils/estimates.py +108 -108
  31. elphick/geomet/utils/interp.py +193 -193
  32. elphick/geomet/utils/interp2.py +134 -134
  33. elphick/geomet/utils/layout.py +72 -72
  34. elphick/geomet/utils/moisture.py +61 -61
  35. elphick/geomet/utils/output.html +617 -0
  36. elphick/geomet/utils/pandas.py +378 -378
  37. elphick/geomet/utils/parallel.py +29 -29
  38. elphick/geomet/utils/partition.py +63 -63
  39. elphick/geomet/utils/size.py +51 -51
  40. elphick/geomet/utils/timer.py +80 -80
  41. elphick/geomet/utils/viz.py +56 -56
  42. elphick/geomet/validate.py.hide +176 -176
  43. {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/LICENSE +21 -21
  44. {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/METADATA +7 -5
  45. geometallurgy-0.4.13.dist-info/RECORD +49 -0
  46. {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/WHEEL +1 -1
  47. geometallurgy-0.4.12.dist-info/RECORD +0 -48
  48. {geometallurgy-0.4.12.dist-info → geometallurgy-0.4.13.dist-info}/entry_points.txt +0 -0
elphick/geomet/io.py CHANGED
@@ -1,379 +1,379 @@
1
- import json
2
- import logging
3
- import tokenize
4
- from abc import abstractmethod, ABC
5
- from io import StringIO
6
- from pathlib import Path
7
- from typing import Optional
8
-
9
- import pyarrow as pa
10
- import os
11
-
12
- import numpy as np
13
- import pandas as pd
14
- from omf import OMFReader, VolumeGridGeometry
15
- import pyarrow.parquet as pq
16
- from pandera import DataFrameSchema
17
-
18
-
19
- class BaseReader(ABC):
20
-
21
- def __init__(self, file_path: Path):
22
- self.logger = logging.getLogger(self.__class__.__name__)
23
- self.file_path: Path = file_path
24
- self.variables_in_file: list[str] = []
25
- self.records_in_file: int = 0
26
-
27
- @staticmethod
28
- def _parse_query_columns(query) -> list[str]:
29
- # Create a list to store the column names
30
- column_names = []
31
-
32
- # Tokenize the query string
33
- for token in tokenize.generate_tokens(StringIO(query).readline):
34
- token_type, token_string, _, _, _ = token
35
-
36
- # If the token is a name, and it's not a built-in Python name, add it to the list
37
- if token_type == tokenize.NAME and token_string not in __builtins__:
38
- column_names.append(token_string)
39
-
40
- return column_names
41
-
42
- @abstractmethod
43
- def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None) -> pd.DataFrame:
44
- pass
45
-
46
- @abstractmethod
47
- def get_index(self) -> pd.Index:
48
- pass
49
-
50
- def validate(self, schema_file: Path, data: Optional[pd.DataFrame]) -> pd.DataFrame:
51
- """Validate using a pandera schema
52
-
53
- This method does not leverage multiprocessing, and loads the entire dataframe into memory.
54
- Args:
55
- schema_file: The path to the schema yaml file
56
- data: The data to validate, if not provided, the underlying read method will be called.
57
- Returns:
58
- The coerced DataFrame after validation
59
- """
60
- import pandera as pa
61
- schema: DataFrameSchema = pa.DataFrameSchema.from_yaml(schema_file)
62
- if data:
63
- df = data
64
- else:
65
- df = self.read()
66
- schema.validate(df, lazy=True, inplace=True)
67
- return df
68
-
69
- def preprocess(self, negative_to_nan_threshold: Optional[float] = -1,
70
- not_detected_assays_threshold: Optional[float] = 0.5) -> pd.DataFrame:
71
- """
72
- Preprocess the data by managing negative values.
73
- Args:
74
- negative_to_nan_threshold: Values below this threshold will be replaced with NaN
75
- not_detected_assays_threshold: Values above this threshold will be replaced with half the absolute value
76
-
77
- Returns:
78
- The preprocessed DataFrame, with no negatives and no values above the not_detected_assays_threshold.
79
-
80
- """
81
- if negative_to_nan_threshold > 0:
82
- raise ValueError("The negative_to_nan_threshold must be less than or equal to zero or None.")
83
- if not_detected_assays_threshold > 0:
84
- raise ValueError("The not_detected_assays_threshold must be less than or equal to zero or None")
85
-
86
- df = self.read()
87
-
88
- # detect numeric columns
89
- numeric_cols = df.select_dtypes(include=[np.number]).columns
90
-
91
- if negative_to_nan_threshold:
92
- df.loc[df[numeric_cols] < negative_to_nan_threshold, numeric_cols] = np.nan
93
- if not_detected_assays_threshold:
94
- mask = (df[numeric_cols] > not_detected_assays_threshold) and (df[numeric_cols] < 0)
95
- df.loc[mask, numeric_cols] = np.abs(df.loc[mask, numeric_cols]) / 2
96
- return df
97
-
98
-
99
- class ParquetFileReader(BaseReader):
100
- """
101
- Read a Parquet file
102
- """
103
-
104
- def __init__(self, file_path: Path):
105
- """
106
- Initialize the parquet reader. While not enforced, it is expected that the file is indexed by x, y, z, or
107
- x, y, z, dx, dy, dz
108
- Args:
109
- file_path: The path to the Parquet file.
110
- """
111
- super().__init__(file_path)
112
- self.variables_in_file = self._get_parquet_columns()
113
- self.records_in_file = self._get_parquet_length()
114
-
115
- def _get_parquet_columns(self):
116
- parquet_file = pq.ParquetFile(self.file_path)
117
- metadata: dict = self.get_parquet_metadata()
118
- cols = [col for col in parquet_file.schema.names if col not in metadata['index_columns']]
119
- return cols
120
-
121
- def _get_parquet_length(self):
122
- parquet_file = pq.ParquetFile(self.file_path)
123
- return parquet_file.metadata.num_rows
124
-
125
- def get_parquet_metadata(self) -> dict:
126
- parquet_file = pq.ParquetFile(self.file_path)
127
- pd_metadata_bytes = parquet_file.metadata.metadata.get(b'pandas')
128
- pd_metadata_str: str = pd_metadata_bytes.decode('utf-8')
129
- return json.loads(pd_metadata_str)
130
-
131
- def get_index(self) -> pd.Index:
132
- parquet_file = pq.ParquetFile(self.file_path)
133
- pd_metadata: dict = self.get_parquet_metadata()
134
- index_columns = pd_metadata['index_columns']
135
- # deal with the single range index case
136
- if len(index_columns) == 1:
137
- if index_columns[0].get('kind') == 'range':
138
- df_index = pd.Index(
139
- range(index_columns[0].get('start'), index_columns[0].get('stop'), index_columns[0].get('step')))
140
- else:
141
- df_index = pd.Index(parquet_file.read(columns=index_columns[0].get('name')).to_pandas())
142
- else:
143
- # extract the pd.MultiIndex
144
- df_index = parquet_file.read(columns=index_columns).to_pandas().index
145
- return df_index
146
-
147
- def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
148
- with_index: bool = True) -> pd.DataFrame:
149
- # If no columns are specified, load all columns
150
- if not columns:
151
- columns = self.variables_in_file
152
- else:
153
- # Check if the columns specified are valid
154
- for col in columns:
155
- if col not in self.variables_in_file:
156
- raise ValueError(f"Column '{col}' not found in the Parquet file: {self.file_path}. "
157
- f"Available columns are: {self.variables_in_file}")
158
-
159
- # If a query is specified, parse it to find the columns involved
160
- if query:
161
- query_columns = self._parse_query_columns(query)
162
- # Load only the columns involved in the query
163
- parquet_file = pq.ParquetFile(self.file_path)
164
- df_query = parquet_file.read(columns=query_columns).to_pandas() # Apply the query to the DataFrame
165
- df_query = df_query.query(query)
166
- # Get the indices of the rows that match the query
167
- query_indices = df_query.index
168
- # Load the remaining columns, but only for the rows that match the query
169
- remaining_columns = [col for col in columns if col not in query_columns]
170
- if remaining_columns:
171
- chunks = []
172
- for col in remaining_columns:
173
- df_col = parquet_file.read(columns=[col]).to_pandas()
174
- chunks.append(df_col.loc[query_indices])
175
- # Concatenate the query DataFrame and the remaining DataFrame
176
- df = pd.concat([df_query, *chunks], axis=1)
177
- else:
178
- df = df_query
179
- if with_index:
180
- df_index: pd.Index = self.get_index()[query_indices]
181
- df.set_index(df_index, inplace=True, drop=True)
182
-
183
- else:
184
- # If no query is specified, load the specified columns
185
- df = pd.read_parquet(self.file_path, columns=columns)
186
- if with_index is False:
187
- df.reset_index(drop=True, inplace=True)
188
-
189
- return df
190
-
191
-
192
- class OMFFileReader(BaseReader):
193
- """
194
- Read an OMF file
195
- """
196
-
197
- def __init__(self, file_path, element: str):
198
- """
199
- Initialize the OMF file reader. The element must be a VolumeElement in the OMF file.
200
- Args:
201
- file_path: The path to the OMF file
202
- element: The name of the element in the OMF file to be validated. E.g. 'Block Model'
203
- """
204
- super().__init__(file_path)
205
-
206
- # check that the element provided is a valid VolumeElement in the OMF file.
207
- self.elements = OMFReader(str(file_path)).get_project().elements
208
- self.element_names = [e.name for e in self.elements]
209
- if element not in self.element_names:
210
- raise ValueError(f"Element '{element}' not found in the OMF file: {file_path}. Available elements are:"
211
- f" {list(self.elements.keys())}")
212
- elif self.get_element_by_name(element).__class__.__name__ != 'VolumeElement':
213
- raise ValueError(f"Element '{element}' is not a VolumeElement in the OMF file: {file_path}")
214
-
215
- self.element = self.get_element_by_name(element)
216
-
217
- self.variables_in_file = [v.name for v in self.element.data]
218
- self.records_in_file = len(self.element.data[0].array.array)
219
-
220
- def get_element_by_name(self, element_name: str):
221
- # get the index of the element in order to index into elements
222
- element_index = self.element_names.index(element_name)
223
- return self.elements[element_index]
224
-
225
- def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
226
- with_index: bool = True) -> pd.DataFrame:
227
- # Get the VolumeElement from the OMF file
228
- # volume_element = OMFReader(self.file_path).get_project().elements[self.element]
229
-
230
- # If no columns are specified, load all columns
231
- if not columns:
232
- columns = self.variables_in_file
233
- else:
234
- # Check if the columns specified are valid
235
- for col in columns:
236
- if col not in self.variables_in_file:
237
- raise ValueError(f"Column '{col}' not found in the VolumeElement: {self.element}")
238
-
239
- # If a query is specified, parse it to find the columns involved
240
- if query:
241
- query_columns = self._parse_query_columns(query)
242
- # Load only the columns involved in the query
243
- df_query: pd.DataFrame = self.read_volume_variables(self.element, variables=query_columns)
244
- # Apply the query to the DataFrame
245
- df_query = df_query.query(query)
246
- # Get the indices of the rows that match the query
247
- query_indices = df_query.index
248
- # Load the remaining columns, but only for the rows that match the query
249
- remaining_columns = [col for col in columns if col not in query_columns]
250
- if remaining_columns:
251
- chunks = []
252
- for col in remaining_columns:
253
- data_array = self.read_volume_variables(self.element, variables=[col])
254
- # Filter the numpy array using the query indices
255
- filtered_data_array = data_array[query_indices]
256
- # Convert the filtered numpy array to a DataFrame
257
- chunks.append(pd.DataFrame(filtered_data_array, columns=[col]))
258
- # Concatenate the query DataFrame and the remaining DataFrame
259
- df = pd.concat([df_query, *chunks], axis=1)
260
- else:
261
- df = df_query
262
- else:
263
- # If no query is specified, load the specified columns
264
- df = self.read_volume_variables(self.element, variables=columns)
265
-
266
- # add the index
267
- if with_index:
268
- df.set_index(self.get_index(), inplace=True, drop=True)
269
-
270
- return df
271
-
272
- def get_index(self) -> pd.MultiIndex:
273
-
274
- geometry: VolumeGridGeometry = self.element.geometry
275
- ox, oy, oz = geometry.origin
276
-
277
- # Make coordinates (points) along each axis, i, j, k
278
- i = ox + np.cumsum(geometry.tensor_u)
279
- i = np.insert(i, 0, ox)
280
- j = oy + np.cumsum(self.element.geometry.tensor_v)
281
- j = np.insert(j, 0, oy)
282
- k = oz + np.cumsum(self.element.geometry.tensor_w)
283
- k = np.insert(k, 0, oz)
284
-
285
- # convert to centroids
286
- x, y, z = (i[1:] + i[:-1]) / 2, (j[1:] + j[:-1]) / 2, (k[1:] + k[:-1]) / 2
287
- xx, yy, zz = np.meshgrid(x, y, z, indexing="ij")
288
-
289
- # Calculate dx, dy, dz
290
- dxx, dyy, dzz = np.meshgrid(geometry.tensor_u, geometry.tensor_v, geometry.tensor_w, indexing="ij")
291
-
292
- # TODO: consider rotation
293
-
294
- index = pd.MultiIndex.from_arrays([xx.ravel("F"), yy.ravel("F"), zz.ravel("F"),
295
- dxx.ravel("F"), dyy.ravel("F"), dzz.ravel("F")],
296
- names=['x', 'y', 'z', 'dx', 'dy', 'dz'])
297
-
298
- if len(index) != self.records_in_file:
299
- raise ValueError(f"The length of the index ({len(index)}) does not match the number of records"
300
- f" in the VolumeElement ({self.records_in_file})")
301
-
302
- return index
303
-
304
- def read_volume_variables(self, element: str, variables: list[str]) -> pd.DataFrame:
305
- # Loop over the variables
306
- chunks: list[pd.DataFrame] = []
307
- for variable in variables:
308
- # Check if the variable exists in the VolumeElement
309
- if variable not in self.variables_in_file:
310
- raise ValueError(f"Variable '{variable}' not found in the VolumeElement: {element}")
311
- chunks.append(self._get_variable_by_name(variable).ravel())
312
-
313
- # Concatenate all chunks into a single DataFrame
314
- return pd.DataFrame(np.vstack(chunks), index=variables).T
315
-
316
- def _get_variable_by_name(self, variable_name: str):
317
- # get the index of the variable in order to index into elements
318
- variable_index = self.variables_in_file.index(variable_name)
319
- return self.element.data[variable_index].array.array
320
-
321
-
322
- class ParquetFileWriter:
323
-
324
- def __init__(self):
325
- pass
326
-
327
- @classmethod
328
- def from_column_generator(cls, index: pd.Index, column_generator):
329
-
330
- # Path to the final output file
331
- output_file = "final.parquet"
332
-
333
- # Temp directory for storing parquet columns
334
- temp_dir = "temp/"
335
-
336
- # Ensure the temp directory exists
337
- os.makedirs(temp_dir, exist_ok=True)
338
-
339
- # Write the index to a separate Parquet file
340
- index_table = pa.Table.from_pandas(index.to_frame('index'))
341
- pq.write_table(index_table, temp_dir + "index.parquet")
342
- index_pf = pq.ParquetFile(temp_dir + "index.parquet")
343
-
344
- for i, column in enumerate(column_generator):
345
- # Write each column to a temporary parquet file
346
- table = pa.Table.from_pandas(column.to_frame())
347
- pq.write_table(table, temp_dir + f"column_{i}.parquet")
348
-
349
- # Collect paths to the temporary Parquet files
350
- paths = [temp_dir + file for file in os.listdir(temp_dir) if file != "index.parquet"]
351
-
352
- # Create a ParquetWriter for the final output file
353
- first_pf = pq.ParquetFile(paths[0])
354
- writer = pq.ParquetWriter(output_file, first_pf.schema)
355
-
356
- for i in range(index_pf.num_row_groups):
357
- # Read index chunk
358
- index_chunk = index_pf.read_row_group(i).to_pandas()
359
-
360
- # Dataframe to store chunk data
361
- df = pd.DataFrame(index=index_chunk['index'])
362
-
363
- for path in paths:
364
- pf = pq.ParquetFile(path)
365
- # Read data chunk
366
- data_chunk = pf.read_row_group(i).to_pandas()
367
-
368
- # Concatenate data chunk to the dataframe
369
- df = pd.concat([df, data_chunk], axis=1)
370
-
371
- # Write the chunk to the output file
372
- writer.write_table(pa.Table.from_pandas(df))
373
-
374
- # Close the writer and release resources
375
- writer.close()
376
-
377
- # Remove temporary files
378
- for file in os.listdir(temp_dir):
379
- os.remove(temp_dir + file)
1
+ import json
2
+ import logging
3
+ import tokenize
4
+ from abc import abstractmethod, ABC
5
+ from io import StringIO
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import pyarrow as pa
10
+ import os
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from omf import OMFReader, VolumeGridGeometry
15
+ import pyarrow.parquet as pq
16
+ from pandera import DataFrameSchema
17
+
18
+
19
+ class BaseReader(ABC):
20
+
21
+ def __init__(self, file_path: Path):
22
+ self.logger = logging.getLogger(self.__class__.__name__)
23
+ self.file_path: Path = file_path
24
+ self.variables_in_file: list[str] = []
25
+ self.records_in_file: int = 0
26
+
27
+ @staticmethod
28
+ def _parse_query_columns(query) -> list[str]:
29
+ # Create a list to store the column names
30
+ column_names = []
31
+
32
+ # Tokenize the query string
33
+ for token in tokenize.generate_tokens(StringIO(query).readline):
34
+ token_type, token_string, _, _, _ = token
35
+
36
+ # If the token is a name, and it's not a built-in Python name, add it to the list
37
+ if token_type == tokenize.NAME and token_string not in __builtins__:
38
+ column_names.append(token_string)
39
+
40
+ return column_names
41
+
42
+ @abstractmethod
43
+ def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None) -> pd.DataFrame:
44
+ pass
45
+
46
+ @abstractmethod
47
+ def get_index(self) -> pd.Index:
48
+ pass
49
+
50
+ def validate(self, schema_file: Path, data: Optional[pd.DataFrame]) -> pd.DataFrame:
51
+ """Validate using a pandera schema
52
+
53
+ This method does not leverage multiprocessing, and loads the entire dataframe into memory.
54
+ Args:
55
+ schema_file: The path to the schema yaml file
56
+ data: The data to validate, if not provided, the underlying read method will be called.
57
+ Returns:
58
+ The coerced DataFrame after validation
59
+ """
60
+ import pandera as pa
61
+ schema: DataFrameSchema = pa.DataFrameSchema.from_yaml(schema_file)
62
+ if data:
63
+ df = data
64
+ else:
65
+ df = self.read()
66
+ schema.validate(df, lazy=True, inplace=True)
67
+ return df
68
+
69
+ def preprocess(self, negative_to_nan_threshold: Optional[float] = -1,
70
+ not_detected_assays_threshold: Optional[float] = 0.5) -> pd.DataFrame:
71
+ """
72
+ Preprocess the data by managing negative values.
73
+ Args:
74
+ negative_to_nan_threshold: Values below this threshold will be replaced with NaN
75
+ not_detected_assays_threshold: Values above this threshold will be replaced with half the absolute value
76
+
77
+ Returns:
78
+ The preprocessed DataFrame, with no negatives and no values above the not_detected_assays_threshold.
79
+
80
+ """
81
+ if negative_to_nan_threshold > 0:
82
+ raise ValueError("The negative_to_nan_threshold must be less than or equal to zero or None.")
83
+ if not_detected_assays_threshold > 0:
84
+ raise ValueError("The not_detected_assays_threshold must be less than or equal to zero or None")
85
+
86
+ df = self.read()
87
+
88
+ # detect numeric columns
89
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
90
+
91
+ if negative_to_nan_threshold:
92
+ df.loc[df[numeric_cols] < negative_to_nan_threshold, numeric_cols] = np.nan
93
+ if not_detected_assays_threshold:
94
+ mask = (df[numeric_cols] > not_detected_assays_threshold) and (df[numeric_cols] < 0)
95
+ df.loc[mask, numeric_cols] = np.abs(df.loc[mask, numeric_cols]) / 2
96
+ return df
97
+
98
+
99
+ class ParquetFileReader(BaseReader):
100
+ """
101
+ Read a Parquet file
102
+ """
103
+
104
+ def __init__(self, file_path: Path):
105
+ """
106
+ Initialize the parquet reader. While not enforced, it is expected that the file is indexed by x, y, z, or
107
+ x, y, z, dx, dy, dz
108
+ Args:
109
+ file_path: The path to the Parquet file.
110
+ """
111
+ super().__init__(file_path)
112
+ self.variables_in_file = self._get_parquet_columns()
113
+ self.records_in_file = self._get_parquet_length()
114
+
115
+ def _get_parquet_columns(self):
116
+ parquet_file = pq.ParquetFile(self.file_path)
117
+ metadata: dict = self.get_parquet_metadata()
118
+ cols = [col for col in parquet_file.schema.names if col not in metadata['index_columns']]
119
+ return cols
120
+
121
+ def _get_parquet_length(self):
122
+ parquet_file = pq.ParquetFile(self.file_path)
123
+ return parquet_file.metadata.num_rows
124
+
125
+ def get_parquet_metadata(self) -> dict:
126
+ parquet_file = pq.ParquetFile(self.file_path)
127
+ pd_metadata_bytes = parquet_file.metadata.metadata.get(b'pandas')
128
+ pd_metadata_str: str = pd_metadata_bytes.decode('utf-8')
129
+ return json.loads(pd_metadata_str)
130
+
131
+ def get_index(self) -> pd.Index:
132
+ parquet_file = pq.ParquetFile(self.file_path)
133
+ pd_metadata: dict = self.get_parquet_metadata()
134
+ index_columns = pd_metadata['index_columns']
135
+ # deal with the single range index case
136
+ if len(index_columns) == 1:
137
+ if index_columns[0].get('kind') == 'range':
138
+ df_index = pd.Index(
139
+ range(index_columns[0].get('start'), index_columns[0].get('stop'), index_columns[0].get('step')))
140
+ else:
141
+ df_index = pd.Index(parquet_file.read(columns=index_columns[0].get('name')).to_pandas())
142
+ else:
143
+ # extract the pd.MultiIndex
144
+ df_index = parquet_file.read(columns=index_columns).to_pandas().index
145
+ return df_index
146
+
147
+ def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
148
+ with_index: bool = True) -> pd.DataFrame:
149
+ # If no columns are specified, load all columns
150
+ if not columns:
151
+ columns = self.variables_in_file
152
+ else:
153
+ # Check if the columns specified are valid
154
+ for col in columns:
155
+ if col not in self.variables_in_file:
156
+ raise ValueError(f"Column '{col}' not found in the Parquet file: {self.file_path}. "
157
+ f"Available columns are: {self.variables_in_file}")
158
+
159
+ # If a query is specified, parse it to find the columns involved
160
+ if query:
161
+ query_columns = self._parse_query_columns(query)
162
+ # Load only the columns involved in the query
163
+ parquet_file = pq.ParquetFile(self.file_path)
164
+ df_query = parquet_file.read(columns=query_columns).to_pandas() # Apply the query to the DataFrame
165
+ df_query = df_query.query(query)
166
+ # Get the indices of the rows that match the query
167
+ query_indices = df_query.index
168
+ # Load the remaining columns, but only for the rows that match the query
169
+ remaining_columns = [col for col in columns if col not in query_columns]
170
+ if remaining_columns:
171
+ chunks = []
172
+ for col in remaining_columns:
173
+ df_col = parquet_file.read(columns=[col]).to_pandas()
174
+ chunks.append(df_col.loc[query_indices])
175
+ # Concatenate the query DataFrame and the remaining DataFrame
176
+ df = pd.concat([df_query, *chunks], axis=1)
177
+ else:
178
+ df = df_query
179
+ if with_index:
180
+ df_index: pd.Index = self.get_index()[query_indices]
181
+ df.set_index(df_index, inplace=True, drop=True)
182
+
183
+ else:
184
+ # If no query is specified, load the specified columns
185
+ df = pd.read_parquet(self.file_path, columns=columns)
186
+ if with_index is False:
187
+ df.reset_index(drop=True, inplace=True)
188
+
189
+ return df
190
+
191
+
192
+ class OMFFileReader(BaseReader):
193
+ """
194
+ Read an OMF file
195
+ """
196
+
197
+ def __init__(self, file_path, element: str):
198
+ """
199
+ Initialize the OMF file reader. The element must be a VolumeElement in the OMF file.
200
+ Args:
201
+ file_path: The path to the OMF file
202
+ element: The name of the element in the OMF file to be validated. E.g. 'Block Model'
203
+ """
204
+ super().__init__(file_path)
205
+
206
+ # check that the element provided is a valid VolumeElement in the OMF file.
207
+ self.elements = OMFReader(str(file_path)).get_project().elements
208
+ self.element_names = [e.name for e in self.elements]
209
+ if element not in self.element_names:
210
+ raise ValueError(f"Element '{element}' not found in the OMF file: {file_path}. Available elements are:"
211
+ f" {list(self.elements.keys())}")
212
+ elif self.get_element_by_name(element).__class__.__name__ != 'VolumeElement':
213
+ raise ValueError(f"Element '{element}' is not a VolumeElement in the OMF file: {file_path}")
214
+
215
+ self.element = self.get_element_by_name(element)
216
+
217
+ self.variables_in_file = [v.name for v in self.element.data]
218
+ self.records_in_file = len(self.element.data[0].array.array)
219
+
220
+ def get_element_by_name(self, element_name: str):
221
+ # get the index of the element in order to index into elements
222
+ element_index = self.element_names.index(element_name)
223
+ return self.elements[element_index]
224
+
225
+ def read(self, columns: Optional[list[str]] = None, query: Optional[str] = None,
226
+ with_index: bool = True) -> pd.DataFrame:
227
+ # Get the VolumeElement from the OMF file
228
+ # volume_element = OMFReader(self.file_path).get_project().elements[self.element]
229
+
230
+ # If no columns are specified, load all columns
231
+ if not columns:
232
+ columns = self.variables_in_file
233
+ else:
234
+ # Check if the columns specified are valid
235
+ for col in columns:
236
+ if col not in self.variables_in_file:
237
+ raise ValueError(f"Column '{col}' not found in the VolumeElement: {self.element}")
238
+
239
+ # If a query is specified, parse it to find the columns involved
240
+ if query:
241
+ query_columns = self._parse_query_columns(query)
242
+ # Load only the columns involved in the query
243
+ df_query: pd.DataFrame = self.read_volume_variables(self.element, variables=query_columns)
244
+ # Apply the query to the DataFrame
245
+ df_query = df_query.query(query)
246
+ # Get the indices of the rows that match the query
247
+ query_indices = df_query.index
248
+ # Load the remaining columns, but only for the rows that match the query
249
+ remaining_columns = [col for col in columns if col not in query_columns]
250
+ if remaining_columns:
251
+ chunks = []
252
+ for col in remaining_columns:
253
+ data_array = self.read_volume_variables(self.element, variables=[col])
254
+ # Filter the numpy array using the query indices
255
+ filtered_data_array = data_array[query_indices]
256
+ # Convert the filtered numpy array to a DataFrame
257
+ chunks.append(pd.DataFrame(filtered_data_array, columns=[col]))
258
+ # Concatenate the query DataFrame and the remaining DataFrame
259
+ df = pd.concat([df_query, *chunks], axis=1)
260
+ else:
261
+ df = df_query
262
+ else:
263
+ # If no query is specified, load the specified columns
264
+ df = self.read_volume_variables(self.element, variables=columns)
265
+
266
+ # add the index
267
+ if with_index:
268
+ df.set_index(self.get_index(), inplace=True, drop=True)
269
+
270
+ return df
271
+
272
+ def get_index(self) -> pd.MultiIndex:
273
+
274
+ geometry: VolumeGridGeometry = self.element.geometry
275
+ ox, oy, oz = geometry.origin
276
+
277
+ # Make coordinates (points) along each axis, i, j, k
278
+ i = ox + np.cumsum(geometry.tensor_u)
279
+ i = np.insert(i, 0, ox)
280
+ j = oy + np.cumsum(self.element.geometry.tensor_v)
281
+ j = np.insert(j, 0, oy)
282
+ k = oz + np.cumsum(self.element.geometry.tensor_w)
283
+ k = np.insert(k, 0, oz)
284
+
285
+ # convert to centroids
286
+ x, y, z = (i[1:] + i[:-1]) / 2, (j[1:] + j[:-1]) / 2, (k[1:] + k[:-1]) / 2
287
+ xx, yy, zz = np.meshgrid(x, y, z, indexing="ij")
288
+
289
+ # Calculate dx, dy, dz
290
+ dxx, dyy, dzz = np.meshgrid(geometry.tensor_u, geometry.tensor_v, geometry.tensor_w, indexing="ij")
291
+
292
+ # TODO: consider rotation
293
+
294
+ index = pd.MultiIndex.from_arrays([xx.ravel("F"), yy.ravel("F"), zz.ravel("F"),
295
+ dxx.ravel("F"), dyy.ravel("F"), dzz.ravel("F")],
296
+ names=['x', 'y', 'z', 'dx', 'dy', 'dz'])
297
+
298
+ if len(index) != self.records_in_file:
299
+ raise ValueError(f"The length of the index ({len(index)}) does not match the number of records"
300
+ f" in the VolumeElement ({self.records_in_file})")
301
+
302
+ return index
303
+
304
+ def read_volume_variables(self, element: str, variables: list[str]) -> pd.DataFrame:
305
+ # Loop over the variables
306
+ chunks: list[pd.DataFrame] = []
307
+ for variable in variables:
308
+ # Check if the variable exists in the VolumeElement
309
+ if variable not in self.variables_in_file:
310
+ raise ValueError(f"Variable '{variable}' not found in the VolumeElement: {element}")
311
+ chunks.append(self._get_variable_by_name(variable).ravel())
312
+
313
+ # Concatenate all chunks into a single DataFrame
314
+ return pd.DataFrame(np.vstack(chunks), index=variables).T
315
+
316
+ def _get_variable_by_name(self, variable_name: str):
317
+ # get the index of the variable in order to index into elements
318
+ variable_index = self.variables_in_file.index(variable_name)
319
+ return self.element.data[variable_index].array.array
320
+
321
+
322
+ class ParquetFileWriter:
323
+
324
+ def __init__(self):
325
+ pass
326
+
327
+ @classmethod
328
+ def from_column_generator(cls, index: pd.Index, column_generator):
329
+
330
+ # Path to the final output file
331
+ output_file = "final.parquet"
332
+
333
+ # Temp directory for storing parquet columns
334
+ temp_dir = "temp/"
335
+
336
+ # Ensure the temp directory exists
337
+ os.makedirs(temp_dir, exist_ok=True)
338
+
339
+ # Write the index to a separate Parquet file
340
+ index_table = pa.Table.from_pandas(index.to_frame('index'))
341
+ pq.write_table(index_table, temp_dir + "index.parquet")
342
+ index_pf = pq.ParquetFile(temp_dir + "index.parquet")
343
+
344
+ for i, column in enumerate(column_generator):
345
+ # Write each column to a temporary parquet file
346
+ table = pa.Table.from_pandas(column.to_frame())
347
+ pq.write_table(table, temp_dir + f"column_{i}.parquet")
348
+
349
+ # Collect paths to the temporary Parquet files
350
+ paths = [temp_dir + file for file in os.listdir(temp_dir) if file != "index.parquet"]
351
+
352
+ # Create a ParquetWriter for the final output file
353
+ first_pf = pq.ParquetFile(paths[0])
354
+ writer = pq.ParquetWriter(output_file, first_pf.schema)
355
+
356
+ for i in range(index_pf.num_row_groups):
357
+ # Read index chunk
358
+ index_chunk = index_pf.read_row_group(i).to_pandas()
359
+
360
+ # Dataframe to store chunk data
361
+ df = pd.DataFrame(index=index_chunk['index'])
362
+
363
+ for path in paths:
364
+ pf = pq.ParquetFile(path)
365
+ # Read data chunk
366
+ data_chunk = pf.read_row_group(i).to_pandas()
367
+
368
+ # Concatenate data chunk to the dataframe
369
+ df = pd.concat([df, data_chunk], axis=1)
370
+
371
+ # Write the chunk to the output file
372
+ writer.write_table(pa.Table.from_pandas(df))
373
+
374
+ # Close the writer and release resources
375
+ writer.close()
376
+
377
+ # Remove temporary files
378
+ for file in os.listdir(temp_dir):
379
+ os.remove(temp_dir + file)