parq-blockmodel 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Greg Elphick
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.3
2
+ Name: parq-blockmodel
3
+ Version: 0.1.1
4
+ Summary: A Python package for efficient storage, manipulation, and analysis of mining block models using Parquet files.
5
+ Author: Greg
6
+ Author-email: 11791585+elphick@users.noreply.github.com
7
+ Requires-Python: >=3.10,<3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Provides-Extra: blockmodel
13
+ Provides-Extra: profiling
14
+ Provides-Extra: progress
15
+ Requires-Dist: lark (>=1.2.2,<2.0.0)
16
+ Requires-Dist: numpy (>=1.25.2)
17
+ Requires-Dist: parq-tools (==0.3.1)
18
+ Requires-Dist: pyarrow (>=16.0)
19
+ Requires-Dist: pyvista (>=0.45.2,<0.46.0) ; extra == "blockmodel"
20
+ Requires-Dist: setuptools ; extra == "profiling"
21
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0) ; extra == "progress"
22
+ Requires-Dist: ydata-profiling (>=4.16.1,<5.0.0) ; extra == "profiling"
23
+ Description-Content-Type: text/markdown
24
+
25
+ # parq-blockmodel
26
+
27
+ [![Run Tests](https://github.com/Elphick/parq-blockmodel/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/Elphick/parq-blockmodel/actions/workflows/build_and_test.yml)
28
+ [![PyPI](https://img.shields.io/pypi/v/parq-blockmodel.svg?logo=python&logoColor=white)](https://pypi.org/project/parq-blockmodel/)
29
+ ![Coverage](https://raw.githubusercontent.com/elphick/parq-blockmodel/main/docs/source/_static/badges/coverage.svg)
30
+ [![Python Versions](https://img.shields.io/pypi/pyversions/parq-blockmodel.svg)](https://pypi.org/project/parq-blockmodel/)
31
+ [![License](https://img.shields.io/github/license/Elphick/parq-blockmodel.svg?logo=apache&logoColor=white)](https://pypi.org/project/parq-blockmodel/)
32
+ [![Publish Docs](https://github.com/Elphick/parq-blockmodel/actions/workflows/docs_to_gh_pages.yml/badge.svg?branch=main)](https://github.com/Elphick/parq-blockmodel/actions/workflows/docs_to_gh_pages.yml)
33
+ [![Open Issues](https://img.shields.io/github/issues/Elphick/parq-blockmodel.svg)](https://github.com/Elphick/parq-blockmodel/issues)
34
+ [![Open PRs](https://img.shields.io/github/issues-pr/Elphick/parq-blockmodel.svg)](https://github.com/Elphick/parq-blockmodel/pulls)
35
+
36
+
37
+ ## Overview
38
+ A Python package for efficient storage, manipulation, and analysis of mining block models using Parquet files.
39
+ parq-blockmodel provides tools for reading, writing, indexing, and transforming large-scale block model datasets,
40
+ leveraging the performance of Apache Arrow and Parquet for scalable geoscience data workflows.
@@ -0,0 +1,16 @@
1
+ # parq-blockmodel
2
+
3
+ [![Run Tests](https://github.com/Elphick/parq-blockmodel/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/Elphick/parq-blockmodel/actions/workflows/build_and_test.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/parq-blockmodel.svg?logo=python&logoColor=white)](https://pypi.org/project/parq-blockmodel/)
5
+ ![Coverage](https://raw.githubusercontent.com/elphick/parq-blockmodel/main/docs/source/_static/badges/coverage.svg)
6
+ [![Python Versions](https://img.shields.io/pypi/pyversions/parq-blockmodel.svg)](https://pypi.org/project/parq-blockmodel/)
7
+ [![License](https://img.shields.io/github/license/Elphick/parq-blockmodel.svg?logo=apache&logoColor=white)](https://pypi.org/project/parq-blockmodel/)
8
+ [![Publish Docs](https://github.com/Elphick/parq-blockmodel/actions/workflows/docs_to_gh_pages.yml/badge.svg?branch=main)](https://github.com/Elphick/parq-blockmodel/actions/workflows/docs_to_gh_pages.yml)
9
+ [![Open Issues](https://img.shields.io/github/issues/Elphick/parq-blockmodel.svg)](https://github.com/Elphick/parq-blockmodel/issues)
10
+ [![Open PRs](https://img.shields.io/github/issues-pr/Elphick/parq-blockmodel.svg)](https://github.com/Elphick/parq-blockmodel/pulls)
11
+
12
+
13
+ ## Overview
14
+ A Python package for efficient storage, manipulation, and analysis of mining block models using Parquet files.
15
+ parq-blockmodel provides tools for reading, writing, indexing, and transforming large-scale block model datasets,
16
+ leveraging the performance of Apache Arrow and Parquet for scalable geoscience data workflows.
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+ os.environ["YDATA_SUPPRESS_BANNER"] = "1"
4
+
5
+ from importlib import metadata
6
+ from .blockmodel import ParquetBlockModel
7
+ from .geometry import RegularGeometry
8
+
9
+ try:
10
+ __version__ = metadata.version('parq_blockmodel')
11
+ except metadata.PackageNotFoundError:
12
+ # Package is not installed
13
+ pass
@@ -0,0 +1,399 @@
1
+ """
2
+ blockmodel.py
3
+
4
+ This module defines the ParquetBlockModel class, which represents a block model stored in a Parquet file.
5
+
6
+ Main API:
7
+
8
+ - ParquetBlockModel: Class for representing a block model stored in a Parquet file.
9
+
10
+ """
11
+ import logging
12
+ import math
13
+ import shutil
14
+ import warnings
15
+ from pathlib import Path
16
+ from typing import Union, Optional
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import pyarrow as pa
21
+ import pyarrow.parquet as pq
22
+
23
+ from parq_blockmodel.utils import create_demo_blockmodel, rotation_to_axis_orientation
24
+ from parq_blockmodel.utils.pyvista_utils import df_to_pv_structured_grid, df_to_pv_unstructured_grid
25
+ from parq_tools.lazy_parquet import LazyParquetDataFrame
26
+ from pyarrow.parquet import ParquetFile
27
+ from tqdm import tqdm
28
+
29
+ from parq_tools import ParquetProfileReport
30
+ from parq_tools.utils import atomic_output_file
31
+
32
+ from parq_blockmodel.geometry import RegularGeometry
33
+
34
+ Point = Union[tuple[float, float, float], list[float, float, float]]
35
+ Triple = Union[tuple[float, float, float], list[float, float, float]]
36
+
37
+
38
+ class ParquetBlockModel:
39
+ """
40
+ A class to represent a **regular** Parquet block model.
41
+
42
+ Block ordering is c-style, ordered by x, y, z coordinates.
43
+
44
+ Attributes:
45
+ blockmodel_path (Path): The file path to the blockmodel Parquet file. This file is the source of the
46
+ block model data. Consider a .pbm.parquet extension to imply a ParquetBlockModel file.
47
+ name (str): The name of the block model, derived from the file name.
48
+ block_path (Path): The original file path from which the block model will be created.
49
+ geometry (RegularGeometry): The geometry of the block model, derived from the Parquet file.
50
+ """
51
+
52
+ def __init__(self, blockmodel_path: Optional[Path] = None, name: Optional[str] = None,
53
+ block_path: Optional[Path] = None,
54
+ geometry: Optional[RegularGeometry] = None):
55
+ if blockmodel_path is None and block_path is not None:
56
+ # Derive the .pbm.parquet path from block_path
57
+ blockmodel_path = block_path.with_suffix('.pbm.parquet')
58
+ shutil.copy(block_path, blockmodel_path)
59
+ elif blockmodel_path is None:
60
+ raise ValueError("Either 'path' or 'block_path' must be provided.")
61
+ self.blockmodel_path = blockmodel_path
62
+ self.name = name or blockmodel_path.stem.strip('.pbm')
63
+ self.block_path = block_path or blockmodel_path
64
+ self.pf: ParquetFile = ParquetFile(blockmodel_path)
65
+ self.report_path: Optional[Path] = None
66
+ self.geometry: Optional[RegularGeometry] = geometry
67
+ if self.geometry is None and blockmodel_path.exists():
68
+ self.geometry = RegularGeometry.from_parquet(self.blockmodel_path)
69
+ self.data: LazyParquetDataFrame = LazyParquetDataFrame(self.blockmodel_path)
70
+ self.columns: list[str] = pq.read_schema(self.blockmodel_path).names
71
+ self._centroid_index: Optional[pd.MultiIndex] = None
72
+ self.attributes: list[str] = [col for col in self.columns if col not in ["x", "y", "z"]]
73
+ self._extract_column_dtypes()
74
+ self._logger = logging.getLogger(__name__)
75
+
76
+ if self.is_sparse:
77
+ if not self.validate_sparse():
78
+ raise ValueError("The sparse ParquetBlockModel is invalid. "
79
+ "Sparse centroids must be a subset of the dense grid.")
80
+
81
+ def __repr__(self):
82
+ return f"ParquetBlockModel(name={self.name}, path={self.blockmodel_path})"
83
+
84
+ def _extract_column_dtypes(self):
85
+ self.column_dtypes: dict[str, np.dtype] = {}
86
+ self._column_categorical_ordered: dict[str, bool] = {}
87
+ schema = pq.read_schema(self.blockmodel_path)
88
+ for col in self.columns:
89
+ if col in ["x", "y", "z"]:
90
+ continue
91
+ field_type = schema.field(col).type
92
+ if pa.types.is_dictionary(field_type):
93
+ self.column_dtypes[col] = pd.CategoricalDtype(ordered=field_type.ordered)
94
+ self._column_categorical_ordered[col] = field_type.ordered
95
+ else:
96
+ self.column_dtypes[col] = field_type.to_pandas_dtype()
97
+
98
+ @property
99
+ def column_categorical_ordered(self) -> dict[str, bool]:
100
+ return self._column_categorical_ordered.copy()
101
+
102
+ @property
103
+ def centroid_index(self) -> pd.MultiIndex:
104
+ """
105
+ Get the centroid index of the block model.
106
+
107
+ Returns:
108
+ pd.MultiIndex: The MultiIndex representing the centroid coordinates (x, y, z).
109
+ """
110
+
111
+ if self._centroid_index is None:
112
+ centroid_cols = ["x", "y", "z"]
113
+ centroids: pd.DataFrame = pq.read_table(self.blockmodel_path, columns=centroid_cols).to_pandas()
114
+
115
+ if centroids.index.names == centroid_cols:
116
+ index = centroids.index
117
+ else:
118
+ if centroids.empty:
119
+ raise ValueError("Parquet file is empty or does not contain valid centroid data.")
120
+ index = centroids.set_index(["x", "y", "z"]).index
121
+ if not index.is_unique:
122
+ raise ValueError("The index of the Parquet file is not unique. "
123
+ "Ensure that the centroid coordinates (x, y, z) are unique.")
124
+
125
+ # Only check monotonicity if axes are aligned (not rotated)
126
+ if not self.geometry.is_rotated and not index.is_monotonic_increasing:
127
+ raise ValueError("The index of the Parquet file is not sorted in ascending order. "
128
+ "Ensure that the centroid coordinates (x, y, z) are sorted.")
129
+ self._centroid_index = index
130
+ return self._centroid_index
131
+
132
+ @property
133
+ def is_sparse(self) -> bool:
134
+ dense_index = self.geometry.to_multi_index()
135
+ return len(self.centroid_index) < len(dense_index)
136
+
137
+ @property
138
+ def sparsity(self) -> float:
139
+ dense_index = self.geometry.to_multi_index()
140
+ return 1.0 - (len(self.centroid_index) / len(dense_index))
141
+
142
+ @property
143
+ def index_c(self) -> np.ndarray:
144
+ """Zero-based C-order (x, y, z) indices for the dense grid."""
145
+ shape = self.geometry.shape
146
+ return np.arange(np.prod(shape)).reshape(shape, order='C').ravel(order='C')
147
+
148
+ @property
149
+ def index_f(self) -> np.ndarray:
150
+ """Zero-based F-order (z, y, x) indices for the dense grid."""
151
+ shape = self.geometry.shape
152
+ return np.arange(np.prod(shape)).reshape(shape, order='C').ravel(order='F')
153
+
154
+ def validate_sparse(self) -> bool:
155
+ dense_index = self.geometry.to_multi_index()
156
+ # All sparse centroids must be in the dense grid
157
+ return self.centroid_index.isin(dense_index).all()
158
+
159
+ @classmethod
160
+ def from_parquet(cls, parquet_path: Path, overwrite: bool = False) -> "ParquetBlockModel":
161
+ """ Create a ParquetBlockModel instance from a Parquet file.
162
+
163
+ Args:
164
+ parquet_path (Path): The path to the Parquet file.
165
+ overwrite (bool): If True, allows overwriting an existing ParquetBlockModel file. Defaults to False.
166
+
167
+ """
168
+ if parquet_path.suffixes[-2:] == [".pbm", ".parquet"]:
169
+ if not overwrite:
170
+ raise ValueError(
171
+ f"File {parquet_path} appears to be a compliant ParquetBlockModel file. "
172
+ f"Use the constructor directly, or pass overwrite=True to allow mutation."
173
+ )
174
+ new_filepath = shutil.copy(parquet_path, parquet_path.resolve().with_suffix(".pbm.parquet"))
175
+ return cls(name=parquet_path.stem, blockmodel_path=new_filepath, block_path=parquet_path)
176
+
177
+ @classmethod
178
+ def create_demo_block_model(cls, filename: Path,
179
+ shape=(3, 3, 3),
180
+ block_size=(1, 1, 1),
181
+ corner=(-0.5, -0.5, -0.5),
182
+ azimuth: float = 0.0,
183
+ dip: float = 0.0,
184
+ plunge: float = 0.0) -> "ParquetBlockModel":
185
+ """
186
+ Create a demo block model with specified parameters.
187
+
188
+ Args:
189
+ filename (Path): The file path where the Parquet file will be saved.
190
+ shape (tuple): The shape of the block model.
191
+ block_size (tuple): The size of each block.
192
+ corner (tuple): The coordinates of the corner of the block model.
193
+ azimuth (float): The azimuth angle in degrees for rotation.
194
+ dip (float): The dip angle in degrees for rotation.
195
+ plunge (float): The plunge angle in degrees for rotation.
196
+
197
+ Returns:
198
+ ParquetBlockModel: An instance of ParquetBlockModel with demo data.
199
+ """
200
+ create_demo_blockmodel(shape=shape, block_size=block_size, corner=corner,
201
+ azimuth=azimuth, dip=dip, plunge=plunge,
202
+ parquet_filepath=filename)
203
+ # get the orientation of the axes
204
+ axis_u, axis_v, axis_w = rotation_to_axis_orientation(azimuth=azimuth, dip=dip, plunge=plunge)
205
+ # create geometry that aligns with the demo block model
206
+ geometry = RegularGeometry(block_size=block_size, corner=corner, shape=shape,
207
+ axis_u=axis_u, axis_v=axis_v, axis_w=axis_w)
208
+
209
+ return cls(geometry=geometry, block_path=filename)
210
+
211
+ @classmethod
212
+ def from_geometry(cls, geometry: RegularGeometry, path: Path, name: Optional[str] = None) -> "ParquetBlockModel":
213
+ centroids_df = geometry.to_dataframe()
214
+ centroids_df.to_parquet(path, index=False)
215
+ return cls(blockmodel_path=path, name=name, geometry=geometry)
216
+
217
+ def create_report(self, columns: Optional[list[str]] = None,
218
+ column_batch_size: int = 10,
219
+ show_progress: bool = True, open_in_browser: bool = False) -> Path:
220
+ """
221
+ Create a ydata-profiling report for the block model.
222
+ The report will be of the same name as the block model, with a '.html' extension.
223
+
224
+ Args:
225
+ columns: List of column names to include in the profile. If None, all columns are used.
226
+ column_batch_size: The number of columns to process in each batch. If None, processes all columns at once.
227
+ show_progress: bool: If True, displays a progress bar during profiling.
228
+ open_in_browser: bool: If True, opens the report in a web browser after generation.
229
+
230
+ Returns
231
+ Path: The path to the generated profile report.
232
+
233
+ """
234
+ report: ParquetProfileReport = ParquetProfileReport(self.blockmodel_path, columns=columns,
235
+ batch_size=column_batch_size,
236
+ show_progress=show_progress).profile()
237
+ if open_in_browser:
238
+ report.show(notebook=False)
239
+ if not columns:
240
+ self.report_path = self.blockmodel_path.with_suffix('.html')
241
+ return self.report_path
242
+
243
+ def plot(self, scalar: str, threshold: bool = True, show_edges: bool = True,
244
+ show_axes: bool = True) -> 'pv.Plotter':
245
+ import pyvista as pv
246
+ if scalar not in self.attributes:
247
+ raise ValueError(f"Column '{scalar}' not found in the ParquetBlockModel.")
248
+
249
+ # Create a PyVista plotter
250
+ plotter = pv.Plotter()
251
+
252
+ mesh = self.to_pyvista(attributes=[scalar])
253
+
254
+ # Add a thresholded mesh to the plotter
255
+ if threshold:
256
+ plotter.add_mesh_threshold(mesh, scalars=scalar, show_edges=show_edges)
257
+ else:
258
+ plotter.add_mesh(mesh, scalars=scalar, show_edges=show_edges)
259
+
260
+ plotter.title = self.name
261
+ if show_axes:
262
+ plotter.show_axes()
263
+
264
+ return plotter
265
+
266
+ def read(self, columns: Optional[list[str]] = None,
267
+ with_index: bool = True, dense: bool = False) -> pd.DataFrame:
268
+ """
269
+ Read the Parquet file and return a DataFrame.
270
+
271
+ Args:
272
+ columns: List of column names to read. If None, all columns are read.
273
+ with_index: If True, includes the index ('x', 'y', 'z') in the DataFrame.
274
+ dense: If True, reads the data as a dense grid. If False, reads the data as a sparse grid.
275
+
276
+ Returns:
277
+ pd.DataFrame: The DataFrame containing the block model data.
278
+ """
279
+ if columns is None:
280
+ columns = self.columns
281
+ df = pq.read_table(self.blockmodel_path, columns=columns).to_pandas()
282
+ if with_index:
283
+ df.index = self.centroid_index
284
+ if dense:
285
+ dense_index = self.geometry.to_multi_index()
286
+ if len(df) == len(dense_index):
287
+ assert df.index.equals(dense_index)
288
+ df = df.reindex(dense_index)
289
+ return df
290
+
291
+ def to_pyvista(self, attributes: Optional[list[str]] = None) -> 'pv.ImageData':
292
+
293
+ if attributes is None:
294
+ attributes = self.attributes
295
+
296
+ grid = self.geometry.to_pyvista()
297
+ df = self.read(columns=attributes, with_index=False, dense=True)
298
+ df['f_order'] = self.index_f
299
+ df = df.sort_values('f_order')
300
+ df = df.drop(columns='f_order')
301
+
302
+ for col in attributes:
303
+ grid.cell_data[col] = df[col].values
304
+
305
+ return grid
306
+
307
+ @staticmethod
308
+ def _validate_geometry(filepath: Path, geometry: Optional[RegularGeometry] = None) -> None:
309
+ """
310
+ Validates the geometry of a Parquet file by checking if the index (centroid) columns are present
311
+ and have valid values.
312
+
313
+ Args:
314
+ filepath (Path): Path to the Parquet file.
315
+ geometry (RegularGeometry, optional): The geometry of the block model. If None, it will be derived from
316
+ the Parquet file.
317
+
318
+ Raises:
319
+ ValueError: If any index column is missing or contains invalid values.
320
+ """
321
+
322
+ index_columns = ['x', 'y', 'z']
323
+ columns = pq.read_schema(filepath).names
324
+ if not all(col in columns for col in index_columns):
325
+ raise ValueError(f"Missing index columns in the dataset: {', '.join(index_columns)}")
326
+
327
+ table = pq.read_table(filepath, columns=index_columns)
328
+ for col in index_columns:
329
+ if table[col].null_count > 0:
330
+ raise ValueError(f"Column '{col}' contains NaN values, which is not allowed in the index columns.")
331
+
332
+ x_values = np.sort(table['x'].to_pandas().unique())
333
+ y_values = np.sort(table['y'].to_pandas().unique())
334
+ z_values = np.sort(table['z'].to_pandas().unique())
335
+ if len(x_values) < 2 or len(y_values) < 2 or len(z_values) < 2:
336
+ raise ValueError(
337
+ "The geometry is not regular. At least two unique values are required in each index column.")
338
+
339
+ # Only check regular spacing if not rotated
340
+ if geometry is None:
341
+ geometry = RegularGeometry.from_parquet(filepath)
342
+ if not geometry.is_rotated:
343
+ def is_regular_spacing(values, tol=1e-8):
344
+ diffs = np.diff(values)
345
+ return np.all(np.abs(diffs - diffs[0]) < tol)
346
+
347
+ if not (is_regular_spacing(x_values) and is_regular_spacing(y_values) and is_regular_spacing(z_values)):
348
+ raise ValueError(
349
+ "The geometry is not regular. The index columns must be evenly spaced (regular grid) in x, y, and z.")
350
+
351
+ logging.info(f"Geometry validation completed successfully for {filepath}.")
352
+
353
+ @staticmethod
354
+ def _validate_and_load_data(df, expected_num_blocks):
355
+ required_cols = {'x', 'y', 'z'}
356
+ if not required_cols.issubset(df.columns):
357
+ if len(df) == expected_num_blocks:
358
+ warnings.warn("Data loaded without x, y, z columns. "
359
+ "Order is assumed to match the block model geometry.")
360
+ else:
361
+ raise ValueError("Data missing x, y, z and row count does not match block model.")
362
+ return df
363
+
364
+ def to_dense_parquet(self, filepath: Path,
365
+ chunk_size: int = 100_000, show_progress: bool = False) -> None:
366
+ """
367
+ Save the block model to a Parquet file.
368
+
369
+ This method saves the block model as a Parquet file by chunk. If `dense` is True, it saves the block model as a dense grid,
370
+ Args:
371
+ filepath (Path): The file path where the Parquet file will be saved.
372
+ chunk_size (int): The number of blocks to save in each chunk. Defaults to 100_000.
373
+ show_progress (bool): If True, show a progress bar. Defaults to False.
374
+ """
375
+ columns = self.columns
376
+ dense_index = self.geometry.to_multi_index()
377
+ parquet_file = pq.ParquetFile(self.blockmodel_path)
378
+ total_rows = parquet_file.metadata.num_rows
379
+ total_batches = max(math.ceil(total_rows / chunk_size), 1)
380
+
381
+ progress = tqdm(total=total_batches, desc="Exporting", disable=not show_progress) if show_progress else None
382
+
383
+ with atomic_output_file(filepath) as tmp_path:
384
+ writer = None
385
+ try:
386
+ for batch in parquet_file.iter_batches(batch_size=chunk_size, columns=columns):
387
+ df = pa.Table.from_batches([batch]).to_pandas()
388
+ df = df.reindex(dense_index)
389
+ table = pa.Table.from_pandas(df)
390
+ if writer is None:
391
+ writer = pq.ParquetWriter(tmp_path, table.schema)
392
+ writer.write_table(table)
393
+ if progress:
394
+ progress.update(1)
395
+ finally:
396
+ if writer is not None:
397
+ writer.close()
398
+ if progress:
399
+ progress.close()