advanced-excel 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- advanced_excel-2.0.0/LICENSE +21 -0
- advanced_excel-2.0.0/PKG-INFO +201 -0
- advanced_excel-2.0.0/README.md +157 -0
- advanced_excel-2.0.0/pyproject.toml +90 -0
- advanced_excel-2.0.0/setup.cfg +4 -0
- advanced_excel-2.0.0/setup.py +10 -0
- advanced_excel-2.0.0/src/advanced_excel/__init__.py +19 -0
- advanced_excel-2.0.0/src/advanced_excel/blocks.py +171 -0
- advanced_excel-2.0.0/src/advanced_excel/cleaning.py +420 -0
- advanced_excel-2.0.0/src/advanced_excel/columns.py +444 -0
- advanced_excel-2.0.0/src/advanced_excel/core.py +87 -0
- advanced_excel-2.0.0/src/advanced_excel/io.py +167 -0
- advanced_excel-2.0.0/src/advanced_excel/lookup.py +229 -0
- advanced_excel-2.0.0/src/advanced_excel/reshape.py +133 -0
- advanced_excel-2.0.0/src/advanced_excel/rows.py +519 -0
- advanced_excel-2.0.0/src/advanced_excel.egg-info/PKG-INFO +201 -0
- advanced_excel-2.0.0/src/advanced_excel.egg-info/SOURCES.txt +19 -0
- advanced_excel-2.0.0/src/advanced_excel.egg-info/dependency_links.txt +1 -0
- advanced_excel-2.0.0/src/advanced_excel.egg-info/requires.txt +18 -0
- advanced_excel-2.0.0/src/advanced_excel.egg-info/top_level.txt +1 -0
- advanced_excel-2.0.0/tests/test_advanced_excel.py +584 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Antonio Castellon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: advanced-excel
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Advanced Excel data processing and manipulation library for Python
|
|
5
|
+
Author-email: Antonio Castellon <antonio@castellon.ch>
|
|
6
|
+
Maintainer-email: Antonio Castellon <antonio@castellon.ch>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/antonio-castellon/AdvancedExcel
|
|
9
|
+
Project-URL: Repository, https://github.com/antonio-castellon/AdvancedExcel
|
|
10
|
+
Project-URL: Issues, https://github.com/antonio-castellon/AdvancedExcel/issues
|
|
11
|
+
Keywords: excel,pandas,data-processing,xlsx,data-cleaning,openpyxl
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: pandas>=2.1.0
|
|
29
|
+
Requires-Dist: openpyxl>=3.1.2
|
|
30
|
+
Requires-Dist: numpy>=1.25.2
|
|
31
|
+
Requires-Dist: pyxlsb>=1.0.10
|
|
32
|
+
Requires-Dist: xlrd>=2.0.1
|
|
33
|
+
Provides-Extra: sspipe
|
|
34
|
+
Requires-Dist: sspipe>=0.1.17; extra == "sspipe"
|
|
35
|
+
Provides-Extra: legacy
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff; extra == "dev"
|
|
40
|
+
Requires-Dist: black; extra == "dev"
|
|
41
|
+
Requires-Dist: mypy; extra == "dev"
|
|
42
|
+
Requires-Dist: pandas-stubs; extra == "dev"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
## AdvancedExcel: Simplifying Excel Data Processing in Python
|
|
46
|
+
|
|
47
|
+
**v2.0+ note:** The public API now uses PEP 8 `snake_case` method names (e.g. `get_sheet`, `remove_empty_rows`, `get_dataframe_blocks_by_key_name`). This is a breaking change from v1.x.
|
|
48
|
+
|
|
49
|
+
The `AdvancedExcel` class provides a powerful and convenient way to interact with Excel files in Python. It streamlines common Excel data processing tasks, from reading and extracting data to cleaning, transforming, and manipulating it using pandas DataFrames. This class aims to reduce the boilerplate code often associated with Excel handling, making your data analysis workflows more efficient and readable.
|
|
50
|
+
|
|
51
|
+
**Key Strengths:**
|
|
52
|
+
|
|
53
|
+
* **Effortless Data Extraction:** Easily read data from Excel sheets, including handling complexities like merged cells, multiple tables within a sheet, and various data types. Target specific tables using row and column indices or header values.
|
|
54
|
+
* **Comprehensive DataFrame Manipulation:** A rich set of methods for cleaning, transforming, and manipulating pandas DataFrames. Perform operations like removing rows/columns based on criteria, filling missing values using various strategies (forward fill, left fill), merging and splitting columns, changing data types, and more.
|
|
55
|
+
* **Flexible Header and Index Management:** Gain fine-grained control over headers and indices. Set rows as headers, add headers as rows, reset indices, and drop unnecessary index levels.
|
|
56
|
+
* **Streamlined String and Value Processing:** Simplify string manipulation with methods for stripping whitespace, replacing values (including using regular expressions), and changing the case of strings (uppercase, lowercase, title case).
|
|
57
|
+
* **Convenient Utility Functions:** Includes helper functions for common tasks such as transposing DataFrames, merging multiple DataFrames, extracting filenames from paths, and getting sheet names without fully opening the Excel file.
|
|
58
|
+
* **Seamless Integration with `sspipe`:** Designed for smooth integration with the `sspipe` library (https://github.com/sspipe/sspipe). Chain method calls in a readable and functional style, creating elegant and concise data pipelines.
|
|
59
|
+
|
|
60
|
+
**Why use `AdvancedExcel`?**
|
|
61
|
+
|
|
62
|
+
Working with Excel files in Python often involves repetitive and verbose code. `AdvancedExcel` encapsulates these common tasks into reusable methods, promoting code reusability, readability, and maintainability. It handles many of the edge cases and complexities of Excel data, allowing you to focus on the core logic of your data analysis. The integration with `sspipe` further enhances code clarity by enabling a functional programming paradigm.
|
|
63
|
+
|
|
64
|
+
# Getting Started
|
|
65
|
+
|
|
66
|
+
Create a Virtual Environment.
|
|
67
|
+
|
|
68
|
+
`py -m venv venv`
|
|
69
|
+
|
|
70
|
+
Activate the virtual environment
|
|
71
|
+
|
|
72
|
+
`venv\Scripts\activate.bat` (in Linux `source ./venv/bin/activate` )
|
|
73
|
+
|
|
74
|
+
Update PIP
|
|
75
|
+
|
|
76
|
+
`py -m pip install -U pip`
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
## Installation
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install advanced-excel
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
For the functional piping examples using `sspipe`:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install "advanced-excel[sspipe]"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For development:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install -e ".[dev,sspipe]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Dependencies
|
|
98
|
+
|
|
99
|
+
Core dependencies (pandas, openpyxl, etc.) are declared in `pyproject.toml`. Optional extras are available for `sspipe` support and development.
|
|
100
|
+
|
|
101
|
+
## Usage
|
|
102
|
+
|
|
103
|
+
### Basic usage
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from advanced_excel import AdvancedExcel
|
|
107
|
+
|
|
108
|
+
data = AdvancedExcel("simple_excel.xlsx")
|
|
109
|
+
sheet = data.get_sheet("Data")
|
|
110
|
+
result = data.get_next_value(sheet, "Study Number")
|
|
111
|
+
print(result)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
> Note: The recommended import is `from advanced_excel import AdvancedExcel` (this is the stable API as of v2.0).
|
|
115
|
+
|
|
116
|
+
### Example using sspipe for complex chaining (requires `pip install "advanced-excel[sspipe]"`):
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from sspipe import p
|
|
120
|
+
import pandas as pd
|
|
121
|
+
from advanced_excel import AdvancedExcel
|
|
122
|
+
|
|
123
|
+
excel_processor = AdvancedExcel("complex_excel.xlsx")
|
|
124
|
+
|
|
125
|
+
raw = excel_processor.get_sheet("Raw data")
|
|
126
|
+
sheet1 = excel_processor.get_sheet("Sheet1")
|
|
127
|
+
sheet2 = excel_processor.get_sheet("Sheet2")
|
|
128
|
+
|
|
129
|
+
final_df_chained = (
|
|
130
|
+
raw
|
|
131
|
+
| p(excel_processor.remove_empty_rows)
|
|
132
|
+
| p(excel_processor.set_column_to_numeric, columnName="Quantity")
|
|
133
|
+
| p(pd.merge,
|
|
134
|
+
sheet1
|
|
135
|
+
| p(excel_processor.remove_empty_rows)
|
|
136
|
+
| p(excel_processor.rename_headers, {"OldName": "CategoryName"}),
|
|
137
|
+
on="ProductID", how="left")
|
|
138
|
+
| p(pd.merge,
|
|
139
|
+
sheet2
|
|
140
|
+
| p(excel_processor.remove_empty_rows)
|
|
141
|
+
| p(excel_processor.strip_all)
|
|
142
|
+
| p(excel_processor.remove_duplicates),
|
|
143
|
+
left_index=True, right_index=True)
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
### Another example of usage on a complex Excel (multiple tables per sheet):
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from advanced_excel import AdvancedExcel
|
|
150
|
+
from sspipe import p # optional, for nice chaining
|
|
151
|
+
|
|
152
|
+
data = AdvancedExcel("studies_excel.xlsx")
|
|
153
|
+
results = data.get_sheet("Raw data") | p(data.get_dataframe_blocks_by_key_name, "Batch Number")
|
|
154
|
+
|
|
155
|
+
all_tables = []
|
|
156
|
+
for dblock in results:
|
|
157
|
+
study_num = data.get_next_value(dblock, "Study No")
|
|
158
|
+
plan_num = data.get_next_value(dblock, "Plan No")
|
|
159
|
+
|
|
160
|
+
table = (
|
|
161
|
+
dblock
|
|
162
|
+
| p(data.get_dataframe_blocks_by_key_name, "Batch Number")
|
|
163
|
+
| (lambda blocks: blocks[0] if blocks else pd.DataFrame())
|
|
164
|
+
| p(data.replace_nan_strings)
|
|
165
|
+
| p(data.remove_empty_rows)
|
|
166
|
+
| p(data.merge_rows, 0, 2)
|
|
167
|
+
| p(data.set_row_as_header, 0)
|
|
168
|
+
| p(data.remove_rows_if_unique_value_on_bottom, 2)
|
|
169
|
+
| p(data.merge_columns, 1, 2, '_', 'SAMPLE_REPLICATE')
|
|
170
|
+
| p(data.remove_column_by_name, "Sample #_Replicat #")
|
|
171
|
+
)
|
|
172
|
+
all_tables.append(table)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from advanced_excel import AdvancedExcel
|
|
177
|
+
from sspipe import p
|
|
178
|
+
import pandas as pd
|
|
179
|
+
|
|
180
|
+
data = AdvancedExcel("studies_excel.xlsx")
|
|
181
|
+
|
|
182
|
+
columns_to_rename = {"PLATEFORM": "PLATFORM"}
|
|
183
|
+
columns_not_to_clean = ["PARAMETER"]
|
|
184
|
+
|
|
185
|
+
results = (
|
|
186
|
+
data.get_sheet("data_raw")
|
|
187
|
+
| p(data.case_headers)
|
|
188
|
+
| p(data.rename_headers, columns_to_rename)
|
|
189
|
+
| p(data.strip_all)
|
|
190
|
+
| p(data.remove_column_by_name, r"^T\d+$")
|
|
191
|
+
| p(data.remove_column_by_name, r"^T\d+\.\d+$")
|
|
192
|
+
| p(data.remove_all_crlf, ";")
|
|
193
|
+
| p(data.replace_nan_strings)
|
|
194
|
+
| p(data.replace_spaces_by_separator, ";", [], columns_not_to_clean)
|
|
195
|
+
| p(data.replace_all, ",;", ";", [], columns_not_to_clean)
|
|
196
|
+
| p(data.replace_all, "tbd", "TBD", [], [])
|
|
197
|
+
# | p(data.replace_by_dictionary, unit_map, "UNIT")
|
|
198
|
+
| p(data.case_column_values, "PARAMETER", title=True)
|
|
199
|
+
# | p(data.replace_by_dictionary, variable_map, "PARAMETER")
|
|
200
|
+
)
|
|
201
|
+
```
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
## AdvancedExcel: Simplifying Excel Data Processing in Python
|
|
2
|
+
|
|
3
|
+
**v2.0+ note:** The public API now uses PEP 8 `snake_case` method names (e.g. `get_sheet`, `remove_empty_rows`, `get_dataframe_blocks_by_key_name`). This is a breaking change from v1.x.
|
|
4
|
+
|
|
5
|
+
The `AdvancedExcel` class provides a powerful and convenient way to interact with Excel files in Python. It streamlines common Excel data processing tasks, from reading and extracting data to cleaning, transforming, and manipulating it using pandas DataFrames. This class aims to reduce the boilerplate code often associated with Excel handling, making your data analysis workflows more efficient and readable.
|
|
6
|
+
|
|
7
|
+
**Key Strengths:**
|
|
8
|
+
|
|
9
|
+
* **Effortless Data Extraction:** Easily read data from Excel sheets, including handling complexities like merged cells, multiple tables within a sheet, and various data types. Target specific tables using row and column indices or header values.
|
|
10
|
+
* **Comprehensive DataFrame Manipulation:** A rich set of methods for cleaning, transforming, and manipulating pandas DataFrames. Perform operations like removing rows/columns based on criteria, filling missing values using various strategies (forward fill, left fill), merging and splitting columns, changing data types, and more.
|
|
11
|
+
* **Flexible Header and Index Management:** Gain fine-grained control over headers and indices. Set rows as headers, add headers as rows, reset indices, and drop unnecessary index levels.
|
|
12
|
+
* **Streamlined String and Value Processing:** Simplify string manipulation with methods for stripping whitespace, replacing values (including using regular expressions), and changing the case of strings (uppercase, lowercase, title case).
|
|
13
|
+
* **Convenient Utility Functions:** Includes helper functions for common tasks such as transposing DataFrames, merging multiple DataFrames, extracting filenames from paths, and getting sheet names without fully opening the Excel file.
|
|
14
|
+
* **Seamless Integration with `sspipe`:** Designed for smooth integration with the `sspipe` library (https://github.com/sspipe/sspipe). Chain method calls in a readable and functional style, creating elegant and concise data pipelines.
|
|
15
|
+
|
|
16
|
+
**Why use `AdvancedExcel`?**
|
|
17
|
+
|
|
18
|
+
Working with Excel files in Python often involves repetitive and verbose code. `AdvancedExcel` encapsulates these common tasks into reusable methods, promoting code reusability, readability, and maintainability. It handles many of the edge cases and complexities of Excel data, allowing you to focus on the core logic of your data analysis. The integration with `sspipe` further enhances code clarity by enabling a functional programming paradigm.
|
|
19
|
+
|
|
20
|
+
# Getting Started
|
|
21
|
+
|
|
22
|
+
Create a Virtual Environment.
|
|
23
|
+
|
|
24
|
+
`py -m venv venv`
|
|
25
|
+
|
|
26
|
+
Activate the virtual environment
|
|
27
|
+
|
|
28
|
+
`venv\Scripts\activate.bat` (in Linux `source ./venv/bin/activate` )
|
|
29
|
+
|
|
30
|
+
Update PIP
|
|
31
|
+
|
|
32
|
+
`py -m pip install -U pip`
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install advanced-excel
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For the functional piping examples using `sspipe`:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "advanced-excel[sspipe]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
For development:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e ".[dev,sspipe]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Dependencies
|
|
54
|
+
|
|
55
|
+
Core dependencies (pandas, openpyxl, etc.) are declared in `pyproject.toml`. Optional extras are available for `sspipe` support and development.
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
### Basic usage
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from advanced_excel import AdvancedExcel
|
|
63
|
+
|
|
64
|
+
data = AdvancedExcel("simple_excel.xlsx")
|
|
65
|
+
sheet = data.get_sheet("Data")
|
|
66
|
+
result = data.get_next_value(sheet, "Study Number")
|
|
67
|
+
print(result)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
> Note: The recommended import is `from advanced_excel import AdvancedExcel` (this is the stable API as of v2.0).
|
|
71
|
+
|
|
72
|
+
### Example using sspipe for complex chaining (requires `pip install "advanced-excel[sspipe]"`):
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from sspipe import p
|
|
76
|
+
import pandas as pd
|
|
77
|
+
from advanced_excel import AdvancedExcel
|
|
78
|
+
|
|
79
|
+
excel_processor = AdvancedExcel("complex_excel.xlsx")
|
|
80
|
+
|
|
81
|
+
raw = excel_processor.get_sheet("Raw data")
|
|
82
|
+
sheet1 = excel_processor.get_sheet("Sheet1")
|
|
83
|
+
sheet2 = excel_processor.get_sheet("Sheet2")
|
|
84
|
+
|
|
85
|
+
final_df_chained = (
|
|
86
|
+
raw
|
|
87
|
+
| p(excel_processor.remove_empty_rows)
|
|
88
|
+
| p(excel_processor.set_column_to_numeric, columnName="Quantity")
|
|
89
|
+
| p(pd.merge,
|
|
90
|
+
sheet1
|
|
91
|
+
| p(excel_processor.remove_empty_rows)
|
|
92
|
+
| p(excel_processor.rename_headers, {"OldName": "CategoryName"}),
|
|
93
|
+
on="ProductID", how="left")
|
|
94
|
+
| p(pd.merge,
|
|
95
|
+
sheet2
|
|
96
|
+
| p(excel_processor.remove_empty_rows)
|
|
97
|
+
| p(excel_processor.strip_all)
|
|
98
|
+
| p(excel_processor.remove_duplicates),
|
|
99
|
+
left_index=True, right_index=True)
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
### Another example of usage on a complex Excel (multiple tables per sheet):
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from advanced_excel import AdvancedExcel
|
|
106
|
+
from sspipe import p # optional, for nice chaining
|
|
107
|
+
|
|
108
|
+
data = AdvancedExcel("studies_excel.xlsx")
|
|
109
|
+
results = data.get_sheet("Raw data") | p(data.get_dataframe_blocks_by_key_name, "Batch Number")
|
|
110
|
+
|
|
111
|
+
all_tables = []
|
|
112
|
+
for dblock in results:
|
|
113
|
+
study_num = data.get_next_value(dblock, "Study No")
|
|
114
|
+
plan_num = data.get_next_value(dblock, "Plan No")
|
|
115
|
+
|
|
116
|
+
table = (
|
|
117
|
+
dblock
|
|
118
|
+
| p(data.get_dataframe_blocks_by_key_name, "Batch Number")
|
|
119
|
+
| (lambda blocks: blocks[0] if blocks else pd.DataFrame())
|
|
120
|
+
| p(data.replace_nan_strings)
|
|
121
|
+
| p(data.remove_empty_rows)
|
|
122
|
+
| p(data.merge_rows, 0, 2)
|
|
123
|
+
| p(data.set_row_as_header, 0)
|
|
124
|
+
| p(data.remove_rows_if_unique_value_on_bottom, 2)
|
|
125
|
+
| p(data.merge_columns, 1, 2, '_', 'SAMPLE_REPLICATE')
|
|
126
|
+
| p(data.remove_column_by_name, "Sample #_Replicat #")
|
|
127
|
+
)
|
|
128
|
+
all_tables.append(table)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from advanced_excel import AdvancedExcel
|
|
133
|
+
from sspipe import p
|
|
134
|
+
import pandas as pd
|
|
135
|
+
|
|
136
|
+
data = AdvancedExcel("studies_excel.xlsx")
|
|
137
|
+
|
|
138
|
+
columns_to_rename = {"PLATEFORM": "PLATFORM"}
|
|
139
|
+
columns_not_to_clean = ["PARAMETER"]
|
|
140
|
+
|
|
141
|
+
results = (
|
|
142
|
+
data.get_sheet("data_raw")
|
|
143
|
+
| p(data.case_headers)
|
|
144
|
+
| p(data.rename_headers, columns_to_rename)
|
|
145
|
+
| p(data.strip_all)
|
|
146
|
+
| p(data.remove_column_by_name, r"^T\d+$")
|
|
147
|
+
| p(data.remove_column_by_name, r"^T\d+\.\d+$")
|
|
148
|
+
| p(data.remove_all_crlf, ";")
|
|
149
|
+
| p(data.replace_nan_strings)
|
|
150
|
+
| p(data.replace_spaces_by_separator, ";", [], columns_not_to_clean)
|
|
151
|
+
| p(data.replace_all, ",;", ";", [], columns_not_to_clean)
|
|
152
|
+
| p(data.replace_all, "tbd", "TBD", [], [])
|
|
153
|
+
# | p(data.replace_by_dictionary, unit_map, "UNIT")
|
|
154
|
+
| p(data.case_column_values, "PARAMETER", title=True)
|
|
155
|
+
# | p(data.replace_by_dictionary, variable_map, "PARAMETER")
|
|
156
|
+
)
|
|
157
|
+
```
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "advanced-excel"
|
|
7
|
+
version = "2.0.0"
|
|
8
|
+
description = "Advanced Excel data processing and manipulation library for Python"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Antonio Castellon", email = "antonio@castellon.ch"}
|
|
13
|
+
]
|
|
14
|
+
maintainers = [
|
|
15
|
+
{name = "Antonio Castellon", email = "antonio@castellon.ch"}
|
|
16
|
+
]
|
|
17
|
+
keywords = ["excel", "pandas", "data-processing", "xlsx", "data-cleaning", "openpyxl"]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Programming Language :: Python :: 3.13",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
30
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
31
|
+
"Topic :: Office/Business :: Financial :: Spreadsheet",
|
|
32
|
+
]
|
|
33
|
+
requires-python = ">=3.10"
|
|
34
|
+
dependencies = [
|
|
35
|
+
"pandas>=2.1.0",
|
|
36
|
+
"openpyxl>=3.1.2",
|
|
37
|
+
"numpy>=1.25.2",
|
|
38
|
+
"pyxlsb>=1.0.10",
|
|
39
|
+
"xlrd>=2.0.1",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
sspipe = ["sspipe>=0.1.17"]
|
|
44
|
+
legacy = [] # xlrd is already in core for .xls support
|
|
45
|
+
dev = [
|
|
46
|
+
"pytest>=7.0",
|
|
47
|
+
"pytest-cov",
|
|
48
|
+
"ruff",
|
|
49
|
+
"black",
|
|
50
|
+
"mypy",
|
|
51
|
+
"pandas-stubs",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
[project.urls]
|
|
55
|
+
Homepage = "https://github.com/antonio-castellon/AdvancedExcel"
|
|
56
|
+
Repository = "https://github.com/antonio-castellon/AdvancedExcel"
|
|
57
|
+
Issues = "https://github.com/antonio-castellon/AdvancedExcel/issues"
|
|
58
|
+
|
|
59
|
+
[tool.setuptools.packages.find]
|
|
60
|
+
where = ["src"]
|
|
61
|
+
include = ["advanced_excel*"] # Standard lowercase package name for Python/PyPI compatibility
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
[tool.setuptools.package-data]
|
|
65
|
+
"*" = ["py.typed"]
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
testpaths = ["tests"]
|
|
69
|
+
python_files = ["test_*.py"]
|
|
70
|
+
addopts = "-q --tb=short"
|
|
71
|
+
|
|
72
|
+
[tool.ruff]
|
|
73
|
+
line-length = 100
|
|
74
|
+
target-version = "py310"
|
|
75
|
+
select = ["E", "F", "I", "UP", "B", "SIM", "C4"]
|
|
76
|
+
ignore = ["E501", "B008"] # allow long lines in some cases, no mutable defaults check over-aggressive
|
|
77
|
+
|
|
78
|
+
[tool.ruff.format]
|
|
79
|
+
quote-style = "double"
|
|
80
|
+
|
|
81
|
+
[tool.black]
|
|
82
|
+
line-length = 100
|
|
83
|
+
target-version = ["py310"]
|
|
84
|
+
|
|
85
|
+
[tool.mypy]
|
|
86
|
+
python_version = "3.10"
|
|
87
|
+
warn_return_any = true
|
|
88
|
+
warn_unused_configs = true
|
|
89
|
+
disallow_untyped_defs = false # start permissive, tighten later
|
|
90
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
advanced_excel package (standard Python package name).
|
|
3
|
+
|
|
4
|
+
This makes the library fully importable and compatible after `pip install -e .`
|
|
5
|
+
or `pip install advanced-excel`.
|
|
6
|
+
|
|
7
|
+
Recommended import:
|
|
8
|
+
from advanced_excel import AdvancedExcel
|
|
9
|
+
|
|
10
|
+
The main class is AdvancedExcel (CamelCase is standard for classes).
|
|
11
|
+
Implementation is in core.py (the thin class + mixin imports) and the various
|
|
12
|
+
*Mixin modules.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .core import AdvancedExcel
|
|
16
|
+
from .core import ROW_INDEX, COL_INDEX, DATA, __version__
|
|
17
|
+
|
|
18
|
+
__all__ = ["AdvancedExcel", "ROW_INDEX", "COL_INDEX", "DATA", "__version__"]
|
|
19
|
+
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
class BlockTableMixin:
|
|
2
|
+
"""
|
|
3
|
+
Mixin for detecting and extracting multiple tables / blocks / entities
|
|
4
|
+
inside a single sheet (repeated key sections, "Batch Number" style blocks, etc.).
|
|
5
|
+
This is the core of the "advanced" multi-table Excel handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
def get_all_tables(self, _sheet, mincol=3):
|
|
9
|
+
"""
|
|
10
|
+
Identifies and extracts table-like structures from a sheet.
|
|
11
|
+
|
|
12
|
+
This method searches for contiguous blocks of rows that have at least `mincol`
|
|
13
|
+
non-NaN values. These blocks are interpreted as tables. It iterates through
|
|
14
|
+
the rows of the sheet. When it encounters a row with at least `mincol`
|
|
15
|
+
valid values, it marks the beginning of a potential table. When it finds a row
|
|
16
|
+
with fewer than `mincol` valid values, it considers the preceding block of
|
|
17
|
+
rows as a complete table and extracts it using the `_getTable` method.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
_sheet (pandas.DataFrame): The sheet to search for tables within.
|
|
21
|
+
mincol (int, optional): The minimum number of non-NaN values required
|
|
22
|
+
for a row to be considered part of a table. Defaults to 3.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
list: A list of pandas DataFrames, where each DataFrame represents
|
|
26
|
+
a table-like structure found in the sheet. Returns an empty
|
|
27
|
+
list if no tables are found.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
allTables = []
|
|
31
|
+
init_table = -1
|
|
32
|
+
|
|
33
|
+
for irow in range(_sheet.shape[0]):
|
|
34
|
+
if len(_sheet.loc[irow].dropna()) >= mincol:
|
|
35
|
+
if init_table == -1:
|
|
36
|
+
init_table = irow
|
|
37
|
+
else:
|
|
38
|
+
if init_table >= 0:
|
|
39
|
+
table = self._getTable(_sheet, init_table, irow)
|
|
40
|
+
allTables.append(table)
|
|
41
|
+
init_table = -1
|
|
42
|
+
|
|
43
|
+
if init_table >= 0:
|
|
44
|
+
table = self._getTable(_sheet, init_table, _sheet.shape[0])
|
|
45
|
+
allTables.append(table)
|
|
46
|
+
|
|
47
|
+
return allTables
|
|
48
|
+
|
|
49
|
+
def get_dataframe_blocks_by_key_name(self, df, key_name):
|
|
50
|
+
"""used for sheet that contains different blocks of information,
|
|
51
|
+
it splits the all data sheet into splited dataframes to be parser later on
|
|
52
|
+
The Split is using the recognition of the first value on a cell that is repeated at the beginning
|
|
53
|
+
of each block"""
|
|
54
|
+
|
|
55
|
+
allDfBlocks = []
|
|
56
|
+
|
|
57
|
+
identified_rows = self.get_all_rows_from_key(df, key_name)
|
|
58
|
+
list_index_of_rows = list(map(lambda x: x["row_index"], identified_rows))
|
|
59
|
+
|
|
60
|
+
for index in range(len(list_index_of_rows)):
|
|
61
|
+
ini = list_index_of_rows[index]
|
|
62
|
+
if index + 1 > len(list_index_of_rows) - 1:
|
|
63
|
+
end = df.shape[0]
|
|
64
|
+
else:
|
|
65
|
+
end = list_index_of_rows[index + 1]
|
|
66
|
+
|
|
67
|
+
block = df.iloc[ini:end].dropna(axis=1, how="all").dropna(axis=0, how="all")
|
|
68
|
+
|
|
69
|
+
block = block.reset_index(drop=True)
|
|
70
|
+
allDfBlocks.append(block)
|
|
71
|
+
|
|
72
|
+
return allDfBlocks
|
|
73
|
+
|
|
74
|
+
def get_dataframe_blocks_by_key_column(self, df, key_column):
|
|
75
|
+
"""
|
|
76
|
+
Splits a DataFrame into blocks based on occurrences of a keyname.
|
|
77
|
+
|
|
78
|
+
This method is designed for sheets containing multiple blocks of information,
|
|
79
|
+
separated by a repeated `key_name` at the beginning of each block. It identifies
|
|
80
|
+
the rows containing the `key_name` and uses their indices to split the DataFrame
|
|
81
|
+
into individual blocks. Each block is then cleaned by removing rows and columns
|
|
82
|
+
that are entirely NaN.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
df (pandas.DataFrame): The DataFrame to split.
|
|
86
|
+
key_name (str): The keyname that marks the beginning of each block.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
list: A list of pandas DataFrames, where each DataFrame represents a
|
|
90
|
+
block of data. Returns an empty list if the `key_name` is not found.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
allDfBlocks = []
|
|
94
|
+
|
|
95
|
+
list_index_of_columns = self._getAllColumnsFromKey(df, key_column)
|
|
96
|
+
|
|
97
|
+
for index in range(len(list_index_of_columns)):
|
|
98
|
+
ini = list_index_of_columns[index]
|
|
99
|
+
if index + 1 > len(list_index_of_columns) - 1:
|
|
100
|
+
end = df.shape[1]
|
|
101
|
+
else:
|
|
102
|
+
end = list_index_of_columns[index + 1]
|
|
103
|
+
|
|
104
|
+
block = df.iloc[:, ini:end].dropna(axis=0, how="all").dropna(axis=1, how="all")
|
|
105
|
+
|
|
106
|
+
block = block.reset_index(drop=True)
|
|
107
|
+
allDfBlocks.append(block)
|
|
108
|
+
|
|
109
|
+
return allDfBlocks
|
|
110
|
+
|
|
111
|
+
def _getAllColumnsFromKey(self, df, key_column):
|
|
112
|
+
"""
|
|
113
|
+
Returns a list of indices for columns matching the specified key column name.
|
|
114
|
+
|
|
115
|
+
This method searches for columns in the DataFrame whose names match `key_column`
|
|
116
|
+
and returns a list of their integer indices.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
df (pandas.DataFrame): The DataFrame to search.
|
|
120
|
+
key_column (str): The name of the column to search for.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
list: A list of integer indices of the matching columns. Returns an
|
|
124
|
+
empty list if no matching columns are found.
|
|
125
|
+
"""
|
|
126
|
+
key_indices = []
|
|
127
|
+
for i, column in enumerate(df.columns):
|
|
128
|
+
if column == key_column:
|
|
129
|
+
key_indices.append(i)
|
|
130
|
+
return key_indices
|
|
131
|
+
|
|
132
|
+
def _getTable(self, _sheet, init_table, end_table):
|
|
133
|
+
"""
|
|
134
|
+
Extracts a table (DataFrame) from a sheet within specified row boundaries.
|
|
135
|
+
|
|
136
|
+
This method extracts a portion of the input sheet (`_sheet`) between `init_table`
|
|
137
|
+
and `end_table` (exclusive) as a new DataFrame. It then cleans the table by
|
|
138
|
+
removing rows and columns that are entirely NaN, sets the first row as the header
|
|
139
|
+
(after cleaning and formatting it), and resets the index.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
_sheet (pandas.DataFrame): The sheet (DataFrame) to extract the table from.
|
|
143
|
+
init_table (int): The starting row index (inclusive).
|
|
144
|
+
end_table (int): The ending row index (exclusive).
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
pandas.DataFrame: A new DataFrame representing the extracted table.
|
|
148
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
149
|
+
"""
|
|
150
|
+
table = (
|
|
151
|
+
_sheet.iloc[init_table:end_table].dropna(axis=1, how="all").dropna(axis=0, how="all")
|
|
152
|
+
) # Extract and clean
|
|
153
|
+
|
|
154
|
+
# Set the header from the first row after cleaning and formatting:
|
|
155
|
+
table.columns = [str(s).strip().upper().replace(" ", "_") for s in table.iloc[0]]
|
|
156
|
+
table = table.drop(0).reset_index(
|
|
157
|
+
drop=True
|
|
158
|
+
) # Remove the first row (old header) and reset index
|
|
159
|
+
return table
|
|
160
|
+
|
|
161
|
+
def _headerColumnsAreEmpty(self, columns):
|
|
162
|
+
"""
|
|
163
|
+
Checks if all column names in a Series start with "Unnamed:".
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
columns (pandas.Series): The Series containing the column names.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
bool: True if all column names start with "Unnamed:", False otherwise.
|
|
170
|
+
"""
|
|
171
|
+
return columns.str.contains("^Unnamed:").all()
|