WBClean-XUM 1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wbclean_xum-1.0/PKG-INFO +13 -0
- wbclean_xum-1.0/README.md +0 -0
- wbclean_xum-1.0/pyproject.toml +29 -0
- wbclean_xum-1.0/setup.cfg +4 -0
- wbclean_xum-1.0/src/WBClean_XUM/WBClean_XUM.py +255 -0
- wbclean_xum-1.0/src/WBClean_XUM/__init__.py +0 -0
- wbclean_xum-1.0/src/WBClean_XUM.egg-info/PKG-INFO +13 -0
- wbclean_xum-1.0/src/WBClean_XUM.egg-info/SOURCES.txt +9 -0
- wbclean_xum-1.0/src/WBClean_XUM.egg-info/dependency_links.txt +1 -0
- wbclean_xum-1.0/src/WBClean_XUM.egg-info/requires.txt +6 -0
- wbclean_xum-1.0/src/WBClean_XUM.egg-info/top_level.txt +1 -0
wbclean_xum-1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: WBClean_XUM
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community.
|
|
5
|
+
Author: Muthu Krishnan
|
|
6
|
+
License: GNU
|
|
7
|
+
Project-URL: Repository, https://github.com/nmkrishnan-droid/WBClean_XUM.git
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
12
|
+
Requires-Dist: requests>=2.31.0
|
|
13
|
+
Requires-Dist: pywin32>=306; platform_system == "Windows"
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "WBClean_XUM"
|
|
7
|
+
version = "1.0"
|
|
8
|
+
description = "A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "GNU" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Muthu Krishnan" }
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pandas>=2.0",
|
|
17
|
+
"openpyxl>=3.1.0",
|
|
18
|
+
"requests>=2.31.0",
|
|
19
|
+
"pywin32>=306; platform_system=='Windows'"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Repository = "https://github.com/nmkrishnan-droid/WBClean_XUM.git"
|
|
24
|
+
|
|
25
|
+
[tool.setuptools]
|
|
26
|
+
package-dir = {"" = "src"}
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.packages.find]
|
|
29
|
+
where = ["src"]
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from openpyxl import load_workbook, Workbook
|
|
4
|
+
from collections import Counter
|
|
5
|
+
import requests
|
|
6
|
+
import win32com.client as win32
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WBClean_XUM:
|
|
11
|
+
def XUM_TransposeSheet(self, src_path, src_sheet_name=None, dst_sheet_name=None):
|
|
12
|
+
wb = load_workbook(src_path, data_only=False) # set True if you want computed values instead of formulas
|
|
13
|
+
ws = wb[src_sheet_name] if src_sheet_name else wb.worksheets[0]
|
|
14
|
+
|
|
15
|
+
# "Used range" bounds as Excel sees it (approx via openpyxl)
|
|
16
|
+
min_row, max_row = ws.min_row, ws.max_row
|
|
17
|
+
min_col, max_col = ws.min_column, ws.max_column
|
|
18
|
+
|
|
19
|
+
out_wb = Workbook()
|
|
20
|
+
out_ws = out_wb.active
|
|
21
|
+
out_ws.title = dst_sheet_name or (src_sheet_name + "_T")
|
|
22
|
+
|
|
23
|
+
# Read every cell in the used rectangle (including blanks) into a grid
|
|
24
|
+
grid = [
|
|
25
|
+
[ws.cell(row=r, column=c).value for c in range(min_col, max_col + 1)]
|
|
26
|
+
for r in range(min_row, max_row + 1)
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Transpose the grid and write it out
|
|
30
|
+
tgrid = list(zip(*grid)) # tuples
|
|
31
|
+
for r, row_vals in enumerate(tgrid, start=1):
|
|
32
|
+
for c, v in enumerate(row_vals, start=1):
|
|
33
|
+
out_ws.cell(row=r, column=c, value=v)
|
|
34
|
+
|
|
35
|
+
print(type(out_wb))
|
|
36
|
+
tableList = []
|
|
37
|
+
for row in out_ws.iter_rows():
|
|
38
|
+
rowList = []
|
|
39
|
+
for cell in row:
|
|
40
|
+
print(cell.value, end=" ")
|
|
41
|
+
rowList.append(cell.value)
|
|
42
|
+
print("\n")
|
|
43
|
+
tableList.append(rowList)
|
|
44
|
+
|
|
45
|
+
print(f"First transposed table: {tableList}")
|
|
46
|
+
return tableList
|
|
47
|
+
|
|
48
|
+
def XUM_LLMFormat(self, prompt_ReqFeildString, prompt_ReqJSONOutputString, prompt_SampleData, groqModel, Key,
|
|
49
|
+
APIUrl,
|
|
50
|
+
temperature=0, maxTokens=512, contentType="application/json", prompt_FullCustom=False,
|
|
51
|
+
prompt_Full=None):
|
|
52
|
+
prompt = f"""
|
|
53
|
+
You are a data-mapping assistant.
|
|
54
|
+
|
|
55
|
+
The SAMPLE DATA is given in /{{Key:value}} format where Key is the column Name and the value is sample values for that column
|
|
56
|
+
|
|
57
|
+
Map the SAMPLE DATA KEYS to the REQUIRED fields below.
|
|
58
|
+
|
|
59
|
+
REQUIRED FIELDS:
|
|
60
|
+
{prompt_ReqFeildString}
|
|
61
|
+
|
|
62
|
+
SAMPLE DATA:
|
|
63
|
+
{prompt_SampleData}
|
|
64
|
+
|
|
65
|
+
RULES:
|
|
66
|
+
- Choose best column if multiple match
|
|
67
|
+
- Use null if none apply
|
|
68
|
+
- alt_cols must be a JSON array
|
|
69
|
+
- RETURN ONLY VALID JSON. NO TEXT.
|
|
70
|
+
|
|
71
|
+
EXPECTED JSON FORMAT:
|
|
72
|
+
{prompt_ReqJSONOutputString}
|
|
73
|
+
|
|
74
|
+
Return ONLY a JSON object. Do not use markdown, do not use code fences, do not add commentary.
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
if prompt_FullCustom:
|
|
79
|
+
prompt = prompt_Full
|
|
80
|
+
|
|
81
|
+
payload = {
|
|
82
|
+
"model": groqModel,
|
|
83
|
+
"messages": [
|
|
84
|
+
{"role": "user", "content": prompt}
|
|
85
|
+
],
|
|
86
|
+
"temperature": temperature,
|
|
87
|
+
"max_tokens": maxTokens
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
r = requests.post(
|
|
91
|
+
APIUrl,
|
|
92
|
+
headers={
|
|
93
|
+
"Authorization": f"Bearer {Key}",
|
|
94
|
+
"Content-Type": contentType,
|
|
95
|
+
},
|
|
96
|
+
json=payload,
|
|
97
|
+
timeout=60,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if r.status_code != 200:
|
|
101
|
+
raise RuntimeError(
|
|
102
|
+
f"GROQ ERROR {r.status_code}: {r.text}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
content = r.json()["choices"][0]["message"]["content"]
|
|
106
|
+
try:
|
|
107
|
+
data = content
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise RuntimeError(f"LLM did not return valid JSON. Raw content:\n{content}") from e
|
|
110
|
+
|
|
111
|
+
return data
|
|
112
|
+
|
|
113
|
+
def XUM_DeleteRows(self, Table2dArray, rowIndexList):
|
|
114
|
+
rows_to_drop = set(rowIndexList) # 0-based indices
|
|
115
|
+
|
|
116
|
+
newTable = [
|
|
117
|
+
row for idx, row in enumerate(Table2dArray)
|
|
118
|
+
if idx not in rows_to_drop
|
|
119
|
+
]
|
|
120
|
+
return newTable
|
|
121
|
+
|
|
122
|
+
def XUM_DeleteColumns(self, Table2dArray, colIndexList):
|
|
123
|
+
cols_to_drop = set(colIndexList)
|
|
124
|
+
|
|
125
|
+
newTable = []
|
|
126
|
+
for row in Table2dArray:
|
|
127
|
+
# keep values whose 1-based index is NOT in cols_to_drop
|
|
128
|
+
newRow = [val for idx, val in enumerate(row, start=1) if idx not in cols_to_drop]
|
|
129
|
+
newTable.append(newRow)
|
|
130
|
+
|
|
131
|
+
return newTable
|
|
132
|
+
|
|
133
|
+
def XUM_TextPresenceRegex(self, x, pattern):
|
|
134
|
+
if x is None:
|
|
135
|
+
return False
|
|
136
|
+
return pattern.search(str(x)) is not None
|
|
137
|
+
|
|
138
|
+
def XUM_XLSConversion(self, xlsPath):
|
|
139
|
+
xls_path = os.path.abspath(xlsPath)
|
|
140
|
+
if xlsPath is None:
|
|
141
|
+
root, _ = os.path.splitext(xls_path)
|
|
142
|
+
xlsx_path = root + ".xlsx"
|
|
143
|
+
xlsxPath = os.path.abspath(xlsPath)
|
|
144
|
+
|
|
145
|
+
excel = win32.Dispatch("Excel.Application")
|
|
146
|
+
excel.Visible = False
|
|
147
|
+
excel.DisplayAlerts = False
|
|
148
|
+
|
|
149
|
+
wb = excel.Workbooks.Open(xls_path)
|
|
150
|
+
wb.SaveAs(xlsxPath, FileFormat=51) # 51 = xlOpenXMLWorkbook (.xlsx)
|
|
151
|
+
wb.Close(False)
|
|
152
|
+
excel.Quit()
|
|
153
|
+
|
|
154
|
+
return xlsxPath
|
|
155
|
+
|
|
156
|
+
def XUM_Clean(self, filePath, pattern, returnDF=True, remove_none=True, getImpFeatures=False,
|
|
157
|
+
prompt_ReqFeildString=None, prompt_ReqJSONOutputString=None, Key=None,
|
|
158
|
+
sheetName=None,
|
|
159
|
+
destinationSheet="WBClean_XUM",
|
|
160
|
+
APIUrl="https://api.groq.com/openai/v1/chat/completions", groqModel="llama-3.3-70b-versatile",
|
|
161
|
+
contentType="application/json", temperature=0, maxTokens=512):
|
|
162
|
+
transposedList = self.XUM_TransposeSheet(
|
|
163
|
+
src_path=filePath,
|
|
164
|
+
src_sheet_name=sheetName,
|
|
165
|
+
dst_sheet_name=destinationSheet
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
out = []
|
|
169
|
+
foundIndex = []
|
|
170
|
+
|
|
171
|
+
for i in transposedList:
|
|
172
|
+
for ind, value in enumerate(i, start=1):
|
|
173
|
+
print(f"value found: {value}")
|
|
174
|
+
if value == None:
|
|
175
|
+
out.append(ind)
|
|
176
|
+
elif self.XUM_TextPresenceRegex(value, re.compile(rf'\b(?:{pattern})\b', re.I)):
|
|
177
|
+
print(f"value matched: {value}")
|
|
178
|
+
foundIndex.append(ind)
|
|
179
|
+
break
|
|
180
|
+
|
|
181
|
+
print(out)
|
|
182
|
+
print(foundIndex)
|
|
183
|
+
captureCols = sorted(x for x in out if x < foundIndex[-1])
|
|
184
|
+
print(f"Caputred! {captureCols}") # [1, 2]
|
|
185
|
+
|
|
186
|
+
freq = Counter(captureCols)
|
|
187
|
+
print(f"Frequency of col values{freq}")
|
|
188
|
+
result = sorted(
|
|
189
|
+
[(num, cnt) for num, cnt in freq.items() if cnt > 1],
|
|
190
|
+
key=lambda x: x[1],
|
|
191
|
+
reverse=True
|
|
192
|
+
)
|
|
193
|
+
nums_only = [num for num, cnt in result]
|
|
194
|
+
|
|
195
|
+
newTable = self.XUM_DeleteColumns(Table2dArray=transposedList, colIndexList=nums_only)
|
|
196
|
+
|
|
197
|
+
# TODO: good job Muthu!!! we made it this far!!
|
|
198
|
+
print(f"this is the new table after deleting unwanted columns: {newTable}")
|
|
199
|
+
|
|
200
|
+
headerColumns = [x for i in newTable for ind, x in enumerate(i) if ind == 0]
|
|
201
|
+
row1Values = [x for i in newTable for ind, x in enumerate(i) if ind == 1]
|
|
202
|
+
|
|
203
|
+
if getImpFeatures:
|
|
204
|
+
testSample = dict(zip(headerColumns, row1Values))
|
|
205
|
+
columnMapping = self.XUM_LLMFormat(
|
|
206
|
+
prompt_ReqFeildString=prompt_ReqFeildString,
|
|
207
|
+
prompt_ReqJSONOutputString=prompt_ReqJSONOutputString,
|
|
208
|
+
prompt_SampleData=testSample,
|
|
209
|
+
Key=Key,
|
|
210
|
+
APIUrl=APIUrl,
|
|
211
|
+
groqModel=groqModel,
|
|
212
|
+
temperature=temperature,
|
|
213
|
+
maxTokens=maxTokens,
|
|
214
|
+
contentType=contentType
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
rowsWeDontNeed = []
|
|
218
|
+
for rowInd, row in enumerate(newTable):
|
|
219
|
+
if row[0] == None:
|
|
220
|
+
rowsWeDontNeed.append(rowInd)
|
|
221
|
+
elif row[0] in columnMapping:
|
|
222
|
+
pass
|
|
223
|
+
else:
|
|
224
|
+
rowsWeDontNeed.append(rowInd)
|
|
225
|
+
|
|
226
|
+
print("\n")
|
|
227
|
+
print(f"rows we don't need: {rowsWeDontNeed}")
|
|
228
|
+
|
|
229
|
+
TableWeNeed = self.XUM_DeleteRows(Table2dArray=newTable, rowIndexList=rowsWeDontNeed)
|
|
230
|
+
|
|
231
|
+
newTable = TableWeNeed
|
|
232
|
+
|
|
233
|
+
else:
|
|
234
|
+
newTable = newTable
|
|
235
|
+
|
|
236
|
+
transposeToNormal = [list(r) for r in zip(*newTable)]
|
|
237
|
+
|
|
238
|
+
# remove_None = remove_none
|
|
239
|
+
# rowsToDelete = []
|
|
240
|
+
|
|
241
|
+
if remove_none:
|
|
242
|
+
# for index,rows in enumerate(back):
|
|
243
|
+
# for i,j in rows:
|
|
244
|
+
# if i==None and j==None:
|
|
245
|
+
# rowsToDelete.append(index)
|
|
246
|
+
rowsToDelete = [idx for idx, row in enumerate(transposeToNormal) if all(v is None for v in row)]
|
|
247
|
+
table = self.XUM_DeleteRows(Table2dArray=transposeToNormal, rowIndexList=rowsToDelete)
|
|
248
|
+
else:
|
|
249
|
+
table = transposeToNormal
|
|
250
|
+
|
|
251
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
252
|
+
|
|
253
|
+
return df if returnDF else df.to_excel(f"{destinationSheet}.xlsx")
|
|
254
|
+
|
|
255
|
+
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: WBClean_XUM
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community.
|
|
5
|
+
Author: Muthu Krishnan
|
|
6
|
+
License: GNU
|
|
7
|
+
Project-URL: Repository, https://github.com/nmkrishnan-droid/WBClean_XUM.git
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
12
|
+
Requires-Dist: requests>=2.31.0
|
|
13
|
+
Requires-Dist: pywin32>=306; platform_system == "Windows"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/WBClean_XUM/WBClean_XUM.py
|
|
4
|
+
src/WBClean_XUM/__init__.py
|
|
5
|
+
src/WBClean_XUM.egg-info/PKG-INFO
|
|
6
|
+
src/WBClean_XUM.egg-info/SOURCES.txt
|
|
7
|
+
src/WBClean_XUM.egg-info/dependency_links.txt
|
|
8
|
+
src/WBClean_XUM.egg-info/requires.txt
|
|
9
|
+
src/WBClean_XUM.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
WBClean_XUM
|