WBClean-XUM 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: WBClean_XUM
3
+ Version: 1.0
4
+ Summary: A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community.
5
+ Author: Muthu Krishnan
6
+ License: GNU
7
+ Project-URL: Repository, https://github.com/nmkrishnan-droid/WBClean_XUM.git
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: openpyxl>=3.1.0
12
+ Requires-Dist: requests>=2.31.0
13
+ Requires-Dist: pywin32>=306; platform_system == "Windows"
File without changes
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "WBClean_XUM"
7
+ version = "1.0"
8
+ description = "A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "GNU" }
12
+ authors = [
13
+ { name = "Muthu Krishnan" }
14
+ ]
15
+ dependencies = [
16
+ "pandas>=2.0",
17
+ "openpyxl>=3.1.0",
18
+ "requests>=2.31.0",
19
+ "pywin32>=306; platform_system=='Windows'"
20
+ ]
21
+
22
+ [project.urls]
23
+ Repository = "https://github.com/nmkrishnan-droid/WBClean_XUM.git"
24
+
25
+ [tool.setuptools]
26
+ package-dir = {"" = "src"}
27
+
28
+ [tool.setuptools.packages.find]
29
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,255 @@
1
+ import re
2
+ import pandas as pd
3
+ from openpyxl import load_workbook, Workbook
4
+ from collections import Counter
5
+ import requests
6
+ import win32com.client as win32
7
+ import os
8
+
9
+
10
+ class WBClean_XUM:
11
+ def XUM_TransposeSheet(self, src_path, src_sheet_name=None, dst_sheet_name=None):
12
+ wb = load_workbook(src_path, data_only=False) # set True if you want computed values instead of formulas
13
+ ws = wb[src_sheet_name] if src_sheet_name else wb.worksheets[0]
14
+
15
+ # "Used range" bounds as Excel sees it (approx via openpyxl)
16
+ min_row, max_row = ws.min_row, ws.max_row
17
+ min_col, max_col = ws.min_column, ws.max_column
18
+
19
+ out_wb = Workbook()
20
+ out_ws = out_wb.active
21
+ out_ws.title = dst_sheet_name or (src_sheet_name + "_T")
22
+
23
+ # Read every cell in the used rectangle (including blanks) into a grid
24
+ grid = [
25
+ [ws.cell(row=r, column=c).value for c in range(min_col, max_col + 1)]
26
+ for r in range(min_row, max_row + 1)
27
+ ]
28
+
29
+ # Transpose the grid and write it out
30
+ tgrid = list(zip(*grid)) # tuples
31
+ for r, row_vals in enumerate(tgrid, start=1):
32
+ for c, v in enumerate(row_vals, start=1):
33
+ out_ws.cell(row=r, column=c, value=v)
34
+
35
+ print(type(out_wb))
36
+ tableList = []
37
+ for row in out_ws.iter_rows():
38
+ rowList = []
39
+ for cell in row:
40
+ print(cell.value, end=" ")
41
+ rowList.append(cell.value)
42
+ print("\n")
43
+ tableList.append(rowList)
44
+
45
+ print(f"First transposed table: {tableList}")
46
+ return tableList
47
+
48
+ def XUM_LLMFormat(self, prompt_ReqFeildString, prompt_ReqJSONOutputString, prompt_SampleData, groqModel, Key,
49
+ APIUrl,
50
+ temperature=0, maxTokens=512, contentType="application/json", prompt_FullCustom=False,
51
+ prompt_Full=None):
52
+ prompt = f"""
53
+ You are a data-mapping assistant.
54
+
55
+ The SAMPLE DATA is given in /{{Key:value}} format where Key is the column Name and the value is sample values for that column
56
+
57
+ Map the SAMPLE DATA KEYS to the REQUIRED fields below.
58
+
59
+ REQUIRED FIELDS:
60
+ {prompt_ReqFeildString}
61
+
62
+ SAMPLE DATA:
63
+ {prompt_SampleData}
64
+
65
+ RULES:
66
+ - Choose best column if multiple match
67
+ - Use null if none apply
68
+ - alt_cols must be a JSON array
69
+ - RETURN ONLY VALID JSON. NO TEXT.
70
+
71
+ EXPECTED JSON FORMAT:
72
+ {prompt_ReqJSONOutputString}
73
+
74
+ Return ONLY a JSON object. Do not use markdown, do not use code fences, do not add commentary.
75
+
76
+ """
77
+
78
+ if prompt_FullCustom:
79
+ prompt = prompt_Full
80
+
81
+ payload = {
82
+ "model": groqModel,
83
+ "messages": [
84
+ {"role": "user", "content": prompt}
85
+ ],
86
+ "temperature": temperature,
87
+ "max_tokens": maxTokens
88
+ }
89
+
90
+ r = requests.post(
91
+ APIUrl,
92
+ headers={
93
+ "Authorization": f"Bearer {Key}",
94
+ "Content-Type": contentType,
95
+ },
96
+ json=payload,
97
+ timeout=60,
98
+ )
99
+
100
+ if r.status_code != 200:
101
+ raise RuntimeError(
102
+ f"GROQ ERROR {r.status_code}: {r.text}"
103
+ )
104
+
105
+ content = r.json()["choices"][0]["message"]["content"]
106
+ try:
107
+ data = content
108
+ except Exception as e:
109
+ raise RuntimeError(f"LLM did not return valid JSON. Raw content:\n{content}") from e
110
+
111
+ return data
112
+
113
+ def XUM_DeleteRows(self, Table2dArray, rowIndexList):
114
+ rows_to_drop = set(rowIndexList) # 0-based indices
115
+
116
+ newTable = [
117
+ row for idx, row in enumerate(Table2dArray)
118
+ if idx not in rows_to_drop
119
+ ]
120
+ return newTable
121
+
122
+ def XUM_DeleteColumns(self, Table2dArray, colIndexList):
123
+ cols_to_drop = set(colIndexList)
124
+
125
+ newTable = []
126
+ for row in Table2dArray:
127
+ # keep values whose 1-based index is NOT in cols_to_drop
128
+ newRow = [val for idx, val in enumerate(row, start=1) if idx not in cols_to_drop]
129
+ newTable.append(newRow)
130
+
131
+ return newTable
132
+
133
+ def XUM_TextPresenceRegex(self, x, pattern):
134
+ if x is None:
135
+ return False
136
+ return pattern.search(str(x)) is not None
137
+
138
+ def XUM_XLSConversion(self, xlsPath):
139
+ xls_path = os.path.abspath(xlsPath)
140
+ if xlsPath is None:
141
+ root, _ = os.path.splitext(xls_path)
142
+ xlsx_path = root + ".xlsx"
143
+ xlsxPath = os.path.abspath(xlsPath)
144
+
145
+ excel = win32.Dispatch("Excel.Application")
146
+ excel.Visible = False
147
+ excel.DisplayAlerts = False
148
+
149
+ wb = excel.Workbooks.Open(xls_path)
150
+ wb.SaveAs(xlsxPath, FileFormat=51) # 51 = xlOpenXMLWorkbook (.xlsx)
151
+ wb.Close(False)
152
+ excel.Quit()
153
+
154
+ return xlsxPath
155
+
156
+ def XUM_Clean(self, filePath, pattern, returnDF=True, remove_none=True, getImpFeatures=False,
157
+ prompt_ReqFeildString=None, prompt_ReqJSONOutputString=None, Key=None,
158
+ sheetName=None,
159
+ destinationSheet="WBClean_XUM",
160
+ APIUrl="https://api.groq.com/openai/v1/chat/completions", groqModel="llama-3.3-70b-versatile",
161
+ contentType="application/json", temperature=0, maxTokens=512):
162
+ transposedList = self.XUM_TransposeSheet(
163
+ src_path=filePath,
164
+ src_sheet_name=sheetName,
165
+ dst_sheet_name=destinationSheet
166
+ )
167
+
168
+ out = []
169
+ foundIndex = []
170
+
171
+ for i in transposedList:
172
+ for ind, value in enumerate(i, start=1):
173
+ print(f"value found: {value}")
174
+ if value == None:
175
+ out.append(ind)
176
+ elif self.XUM_TextPresenceRegex(value, re.compile(rf'\b(?:{pattern})\b', re.I)):
177
+ print(f"value matched: {value}")
178
+ foundIndex.append(ind)
179
+ break
180
+
181
+ print(out)
182
+ print(foundIndex)
183
+ captureCols = sorted(x for x in out if x < foundIndex[-1])
184
+ print(f"Caputred! {captureCols}") # [1, 2]
185
+
186
+ freq = Counter(captureCols)
187
+ print(f"Frequency of col values{freq}")
188
+ result = sorted(
189
+ [(num, cnt) for num, cnt in freq.items() if cnt > 1],
190
+ key=lambda x: x[1],
191
+ reverse=True
192
+ )
193
+ nums_only = [num for num, cnt in result]
194
+
195
+ newTable = self.XUM_DeleteColumns(Table2dArray=transposedList, colIndexList=nums_only)
196
+
197
+ # TODO: good job Muthu!!! we made it this far!!
198
+ print(f"this is the new table after deleting unwanted columns: {newTable}")
199
+
200
+ headerColumns = [x for i in newTable for ind, x in enumerate(i) if ind == 0]
201
+ row1Values = [x for i in newTable for ind, x in enumerate(i) if ind == 1]
202
+
203
+ if getImpFeatures:
204
+ testSample = dict(zip(headerColumns, row1Values))
205
+ columnMapping = self.XUM_LLMFormat(
206
+ prompt_ReqFeildString=prompt_ReqFeildString,
207
+ prompt_ReqJSONOutputString=prompt_ReqJSONOutputString,
208
+ prompt_SampleData=testSample,
209
+ Key=Key,
210
+ APIUrl=APIUrl,
211
+ groqModel=groqModel,
212
+ temperature=temperature,
213
+ maxTokens=maxTokens,
214
+ contentType=contentType
215
+ )
216
+
217
+ rowsWeDontNeed = []
218
+ for rowInd, row in enumerate(newTable):
219
+ if row[0] == None:
220
+ rowsWeDontNeed.append(rowInd)
221
+ elif row[0] in columnMapping:
222
+ pass
223
+ else:
224
+ rowsWeDontNeed.append(rowInd)
225
+
226
+ print("\n")
227
+ print(f"rows we don't need: {rowsWeDontNeed}")
228
+
229
+ TableWeNeed = self.XUM_DeleteRows(Table2dArray=newTable, rowIndexList=rowsWeDontNeed)
230
+
231
+ newTable = TableWeNeed
232
+
233
+ else:
234
+ newTable = newTable
235
+
236
+ transposeToNormal = [list(r) for r in zip(*newTable)]
237
+
238
+ # remove_None = remove_none
239
+ # rowsToDelete = []
240
+
241
+ if remove_none:
242
+ # for index,rows in enumerate(back):
243
+ # for i,j in rows:
244
+ # if i==None and j==None:
245
+ # rowsToDelete.append(index)
246
+ rowsToDelete = [idx for idx, row in enumerate(transposeToNormal) if all(v is None for v in row)]
247
+ table = self.XUM_DeleteRows(Table2dArray=transposeToNormal, rowIndexList=rowsToDelete)
248
+ else:
249
+ table = transposeToNormal
250
+
251
+ df = pd.DataFrame(table[1:], columns=table[0])
252
+
253
+ return df if returnDF else df.to_excel(f"{destinationSheet}.xlsx")
254
+
255
+
File without changes
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: WBClean_XUM
3
+ Version: 1.0
4
+ Summary: A python library that will help developers trying to parse different excel file formats. From conversions of file formats to extracting required information from complex excel sheets, this library is the go-to I would like to propose to all of developer community.
5
+ Author: Muthu Krishnan
6
+ License: GNU
7
+ Project-URL: Repository, https://github.com/nmkrishnan-droid/WBClean_XUM.git
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: openpyxl>=3.1.0
12
+ Requires-Dist: requests>=2.31.0
13
+ Requires-Dist: pywin32>=306; platform_system == "Windows"
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/WBClean_XUM/WBClean_XUM.py
4
+ src/WBClean_XUM/__init__.py
5
+ src/WBClean_XUM.egg-info/PKG-INFO
6
+ src/WBClean_XUM.egg-info/SOURCES.txt
7
+ src/WBClean_XUM.egg-info/dependency_links.txt
8
+ src/WBClean_XUM.egg-info/requires.txt
9
+ src/WBClean_XUM.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ pandas>=2.0
2
+ openpyxl>=3.1.0
3
+ requests>=2.31.0
4
+
5
+ [:platform_system == "Windows"]
6
+ pywin32>=306
@@ -0,0 +1 @@
1
+ WBClean_XUM