excel-explorer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,359 @@
1
+ import datetime
2
+ import re
3
+ from collections import defaultdict
4
+
5
+ from openpyxl.utils import get_column_letter
6
+
7
+ from excel_explorer.workbook import load_workbook, get_named_ranges
8
+ from excel_explorer.formatters import format_output
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # summarize_assumptions
13
+ # ---------------------------------------------------------------------------
14
+
15
+ def summarize_assumptions(path: str, limit: int = 50, offset: int = 0) -> str:
16
+ """Find named ranges and scan for assumption-style sheets, extracting
17
+ hardcoded values with their labels."""
18
+ wb = load_workbook(path)
19
+ named = get_named_ranges(wb)
20
+
21
+ assumption_sheet_names = {"assumptions", "inputs", "parameters", "drivers"}
22
+ found_sheets = [s for s in wb.sheetnames if s.lower() in assumption_sheet_names]
23
+
24
+ lines: list[str] = []
25
+
26
+ # Named ranges section
27
+ if named:
28
+ lines.append("Named Ranges:")
29
+ for name, ref in named.items():
30
+ lines.append(f" {name} = {ref}")
31
+ else:
32
+ lines.append("Named Ranges: none")
33
+
34
+ # Assumption sheets section
35
+ for sheet_name in found_sheets:
36
+ lines.append(f"\nSheet: {sheet_name}")
37
+ ws = wb[sheet_name]
38
+ rows = list(ws.iter_rows(values_only=True))
39
+ # Skip header row if first row looks like a header
40
+ start = 0
41
+ if rows and all(isinstance(v, str) or v is None for v in rows[0]):
42
+ start = 1
43
+ entries = rows[start:]
44
+ paginated = entries[offset: offset + limit]
45
+ for row in paginated:
46
+ if not any(v is not None for v in row):
47
+ continue
48
+ label = row[0] if row else ""
49
+ value = row[1] if len(row) > 1 else ""
50
+ if label is not None or value is not None:
51
+ lines.append(f" {label}: {value}")
52
+
53
+ if not found_sheets and not named:
54
+ lines.append("No assumption sheets or named ranges found.")
55
+
56
+ meta = {
57
+ "command": "summarize-assumptions",
58
+ "path": path,
59
+ "assumption_sheets": ", ".join(found_sheets) if found_sheets else "none",
60
+ "named_ranges": len(named),
61
+ }
62
+ return format_output(meta, "\n".join(lines))
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # compare_periods
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def compare_periods(
70
+ path: str, sheet: str, row: int, max_cols: int = 20, col_offset: int = 0
71
+ ) -> str:
72
+ """Read a time-series row, auto-detect date headers, compute
73
+ period-over-period changes and growth rates, and summary stats."""
74
+ wb = load_workbook(path, data_only=True)
75
+ ws = wb[sheet]
76
+
77
+ # Read header row (row 1) for period labels
78
+ header_row = list(ws.iter_rows(min_row=1, max_row=1, values_only=True))[0]
79
+
80
+ # Read the target row
81
+ data_row = list(ws.iter_rows(min_row=row, max_row=row, values_only=True))[0]
82
+
83
+ # Determine label (column A)
84
+ label = data_row[0] if data_row else f"Row {row}"
85
+
86
+ # Collect numeric data starting from column 2 (index 1), respecting col_offset
87
+ start_idx = 1 + col_offset
88
+ end_idx = min(len(data_row), start_idx + max_cols)
89
+
90
+ periods: list[str] = []
91
+ values: list[float] = []
92
+
93
+ for i in range(start_idx, end_idx):
94
+ if i >= len(data_row):
95
+ break
96
+ val = data_row[i]
97
+ if val is None:
98
+ continue
99
+ if not isinstance(val, (int, float)):
100
+ continue
101
+
102
+ # Build period label from header
103
+ if i < len(header_row) and header_row[i] is not None:
104
+ hdr = header_row[i]
105
+ if isinstance(hdr, (datetime.datetime, datetime.date)):
106
+ period_label = hdr.strftime("%Y-%m")
107
+ else:
108
+ period_label = str(hdr)
109
+ else:
110
+ period_label = get_column_letter(i + 1)
111
+
112
+ periods.append(period_label)
113
+ values.append(float(val))
114
+
115
+ if not values:
116
+ meta = {"command": "compare-periods", "path": path, "sheet": sheet, "row": row}
117
+ return format_output(meta, f"No numeric data found in row {row}.")
118
+
119
+ # Compute period-over-period changes and growth rates
120
+ lines: list[str] = [f"Label: {label}", ""]
121
+ lines.append(f"{'Period':<12} {'Value':>12} {'Change':>12} {'Growth %':>10}")
122
+ lines.append("-" * 52)
123
+
124
+ for i, (period, val) in enumerate(zip(periods, values)):
125
+ if i == 0:
126
+ lines.append(f"{period:<12} {val:>12,.2f} {'—':>12} {'—':>10}")
127
+ else:
128
+ change = val - values[i - 1]
129
+ growth = (change / values[i - 1] * 100) if values[i - 1] != 0 else float("nan")
130
+ lines.append(
131
+ f"{period:<12} {val:>12,.2f} {change:>+12,.2f} {growth:>9.1f}%"
132
+ )
133
+
134
+ # Summary stats
135
+ lines.append("")
136
+ lines.append("Summary:")
137
+ lines.append(f" Min: {min(values):,.2f} ({periods[values.index(min(values))]})")
138
+ lines.append(f" Max: {max(values):,.2f} ({periods[values.index(max(values))]})")
139
+ lines.append(f" Average: {sum(values)/len(values):,.2f}")
140
+ if len(values) >= 2:
141
+ total_growth = (values[-1] - values[0]) / values[0] * 100 if values[0] != 0 else float("nan")
142
+ lines.append(f" Total growth ({periods[0]} to {periods[-1]}): {total_growth:+.1f}%")
143
+
144
+ meta = {
145
+ "command": "compare-periods",
146
+ "path": path,
147
+ "sheet": sheet,
148
+ "row": row,
149
+ "label": label,
150
+ "periods": len(values),
151
+ }
152
+ return format_output(meta, "\n".join(lines))
153
+
154
+
155
+ # ---------------------------------------------------------------------------
156
+ # find_anomalies
157
+ # ---------------------------------------------------------------------------
158
+
159
+ def _normalize_formula(val: str) -> str:
160
+ """Replace whole cell references with REF to get a structural pattern.
161
+
162
+ Rows are compared cell-to-cell across columns, so column letters must be
163
+ normalized too — otherwise =B2+B3 and =C2+C3 look like different patterns.
164
+ """
165
+ return re.sub(r'\$?[A-Z]{1,3}\$?\d+', 'REF', val)
166
+
167
+
168
+ def find_anomalies(path: str, sheet: str, limit: int = 50, offset: int = 0) -> str:
169
+ """Scan rows for formula pattern breaks: cells that break the dominant
170
+ formula pattern of their row (different formula or hardcoded when others
171
+ use formulas)."""
172
+ wb_formulas = load_workbook(path, data_only=False)
173
+ ws = wb_formulas[sheet]
174
+
175
+ anomalies: list[str] = []
176
+
177
+ rows = list(ws.iter_rows())
178
+ paginated_rows = rows[offset: offset + limit]
179
+
180
+ for row_cells in paginated_rows:
181
+ row_num = row_cells[0].row
182
+
183
+ # Skip header rows or rows with fewer than 3 data cells
184
+ data_cells = [c for c in row_cells if c.column > 1 and c.value is not None]
185
+ if len(data_cells) < 2:
186
+ continue
187
+
188
+ # Classify each cell as formula or literal and record pattern
189
+ patterns: list[tuple] = [] # (col_index, col_letter, raw_value, pattern)
190
+ for cell in data_cells:
191
+ val = cell.value
192
+ col_letter = get_column_letter(cell.column)
193
+ if isinstance(val, str) and val.startswith("="):
194
+ pattern = _normalize_formula(val)
195
+ patterns.append((cell.column, col_letter, val, ("formula", pattern)))
196
+ else:
197
+ patterns.append((cell.column, col_letter, val, ("literal", None)))
198
+
199
+ if not patterns:
200
+ continue
201
+
202
+ # Find the dominant type (formula vs literal)
203
+ formula_count = sum(1 for _, _, _, p in patterns if p[0] == "formula")
204
+ literal_count = len(patterns) - formula_count
205
+
206
+ # Only analyze rows where there's a mix or a clear formula pattern
207
+ if formula_count == 0:
208
+ # All literals — no anomaly to detect
209
+ continue
210
+
211
+ # Find dominant formula pattern among formulas
212
+ formula_patterns: list[str] = [p[1] for _, _, _, p in patterns if p[0] == "formula" and p[1] is not None]
213
+ pattern_counts: dict[str, int] = defaultdict(int)
214
+ for fp in formula_patterns:
215
+ pattern_counts[fp] += 1
216
+
217
+ dominant_pattern = max(pattern_counts, key=lambda k: pattern_counts[k]) if pattern_counts else None
218
+ dominant_count = pattern_counts[dominant_pattern] if dominant_pattern else 0
219
+
220
+ for col_idx, col_letter, val, (kind, pat) in patterns:
221
+ cell_ref = f"{col_letter}{row_num}"
222
+
223
+ if kind == "literal" and formula_count > literal_count:
224
+ # Only flag literals when formulas dominate the row; otherwise
225
+ # every literal in a mostly-literal row is a false positive
226
+ # Hardcoded value in a row that's mostly formulas
227
+ anomalies.append(
228
+ f"{cell_ref}: hardcoded value ({val!r}) in formula row "
229
+ f"({formula_count} formula(s), {literal_count} literal(s))"
230
+ )
231
+ elif kind == "formula" and dominant_pattern and pat != dominant_pattern and dominant_count > 1:
232
+ # Formula deviates from the dominant pattern
233
+ anomalies.append(
234
+ f"{cell_ref}: formula pattern differs — got `{pat}`, "
235
+ f"dominant is `{dominant_pattern}`"
236
+ )
237
+
238
+ meta = {
239
+ "command": "find-anomalies",
240
+ "path": path,
241
+ "sheet": sheet,
242
+ "anomalies_found": len(anomalies),
243
+ }
244
+
245
+ if anomalies:
246
+ body = f"Found {len(anomalies)} anomaly/anomalies:\n\n" + "\n".join(f" • {a}" for a in anomalies)
247
+ else:
248
+ body = "No anomalies detected."
249
+
250
+ return format_output(meta, body)
251
+
252
+
253
+ # ---------------------------------------------------------------------------
254
+ # validate_balance
255
+ # ---------------------------------------------------------------------------
256
+
257
+ _ASSET_KEYWORDS = {"total assets", "assets"}
258
+ _LIABILITY_KEYWORDS = {"total liabilities", "liabilities"}
259
+ _EQUITY_KEYWORDS = {"total equity", "equity", "stockholders equity", "shareholders equity"}
260
+
261
+
262
+ def _match_keyword(label: str, keyword_set: set[str]) -> bool:
263
+ if label is None:
264
+ return False
265
+ return label.strip().lower() in keyword_set
266
+
267
+
268
+ def validate_balance(path: str, sheet: str) -> str:
269
+ """Check that Assets = Liabilities + Equity across all period columns."""
270
+ wb = load_workbook(path, data_only=True)
271
+ ws = wb[sheet]
272
+
273
+ rows = list(ws.iter_rows(values_only=True))
274
+ if not rows:
275
+ meta = {"command": "validate-balance", "path": path, "sheet": sheet}
276
+ return format_output(meta, "Sheet is empty.")
277
+
278
+ # Find header row (row 0 / index 0) for period labels
279
+ header_row = rows[0]
280
+
281
+ assets_row: list | None = None
282
+ liabilities_row: list | None = None
283
+ equity_row: list | None = None
284
+
285
+ for row in rows[1:]:
286
+ if not row:
287
+ continue
288
+ label = row[0]
289
+ if assets_row is None and _match_keyword(label, _ASSET_KEYWORDS):
290
+ assets_row = list(row)
291
+ elif liabilities_row is None and _match_keyword(label, _LIABILITY_KEYWORDS):
292
+ liabilities_row = list(row)
293
+ elif equity_row is None and _match_keyword(label, _EQUITY_KEYWORDS):
294
+ equity_row = list(row)
295
+
296
+ if assets_row is None or liabilities_row is None or equity_row is None:
297
+ missing = []
298
+ if assets_row is None:
299
+ missing.append("Assets")
300
+ if liabilities_row is None:
301
+ missing.append("Liabilities")
302
+ if equity_row is None:
303
+ missing.append("Equity")
304
+ meta = {"command": "validate-balance", "path": path, "sheet": sheet, "result": "ERROR"}
305
+ return format_output(meta, f"Could not find rows for: {', '.join(missing)}.")
306
+
307
+ # Check across all period columns (col index 1+)
308
+ num_cols = max(len(assets_row), len(liabilities_row), len(equity_row))
309
+ results: list[str] = []
310
+ all_pass = True
311
+ period_results: list[str] = []
312
+
313
+ for col_idx in range(1, num_cols):
314
+ a = assets_row[col_idx] if col_idx < len(assets_row) else None
315
+ l = liabilities_row[col_idx] if col_idx < len(liabilities_row) else None
316
+ e = equity_row[col_idx] if col_idx < len(equity_row) else None
317
+
318
+ if a is None and l is None and e is None:
319
+ continue
320
+
321
+ # Build period label from header
322
+ period_label: str
323
+ if col_idx < len(header_row) and header_row[col_idx] is not None:
324
+ hdr = header_row[col_idx]
325
+ if isinstance(hdr, (datetime.datetime, datetime.date)):
326
+ period_label = hdr.strftime("%Y-%m-%d")
327
+ else:
328
+ period_label = str(hdr)
329
+ else:
330
+ period_label = get_column_letter(col_idx + 1)
331
+
332
+ if any(not isinstance(v, (int, float)) for v in [a, l, e] if v is not None):
333
+ period_results.append(f" {period_label}: SKIP (non-numeric values)")
334
+ continue
335
+
336
+ a_val = float(a) if a is not None else 0.0
337
+ l_val = float(l) if l is not None else 0.0
338
+ e_val = float(e) if e is not None else 0.0
339
+
340
+ diff = a_val - (l_val + e_val)
341
+ tolerance = 0.01 # allow for floating point rounding
342
+ if abs(diff) <= tolerance:
343
+ period_results.append(f" {period_label}: PASS (Assets={a_val:,.2f} = L+E={l_val+e_val:,.2f})")
344
+ else:
345
+ all_pass = False
346
+ period_results.append(
347
+ f" {period_label}: FAIL (Assets={a_val:,.2f} ≠ L+E={l_val+e_val:,.2f}, diff={diff:+,.2f})"
348
+ )
349
+
350
+ overall = "BALANCED" if all_pass else "IMBALANCED"
351
+ lines = [f"Balance Check: {overall}", ""] + period_results
352
+
353
+ meta = {
354
+ "command": "validate-balance",
355
+ "path": path,
356
+ "sheet": sheet,
357
+ "result": overall,
358
+ }
359
+ return format_output(meta, "\n".join(lines))