excel-explorer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- excel_explorer/__init__.py +1 -0
- excel_explorer/analysis.py +359 -0
- excel_explorer/cli.py +497 -0
- excel_explorer/dependencies.py +390 -0
- excel_explorer/explorer.py +325 -0
- excel_explorer/formatters.py +15 -0
- excel_explorer/search.py +139 -0
- excel_explorer/workbook.py +29 -0
- excel_explorer-0.1.0.dist-info/METADATA +199 -0
- excel_explorer-0.1.0.dist-info/RECORD +13 -0
- excel_explorer-0.1.0.dist-info/WHEEL +4 -0
- excel_explorer-0.1.0.dist-info/entry_points.txt +2 -0
- excel_explorer-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import re
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
from openpyxl.utils import get_column_letter
|
|
6
|
+
|
|
7
|
+
from excel_explorer.workbook import load_workbook, get_named_ranges
|
|
8
|
+
from excel_explorer.formatters import format_output
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
# summarize_assumptions
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
def summarize_assumptions(path: str, limit: int = 50, offset: int = 0) -> str:
|
|
16
|
+
"""Find named ranges and scan for assumption-style sheets, extracting
|
|
17
|
+
hardcoded values with their labels."""
|
|
18
|
+
wb = load_workbook(path)
|
|
19
|
+
named = get_named_ranges(wb)
|
|
20
|
+
|
|
21
|
+
assumption_sheet_names = {"assumptions", "inputs", "parameters", "drivers"}
|
|
22
|
+
found_sheets = [s for s in wb.sheetnames if s.lower() in assumption_sheet_names]
|
|
23
|
+
|
|
24
|
+
lines: list[str] = []
|
|
25
|
+
|
|
26
|
+
# Named ranges section
|
|
27
|
+
if named:
|
|
28
|
+
lines.append("Named Ranges:")
|
|
29
|
+
for name, ref in named.items():
|
|
30
|
+
lines.append(f" {name} = {ref}")
|
|
31
|
+
else:
|
|
32
|
+
lines.append("Named Ranges: none")
|
|
33
|
+
|
|
34
|
+
# Assumption sheets section
|
|
35
|
+
for sheet_name in found_sheets:
|
|
36
|
+
lines.append(f"\nSheet: {sheet_name}")
|
|
37
|
+
ws = wb[sheet_name]
|
|
38
|
+
rows = list(ws.iter_rows(values_only=True))
|
|
39
|
+
# Skip header row if first row looks like a header
|
|
40
|
+
start = 0
|
|
41
|
+
if rows and all(isinstance(v, str) or v is None for v in rows[0]):
|
|
42
|
+
start = 1
|
|
43
|
+
entries = rows[start:]
|
|
44
|
+
paginated = entries[offset: offset + limit]
|
|
45
|
+
for row in paginated:
|
|
46
|
+
if not any(v is not None for v in row):
|
|
47
|
+
continue
|
|
48
|
+
label = row[0] if row else ""
|
|
49
|
+
value = row[1] if len(row) > 1 else ""
|
|
50
|
+
if label is not None or value is not None:
|
|
51
|
+
lines.append(f" {label}: {value}")
|
|
52
|
+
|
|
53
|
+
if not found_sheets and not named:
|
|
54
|
+
lines.append("No assumption sheets or named ranges found.")
|
|
55
|
+
|
|
56
|
+
meta = {
|
|
57
|
+
"command": "summarize-assumptions",
|
|
58
|
+
"path": path,
|
|
59
|
+
"assumption_sheets": ", ".join(found_sheets) if found_sheets else "none",
|
|
60
|
+
"named_ranges": len(named),
|
|
61
|
+
}
|
|
62
|
+
return format_output(meta, "\n".join(lines))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# compare_periods
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
def compare_periods(
|
|
70
|
+
path: str, sheet: str, row: int, max_cols: int = 20, col_offset: int = 0
|
|
71
|
+
) -> str:
|
|
72
|
+
"""Read a time-series row, auto-detect date headers, compute
|
|
73
|
+
period-over-period changes and growth rates, and summary stats."""
|
|
74
|
+
wb = load_workbook(path, data_only=True)
|
|
75
|
+
ws = wb[sheet]
|
|
76
|
+
|
|
77
|
+
# Read header row (row 1) for period labels
|
|
78
|
+
header_row = list(ws.iter_rows(min_row=1, max_row=1, values_only=True))[0]
|
|
79
|
+
|
|
80
|
+
# Read the target row
|
|
81
|
+
data_row = list(ws.iter_rows(min_row=row, max_row=row, values_only=True))[0]
|
|
82
|
+
|
|
83
|
+
# Determine label (column A)
|
|
84
|
+
label = data_row[0] if data_row else f"Row {row}"
|
|
85
|
+
|
|
86
|
+
# Collect numeric data starting from column 2 (index 1), respecting col_offset
|
|
87
|
+
start_idx = 1 + col_offset
|
|
88
|
+
end_idx = min(len(data_row), start_idx + max_cols)
|
|
89
|
+
|
|
90
|
+
periods: list[str] = []
|
|
91
|
+
values: list[float] = []
|
|
92
|
+
|
|
93
|
+
for i in range(start_idx, end_idx):
|
|
94
|
+
if i >= len(data_row):
|
|
95
|
+
break
|
|
96
|
+
val = data_row[i]
|
|
97
|
+
if val is None:
|
|
98
|
+
continue
|
|
99
|
+
if not isinstance(val, (int, float)):
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
# Build period label from header
|
|
103
|
+
if i < len(header_row) and header_row[i] is not None:
|
|
104
|
+
hdr = header_row[i]
|
|
105
|
+
if isinstance(hdr, (datetime.datetime, datetime.date)):
|
|
106
|
+
period_label = hdr.strftime("%Y-%m")
|
|
107
|
+
else:
|
|
108
|
+
period_label = str(hdr)
|
|
109
|
+
else:
|
|
110
|
+
period_label = get_column_letter(i + 1)
|
|
111
|
+
|
|
112
|
+
periods.append(period_label)
|
|
113
|
+
values.append(float(val))
|
|
114
|
+
|
|
115
|
+
if not values:
|
|
116
|
+
meta = {"command": "compare-periods", "path": path, "sheet": sheet, "row": row}
|
|
117
|
+
return format_output(meta, f"No numeric data found in row {row}.")
|
|
118
|
+
|
|
119
|
+
# Compute period-over-period changes and growth rates
|
|
120
|
+
lines: list[str] = [f"Label: {label}", ""]
|
|
121
|
+
lines.append(f"{'Period':<12} {'Value':>12} {'Change':>12} {'Growth %':>10}")
|
|
122
|
+
lines.append("-" * 52)
|
|
123
|
+
|
|
124
|
+
for i, (period, val) in enumerate(zip(periods, values)):
|
|
125
|
+
if i == 0:
|
|
126
|
+
lines.append(f"{period:<12} {val:>12,.2f} {'—':>12} {'—':>10}")
|
|
127
|
+
else:
|
|
128
|
+
change = val - values[i - 1]
|
|
129
|
+
growth = (change / values[i - 1] * 100) if values[i - 1] != 0 else float("nan")
|
|
130
|
+
lines.append(
|
|
131
|
+
f"{period:<12} {val:>12,.2f} {change:>+12,.2f} {growth:>9.1f}%"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Summary stats
|
|
135
|
+
lines.append("")
|
|
136
|
+
lines.append("Summary:")
|
|
137
|
+
lines.append(f" Min: {min(values):,.2f} ({periods[values.index(min(values))]})")
|
|
138
|
+
lines.append(f" Max: {max(values):,.2f} ({periods[values.index(max(values))]})")
|
|
139
|
+
lines.append(f" Average: {sum(values)/len(values):,.2f}")
|
|
140
|
+
if len(values) >= 2:
|
|
141
|
+
total_growth = (values[-1] - values[0]) / values[0] * 100 if values[0] != 0 else float("nan")
|
|
142
|
+
lines.append(f" Total growth ({periods[0]} to {periods[-1]}): {total_growth:+.1f}%")
|
|
143
|
+
|
|
144
|
+
meta = {
|
|
145
|
+
"command": "compare-periods",
|
|
146
|
+
"path": path,
|
|
147
|
+
"sheet": sheet,
|
|
148
|
+
"row": row,
|
|
149
|
+
"label": label,
|
|
150
|
+
"periods": len(values),
|
|
151
|
+
}
|
|
152
|
+
return format_output(meta, "\n".join(lines))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ---------------------------------------------------------------------------
|
|
156
|
+
# find_anomalies
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
|
|
159
|
+
def _normalize_formula(val: str) -> str:
|
|
160
|
+
"""Replace whole cell references with REF to get a structural pattern.
|
|
161
|
+
|
|
162
|
+
Rows are compared cell-to-cell across columns, so column letters must be
|
|
163
|
+
normalized too — otherwise =B2+B3 and =C2+C3 look like different patterns.
|
|
164
|
+
"""
|
|
165
|
+
return re.sub(r'\$?[A-Z]{1,3}\$?\d+', 'REF', val)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def find_anomalies(path: str, sheet: str, limit: int = 50, offset: int = 0) -> str:
|
|
169
|
+
"""Scan rows for formula pattern breaks: cells that break the dominant
|
|
170
|
+
formula pattern of their row (different formula or hardcoded when others
|
|
171
|
+
use formulas)."""
|
|
172
|
+
wb_formulas = load_workbook(path, data_only=False)
|
|
173
|
+
ws = wb_formulas[sheet]
|
|
174
|
+
|
|
175
|
+
anomalies: list[str] = []
|
|
176
|
+
|
|
177
|
+
rows = list(ws.iter_rows())
|
|
178
|
+
paginated_rows = rows[offset: offset + limit]
|
|
179
|
+
|
|
180
|
+
for row_cells in paginated_rows:
|
|
181
|
+
row_num = row_cells[0].row
|
|
182
|
+
|
|
183
|
+
# Skip header rows or rows with fewer than 3 data cells
|
|
184
|
+
data_cells = [c for c in row_cells if c.column > 1 and c.value is not None]
|
|
185
|
+
if len(data_cells) < 2:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# Classify each cell as formula or literal and record pattern
|
|
189
|
+
patterns: list[tuple] = [] # (col_index, col_letter, raw_value, pattern)
|
|
190
|
+
for cell in data_cells:
|
|
191
|
+
val = cell.value
|
|
192
|
+
col_letter = get_column_letter(cell.column)
|
|
193
|
+
if isinstance(val, str) and val.startswith("="):
|
|
194
|
+
pattern = _normalize_formula(val)
|
|
195
|
+
patterns.append((cell.column, col_letter, val, ("formula", pattern)))
|
|
196
|
+
else:
|
|
197
|
+
patterns.append((cell.column, col_letter, val, ("literal", None)))
|
|
198
|
+
|
|
199
|
+
if not patterns:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Find the dominant type (formula vs literal)
|
|
203
|
+
formula_count = sum(1 for _, _, _, p in patterns if p[0] == "formula")
|
|
204
|
+
literal_count = len(patterns) - formula_count
|
|
205
|
+
|
|
206
|
+
# Only analyze rows where there's a mix or a clear formula pattern
|
|
207
|
+
if formula_count == 0:
|
|
208
|
+
# All literals — no anomaly to detect
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# Find dominant formula pattern among formulas
|
|
212
|
+
formula_patterns: list[str] = [p[1] for _, _, _, p in patterns if p[0] == "formula" and p[1] is not None]
|
|
213
|
+
pattern_counts: dict[str, int] = defaultdict(int)
|
|
214
|
+
for fp in formula_patterns:
|
|
215
|
+
pattern_counts[fp] += 1
|
|
216
|
+
|
|
217
|
+
dominant_pattern = max(pattern_counts, key=lambda k: pattern_counts[k]) if pattern_counts else None
|
|
218
|
+
dominant_count = pattern_counts[dominant_pattern] if dominant_pattern else 0
|
|
219
|
+
|
|
220
|
+
for col_idx, col_letter, val, (kind, pat) in patterns:
|
|
221
|
+
cell_ref = f"{col_letter}{row_num}"
|
|
222
|
+
|
|
223
|
+
if kind == "literal" and formula_count > literal_count:
|
|
224
|
+
# Only flag literals when formulas dominate the row; otherwise
|
|
225
|
+
# every literal in a mostly-literal row is a false positive
|
|
226
|
+
# Hardcoded value in a row that's mostly formulas
|
|
227
|
+
anomalies.append(
|
|
228
|
+
f"{cell_ref}: hardcoded value ({val!r}) in formula row "
|
|
229
|
+
f"({formula_count} formula(s), {literal_count} literal(s))"
|
|
230
|
+
)
|
|
231
|
+
elif kind == "formula" and dominant_pattern and pat != dominant_pattern and dominant_count > 1:
|
|
232
|
+
# Formula deviates from the dominant pattern
|
|
233
|
+
anomalies.append(
|
|
234
|
+
f"{cell_ref}: formula pattern differs — got `{pat}`, "
|
|
235
|
+
f"dominant is `{dominant_pattern}`"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
meta = {
|
|
239
|
+
"command": "find-anomalies",
|
|
240
|
+
"path": path,
|
|
241
|
+
"sheet": sheet,
|
|
242
|
+
"anomalies_found": len(anomalies),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if anomalies:
|
|
246
|
+
body = f"Found {len(anomalies)} anomaly/anomalies:\n\n" + "\n".join(f" • {a}" for a in anomalies)
|
|
247
|
+
else:
|
|
248
|
+
body = "No anomalies detected."
|
|
249
|
+
|
|
250
|
+
return format_output(meta, body)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ---------------------------------------------------------------------------
|
|
254
|
+
# validate_balance
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
|
|
257
|
+
_ASSET_KEYWORDS = {"total assets", "assets"}
|
|
258
|
+
_LIABILITY_KEYWORDS = {"total liabilities", "liabilities"}
|
|
259
|
+
_EQUITY_KEYWORDS = {"total equity", "equity", "stockholders equity", "shareholders equity"}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _match_keyword(label: str, keyword_set: set[str]) -> bool:
|
|
263
|
+
if label is None:
|
|
264
|
+
return False
|
|
265
|
+
return label.strip().lower() in keyword_set
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def validate_balance(path: str, sheet: str) -> str:
|
|
269
|
+
"""Check that Assets = Liabilities + Equity across all period columns."""
|
|
270
|
+
wb = load_workbook(path, data_only=True)
|
|
271
|
+
ws = wb[sheet]
|
|
272
|
+
|
|
273
|
+
rows = list(ws.iter_rows(values_only=True))
|
|
274
|
+
if not rows:
|
|
275
|
+
meta = {"command": "validate-balance", "path": path, "sheet": sheet}
|
|
276
|
+
return format_output(meta, "Sheet is empty.")
|
|
277
|
+
|
|
278
|
+
# Find header row (row 0 / index 0) for period labels
|
|
279
|
+
header_row = rows[0]
|
|
280
|
+
|
|
281
|
+
assets_row: list | None = None
|
|
282
|
+
liabilities_row: list | None = None
|
|
283
|
+
equity_row: list | None = None
|
|
284
|
+
|
|
285
|
+
for row in rows[1:]:
|
|
286
|
+
if not row:
|
|
287
|
+
continue
|
|
288
|
+
label = row[0]
|
|
289
|
+
if assets_row is None and _match_keyword(label, _ASSET_KEYWORDS):
|
|
290
|
+
assets_row = list(row)
|
|
291
|
+
elif liabilities_row is None and _match_keyword(label, _LIABILITY_KEYWORDS):
|
|
292
|
+
liabilities_row = list(row)
|
|
293
|
+
elif equity_row is None and _match_keyword(label, _EQUITY_KEYWORDS):
|
|
294
|
+
equity_row = list(row)
|
|
295
|
+
|
|
296
|
+
if assets_row is None or liabilities_row is None or equity_row is None:
|
|
297
|
+
missing = []
|
|
298
|
+
if assets_row is None:
|
|
299
|
+
missing.append("Assets")
|
|
300
|
+
if liabilities_row is None:
|
|
301
|
+
missing.append("Liabilities")
|
|
302
|
+
if equity_row is None:
|
|
303
|
+
missing.append("Equity")
|
|
304
|
+
meta = {"command": "validate-balance", "path": path, "sheet": sheet, "result": "ERROR"}
|
|
305
|
+
return format_output(meta, f"Could not find rows for: {', '.join(missing)}.")
|
|
306
|
+
|
|
307
|
+
# Check across all period columns (col index 1+)
|
|
308
|
+
num_cols = max(len(assets_row), len(liabilities_row), len(equity_row))
|
|
309
|
+
results: list[str] = []
|
|
310
|
+
all_pass = True
|
|
311
|
+
period_results: list[str] = []
|
|
312
|
+
|
|
313
|
+
for col_idx in range(1, num_cols):
|
|
314
|
+
a = assets_row[col_idx] if col_idx < len(assets_row) else None
|
|
315
|
+
l = liabilities_row[col_idx] if col_idx < len(liabilities_row) else None
|
|
316
|
+
e = equity_row[col_idx] if col_idx < len(equity_row) else None
|
|
317
|
+
|
|
318
|
+
if a is None and l is None and e is None:
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# Build period label from header
|
|
322
|
+
period_label: str
|
|
323
|
+
if col_idx < len(header_row) and header_row[col_idx] is not None:
|
|
324
|
+
hdr = header_row[col_idx]
|
|
325
|
+
if isinstance(hdr, (datetime.datetime, datetime.date)):
|
|
326
|
+
period_label = hdr.strftime("%Y-%m-%d")
|
|
327
|
+
else:
|
|
328
|
+
period_label = str(hdr)
|
|
329
|
+
else:
|
|
330
|
+
period_label = get_column_letter(col_idx + 1)
|
|
331
|
+
|
|
332
|
+
if any(not isinstance(v, (int, float)) for v in [a, l, e] if v is not None):
|
|
333
|
+
period_results.append(f" {period_label}: SKIP (non-numeric values)")
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
a_val = float(a) if a is not None else 0.0
|
|
337
|
+
l_val = float(l) if l is not None else 0.0
|
|
338
|
+
e_val = float(e) if e is not None else 0.0
|
|
339
|
+
|
|
340
|
+
diff = a_val - (l_val + e_val)
|
|
341
|
+
tolerance = 0.01 # allow for floating point rounding
|
|
342
|
+
if abs(diff) <= tolerance:
|
|
343
|
+
period_results.append(f" {period_label}: PASS (Assets={a_val:,.2f} = L+E={l_val+e_val:,.2f})")
|
|
344
|
+
else:
|
|
345
|
+
all_pass = False
|
|
346
|
+
period_results.append(
|
|
347
|
+
f" {period_label}: FAIL (Assets={a_val:,.2f} ≠ L+E={l_val+e_val:,.2f}, diff={diff:+,.2f})"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
overall = "BALANCED" if all_pass else "IMBALANCED"
|
|
351
|
+
lines = [f"Balance Check: {overall}", ""] + period_results
|
|
352
|
+
|
|
353
|
+
meta = {
|
|
354
|
+
"command": "validate-balance",
|
|
355
|
+
"path": path,
|
|
356
|
+
"sheet": sheet,
|
|
357
|
+
"result": overall,
|
|
358
|
+
}
|
|
359
|
+
return format_output(meta, "\n".join(lines))
|