offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1266 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from copy import copy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from offagent.domain.locators import parse_locator, to_v2_locator
|
|
9
|
+
from offagent.domain.models import (
|
|
10
|
+
BlockStyle,
|
|
11
|
+
DocumentRef,
|
|
12
|
+
InlineFragment,
|
|
13
|
+
InlineStyle,
|
|
14
|
+
IndexedItem,
|
|
15
|
+
SectionPayload,
|
|
16
|
+
SheetCell,
|
|
17
|
+
SheetSnapshot,
|
|
18
|
+
StructureSection,
|
|
19
|
+
TextContainerSnapshot,
|
|
20
|
+
VisibleTextRange,
|
|
21
|
+
WorkbookStructure,
|
|
22
|
+
WorksheetSummary,
|
|
23
|
+
XlsxSectionCell,
|
|
24
|
+
XlsxRowEmbedding,
|
|
25
|
+
XlsxRowEmbeddingCell,
|
|
26
|
+
)
|
|
27
|
+
from offagent.domain.text_fragments import (
|
|
28
|
+
apply_style_to_range,
|
|
29
|
+
fragment_text,
|
|
30
|
+
normalize_fragments,
|
|
31
|
+
)
|
|
32
|
+
from offagent.errors import InvalidArgumentsError, TargetNotEditableError
|
|
33
|
+
from offagent.errors import TargetNotFoundError
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from openpyxl import Workbook, load_workbook
|
|
37
|
+
from openpyxl.cell.rich_text import CellRichText, TextBlock
|
|
38
|
+
from openpyxl.cell.text import InlineFont
|
|
39
|
+
from openpyxl.styles import Alignment, Font, PatternFill
|
|
40
|
+
from openpyxl.utils.cell import (
|
|
41
|
+
coordinate_to_tuple,
|
|
42
|
+
get_column_letter,
|
|
43
|
+
range_boundaries,
|
|
44
|
+
)
|
|
45
|
+
except ModuleNotFoundError: # pragma: no cover - exercised through dependency checks
|
|
46
|
+
Workbook = None
|
|
47
|
+
CellRichText = None
|
|
48
|
+
Alignment = None
|
|
49
|
+
Font = None
|
|
50
|
+
InlineFont = None
|
|
51
|
+
PatternFill = None
|
|
52
|
+
TextBlock = None
|
|
53
|
+
load_workbook = None
|
|
54
|
+
coordinate_to_tuple = None
|
|
55
|
+
get_column_letter = None
|
|
56
|
+
range_boundaries = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class ResolvedCell:
|
|
61
|
+
sheet_name: str
|
|
62
|
+
coordinate: str
|
|
63
|
+
raw_value: object
|
|
64
|
+
formula: str | None
|
|
65
|
+
display_text: str
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TargetNotAppendableError(TargetNotEditableError):
|
|
69
|
+
"""Raised when a requested XLSX target cannot accept append text."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
NUMERIC_TEXT_PATTERN = re.compile(
|
|
73
|
+
r"""
|
|
74
|
+
^\s*
|
|
75
|
+
[\(\+\-]?
|
|
76
|
+
[$€£]?
|
|
77
|
+
(?:
|
|
78
|
+
\d{1,3}(?:,\d{3})+ |
|
|
79
|
+
\d+
|
|
80
|
+
)
|
|
81
|
+
(?:\.\d+)?
|
|
82
|
+
%?
|
|
83
|
+
\)?
|
|
84
|
+
\s*$
|
|
85
|
+
""",
|
|
86
|
+
re.VERBOSE,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_document(document_path: Path) -> list[IndexedItem]:
|
|
91
|
+
workbook = _open_workbook(document_path)
|
|
92
|
+
items: list[IndexedItem] = []
|
|
93
|
+
|
|
94
|
+
for worksheet in workbook.worksheets:
|
|
95
|
+
indexed_cells: list[tuple[object, str | None, str]] = []
|
|
96
|
+
row_contexts: dict[int, list[tuple[str, str]]] = {}
|
|
97
|
+
column_contexts: dict[int, list[tuple[str, str]]] = {}
|
|
98
|
+
|
|
99
|
+
for row in worksheet.iter_rows():
|
|
100
|
+
for cell in row:
|
|
101
|
+
if not _is_indexable_cell(cell):
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
formula = _formula_text(cell)
|
|
105
|
+
display_text = _display_text(cell)
|
|
106
|
+
indexed_cells.append((cell, formula, display_text))
|
|
107
|
+
row_contexts.setdefault(cell.row, []).append(
|
|
108
|
+
(cell.coordinate, display_text)
|
|
109
|
+
)
|
|
110
|
+
column_contexts.setdefault(cell.column, []).append(
|
|
111
|
+
(cell.coordinate, display_text)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
for cell, formula, display_text in indexed_cells:
|
|
115
|
+
item_id = make_item_id(worksheet.title, cell.coordinate)
|
|
116
|
+
items.append(
|
|
117
|
+
IndexedItem(
|
|
118
|
+
item_id=item_id,
|
|
119
|
+
item_type="cell",
|
|
120
|
+
locator=item_id,
|
|
121
|
+
preview=display_text[:120],
|
|
122
|
+
content_text=display_text,
|
|
123
|
+
metadata={
|
|
124
|
+
"sheet_name": worksheet.title,
|
|
125
|
+
"coordinate": cell.coordinate,
|
|
126
|
+
"raw_value": _metadata_raw_value(cell.value),
|
|
127
|
+
"formula": formula,
|
|
128
|
+
"display_text": display_text,
|
|
129
|
+
"data_type": cell.data_type,
|
|
130
|
+
"row_context": _context_text(
|
|
131
|
+
row_contexts[cell.row], exclude=cell.coordinate
|
|
132
|
+
),
|
|
133
|
+
"column_context": _context_text(
|
|
134
|
+
column_contexts[cell.column],
|
|
135
|
+
exclude=cell.coordinate,
|
|
136
|
+
),
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return items
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def build_embedding_text(item: IndexedItem, document_path: Path) -> str:
|
|
145
|
+
metadata = item.metadata
|
|
146
|
+
return "\n".join(
|
|
147
|
+
[
|
|
148
|
+
f"Workbook: {document_path.name}",
|
|
149
|
+
f"Sheet: {metadata.get('sheet_name', '')}",
|
|
150
|
+
f"Cell: {metadata.get('coordinate', '')}",
|
|
151
|
+
f"Row Context: {metadata.get('row_context', '')}",
|
|
152
|
+
f"Column Context: {metadata.get('column_context', '')}",
|
|
153
|
+
f"Value: {metadata.get('display_text', item.content_text)}",
|
|
154
|
+
]
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def build_row_embeddings(
|
|
159
|
+
items: list[IndexedItem], document_path: Path
|
|
160
|
+
) -> list[XlsxRowEmbedding]:
|
|
161
|
+
grouped: dict[tuple[str, int], list[IndexedItem]] = {}
|
|
162
|
+
|
|
163
|
+
for item in items:
|
|
164
|
+
if not _is_text_like_item(item):
|
|
165
|
+
continue
|
|
166
|
+
metadata = item.metadata
|
|
167
|
+
coordinate = str(metadata.get("coordinate", ""))
|
|
168
|
+
grouped.setdefault(
|
|
169
|
+
(str(metadata.get("sheet_name", "")), _row_number(coordinate)),
|
|
170
|
+
[],
|
|
171
|
+
).append(item)
|
|
172
|
+
|
|
173
|
+
row_embeddings: list[XlsxRowEmbedding] = []
|
|
174
|
+
for (sheet_name, row_number), row_items in sorted(grouped.items()):
|
|
175
|
+
ordered_items = sorted(
|
|
176
|
+
row_items,
|
|
177
|
+
key=lambda item: _coordinate_sort_key(
|
|
178
|
+
str(item.metadata.get("coordinate", ""))
|
|
179
|
+
),
|
|
180
|
+
)
|
|
181
|
+
contributing_cells = tuple(
|
|
182
|
+
XlsxRowEmbeddingCell(
|
|
183
|
+
item_id=item.item_id,
|
|
184
|
+
coordinate=str(item.metadata.get("coordinate", "")),
|
|
185
|
+
display_text=str(item.metadata.get("display_text", item.content_text)),
|
|
186
|
+
preview=item.preview,
|
|
187
|
+
)
|
|
188
|
+
for item in ordered_items
|
|
189
|
+
)
|
|
190
|
+
representative = max(
|
|
191
|
+
ordered_items,
|
|
192
|
+
key=lambda item: _representative_score(
|
|
193
|
+
str(item.metadata.get("coordinate", "")), item
|
|
194
|
+
),
|
|
195
|
+
)
|
|
196
|
+
row_embeddings.append(
|
|
197
|
+
XlsxRowEmbedding(
|
|
198
|
+
sheet_name=sheet_name,
|
|
199
|
+
row_number=row_number,
|
|
200
|
+
text=_build_row_embedding_text(
|
|
201
|
+
document_path.name,
|
|
202
|
+
sheet_name,
|
|
203
|
+
row_number,
|
|
204
|
+
contributing_cells,
|
|
205
|
+
),
|
|
206
|
+
preview=representative.preview,
|
|
207
|
+
representative_item_id=representative.item_id,
|
|
208
|
+
contributing_cells=contributing_cells,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return row_embeddings
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def read_cell(document_path: Path, item_id: str) -> str:
|
|
216
|
+
resolved = resolve_cell(document_path, item_id)
|
|
217
|
+
return resolved.display_text
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def create_xlsx(output_path: Path, initial_sheet_name: str | None = None) -> Path:
|
|
221
|
+
if Workbook is None:
|
|
222
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
223
|
+
workbook = Workbook()
|
|
224
|
+
workbook.active.title = initial_sheet_name or "Sheet1"
|
|
225
|
+
workbook.save(output_path)
|
|
226
|
+
return output_path
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def add_sheet(
|
|
230
|
+
document_path: Path,
|
|
231
|
+
name: str,
|
|
232
|
+
output_path: Path | None = None,
|
|
233
|
+
) -> tuple[Path, str]:
|
|
234
|
+
workbook = _open_workbook(document_path)
|
|
235
|
+
if not name.strip():
|
|
236
|
+
raise InvalidArgumentsError("Worksheet name cannot be empty.")
|
|
237
|
+
if name in workbook.sheetnames:
|
|
238
|
+
raise InvalidArgumentsError(f"Worksheet {name!r} already exists.")
|
|
239
|
+
workbook.create_sheet(title=name)
|
|
240
|
+
target_path = _target_path(document_path, output_path)
|
|
241
|
+
workbook.save(target_path)
|
|
242
|
+
return target_path, f"xlsx:sheet:{name}"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def write_cell(
|
|
246
|
+
document_path: Path, item_id: str, value: object, output_path: Path | None = None
|
|
247
|
+
) -> Path:
|
|
248
|
+
workbook = _open_workbook(document_path)
|
|
249
|
+
cell = _resolve_cell(workbook, item_id)
|
|
250
|
+
cell.value = _coerce_write_value(value)
|
|
251
|
+
target_path = _target_path(document_path, output_path)
|
|
252
|
+
workbook.save(target_path)
|
|
253
|
+
return target_path
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def add_row(
|
|
257
|
+
document_path: Path,
|
|
258
|
+
sheet_locator: str,
|
|
259
|
+
values: list[object],
|
|
260
|
+
output_path: Path | None = None,
|
|
261
|
+
) -> tuple[Path, str]:
|
|
262
|
+
workbook = _open_workbook(document_path)
|
|
263
|
+
sheet_name = _sheet_name_from_locator(sheet_locator)
|
|
264
|
+
worksheet = _resolve_worksheet(workbook, sheet_name)
|
|
265
|
+
target_row = _last_used_row(worksheet) + 1
|
|
266
|
+
for column_index, value in enumerate(values, start=1):
|
|
267
|
+
worksheet.cell(row=target_row, column=column_index).value = _coerce_write_value(
|
|
268
|
+
value
|
|
269
|
+
)
|
|
270
|
+
target_path = _target_path(document_path, output_path)
|
|
271
|
+
workbook.save(target_path)
|
|
272
|
+
return target_path, f"xlsx:sheet:{sheet_name}:row:{target_row}"
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def append_cell(
|
|
276
|
+
document_path: Path, item_id: str, text: str, output_path: Path | None = None
|
|
277
|
+
) -> Path:
|
|
278
|
+
workbook = _open_workbook(document_path)
|
|
279
|
+
cell = _resolve_cell(workbook, item_id)
|
|
280
|
+
formula = _formula_text(cell)
|
|
281
|
+
if formula is not None or (
|
|
282
|
+
cell.value is not None and not isinstance(cell.value, str)
|
|
283
|
+
):
|
|
284
|
+
raise TargetNotAppendableError("target not appendable; use write-cell")
|
|
285
|
+
if cell.value is None:
|
|
286
|
+
cell.value = text
|
|
287
|
+
else:
|
|
288
|
+
cell.value = f"{cell.value}{text}"
|
|
289
|
+
target_path = _target_path(document_path, output_path)
|
|
290
|
+
workbook.save(target_path)
|
|
291
|
+
return target_path
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def resolve_cell(document_path: Path, item_id: str) -> ResolvedCell:
|
|
295
|
+
workbook = _open_workbook(document_path)
|
|
296
|
+
cell = _resolve_cell(workbook, item_id)
|
|
297
|
+
return ResolvedCell(
|
|
298
|
+
sheet_name=cell.parent.title,
|
|
299
|
+
coordinate=cell.coordinate,
|
|
300
|
+
raw_value=cell.value,
|
|
301
|
+
formula=_formula_text(cell),
|
|
302
|
+
display_text=_display_text(cell),
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def resolve_structure(document_path: Path) -> tuple[StructureSection, ...]:
|
|
307
|
+
workbook = _open_workbook(document_path)
|
|
308
|
+
sections: list[StructureSection] = []
|
|
309
|
+
|
|
310
|
+
for worksheet in workbook.worksheets:
|
|
311
|
+
anchor = _first_indexable_cell(worksheet)
|
|
312
|
+
locator = make_item_id(
|
|
313
|
+
worksheet.title, anchor.coordinate if anchor is not None else "A1"
|
|
314
|
+
)
|
|
315
|
+
preview = "" if anchor is None else _display_text(anchor)[:120]
|
|
316
|
+
sections.append(
|
|
317
|
+
StructureSection(
|
|
318
|
+
locator=locator,
|
|
319
|
+
section_type="worksheet",
|
|
320
|
+
preview=preview,
|
|
321
|
+
metadata={
|
|
322
|
+
"sheet_name": worksheet.title,
|
|
323
|
+
"used_range": _format_range(_used_bounds(worksheet)),
|
|
324
|
+
"max_row": worksheet.max_row,
|
|
325
|
+
"max_column": worksheet.max_column,
|
|
326
|
+
"cell_count": _cell_count(worksheet),
|
|
327
|
+
},
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return tuple(sections)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def get_section(
|
|
335
|
+
document_path: Path, locator: str, *, cell_range: str | None = None
|
|
336
|
+
) -> SectionPayload:
|
|
337
|
+
sheet_name, _ = parse_item_id(locator)
|
|
338
|
+
snapshot = get_sheet_snapshot(document_path, sheet_name, cell_range=cell_range)
|
|
339
|
+
return SectionPayload(
|
|
340
|
+
document=snapshot.document,
|
|
341
|
+
locator=locator,
|
|
342
|
+
section_type="worksheet",
|
|
343
|
+
preview=next(
|
|
344
|
+
(cell.display_value for cell in snapshot.cells if cell.display_value), ""
|
|
345
|
+
),
|
|
346
|
+
metadata={**snapshot.metadata, "sheet_name": sheet_name},
|
|
347
|
+
sheet_name=sheet_name,
|
|
348
|
+
cells=tuple(
|
|
349
|
+
XlsxSectionCell(
|
|
350
|
+
locator=make_item_id(sheet_name, cell.coordinate),
|
|
351
|
+
coordinate=cell.coordinate,
|
|
352
|
+
row=cell.row,
|
|
353
|
+
column=cell.column,
|
|
354
|
+
display_value=cell.display_value,
|
|
355
|
+
formula=cell.metadata.get("formula"),
|
|
356
|
+
metadata=cell.metadata,
|
|
357
|
+
)
|
|
358
|
+
for cell in snapshot.cells
|
|
359
|
+
),
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def read_node(document_path: Path, locator: str) -> tuple[str, str, dict[str, object]]:
|
|
364
|
+
resolved = resolve_cell(document_path, locator)
|
|
365
|
+
return (
|
|
366
|
+
"cell",
|
|
367
|
+
resolved.display_text,
|
|
368
|
+
{
|
|
369
|
+
"sheet_name": resolved.sheet_name,
|
|
370
|
+
"coordinate": resolved.coordinate,
|
|
371
|
+
"formula": resolved.formula,
|
|
372
|
+
"raw_value": resolved.raw_value,
|
|
373
|
+
},
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def write_node(
|
|
378
|
+
document_path: Path, locator: str, value: str, output_path: Path | None = None
|
|
379
|
+
) -> Path:
|
|
380
|
+
return write_cell(document_path, locator, value, output_path)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def get_workbook_structure(document_path: Path) -> WorkbookStructure:
|
|
384
|
+
workbook = _open_workbook(document_path)
|
|
385
|
+
sheets: list[WorksheetSummary] = []
|
|
386
|
+
|
|
387
|
+
for position, worksheet in enumerate(workbook.worksheets):
|
|
388
|
+
used_bounds = _used_bounds(worksheet)
|
|
389
|
+
preview = ""
|
|
390
|
+
if used_bounds is not None:
|
|
391
|
+
min_row, min_col, max_row, max_col = used_bounds
|
|
392
|
+
for row in worksheet.iter_rows(
|
|
393
|
+
min_row=min_row,
|
|
394
|
+
max_row=max_row,
|
|
395
|
+
min_col=min_col,
|
|
396
|
+
max_col=max_col,
|
|
397
|
+
):
|
|
398
|
+
for cell in row:
|
|
399
|
+
text = _display_text(cell).strip()
|
|
400
|
+
if text:
|
|
401
|
+
preview = text[:120]
|
|
402
|
+
break
|
|
403
|
+
if preview:
|
|
404
|
+
break
|
|
405
|
+
sheets.append(
|
|
406
|
+
WorksheetSummary(
|
|
407
|
+
position=position,
|
|
408
|
+
sheet_name=worksheet.title,
|
|
409
|
+
preview=preview,
|
|
410
|
+
metadata={
|
|
411
|
+
"used_range": _format_range(used_bounds),
|
|
412
|
+
"max_row": worksheet.max_row,
|
|
413
|
+
"max_column": worksheet.max_column,
|
|
414
|
+
},
|
|
415
|
+
)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return WorkbookStructure(
|
|
419
|
+
document=_document_ref(document_path), sheets=tuple(sheets)
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def get_sheet_snapshot(
|
|
424
|
+
document_path: Path,
|
|
425
|
+
sheet_name: str,
|
|
426
|
+
*,
|
|
427
|
+
cell_range: str | None = None,
|
|
428
|
+
start_cell: str | None = None,
|
|
429
|
+
row_count: int | None = None,
|
|
430
|
+
column_count: int | None = None,
|
|
431
|
+
) -> SheetSnapshot:
|
|
432
|
+
workbook = _open_workbook(document_path)
|
|
433
|
+
worksheet = _resolve_worksheet(workbook, sheet_name)
|
|
434
|
+
bounds = _snapshot_bounds(
|
|
435
|
+
worksheet,
|
|
436
|
+
cell_range=cell_range,
|
|
437
|
+
start_cell=start_cell,
|
|
438
|
+
row_count=row_count,
|
|
439
|
+
column_count=column_count,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
cells: list[SheetCell] = []
|
|
443
|
+
if bounds is not None:
|
|
444
|
+
min_row, min_col, max_row, max_col = bounds
|
|
445
|
+
for row in worksheet.iter_rows(
|
|
446
|
+
min_row=min_row,
|
|
447
|
+
max_row=max_row,
|
|
448
|
+
min_col=min_col,
|
|
449
|
+
max_col=max_col,
|
|
450
|
+
):
|
|
451
|
+
for cell in row:
|
|
452
|
+
cells.append(
|
|
453
|
+
SheetCell(
|
|
454
|
+
coordinate=cell.coordinate,
|
|
455
|
+
row=cell.row,
|
|
456
|
+
column=cell.column,
|
|
457
|
+
display_value=_display_text(cell),
|
|
458
|
+
metadata={
|
|
459
|
+
"raw_value": _metadata_raw_value(cell.value),
|
|
460
|
+
"formula": _formula_text(cell),
|
|
461
|
+
"data_type": cell.data_type,
|
|
462
|
+
},
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
return SheetSnapshot(
|
|
467
|
+
document=_document_ref(document_path),
|
|
468
|
+
sheet_name=worksheet.title,
|
|
469
|
+
cells=tuple(cells),
|
|
470
|
+
metadata={
|
|
471
|
+
"range": _format_range(bounds),
|
|
472
|
+
"row_count": 0 if bounds is None else bounds[2] - bounds[0] + 1,
|
|
473
|
+
"column_count": 0 if bounds is None else bounds[3] - bounds[1] + 1,
|
|
474
|
+
},
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def append_row(
|
|
479
|
+
document_path: Path,
|
|
480
|
+
sheet_name: str,
|
|
481
|
+
*,
|
|
482
|
+
values: list[object] | None = None,
|
|
483
|
+
record: dict[str, object] | None = None,
|
|
484
|
+
output_path: Path | None = None,
|
|
485
|
+
) -> tuple[Path, int, tuple[str, ...]]:
|
|
486
|
+
if (values is None) == (record is None):
|
|
487
|
+
raise InvalidArgumentsError(
|
|
488
|
+
"append_row requires exactly one of values or record."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
workbook = _open_workbook(document_path)
|
|
492
|
+
worksheet = _resolve_worksheet(workbook, sheet_name)
|
|
493
|
+
target_row = _last_used_row(worksheet) + 1
|
|
494
|
+
written_coordinates: list[str] = []
|
|
495
|
+
|
|
496
|
+
if values is not None:
|
|
497
|
+
for column_index, value in enumerate(values, start=1):
|
|
498
|
+
coordinate = f"{get_column_letter(column_index)}{target_row}"
|
|
499
|
+
worksheet[coordinate] = _coerce_write_value(value)
|
|
500
|
+
written_coordinates.append(coordinate)
|
|
501
|
+
else:
|
|
502
|
+
header_map = _header_map(worksheet)
|
|
503
|
+
if not header_map:
|
|
504
|
+
raise InvalidArgumentsError(
|
|
505
|
+
"append_row record writes require an existing header row in the worksheet."
|
|
506
|
+
)
|
|
507
|
+
for key, value in record.items():
|
|
508
|
+
if key not in header_map:
|
|
509
|
+
raise InvalidArgumentsError(
|
|
510
|
+
f"Unknown worksheet header for append_row: {key}"
|
|
511
|
+
)
|
|
512
|
+
coordinate = f"{get_column_letter(header_map[key])}{target_row}"
|
|
513
|
+
worksheet[coordinate] = _coerce_write_value(value)
|
|
514
|
+
written_coordinates.append(coordinate)
|
|
515
|
+
|
|
516
|
+
target_path = _target_path(document_path, output_path)
|
|
517
|
+
workbook.save(target_path)
|
|
518
|
+
return target_path, target_row, tuple(written_coordinates)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def write_table(
|
|
522
|
+
document_path: Path,
|
|
523
|
+
sheet_name: str,
|
|
524
|
+
*,
|
|
525
|
+
rows: list[list[object]] | None = None,
|
|
526
|
+
records: list[dict[str, object]] | None = None,
|
|
527
|
+
column_mapping: dict[str, str] | None = None,
|
|
528
|
+
output_path: Path | None = None,
|
|
529
|
+
) -> tuple[Path, int, int]:
|
|
530
|
+
if (rows is None) == (records is None):
|
|
531
|
+
raise InvalidArgumentsError(
|
|
532
|
+
"write_table requires exactly one of rows or records."
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
workbook = _open_workbook(document_path)
|
|
536
|
+
worksheet = _resolve_worksheet(workbook, sheet_name)
|
|
537
|
+
start_row = _last_used_row(worksheet) + 1
|
|
538
|
+
|
|
539
|
+
if rows is not None:
|
|
540
|
+
for row_offset, row_values in enumerate(rows):
|
|
541
|
+
for column_index, value in enumerate(row_values, start=1):
|
|
542
|
+
worksheet.cell(
|
|
543
|
+
row=start_row + row_offset,
|
|
544
|
+
column=column_index,
|
|
545
|
+
).value = _coerce_write_value(value)
|
|
546
|
+
end_row = start_row + len(rows) - 1
|
|
547
|
+
else:
|
|
548
|
+
resolved_mapping = _resolve_record_mapping(worksheet, column_mapping)
|
|
549
|
+
for row_offset, record in enumerate(records):
|
|
550
|
+
for key, value in record.items():
|
|
551
|
+
if key not in resolved_mapping:
|
|
552
|
+
raise InvalidArgumentsError(
|
|
553
|
+
f"Unknown worksheet mapping for write_table field: {key}"
|
|
554
|
+
)
|
|
555
|
+
worksheet.cell(
|
|
556
|
+
row=start_row + row_offset,
|
|
557
|
+
column=resolved_mapping[key],
|
|
558
|
+
).value = _coerce_write_value(value)
|
|
559
|
+
end_row = start_row + len(records) - 1
|
|
560
|
+
|
|
561
|
+
target_path = _target_path(document_path, output_path)
|
|
562
|
+
workbook.save(target_path)
|
|
563
|
+
return target_path, start_row, end_row
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def parse_item_id(item_id: str) -> tuple[str, str]:
|
|
567
|
+
if not item_id.startswith("sheet:"):
|
|
568
|
+
raise InvalidArgumentsError(f"Unsupported XLSX item id: {item_id}")
|
|
569
|
+
|
|
570
|
+
payload = item_id.removeprefix("sheet:")
|
|
571
|
+
if "!" not in payload:
|
|
572
|
+
raise InvalidArgumentsError(f"Invalid XLSX item id: {item_id}")
|
|
573
|
+
|
|
574
|
+
sheet_name, coordinate = payload.rsplit("!", maxsplit=1)
|
|
575
|
+
if not sheet_name:
|
|
576
|
+
raise InvalidArgumentsError(f"Invalid XLSX sheet name in item id: {item_id}")
|
|
577
|
+
normalized_coordinate = _normalize_coordinate(coordinate)
|
|
578
|
+
return sheet_name, normalized_coordinate
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def make_item_id(sheet_name: str, coordinate: str) -> str:
|
|
582
|
+
return f"sheet:{sheet_name}!{_normalize_coordinate(coordinate)}"
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def style_cell_inline(
|
|
586
|
+
document_path: Path,
|
|
587
|
+
locator: str,
|
|
588
|
+
style: InlineStyle,
|
|
589
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
590
|
+
output_path: Path | None = None,
|
|
591
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
592
|
+
workbook = _open_workbook(document_path)
|
|
593
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
594
|
+
cell = _resolve_cell(workbook, _legacy_item_id_from_v2(canonical))
|
|
595
|
+
clear_set = _normalize_clear_fields(clear_fields, _INLINE_STYLE_FIELDS)
|
|
596
|
+
skipped_fields = _apply_xlsx_inline_style(cell, style, clear_set)
|
|
597
|
+
target_path = _target_path(document_path, output_path)
|
|
598
|
+
workbook.save(target_path)
|
|
599
|
+
return (
|
|
600
|
+
target_path,
|
|
601
|
+
canonical,
|
|
602
|
+
{"cleared_fields": clear_set, "skipped_fields": skipped_fields},
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def read_cell_fragments(document_path: Path, locator: str) -> TextContainerSnapshot:
|
|
607
|
+
workbook = _open_workbook(document_path)
|
|
608
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
609
|
+
cell = _resolve_cell(workbook, _legacy_item_id_from_v2(canonical))
|
|
610
|
+
_ensure_partial_formatting_cell_supported(cell, canonical)
|
|
611
|
+
fragments = _read_xlsx_fragments(cell)
|
|
612
|
+
return TextContainerSnapshot(
|
|
613
|
+
locator=canonical,
|
|
614
|
+
object_type="cell",
|
|
615
|
+
text=fragment_text(fragments),
|
|
616
|
+
fragments=fragments,
|
|
617
|
+
metadata={"sheet_name": cell.parent.title, "coordinate": cell.coordinate},
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def write_cell_fragments(
|
|
622
|
+
document_path: Path,
|
|
623
|
+
locator: str,
|
|
624
|
+
fragments: list[InlineFragment] | tuple[InlineFragment, ...],
|
|
625
|
+
output_path: Path | None = None,
|
|
626
|
+
) -> tuple[Path, str, TextContainerSnapshot]:
|
|
627
|
+
workbook = _open_workbook(document_path)
|
|
628
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
629
|
+
cell = _resolve_cell(workbook, _legacy_item_id_from_v2(canonical))
|
|
630
|
+
_ensure_partial_formatting_cell_supported(cell, canonical)
|
|
631
|
+
normalized = normalize_fragments(fragments)
|
|
632
|
+
_write_xlsx_fragments(cell, normalized)
|
|
633
|
+
target_path = _target_path(document_path, output_path)
|
|
634
|
+
workbook.save(target_path)
|
|
635
|
+
snapshot = TextContainerSnapshot(
|
|
636
|
+
locator=canonical,
|
|
637
|
+
object_type="cell",
|
|
638
|
+
text=fragment_text(normalized),
|
|
639
|
+
fragments=normalized,
|
|
640
|
+
metadata={"sheet_name": cell.parent.title, "coordinate": cell.coordinate},
|
|
641
|
+
)
|
|
642
|
+
return target_path, canonical, snapshot
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def style_cell_range(
|
|
646
|
+
document_path: Path,
|
|
647
|
+
locator: str,
|
|
648
|
+
text_range: VisibleTextRange,
|
|
649
|
+
style: InlineStyle,
|
|
650
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
651
|
+
output_path: Path | None = None,
|
|
652
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
653
|
+
snapshot = read_cell_fragments(document_path, locator)
|
|
654
|
+
clear_set = _normalize_clear_fields(clear_fields, _INLINE_STYLE_FIELDS)
|
|
655
|
+
styled = apply_style_to_range(
|
|
656
|
+
snapshot.fragments, text_range, style=style, clear_fields=clear_set
|
|
657
|
+
)
|
|
658
|
+
target_path, canonical, rewritten = write_cell_fragments(
|
|
659
|
+
document_path,
|
|
660
|
+
locator,
|
|
661
|
+
styled,
|
|
662
|
+
output_path=output_path,
|
|
663
|
+
)
|
|
664
|
+
return (
|
|
665
|
+
target_path,
|
|
666
|
+
canonical,
|
|
667
|
+
{
|
|
668
|
+
"cleared_fields": clear_set,
|
|
669
|
+
"range": {"start": text_range.start, "end": text_range.end},
|
|
670
|
+
"text": rewritten.text,
|
|
671
|
+
},
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def style_cell_block(
|
|
676
|
+
document_path: Path,
|
|
677
|
+
locator: str,
|
|
678
|
+
style: BlockStyle,
|
|
679
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
680
|
+
output_path: Path | None = None,
|
|
681
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
682
|
+
workbook = _open_workbook(document_path)
|
|
683
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
684
|
+
cell = _resolve_cell(workbook, _legacy_item_id_from_v2(canonical))
|
|
685
|
+
clear_set = _normalize_clear_fields(clear_fields, _BLOCK_STYLE_FIELDS)
|
|
686
|
+
skipped_fields = _apply_xlsx_block_style(cell, style, clear_set)
|
|
687
|
+
target_path = _target_path(document_path, output_path)
|
|
688
|
+
workbook.save(target_path)
|
|
689
|
+
return (
|
|
690
|
+
target_path,
|
|
691
|
+
canonical,
|
|
692
|
+
{"cleared_fields": clear_set, "skipped_fields": skipped_fields},
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def _open_workbook(document_path: Path):
|
|
697
|
+
if load_workbook is None:
|
|
698
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
699
|
+
return load_workbook(str(document_path), rich_text=True)
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _document_ref(document_path: Path) -> DocumentRef:
|
|
703
|
+
resolved_path = document_path.resolve()
|
|
704
|
+
stat = resolved_path.stat()
|
|
705
|
+
return DocumentRef(
|
|
706
|
+
document_id=resolved_path.as_posix(),
|
|
707
|
+
path=resolved_path,
|
|
708
|
+
file_type="xlsx",
|
|
709
|
+
display_name=resolved_path.name,
|
|
710
|
+
modified_time=stat.st_mtime,
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def _resolve_cell(workbook, item_id: str):
|
|
715
|
+
sheet_name, coordinate = parse_item_id(item_id)
|
|
716
|
+
worksheet = _resolve_worksheet(workbook, sheet_name)
|
|
717
|
+
return worksheet[coordinate]
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _resolve_worksheet(workbook, sheet_name: str):
|
|
721
|
+
try:
|
|
722
|
+
return workbook[sheet_name]
|
|
723
|
+
except KeyError as exc:
|
|
724
|
+
raise TargetNotFoundError(
|
|
725
|
+
f"Worksheet {sheet_name!r} does not exist in the workbook."
|
|
726
|
+
) from exc
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _is_indexable_cell(cell) -> bool:
|
|
730
|
+
return _formula_text(cell) is not None or cell.value is not None
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _formula_text(cell) -> str | None:
|
|
734
|
+
if getattr(cell, "data_type", None) == "f" and cell.value is not None:
|
|
735
|
+
return str(cell.value)
|
|
736
|
+
return None
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def _display_text(cell) -> str:
|
|
740
|
+
formula = _formula_text(cell)
|
|
741
|
+
if formula is not None:
|
|
742
|
+
return formula
|
|
743
|
+
return "" if cell.value is None else str(cell.value)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _metadata_raw_value(value: object) -> object:
|
|
747
|
+
if CellRichText is not None and isinstance(value, CellRichText):
|
|
748
|
+
return str(value)
|
|
749
|
+
return value
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _coerce_value(value: str) -> object:
|
|
753
|
+
for converter in (int, float):
|
|
754
|
+
try:
|
|
755
|
+
return converter(value)
|
|
756
|
+
except ValueError:
|
|
757
|
+
continue
|
|
758
|
+
return value
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _coerce_write_value(value: object) -> object:
|
|
762
|
+
if isinstance(value, str):
|
|
763
|
+
return _coerce_value(value)
|
|
764
|
+
return value
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def _normalize_coordinate(coordinate: str) -> str:
|
|
768
|
+
normalized = coordinate.strip().upper()
|
|
769
|
+
if not normalized:
|
|
770
|
+
raise InvalidArgumentsError("Cell coordinate cannot be empty.")
|
|
771
|
+
if coordinate_to_tuple is None:
|
|
772
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
773
|
+
try:
|
|
774
|
+
coordinate_to_tuple(normalized)
|
|
775
|
+
except ValueError as exc:
|
|
776
|
+
raise InvalidArgumentsError(
|
|
777
|
+
f"Invalid XLSX cell coordinate: {coordinate}"
|
|
778
|
+
) from exc
|
|
779
|
+
return normalized
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _context_text(entries: list[tuple[str, str]], *, exclude: str) -> str:
|
|
783
|
+
return " | ".join(
|
|
784
|
+
display_text
|
|
785
|
+
for coordinate, display_text in entries
|
|
786
|
+
if coordinate != exclude and display_text
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def _target_path(document_path: Path, output_path: Path | None) -> Path:
|
|
791
|
+
return document_path if output_path is None else output_path
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _sheet_name_from_locator(locator: str) -> str:
|
|
795
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
796
|
+
components = parse_locator(canonical).components
|
|
797
|
+
if len(components) == 3 and components[:2] == ("xlsx", "sheet"):
|
|
798
|
+
return components[2]
|
|
799
|
+
raise InvalidArgumentsError(f"Unsupported worksheet locator: {locator}")
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _legacy_item_id_from_v2(locator: str) -> str:
|
|
803
|
+
components = parse_locator(locator).components
|
|
804
|
+
if len(components) == 4 and components[:2] == ("xlsx", "sheet"):
|
|
805
|
+
return make_item_id(components[2], components[3])
|
|
806
|
+
raise InvalidArgumentsError(f"XLSX cell locator required: {locator}")
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def _ensure_partial_formatting_cell_supported(cell, locator: str) -> None:
|
|
810
|
+
if _formula_text(cell) is not None:
|
|
811
|
+
raise TargetNotEditableError(
|
|
812
|
+
f"{locator} does not support partial formatting for formula cells."
|
|
813
|
+
)
|
|
814
|
+
if isinstance(cell.value, bool):
|
|
815
|
+
raise TargetNotEditableError(
|
|
816
|
+
f"{locator} does not support partial formatting for boolean cells."
|
|
817
|
+
)
|
|
818
|
+
if cell.coordinate in {
|
|
819
|
+
merged.split(":")[0] for merged in map(str, cell.parent.merged_cells.ranges)
|
|
820
|
+
}:
|
|
821
|
+
return
|
|
822
|
+
for merged in cell.parent.merged_cells.ranges:
|
|
823
|
+
if cell.coordinate in merged:
|
|
824
|
+
raise TargetNotEditableError(
|
|
825
|
+
f"{locator} does not support partial formatting for merged cells."
|
|
826
|
+
)
|
|
827
|
+
if cell.value is None:
|
|
828
|
+
return
|
|
829
|
+
if CellRichText is not None and isinstance(cell.value, CellRichText):
|
|
830
|
+
return
|
|
831
|
+
if not isinstance(cell.value, str):
|
|
832
|
+
raise TargetNotEditableError(
|
|
833
|
+
f"{locator} does not support partial formatting for non-string cells."
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _read_xlsx_fragments(cell) -> tuple[InlineFragment, ...]:
|
|
838
|
+
value = cell.value
|
|
839
|
+
if value is None:
|
|
840
|
+
return ()
|
|
841
|
+
if CellRichText is not None and isinstance(value, CellRichText):
|
|
842
|
+
fragments: list[InlineFragment] = []
|
|
843
|
+
for part in value:
|
|
844
|
+
if isinstance(part, str):
|
|
845
|
+
fragments.append(InlineFragment(text=part, style=InlineStyle()))
|
|
846
|
+
continue
|
|
847
|
+
fragments.append(
|
|
848
|
+
InlineFragment(
|
|
849
|
+
text=part.text,
|
|
850
|
+
style=_inline_style_from_xlsx_font(part.font),
|
|
851
|
+
)
|
|
852
|
+
)
|
|
853
|
+
return normalize_fragments(fragments)
|
|
854
|
+
return (InlineFragment(text=str(value), style=InlineStyle()),)
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _write_xlsx_fragments(
|
|
858
|
+
cell,
|
|
859
|
+
fragments: list[InlineFragment] | tuple[InlineFragment, ...],
|
|
860
|
+
) -> None:
|
|
861
|
+
normalized = normalize_fragments(fragments)
|
|
862
|
+
if not normalized:
|
|
863
|
+
cell.value = ""
|
|
864
|
+
return
|
|
865
|
+
if CellRichText is None or TextBlock is None or InlineFont is None:
|
|
866
|
+
raise RuntimeError(
|
|
867
|
+
"openpyxl rich-text support is required for XLSX partial formatting."
|
|
868
|
+
)
|
|
869
|
+
rich_parts: list[object] = []
|
|
870
|
+
for fragment in normalized:
|
|
871
|
+
if all(value is None for value in fragment.style.__dict__.values()):
|
|
872
|
+
rich_parts.append(fragment.text)
|
|
873
|
+
continue
|
|
874
|
+
rich_parts.append(TextBlock(_xlsx_inline_font(fragment.style), fragment.text))
|
|
875
|
+
cell.value = CellRichText(*rich_parts)
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def _inline_style_from_xlsx_font(font) -> InlineStyle:
|
|
879
|
+
if font is None:
|
|
880
|
+
return InlineStyle()
|
|
881
|
+
color = None
|
|
882
|
+
if getattr(font, "color", None) not in {None, ""}:
|
|
883
|
+
if isinstance(font.color, str):
|
|
884
|
+
color = font.color[-6:]
|
|
885
|
+
elif getattr(font.color, "rgb", None):
|
|
886
|
+
color = str(font.color.rgb)[-6:]
|
|
887
|
+
underline = None
|
|
888
|
+
if getattr(font, "u", None) is not None:
|
|
889
|
+
underline = str(font.u).lower() not in {"", "none"}
|
|
890
|
+
return InlineStyle(
|
|
891
|
+
bold=getattr(font, "b", getattr(font, "bold", None)),
|
|
892
|
+
italic=getattr(font, "i", getattr(font, "italic", None)),
|
|
893
|
+
underline=underline
|
|
894
|
+
if underline is not None
|
|
895
|
+
else getattr(font, "underline", None),
|
|
896
|
+
strike=getattr(font, "strike", None),
|
|
897
|
+
font_name=getattr(font, "rFont", getattr(font, "name", None)),
|
|
898
|
+
font_size=getattr(font, "sz", None),
|
|
899
|
+
font_color=color,
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _xlsx_inline_font(style: InlineStyle):
|
|
904
|
+
return InlineFont(
|
|
905
|
+
b=style.bold,
|
|
906
|
+
i=style.italic,
|
|
907
|
+
strike=style.strike,
|
|
908
|
+
rFont=style.font_name,
|
|
909
|
+
sz=style.font_size,
|
|
910
|
+
color=None
|
|
911
|
+
if style.font_color is None
|
|
912
|
+
else _normalize_hex_color(style.font_color),
|
|
913
|
+
u="single" if style.underline else None,
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
_INLINE_STYLE_FIELDS = frozenset(
|
|
918
|
+
{
|
|
919
|
+
"bold",
|
|
920
|
+
"italic",
|
|
921
|
+
"underline",
|
|
922
|
+
"strike",
|
|
923
|
+
"font_name",
|
|
924
|
+
"font_size",
|
|
925
|
+
"font_color",
|
|
926
|
+
"highlight",
|
|
927
|
+
}
|
|
928
|
+
)
|
|
929
|
+
_BLOCK_STYLE_FIELDS = frozenset(
|
|
930
|
+
{
|
|
931
|
+
"alignment",
|
|
932
|
+
"indent_level",
|
|
933
|
+
"left_indent",
|
|
934
|
+
"right_indent",
|
|
935
|
+
"spacing_before",
|
|
936
|
+
"spacing_after",
|
|
937
|
+
"line_spacing",
|
|
938
|
+
"wrap_text",
|
|
939
|
+
"vertical_alignment",
|
|
940
|
+
"fill_color",
|
|
941
|
+
"number_format",
|
|
942
|
+
}
|
|
943
|
+
)
|
|
944
|
+
_XLSX_ALIGNMENT_MAP = {
|
|
945
|
+
"left": "left",
|
|
946
|
+
"center": "center",
|
|
947
|
+
"right": "right",
|
|
948
|
+
"justify": "justify",
|
|
949
|
+
}
|
|
950
|
+
_XLSX_VERTICAL_ALIGNMENT_MAP = {
|
|
951
|
+
"top": "top",
|
|
952
|
+
"center": "center",
|
|
953
|
+
"bottom": "bottom",
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def _normalize_clear_fields(
|
|
958
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
959
|
+
allowed: frozenset[str],
|
|
960
|
+
) -> tuple[str, ...]:
|
|
961
|
+
normalized: list[str] = []
|
|
962
|
+
seen: set[str] = set()
|
|
963
|
+
for field_name in clear_fields:
|
|
964
|
+
if field_name not in allowed:
|
|
965
|
+
raise InvalidArgumentsError(
|
|
966
|
+
f"Unknown style field in clear_fields: {field_name}"
|
|
967
|
+
)
|
|
968
|
+
if field_name not in seen:
|
|
969
|
+
normalized.append(field_name)
|
|
970
|
+
seen.add(field_name)
|
|
971
|
+
return tuple(normalized)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _apply_xlsx_inline_style(
|
|
975
|
+
cell, style: InlineStyle, clear_fields: tuple[str, ...]
|
|
976
|
+
) -> list[str]:
|
|
977
|
+
font = copy(cell.font)
|
|
978
|
+
clear_set = set(clear_fields)
|
|
979
|
+
skipped_fields: list[str] = []
|
|
980
|
+
|
|
981
|
+
if "bold" in clear_set:
|
|
982
|
+
font.bold = None
|
|
983
|
+
elif style.bold is not None:
|
|
984
|
+
font.bold = style.bold
|
|
985
|
+
|
|
986
|
+
if "italic" in clear_set:
|
|
987
|
+
font.italic = None
|
|
988
|
+
elif style.italic is not None:
|
|
989
|
+
font.italic = style.italic
|
|
990
|
+
|
|
991
|
+
if "underline" in clear_set:
|
|
992
|
+
font.underline = None
|
|
993
|
+
elif style.underline is not None:
|
|
994
|
+
font.underline = "single" if style.underline else None
|
|
995
|
+
|
|
996
|
+
if "strike" in clear_set:
|
|
997
|
+
font.strike = None
|
|
998
|
+
elif style.strike is not None:
|
|
999
|
+
font.strike = style.strike
|
|
1000
|
+
|
|
1001
|
+
if "font_name" in clear_set:
|
|
1002
|
+
font.name = None
|
|
1003
|
+
elif style.font_name is not None:
|
|
1004
|
+
font.name = style.font_name
|
|
1005
|
+
|
|
1006
|
+
if "font_size" in clear_set:
|
|
1007
|
+
font.sz = None
|
|
1008
|
+
elif style.font_size is not None:
|
|
1009
|
+
font.sz = style.font_size
|
|
1010
|
+
|
|
1011
|
+
if "font_color" in clear_set:
|
|
1012
|
+
font.color = None
|
|
1013
|
+
elif style.font_color is not None:
|
|
1014
|
+
font.color = _normalize_hex_color(style.font_color)
|
|
1015
|
+
|
|
1016
|
+
if style.highlight is not None or "highlight" in clear_set:
|
|
1017
|
+
skipped_fields.append("highlight")
|
|
1018
|
+
|
|
1019
|
+
cell.font = font
|
|
1020
|
+
return skipped_fields
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def _apply_xlsx_block_style(
|
|
1024
|
+
cell, style: BlockStyle, clear_fields: tuple[str, ...]
|
|
1025
|
+
) -> list[str]:
|
|
1026
|
+
alignment = copy(cell.alignment)
|
|
1027
|
+
clear_set = set(clear_fields)
|
|
1028
|
+
skipped_fields: list[str] = []
|
|
1029
|
+
|
|
1030
|
+
if "alignment" in clear_set:
|
|
1031
|
+
alignment.horizontal = None
|
|
1032
|
+
elif style.alignment is not None:
|
|
1033
|
+
alignment.horizontal = _xlsx_alignment_value(style.alignment)
|
|
1034
|
+
|
|
1035
|
+
if "wrap_text" in clear_set:
|
|
1036
|
+
alignment.wrap_text = None
|
|
1037
|
+
elif style.wrap_text is not None:
|
|
1038
|
+
alignment.wrap_text = style.wrap_text
|
|
1039
|
+
|
|
1040
|
+
if "vertical_alignment" in clear_set:
|
|
1041
|
+
alignment.vertical = None
|
|
1042
|
+
elif style.vertical_alignment is not None:
|
|
1043
|
+
alignment.vertical = _xlsx_vertical_alignment_value(style.vertical_alignment)
|
|
1044
|
+
|
|
1045
|
+
if "indent_level" in clear_set:
|
|
1046
|
+
alignment.indent = 0
|
|
1047
|
+
elif style.indent_level is not None:
|
|
1048
|
+
alignment.indent = style.indent_level
|
|
1049
|
+
|
|
1050
|
+
for field_name in (
|
|
1051
|
+
"left_indent",
|
|
1052
|
+
"right_indent",
|
|
1053
|
+
"spacing_before",
|
|
1054
|
+
"spacing_after",
|
|
1055
|
+
"line_spacing",
|
|
1056
|
+
):
|
|
1057
|
+
if getattr(style, field_name) is not None or field_name in clear_set:
|
|
1058
|
+
skipped_fields.append(field_name)
|
|
1059
|
+
|
|
1060
|
+
if "fill_color" in clear_set:
|
|
1061
|
+
cell.fill = PatternFill(fill_type=None)
|
|
1062
|
+
elif style.fill_color is not None:
|
|
1063
|
+
color = _normalize_hex_color(style.fill_color)
|
|
1064
|
+
cell.fill = PatternFill(fill_type="solid", start_color=color, end_color=color)
|
|
1065
|
+
|
|
1066
|
+
if "number_format" in clear_set:
|
|
1067
|
+
cell.number_format = "General"
|
|
1068
|
+
elif style.number_format is not None:
|
|
1069
|
+
cell.number_format = style.number_format
|
|
1070
|
+
|
|
1071
|
+
cell.alignment = alignment
|
|
1072
|
+
return skipped_fields
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
def _normalize_hex_color(value: str) -> str:
|
|
1076
|
+
normalized = value.strip().lstrip("#").upper()
|
|
1077
|
+
if len(normalized) != 6 or any(
|
|
1078
|
+
character not in "0123456789ABCDEF" for character in normalized
|
|
1079
|
+
):
|
|
1080
|
+
raise InvalidArgumentsError(f"Invalid RGB hex color: {value}")
|
|
1081
|
+
return normalized
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def _xlsx_alignment_value(raw: str) -> str:
|
|
1085
|
+
normalized = raw.strip().lower()
|
|
1086
|
+
if normalized not in _XLSX_ALIGNMENT_MAP:
|
|
1087
|
+
raise InvalidArgumentsError(f"Unsupported XLSX alignment: {raw}")
|
|
1088
|
+
return _XLSX_ALIGNMENT_MAP[normalized]
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
def _xlsx_vertical_alignment_value(raw: str) -> str:
|
|
1092
|
+
normalized = raw.strip().lower()
|
|
1093
|
+
if normalized not in _XLSX_VERTICAL_ALIGNMENT_MAP:
|
|
1094
|
+
raise InvalidArgumentsError(f"Unsupported XLSX vertical alignment: {raw}")
|
|
1095
|
+
return _XLSX_VERTICAL_ALIGNMENT_MAP[normalized]
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
def _is_text_like_item(item: IndexedItem) -> bool:
|
|
1099
|
+
metadata = item.metadata
|
|
1100
|
+
display_text = str(metadata.get("display_text", item.content_text)).strip()
|
|
1101
|
+
if not display_text:
|
|
1102
|
+
return False
|
|
1103
|
+
|
|
1104
|
+
raw_value = metadata.get("raw_value")
|
|
1105
|
+
if metadata.get("formula") is not None:
|
|
1106
|
+
return False
|
|
1107
|
+
if isinstance(raw_value, bool):
|
|
1108
|
+
return False
|
|
1109
|
+
if isinstance(raw_value, (int, float)):
|
|
1110
|
+
return False
|
|
1111
|
+
if NUMERIC_TEXT_PATTERN.match(display_text):
|
|
1112
|
+
return False
|
|
1113
|
+
return any(character.isalpha() for character in display_text)
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
def _row_number(coordinate: str) -> int:
|
|
1117
|
+
row_number, _ = _coordinate_sort_key(coordinate)
|
|
1118
|
+
return row_number
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
def _coordinate_sort_key(coordinate: str) -> tuple[int, int]:
|
|
1122
|
+
normalized = _normalize_coordinate(coordinate)
|
|
1123
|
+
if coordinate_to_tuple is None:
|
|
1124
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
1125
|
+
return coordinate_to_tuple(normalized)
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
def _representative_score(coordinate: str, item: IndexedItem) -> tuple[int, int, int]:
|
|
1129
|
+
display_text = str(item.metadata.get("display_text", item.content_text))
|
|
1130
|
+
_, column_number = _coordinate_sort_key(coordinate)
|
|
1131
|
+
alpha_characters = sum(1 for character in display_text if character.isalpha())
|
|
1132
|
+
return (alpha_characters, len(display_text), -column_number)
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
def _build_row_embedding_text(
|
|
1136
|
+
workbook_name: str,
|
|
1137
|
+
sheet_name: str,
|
|
1138
|
+
row_number: int,
|
|
1139
|
+
contributing_cells: tuple[XlsxRowEmbeddingCell, ...],
|
|
1140
|
+
) -> str:
|
|
1141
|
+
lines = [
|
|
1142
|
+
f"Workbook: {workbook_name}",
|
|
1143
|
+
f"Sheet: {sheet_name}",
|
|
1144
|
+
f"Row: {row_number}",
|
|
1145
|
+
"Cells:",
|
|
1146
|
+
]
|
|
1147
|
+
lines.extend(
|
|
1148
|
+
f"- {cell.coordinate}: {cell.display_text}" for cell in contributing_cells
|
|
1149
|
+
)
|
|
1150
|
+
return "\n".join(lines)
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def _used_bounds(worksheet) -> tuple[int, int, int, int] | None:
|
|
1154
|
+
used_cells = [
|
|
1155
|
+
cell
|
|
1156
|
+
for row in worksheet.iter_rows()
|
|
1157
|
+
for cell in row
|
|
1158
|
+
if _is_indexable_cell(cell)
|
|
1159
|
+
]
|
|
1160
|
+
if not used_cells:
|
|
1161
|
+
return None
|
|
1162
|
+
min_row = min(cell.row for cell in used_cells)
|
|
1163
|
+
min_col = min(cell.column for cell in used_cells)
|
|
1164
|
+
max_row = max(cell.row for cell in used_cells)
|
|
1165
|
+
max_col = max(cell.column for cell in used_cells)
|
|
1166
|
+
return (min_row, min_col, max_row, max_col)
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def _snapshot_bounds(
|
|
1170
|
+
worksheet,
|
|
1171
|
+
*,
|
|
1172
|
+
cell_range: str | None,
|
|
1173
|
+
start_cell: str | None,
|
|
1174
|
+
row_count: int | None,
|
|
1175
|
+
column_count: int | None,
|
|
1176
|
+
) -> tuple[int, int, int, int] | None:
|
|
1177
|
+
if cell_range is not None:
|
|
1178
|
+
if start_cell is not None or row_count is not None or column_count is not None:
|
|
1179
|
+
raise InvalidArgumentsError(
|
|
1180
|
+
"sheet snapshot range and window inputs are mutually exclusive."
|
|
1181
|
+
)
|
|
1182
|
+
if range_boundaries is None:
|
|
1183
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
1184
|
+
min_col, min_row, max_col, max_row = range_boundaries(cell_range)
|
|
1185
|
+
return (min_row, min_col, max_row, max_col)
|
|
1186
|
+
|
|
1187
|
+
if start_cell is not None or row_count is not None or column_count is not None:
|
|
1188
|
+
if start_cell is None or row_count is None or column_count is None:
|
|
1189
|
+
raise InvalidArgumentsError(
|
|
1190
|
+
"sheet snapshot windows require start_cell, row_count, and column_count together."
|
|
1191
|
+
)
|
|
1192
|
+
start_row, start_column = _coordinate_sort_key(start_cell)
|
|
1193
|
+
return (
|
|
1194
|
+
start_row,
|
|
1195
|
+
start_column,
|
|
1196
|
+
start_row + row_count - 1,
|
|
1197
|
+
start_column + column_count - 1,
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
return _used_bounds(worksheet)
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
def _format_range(bounds: tuple[int, int, int, int] | None) -> str | None:
|
|
1204
|
+
if bounds is None or get_column_letter is None:
|
|
1205
|
+
return None
|
|
1206
|
+
min_row, min_col, max_row, max_col = bounds
|
|
1207
|
+
return (
|
|
1208
|
+
f"{get_column_letter(min_col)}{min_row}:{get_column_letter(max_col)}{max_row}"
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
def _last_used_row(worksheet) -> int:
|
|
1213
|
+
bounds = _used_bounds(worksheet)
|
|
1214
|
+
return 0 if bounds is None else bounds[2]
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
def _header_map(worksheet) -> dict[str, int]:
|
|
1218
|
+
header_map: dict[str, int] = {}
|
|
1219
|
+
for cell in worksheet[1]:
|
|
1220
|
+
header = _display_text(cell).strip()
|
|
1221
|
+
if header:
|
|
1222
|
+
header_map[header] = cell.column
|
|
1223
|
+
return header_map
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def _resolve_record_mapping(
|
|
1227
|
+
worksheet, column_mapping: dict[str, str] | None
|
|
1228
|
+
) -> dict[str, int]:
|
|
1229
|
+
if column_mapping is None:
|
|
1230
|
+
return _header_map(worksheet)
|
|
1231
|
+
|
|
1232
|
+
header_map = _header_map(worksheet)
|
|
1233
|
+
resolved: dict[str, int] = {}
|
|
1234
|
+
for field_name, target in column_mapping.items():
|
|
1235
|
+
normalized_target = target.strip()
|
|
1236
|
+
if normalized_target in header_map:
|
|
1237
|
+
resolved[field_name] = header_map[normalized_target]
|
|
1238
|
+
continue
|
|
1239
|
+
if _is_column_reference(normalized_target):
|
|
1240
|
+
if coordinate_to_tuple is None:
|
|
1241
|
+
raise RuntimeError("openpyxl is required for XLSX operations.")
|
|
1242
|
+
_, column_number = coordinate_to_tuple(f"{normalized_target.upper()}1")
|
|
1243
|
+
resolved[field_name] = column_number
|
|
1244
|
+
continue
|
|
1245
|
+
raise InvalidArgumentsError(
|
|
1246
|
+
f"Unknown worksheet header in column_mapping: {target}"
|
|
1247
|
+
)
|
|
1248
|
+
return resolved
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
def _is_column_reference(value: str) -> bool:
|
|
1252
|
+
return bool(re.fullmatch(r"[A-Za-z]+", value))
|
|
1253
|
+
|
|
1254
|
+
|
|
1255
|
+
def _first_indexable_cell(worksheet):
|
|
1256
|
+
for row in worksheet.iter_rows():
|
|
1257
|
+
for cell in row:
|
|
1258
|
+
if _is_indexable_cell(cell):
|
|
1259
|
+
return cell
|
|
1260
|
+
return None
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
def _cell_count(worksheet) -> int:
|
|
1264
|
+
return sum(
|
|
1265
|
+
1 for row in worksheet.iter_rows() for cell in row if _is_indexable_cell(cell)
|
|
1266
|
+
)
|