arcade-google-docs 4.0.0__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arcade_google_docs/docmd.py +534 -0
- arcade_google_docs/enum.py +1 -0
- arcade_google_docs/models/document.py +953 -0
- arcade_google_docs/models/document_writables.py +735 -0
- arcade_google_docs/models/requests.py +1315 -0
- arcade_google_docs/tools/__init__.py +2 -0
- arcade_google_docs/tools/edit_agent/edit_agent.py +56 -0
- arcade_google_docs/tools/edit_agent/executor.py +103 -0
- arcade_google_docs/tools/edit_agent/models/planning.py +89 -0
- arcade_google_docs/tools/edit_agent/planner.py +130 -0
- arcade_google_docs/tools/edit_agent/progress_tracker.py +32 -0
- arcade_google_docs/tools/edit_agent/prompts.py +204 -0
- arcade_google_docs/tools/edit_agent/request_generator.py +150 -0
- arcade_google_docs/tools/edit_agent/utils.py +21 -0
- arcade_google_docs/tools/get.py +26 -0
- arcade_google_docs/tools/search.py +5 -1
- arcade_google_docs/tools/system_context.py +36 -0
- arcade_google_docs/who_am_i_util.py +86 -0
- {arcade_google_docs-4.0.0.dist-info → arcade_google_docs-4.2.0.dist-info}/METADATA +2 -1
- arcade_google_docs-4.2.0.dist-info/RECORD +30 -0
- arcade_google_docs-4.0.0.dist-info/RECORD +0 -16
- {arcade_google_docs-4.0.0.dist-info → arcade_google_docs-4.2.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DocMD models and helpers.
|
|
3
|
+
|
|
4
|
+
This module defines a compact, index-aware DocMD representation for Google Docs
|
|
5
|
+
documents, plus helpers to build DocMD from a Document and render/parse the
|
|
6
|
+
string format expected for LLM consumption.
|
|
7
|
+
|
|
8
|
+
Example DocMD as a string:
|
|
9
|
+
@document_id: 1t9igNb2XSo_1FOkFXy3bI9bIQKvQgk_JUWpoomADkX4
|
|
10
|
+
@revision_id: ALBJ4LtLdNr30MBQxnybwwss4gpEhWixhJrjhCy29BVprpmcjurkGCqQOPyW2w9RibJcFdvchJqJ5bd-V_-K4g
|
|
11
|
+
|
|
12
|
+
[H1 1-18 HEADING_1 headingId=h.5wd8jf8y8o2n styles=italic:1-18,fontSize=23.0pt:1-18] Project Lightning
|
|
13
|
+
[P1 19-51 PARAGRAPH styles=bold:19-51] Confidential - Internal Use Only
|
|
14
|
+
[P2 52-73 PARAGRAPH styles=bold:52-56] Date: August 23, 2025
|
|
15
|
+
[P3 74-104 PARAGRAPH styles=bold:74-86] Prepared by: [Bob F, Alice H.]
|
|
16
|
+
[P4 105-115 PARAGRAPH styles=bold:105-113] Version: 1
|
|
17
|
+
[P5 116-117 PARAGRAPH]
|
|
18
|
+
[H2 118-138 HEADING_2 headingId=h.775rila7csjc styles=color=rgb(0.20784314,0.21568628,0.26666668):118-137,fontSize=17.0pt:118-137] 1. Executive Summary
|
|
19
|
+
[P6 139-731 PARAGRAPH] Project Lightning is a strategic initiative aimed at developing an advanced, AI-enhanced platform capable of processing and responding to intricate data requests with near real-time efficiency. The primary goals of this project are to achieve a 60% reduction in latency, including meeting p95 latency of 100 ms and p99 latency of 170 ms (measured from ingress receive to first byte to client and encompassing authentication, routing, model call, and post-processing), enhance system reliability to 99.99%, and introduce adaptive scaling mechanisms to effectively manage fluctuating workloads.
|
|
20
|
+
[P7 732-1373 PARAGRAPH] The team is targeting a 60% reduction in latency, aiming for a p95 latency of ≤100 ms and a p99 latency of ≤170 ms, covering end-to-end processes from ingress to client delivery. Reliability is set at 99.99%, equating to a 13-minute error budget per quarter. Adaptive scaling should handle up to 1 million concurrent sessions within five minutes, maintaining an error rate below 0.1%. The architecture involves a comprehensive flow from ingress to egress, with specific latency budgets and cold-start mitigations in place. Observability and data security are emphasized through advanced logging, monitoring, and stringent security protocols.
|
|
21
|
+
[P8 1374-1374 PARAGRAPH]
|
|
22
|
+
[H3 1375-1388 HEADING_2 headingId=h.54wtcb9egpyb styles=color=rgb(0.20784314,0.21568628,0.26666668):1375-1388,fontSize=17.0pt:1375-1388] 2. Objectives
|
|
23
|
+
[UL1 1389-1469 UL_ITEM listId=kix.xqqibvne3ovq styles=bold:1389-1399,italic:1389-1399] Performance - We will optimize algorithms to enhance data processing efficiency.
|
|
24
|
+
[UL2 1470-1558 UL_ITEM listId=kix.xqqibvne3ovq styles=bold:1470-1480,italic:1470-1480,italic:1483-1483] Reliability - Build redundancy into all critical components to ensure continuous uptime.
|
|
25
|
+
[UL3 1559-1645 UL_ITEM listId=kix.xqqibvne3ovq styles=bold:1559-1569,italic:1559-1569] Scalability - Implement adaptive scaling to increase capacity from 10,000 to 1,000,000
|
|
26
|
+
[UL4 1646-1746 UL_ITEM listId=kix.xqqibvne3ovq styles=bold:1646-1658,italic:1646-1658] Observability - Integrate advanced logging, monitoring, and alerting for proactive issue resolution.
|
|
27
|
+
[P9 1747-1747 PARAGRAPH]
|
|
28
|
+
[P10 1748-1748 PARAGRAPH]
|
|
29
|
+
[TABLE1 1749-1831 TABLE rows=2 cols=4]
|
|
30
|
+
[TR1 1750-1805 TABLE_ROW row=0]
|
|
31
|
+
[TC1 1751-1764 TABLE_CELL row=0 col=0] Performance
|
|
32
|
+
[TC2 1764-1777 TABLE_CELL row=0 col=1] Reliability
|
|
33
|
+
[TC3 1777-1790 TABLE_CELL row=0 col=2] Scalability
|
|
34
|
+
[TC4 1790-1805 TABLE_CELL row=0 col=3] Observability
|
|
35
|
+
[TR2 1805-1830 TABLE_ROW row=1]
|
|
36
|
+
[TC5 1806-1812 TABLE_CELL row=1 col=0] DONE
|
|
37
|
+
[TC6 1812-1818 TABLE_CELL row=1 col=1] DONE
|
|
38
|
+
[TC7 1818-1824 TABLE_CELL row=1 col=2] DONE
|
|
39
|
+
[TC8 1824-1830 TABLE_CELL row=1 col=3] DONE
|
|
40
|
+
[H4 1831-1831 HEADING_2 headingId=h.7h4kp6390h67]
|
|
41
|
+
[H5 1832-1840 HEADING_2 headingId=h.cx3le136ppi styles=color=rgb(0.20784314,0.21568628,0.26666668):1832-1840,fontSize=17.0pt:1832-1840] 3. Scope
|
|
42
|
+
[P11 1841-1850 PARAGRAPH styles=bold:1841-1850] In Scope:
|
|
43
|
+
[UL5 1851-1880 UL_ITEM listId=kix.s0qmo6yotpsp] Backend architecture redesign
|
|
44
|
+
[UL6 1881-1915 UL_ITEM listId=kix.s0qmo6yotpsp] AI model selection and fine-tuning
|
|
45
|
+
[UL7 1916-1953 UL_ITEM listId=kix.s0qmo6yotpsp] Cloud-based deployment infrastructure
|
|
46
|
+
[UL8 1954-1995 UL_ITEM listId=kix.s0qmo6yotpsp] Load testing and performance benchmarking
|
|
47
|
+
[UL9 1996-2026 UL_ITEM listId=kix.s0qmo6yotpsp] Altering the document's style.
|
|
48
|
+
[P12 2027-2040 PARAGRAPH styles=bold:2027-2040] Out of Scope:
|
|
49
|
+
[UL10 2041-2062 UL_ITEM listId=kix.3940kw992qy5] End-user UI/UX design
|
|
50
|
+
[UL11 2063-2087 UL_ITEM listId=kix.3940kw992qy5] Public launch activities
|
|
51
|
+
[UL12 2088-2139 UL_ITEM listId=kix.3940kw992qy5] Purchasing billboards that show off our new feature
|
|
52
|
+
""" # noqa: E501
|
|
53
|
+
|
|
54
|
+
from collections.abc import Callable
|
|
55
|
+
from enum import Enum
|
|
56
|
+
|
|
57
|
+
from pydantic import BaseModel
|
|
58
|
+
|
|
59
|
+
from arcade_google_docs.models.document import (
|
|
60
|
+
Document,
|
|
61
|
+
NamedStyleType,
|
|
62
|
+
Paragraph,
|
|
63
|
+
Table,
|
|
64
|
+
TextStyle,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DocMDBlockType(Enum):
|
|
69
|
+
PARAGRAPH = "PARAGRAPH"
|
|
70
|
+
HR = "HR"
|
|
71
|
+
HEADING_1 = "HEADING_1"
|
|
72
|
+
HEADING_2 = "HEADING_2"
|
|
73
|
+
HEADING_3 = "HEADING_3"
|
|
74
|
+
HEADING_4 = "HEADING_4"
|
|
75
|
+
HEADING_5 = "HEADING_5"
|
|
76
|
+
HEADING_6 = "HEADING_6"
|
|
77
|
+
UL_ITEM = "UL_ITEM"
|
|
78
|
+
OL_ITEM = "OL_ITEM" # Reserved for future use if ordering is inferred
|
|
79
|
+
TABLE = "TABLE"
|
|
80
|
+
TABLE_ROW = "TABLE_ROW"
|
|
81
|
+
TABLE_CELL = "TABLE_CELL"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DocMDBlock(BaseModel):
|
|
85
|
+
id: str
|
|
86
|
+
startIndex: int
|
|
87
|
+
endIndex: int
|
|
88
|
+
type: str
|
|
89
|
+
attrs: dict[str, str] | None = None
|
|
90
|
+
text: str
|
|
91
|
+
|
|
92
|
+
def to_string(self) -> str:
|
|
93
|
+
"""Return a string representation of the block."""
|
|
94
|
+
attr_parts: list[str] = []
|
|
95
|
+
if self.attrs:
|
|
96
|
+
for k, v in self.attrs.items():
|
|
97
|
+
if v is None:
|
|
98
|
+
continue
|
|
99
|
+
# Skip tab attribute if it's empty (default tab)
|
|
100
|
+
if k == "tab" and not v:
|
|
101
|
+
continue
|
|
102
|
+
attr_parts.append(f"{k}={v}")
|
|
103
|
+
attr_str = " ".join(attr_parts)
|
|
104
|
+
if attr_str:
|
|
105
|
+
return (
|
|
106
|
+
f"[{self.id} {self.startIndex}-{self.endIndex} {self.type} {attr_str}] {self.text}"
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
return f"[{self.id} {self.startIndex}-{self.endIndex} {self.type}] {self.text}"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class DocMD(BaseModel):
|
|
113
|
+
documentId: str
|
|
114
|
+
revisionId: str | None = None
|
|
115
|
+
tab: str = ""
|
|
116
|
+
blocks: list[DocMDBlock]
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def block_ids(self) -> list[str]:
|
|
120
|
+
return [b.id for b in self.blocks]
|
|
121
|
+
|
|
122
|
+
def get_block_from_id(self, block_id: str) -> DocMDBlock:
|
|
123
|
+
return self.blocks[self.block_ids.index(block_id)]
|
|
124
|
+
|
|
125
|
+
def to_string(self) -> str:
|
|
126
|
+
lines: list[str] = []
|
|
127
|
+
lines.append(f"@document_id: {self.documentId}")
|
|
128
|
+
if self.revisionId:
|
|
129
|
+
lines.append(f"@revision_id: {self.revisionId}")
|
|
130
|
+
if self.tab: # Only include @tab line if tab is not empty
|
|
131
|
+
lines.append(f"@tab: {self.tab}")
|
|
132
|
+
lines.append("")
|
|
133
|
+
for b in self.blocks:
|
|
134
|
+
lines.append(b.to_string())
|
|
135
|
+
return "\n".join(lines)
|
|
136
|
+
|
|
137
|
+
def get_docmd_with_annotated_block(self, block_id: str) -> "DocMD":
|
|
138
|
+
"""
|
|
139
|
+
Get a new DocMD with the provided block id's text
|
|
140
|
+
annotated with location tags on each word.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
block = self.get_block_from_id(block_id)
|
|
144
|
+
text = block.text
|
|
145
|
+
|
|
146
|
+
annotated_text = ""
|
|
147
|
+
i = 0
|
|
148
|
+
|
|
149
|
+
while i < len(text):
|
|
150
|
+
if text[i].isspace():
|
|
151
|
+
# Preserve whitespace
|
|
152
|
+
annotated_text += text[i]
|
|
153
|
+
i += 1
|
|
154
|
+
else:
|
|
155
|
+
# We're at the start of a word, find where it ends
|
|
156
|
+
word_start = i
|
|
157
|
+
while i < len(text) and not text[i].isspace():
|
|
158
|
+
i += 1
|
|
159
|
+
|
|
160
|
+
word = text[word_start:i]
|
|
161
|
+
word_length = len(word)
|
|
162
|
+
start_pos = block.startIndex + word_start
|
|
163
|
+
end_pos = start_pos + word_length
|
|
164
|
+
|
|
165
|
+
annotated_text += f"<@{start_pos}>{word}</@{end_pos}>"
|
|
166
|
+
|
|
167
|
+
annotated_block = DocMDBlock(
|
|
168
|
+
id=block.id,
|
|
169
|
+
startIndex=block.startIndex,
|
|
170
|
+
endIndex=block.endIndex,
|
|
171
|
+
type=block.type,
|
|
172
|
+
attrs=block.attrs,
|
|
173
|
+
text=annotated_text,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
new_blocks = []
|
|
177
|
+
for b in self.blocks:
|
|
178
|
+
if b.id == block_id:
|
|
179
|
+
new_blocks.append(annotated_block)
|
|
180
|
+
else:
|
|
181
|
+
new_blocks.append(b)
|
|
182
|
+
|
|
183
|
+
return DocMD(
|
|
184
|
+
documentId=self.documentId,
|
|
185
|
+
revisionId=self.revisionId,
|
|
186
|
+
tab=self.tab,
|
|
187
|
+
blocks=new_blocks,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def build_docmd(document: Document) -> DocMD: # noqa: C901
|
|
192
|
+
doc_id = document.documentId or ""
|
|
193
|
+
rev = document.revisionId
|
|
194
|
+
tab = ""
|
|
195
|
+
|
|
196
|
+
counters: dict[str, int] = {
|
|
197
|
+
"H": 0,
|
|
198
|
+
"P": 0,
|
|
199
|
+
"UL": 0,
|
|
200
|
+
"OL": 0,
|
|
201
|
+
"HR": 0,
|
|
202
|
+
"TABLE": 0,
|
|
203
|
+
"TR": 0,
|
|
204
|
+
"TC": 0,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
def next_id(prefix: str) -> str:
|
|
208
|
+
counters[prefix] += 1
|
|
209
|
+
return f"{prefix}{counters[prefix]}"
|
|
210
|
+
|
|
211
|
+
blocks: list[DocMDBlock] = []
|
|
212
|
+
|
|
213
|
+
for se in document.body.content or [] if document.body else []:
|
|
214
|
+
if se.paragraph is not None:
|
|
215
|
+
p: Paragraph = se.paragraph
|
|
216
|
+
named = p.paragraphStyle.namedStyleType if p.paragraphStyle else None
|
|
217
|
+
is_heading = named in (
|
|
218
|
+
NamedStyleType.HEADING_1,
|
|
219
|
+
NamedStyleType.HEADING_2,
|
|
220
|
+
NamedStyleType.HEADING_3,
|
|
221
|
+
NamedStyleType.HEADING_4,
|
|
222
|
+
NamedStyleType.HEADING_5,
|
|
223
|
+
NamedStyleType.HEADING_6,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
block_type: str
|
|
227
|
+
block_id: str
|
|
228
|
+
attrs: dict[str, str] = {}
|
|
229
|
+
# Only add tab attribute if it's not empty (not the default tab)
|
|
230
|
+
if tab:
|
|
231
|
+
attrs["tab"] = tab
|
|
232
|
+
|
|
233
|
+
if is_heading:
|
|
234
|
+
level = int(str(named).split("_")[-1])
|
|
235
|
+
block_type = f"HEADING_{level}"
|
|
236
|
+
block_id = next_id("H")
|
|
237
|
+
if p.paragraphStyle and p.paragraphStyle.headingId:
|
|
238
|
+
attrs["headingId"] = p.paragraphStyle.headingId
|
|
239
|
+
else:
|
|
240
|
+
if p.bullet and p.bullet.listId:
|
|
241
|
+
block_type = DocMDBlockType.UL_ITEM.value
|
|
242
|
+
block_id = next_id("UL")
|
|
243
|
+
attrs["listId"] = p.bullet.listId
|
|
244
|
+
if p.bullet.nestingLevel is not None:
|
|
245
|
+
attrs["level"] = str(p.bullet.nestingLevel)
|
|
246
|
+
else:
|
|
247
|
+
block_type = DocMDBlockType.PARAGRAPH.value
|
|
248
|
+
block_id = next_id("P")
|
|
249
|
+
|
|
250
|
+
vis_start, vis_end, text, style_runs = _visible_span_and_text(p)
|
|
251
|
+
start = vis_start if vis_start is not None else se.startIndex or 0
|
|
252
|
+
end = vis_end if vis_end is not None else se.endIndex or start
|
|
253
|
+
text_line = (text or "").rstrip("\n")
|
|
254
|
+
|
|
255
|
+
# Add style ranges to attrs if any styles are present
|
|
256
|
+
if style_runs:
|
|
257
|
+
style_ranges = _format_style_ranges(style_runs, start)
|
|
258
|
+
if style_ranges:
|
|
259
|
+
attrs["styles"] = style_ranges
|
|
260
|
+
|
|
261
|
+
blocks.append(
|
|
262
|
+
DocMDBlock(
|
|
263
|
+
id=block_id,
|
|
264
|
+
startIndex=start,
|
|
265
|
+
endIndex=end,
|
|
266
|
+
type=block_type,
|
|
267
|
+
attrs=attrs if attrs else None,
|
|
268
|
+
text=text_line,
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
elif se.table is not None:
|
|
273
|
+
_process_table(se.table, se, next_id, tab, blocks)
|
|
274
|
+
|
|
275
|
+
return DocMD(documentId=doc_id, revisionId=rev, tab=tab, blocks=blocks)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _process_table( # type: ignore[no-untyped-def]
|
|
279
|
+
table: Table,
|
|
280
|
+
se,
|
|
281
|
+
next_id_func: Callable[[str], str],
|
|
282
|
+
tab: str,
|
|
283
|
+
blocks: list[DocMDBlock],
|
|
284
|
+
) -> None:
|
|
285
|
+
"""Process a table structural element and add table/row/cell blocks."""
|
|
286
|
+
table_id = next_id_func("TABLE")
|
|
287
|
+
table_attrs: dict[str, str] = {}
|
|
288
|
+
|
|
289
|
+
# Only add tab attribute if it's not empty (not the default tab)
|
|
290
|
+
if tab:
|
|
291
|
+
table_attrs["tab"] = tab
|
|
292
|
+
|
|
293
|
+
if table.rows is not None:
|
|
294
|
+
table_attrs["rows"] = str(table.rows)
|
|
295
|
+
if table.columns is not None:
|
|
296
|
+
table_attrs["cols"] = str(table.columns)
|
|
297
|
+
|
|
298
|
+
table_start = se.startIndex or 0
|
|
299
|
+
table_end = se.endIndex or table_start
|
|
300
|
+
|
|
301
|
+
blocks.append(
|
|
302
|
+
DocMDBlock(
|
|
303
|
+
id=table_id,
|
|
304
|
+
startIndex=table_start,
|
|
305
|
+
endIndex=table_end,
|
|
306
|
+
type=DocMDBlockType.TABLE.value,
|
|
307
|
+
attrs=table_attrs if table_attrs else None,
|
|
308
|
+
text="",
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
for row_idx, table_row in enumerate(table.tableRows or []):
|
|
313
|
+
_process_table_row(table_row, row_idx, table_start, next_id_func, tab, blocks)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _process_table_row( # type: ignore[no-untyped-def]
|
|
317
|
+
table_row,
|
|
318
|
+
row_idx: int,
|
|
319
|
+
table_start: int,
|
|
320
|
+
next_id_func: Callable[[str], str],
|
|
321
|
+
tab: str,
|
|
322
|
+
blocks: list[DocMDBlock],
|
|
323
|
+
) -> None:
|
|
324
|
+
"""Process a table row and add row/cell blocks."""
|
|
325
|
+
row_id = next_id_func("TR")
|
|
326
|
+
row_attrs: dict[str, str] = {"row": str(row_idx)}
|
|
327
|
+
|
|
328
|
+
if tab:
|
|
329
|
+
row_attrs["tab"] = tab
|
|
330
|
+
|
|
331
|
+
row_start = table_row.startIndex or table_start
|
|
332
|
+
row_end = table_row.endIndex or row_start
|
|
333
|
+
|
|
334
|
+
blocks.append(
|
|
335
|
+
DocMDBlock(
|
|
336
|
+
id=row_id,
|
|
337
|
+
startIndex=row_start,
|
|
338
|
+
endIndex=row_end,
|
|
339
|
+
type=DocMDBlockType.TABLE_ROW.value,
|
|
340
|
+
attrs=row_attrs,
|
|
341
|
+
text="",
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
for cell_idx, table_cell in enumerate(table_row.tableCells or []):
|
|
346
|
+
_process_table_cell(table_cell, row_idx, cell_idx, row_start, next_id_func, tab, blocks)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _process_table_cell( # type: ignore[no-untyped-def] # noqa: C901
|
|
350
|
+
table_cell,
|
|
351
|
+
row_idx: int,
|
|
352
|
+
cell_idx: int,
|
|
353
|
+
row_start: int,
|
|
354
|
+
next_id_func: Callable[[str], str],
|
|
355
|
+
tab: str,
|
|
356
|
+
blocks: list[DocMDBlock],
|
|
357
|
+
) -> None:
|
|
358
|
+
"""Process a table cell and add cell block."""
|
|
359
|
+
cell_id = next_id_func("TC")
|
|
360
|
+
cell_attrs: dict[str, str] = {
|
|
361
|
+
"row": str(row_idx),
|
|
362
|
+
"col": str(cell_idx),
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if tab:
|
|
366
|
+
cell_attrs["tab"] = tab
|
|
367
|
+
|
|
368
|
+
# Add cell styling attributes if present
|
|
369
|
+
if (
|
|
370
|
+
table_cell.tableCellStyle
|
|
371
|
+
and table_cell.tableCellStyle.rowSpan is not None
|
|
372
|
+
and table_cell.tableCellStyle.rowSpan > 1
|
|
373
|
+
):
|
|
374
|
+
cell_attrs["rowspan"] = str(table_cell.tableCellStyle.rowSpan)
|
|
375
|
+
|
|
376
|
+
if (
|
|
377
|
+
table_cell.tableCellStyle
|
|
378
|
+
and table_cell.tableCellStyle.columnSpan is not None
|
|
379
|
+
and table_cell.tableCellStyle.columnSpan > 1
|
|
380
|
+
):
|
|
381
|
+
cell_attrs["colspan"] = str(table_cell.tableCellStyle.columnSpan)
|
|
382
|
+
|
|
383
|
+
cell_start = table_cell.startIndex or row_start
|
|
384
|
+
cell_end = table_cell.endIndex or cell_start
|
|
385
|
+
|
|
386
|
+
# Extract text content from cell
|
|
387
|
+
cell_text_parts: list[str] = []
|
|
388
|
+
cell_style_runs: list[dict] = []
|
|
389
|
+
if table_cell.content:
|
|
390
|
+
for cell_se in table_cell.content:
|
|
391
|
+
if cell_se.paragraph:
|
|
392
|
+
_, _, text, style_runs = _visible_span_and_text(cell_se.paragraph)
|
|
393
|
+
if text:
|
|
394
|
+
cell_text_parts.append(text.rstrip("\n"))
|
|
395
|
+
# Collect style runs for the cell (we'll merge them if needed)
|
|
396
|
+
if style_runs:
|
|
397
|
+
cell_style_runs.extend(style_runs)
|
|
398
|
+
|
|
399
|
+
cell_text = " ".join(cell_text_parts)
|
|
400
|
+
|
|
401
|
+
# Add style ranges to cell attrs if any styles are present
|
|
402
|
+
if cell_style_runs:
|
|
403
|
+
style_ranges = _format_style_ranges(cell_style_runs, cell_start)
|
|
404
|
+
if style_ranges:
|
|
405
|
+
cell_attrs["styles"] = style_ranges
|
|
406
|
+
|
|
407
|
+
blocks.append(
|
|
408
|
+
DocMDBlock(
|
|
409
|
+
id=cell_id,
|
|
410
|
+
startIndex=cell_start,
|
|
411
|
+
endIndex=cell_end,
|
|
412
|
+
type=DocMDBlockType.TABLE_CELL.value,
|
|
413
|
+
attrs=cell_attrs,
|
|
414
|
+
text=cell_text,
|
|
415
|
+
)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _visible_span_and_text(p: Paragraph) -> tuple[int | None, int | None, str, list[dict]]:
|
|
420
|
+
"""Extract visible text and style information from a paragraph.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
tuple of (start_index, end_index, text_content, style_runs)
|
|
424
|
+
where style_runs is a list of dicts with style info and relative positions
|
|
425
|
+
"""
|
|
426
|
+
start: int | None = None
|
|
427
|
+
end: int | None = None
|
|
428
|
+
parts: list[str] = []
|
|
429
|
+
style_runs: list[dict] = []
|
|
430
|
+
|
|
431
|
+
for el in p.elements or []:
|
|
432
|
+
if el.textRun and el.textRun.content is not None:
|
|
433
|
+
if start is None and el.startIndex is not None:
|
|
434
|
+
start = el.startIndex
|
|
435
|
+
if el.endIndex is not None:
|
|
436
|
+
end = el.endIndex - 1
|
|
437
|
+
|
|
438
|
+
# Track style information for this text run
|
|
439
|
+
if el.textRun.textStyle and el.startIndex is not None and el.endIndex is not None:
|
|
440
|
+
style_info = _extract_text_style(el.textRun.textStyle)
|
|
441
|
+
if style_info: # Only add if there are actual styles
|
|
442
|
+
style_runs.append({
|
|
443
|
+
"start": el.startIndex,
|
|
444
|
+
"end": el.endIndex - 1,
|
|
445
|
+
"styles": style_info,
|
|
446
|
+
})
|
|
447
|
+
|
|
448
|
+
parts.append(el.textRun.content)
|
|
449
|
+
elif el.horizontalRule is not None:
|
|
450
|
+
if start is None and el.startIndex is not None:
|
|
451
|
+
start = el.startIndex
|
|
452
|
+
if el.endIndex is not None:
|
|
453
|
+
end = el.endIndex - 1
|
|
454
|
+
|
|
455
|
+
return start, end, "".join(parts), style_runs
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _extract_text_style(text_style: TextStyle) -> dict[str, bool | str | int]: # noqa: C901
|
|
459
|
+
"""Extract relevant style properties from a TextStyle object."""
|
|
460
|
+
styles: dict[str, bool | str | int] = {}
|
|
461
|
+
|
|
462
|
+
# Boolean styles
|
|
463
|
+
if text_style.bold:
|
|
464
|
+
styles["bold"] = True
|
|
465
|
+
if text_style.italic:
|
|
466
|
+
styles["italic"] = True
|
|
467
|
+
if text_style.underline:
|
|
468
|
+
styles["underline"] = True
|
|
469
|
+
if text_style.strikethrough:
|
|
470
|
+
styles["strikethrough"] = True
|
|
471
|
+
|
|
472
|
+
# Color styles
|
|
473
|
+
if text_style.backgroundColor and text_style.backgroundColor.color:
|
|
474
|
+
rgb = text_style.backgroundColor.color.rgbColor
|
|
475
|
+
if rgb:
|
|
476
|
+
styles["bgColor"] = f"rgb({rgb.red or 0},{rgb.green or 0},{rgb.blue or 0})"
|
|
477
|
+
|
|
478
|
+
if text_style.foregroundColor and text_style.foregroundColor.color:
|
|
479
|
+
rgb = text_style.foregroundColor.color.rgbColor
|
|
480
|
+
if rgb:
|
|
481
|
+
styles["color"] = f"rgb({rgb.red or 0},{rgb.green or 0},{rgb.blue or 0})"
|
|
482
|
+
|
|
483
|
+
# Font size
|
|
484
|
+
if text_style.fontSize and text_style.fontSize.magnitude:
|
|
485
|
+
styles["fontSize"] = f"{text_style.fontSize.magnitude}pt"
|
|
486
|
+
|
|
487
|
+
# Font family
|
|
488
|
+
if text_style.weightedFontFamily:
|
|
489
|
+
if text_style.weightedFontFamily.fontFamily:
|
|
490
|
+
styles["font"] = text_style.weightedFontFamily.fontFamily
|
|
491
|
+
if text_style.weightedFontFamily.weight and text_style.weightedFontFamily.weight != 400:
|
|
492
|
+
styles["fontWeight"] = text_style.weightedFontFamily.weight
|
|
493
|
+
|
|
494
|
+
# Baseline offset
|
|
495
|
+
if text_style.baselineOffset and text_style.baselineOffset != "NONE":
|
|
496
|
+
styles["baseline"] = text_style.baselineOffset.lower()
|
|
497
|
+
|
|
498
|
+
return styles
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _format_style_ranges(style_runs: list[dict], block_start: int) -> str:
|
|
502
|
+
"""Format style runs into a compact string representation for attrs.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
style_runs: List of style run dictionaries with absolute positions
|
|
506
|
+
block_start: The start index of the block (unused, kept for compatibility)
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Formatted string like "bold:10-21,italic:15-20,color=red:25-30"
|
|
510
|
+
with absolute document positions
|
|
511
|
+
"""
|
|
512
|
+
if not style_runs:
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
# Consolidate overlapping ranges with the same styles
|
|
516
|
+
consolidated = []
|
|
517
|
+
|
|
518
|
+
for run in style_runs:
|
|
519
|
+
# Use absolute document positions (not relative to block)
|
|
520
|
+
abs_start = run["start"]
|
|
521
|
+
abs_end = run["end"]
|
|
522
|
+
|
|
523
|
+
for style_name, style_value in run["styles"].items():
|
|
524
|
+
# Format the style entry
|
|
525
|
+
if isinstance(style_value, bool):
|
|
526
|
+
# For boolean styles, just use the name
|
|
527
|
+
style_str = style_name
|
|
528
|
+
else:
|
|
529
|
+
# For valued styles, include the value
|
|
530
|
+
style_str = f"{style_name}={style_value}"
|
|
531
|
+
|
|
532
|
+
consolidated.append(f"{style_str}:{abs_start}-{abs_end}")
|
|
533
|
+
|
|
534
|
+
return ",".join(consolidated) if consolidated else ""
|