raw-docx 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- raw_docx/__info__.py +1 -1
- raw_docx/docx/docx_paragraph.py +174 -32
- raw_docx/raw_bookmark_end.py +19 -0
- raw_docx/raw_bookmark_start.py +20 -0
- raw_docx/raw_document.py +4 -0
- raw_docx/raw_docx.py +9 -64
- raw_docx/raw_paragraph.py +69 -11
- raw_docx/raw_run.py +36 -2
- raw_docx/raw_section.py +3 -0
- raw_docx/raw_simple_field.py +24 -0
- {raw_docx-0.9.0.dist-info → raw_docx-0.10.0.dist-info}/METADATA +7 -4
- raw_docx-0.10.0.dist-info/RECORD +24 -0
- raw_docx-0.9.0.dist-info/RECORD +0 -21
- {raw_docx-0.9.0.dist-info → raw_docx-0.10.0.dist-info}/WHEEL +0 -0
- {raw_docx-0.9.0.dist-info → raw_docx-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {raw_docx-0.9.0.dist-info → raw_docx-0.10.0.dist-info}/top_level.txt +0 -0
raw_docx/__info__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__package_version__ = "0.
|
1
|
+
__package_version__ = "0.10.0"
|
raw_docx/docx/docx_paragraph.py
CHANGED
@@ -1,51 +1,193 @@
|
|
1
|
+
import re
|
1
2
|
from docx.text.paragraph import Paragraph
|
2
3
|
from docx.styles.style import ParagraphStyle
|
3
4
|
from docx.text.run import Run
|
4
5
|
from simple_error_log import Errors
|
6
|
+
from simple_error_log.error_location import KlassMethodLocation
|
5
7
|
from raw_docx.raw_run import RawRun
|
8
|
+
from raw_docx.raw_simple_field import RawSimpleField
|
9
|
+
from raw_docx.raw_bookmark_start import RawBookmarkStart
|
10
|
+
from raw_docx.raw_bookmark_end import RawBookmarkEnd
|
11
|
+
from docx.oxml.text.run import CT_R
|
12
|
+
from lxml import etree
|
13
|
+
|
14
|
+
|
15
|
+
BOOKMARK_START = (
|
16
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bookmarkStart"
|
17
|
+
)
|
18
|
+
BOOKMARK_END = (
|
19
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bookmarkEnd"
|
20
|
+
)
|
21
|
+
FIELD_SIMPLE = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fldSimple"
|
22
|
+
FIELD_CHAR = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fldChar"
|
23
|
+
INSTRUCTION_TEXT = (
|
24
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}instrText"
|
25
|
+
)
|
26
|
+
|
27
|
+
PARA_PROPERTIES = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr"
|
28
|
+
HYPERLINK = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink"
|
29
|
+
PROOF_ERROR = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}proofErr"
|
30
|
+
|
31
|
+
ELEMENT_IGNORE_TAGS = [PARA_PROPERTIES, HYPERLINK, PROOF_ERROR]
|
32
|
+
SIMPLE_FIELD_IGNORE_TAGS = [BOOKMARK_START, BOOKMARK_END]
|
33
|
+
|
34
|
+
ID_ATTRIBUTE = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id"
|
35
|
+
NAME_ATTRIBUTE = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name"
|
36
|
+
FIELD_CHAR_TYPE_ATTRIBUTE = (
|
37
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fldCharType"
|
38
|
+
)
|
39
|
+
INSTR_ATTRIBUTE = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}instr"
|
40
|
+
|
41
|
+
MODULE = "raw_docx.docx.docx_paragraph"
|
6
42
|
|
7
43
|
|
8
44
|
def install():
|
9
|
-
setattr(Paragraph, "
|
10
|
-
|
11
|
-
|
12
|
-
def
|
13
|
-
|
14
|
-
|
15
|
-
)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
45
|
+
setattr(Paragraph, "extract_content", extract_content)
|
46
|
+
|
47
|
+
|
48
|
+
def extract_content(
|
49
|
+
paragraph: Paragraph, errors: Errors
|
50
|
+
) -> list[RawRun | RawBookmarkStart | RawBookmarkEnd | RawSimpleField]:
|
51
|
+
return process_element(paragraph._element, paragraph, errors)
|
52
|
+
|
53
|
+
|
54
|
+
def process_element(
|
55
|
+
element: etree._Element | CT_R, parent: Paragraph, errors: Errors
|
56
|
+
) -> list[RawRun | RawBookmarkStart | RawBookmarkEnd | RawSimpleField]:
|
57
|
+
data = []
|
58
|
+
bookmark = None
|
59
|
+
bookmark_id = None
|
60
|
+
for child in element:
|
61
|
+
# print(
|
62
|
+
# f"XML: {child.xml if isinstance(child, CT_R) else etree.tostring(child, encoding='unicode', pretty_print=True)}, type{type(child)}"
|
63
|
+
# )
|
64
|
+
if isinstance(child, CT_R):
|
65
|
+
run = build_run(child, parent, errors)
|
66
|
+
data.append(run)
|
67
|
+
elif child.tag == BOOKMARK_START:
|
68
|
+
if bookmark:
|
69
|
+
del data[bookmark]
|
70
|
+
id = child.get(ID_ATTRIBUTE)
|
71
|
+
name = child.get(NAME_ATTRIBUTE)
|
72
|
+
data.append(RawBookmarkStart(id, name))
|
73
|
+
bookmark_id = id
|
74
|
+
bookmark = len(data) - 1
|
75
|
+
elif child.tag == BOOKMARK_END:
|
76
|
+
id = child.get(ID_ATTRIBUTE)
|
77
|
+
if id == bookmark_id:
|
78
|
+
name = child.get(NAME_ATTRIBUTE)
|
79
|
+
data.append(RawBookmarkEnd(id, name))
|
80
|
+
bookmark = None
|
81
|
+
bookmark_id = None
|
82
|
+
elif child.tag == FIELD_SIMPLE:
|
83
|
+
bookmark_id = _extract_instruction_id(child.get(INSTR_ATTRIBUTE))
|
84
|
+
items = process_simple_field(child, parent, errors)
|
85
|
+
data.append(RawSimpleField(bookmark_id, create_runs(items, errors)))
|
86
|
+
elif child.tag in ELEMENT_IGNORE_TAGS:
|
87
|
+
pass
|
88
|
+
else:
|
89
|
+
errors.warning(
|
90
|
+
f"Element other instance/tag detected:: '{child.tag}'",
|
91
|
+
KlassMethodLocation(MODULE, "process_element"),
|
92
|
+
)
|
93
|
+
return create_runs(data, errors)
|
30
94
|
|
31
95
|
|
32
|
-
def
|
96
|
+
def process_simple_field(element, parent: Paragraph, errors: Errors) -> list[RawRun]:
|
97
|
+
data = []
|
98
|
+
for child in element:
|
99
|
+
# print(F"CHILD: {child}, type{type(child)}")
|
100
|
+
if isinstance(child, CT_R):
|
101
|
+
run = build_run(child, parent, errors)
|
102
|
+
data.append(run)
|
103
|
+
elif child.tag in SIMPLE_FIELD_IGNORE_TAGS:
|
104
|
+
pass
|
105
|
+
else:
|
106
|
+
errors.warning(
|
107
|
+
f"Simple field other instance/tag detected: '{child.tag}'",
|
108
|
+
KlassMethodLocation(MODULE, "process_element"),
|
109
|
+
)
|
110
|
+
return data
|
111
|
+
|
112
|
+
|
113
|
+
def _extract_instruction_id(text: str) -> str:
|
114
|
+
pattern = r"_TN[A-F0-9]+"
|
115
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
116
|
+
return match.group(0) if match else ""
|
117
|
+
|
118
|
+
|
119
|
+
def create_runs(data: list[dict], errors: Errors) -> list[RawRun]:
|
120
|
+
data = _tidy_runs(data, errors)
|
121
|
+
results = []
|
122
|
+
for x in data:
|
123
|
+
if isinstance(x, dict):
|
124
|
+
results.append(
|
125
|
+
RawRun(
|
126
|
+
x["text"],
|
127
|
+
x["color"],
|
128
|
+
x["highlight"],
|
129
|
+
x["style"],
|
130
|
+
x["superscript"],
|
131
|
+
x["subscript"],
|
132
|
+
x["field_char_type"],
|
133
|
+
x["instruction"],
|
134
|
+
)
|
135
|
+
)
|
136
|
+
else:
|
137
|
+
results.append(x)
|
138
|
+
return results
|
139
|
+
|
140
|
+
|
141
|
+
def build_run(element, paragraph: Paragraph, errors: Errors) -> RawRun:
|
142
|
+
run = Run(element, paragraph)
|
143
|
+
field_char_type = None
|
144
|
+
instruction = None
|
145
|
+
for child in element:
|
146
|
+
if child.tag == FIELD_CHAR:
|
147
|
+
field_char_type = child.get(FIELD_CHAR_TYPE_ATTRIBUTE)
|
148
|
+
elif child.tag == INSTRUCTION_TEXT:
|
149
|
+
instruction = _extract_instruction_id(child.text)
|
150
|
+
return {
|
151
|
+
"text": run.text,
|
152
|
+
"color": _get_run_color(paragraph.style, run, errors),
|
153
|
+
"highlight": _get_highlight_color(run, errors),
|
154
|
+
"keep": True,
|
155
|
+
"style": paragraph.style.name,
|
156
|
+
"subscript": run.font.subscript,
|
157
|
+
"superscript": run.font.superscript,
|
158
|
+
"field_char_type": field_char_type,
|
159
|
+
"instruction": instruction,
|
160
|
+
}
|
161
|
+
|
162
|
+
|
163
|
+
def _tidy_runs(data: list, errors: Errors) -> list:
|
33
164
|
more = False
|
165
|
+
# print(f"TIDY IN: {data}")
|
34
166
|
for index, run in enumerate(data):
|
35
|
-
if (
|
36
|
-
index
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
167
|
+
if index > 0 and isinstance(run, dict) and isinstance(data[index - 1], dict):
|
168
|
+
# print(f"A={run}, B={data[index - 1]}")
|
169
|
+
if _equal_with_ignore(run, data[index - 1], ["text", "keep"]):
|
170
|
+
run["text"] = data[index - 1]["text"] + run["text"]
|
171
|
+
data[index - 1]["keep"] = False
|
172
|
+
more = True
|
173
|
+
new_data = [
|
174
|
+
x
|
175
|
+
for x in data
|
176
|
+
if (isinstance(x, dict) and x["keep"]) or (not isinstance(x, dict))
|
177
|
+
]
|
44
178
|
if more:
|
45
|
-
new_data =
|
179
|
+
new_data = _tidy_runs(new_data, errors)
|
180
|
+
# print(f"TIDY OUT: {new_data}")
|
46
181
|
return new_data
|
47
182
|
|
48
183
|
|
184
|
+
def _equal_with_ignore(a: dict, b: dict, ignore_keys: list) -> bool:
|
185
|
+
# print(f"A={a}, B={b}")
|
186
|
+
return {k: v for k, v in a.items() if k not in ignore_keys} == {
|
187
|
+
k: v for k, v in b.items() if k not in ignore_keys
|
188
|
+
}
|
189
|
+
|
190
|
+
|
49
191
|
def _get_run_color(paragraph: Paragraph, run: Run, errors: Errors) -> str | None:
|
50
192
|
paragraph_color = _get_font_colour(paragraph, errors)
|
51
193
|
font_color = _get_font_colour(run, errors)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class RawBookmarkEnd:
|
2
|
+
def __init__(self, id: str, name: str):
|
3
|
+
self.id = id
|
4
|
+
self.name = name
|
5
|
+
|
6
|
+
@property
|
7
|
+
def text(self) -> str:
|
8
|
+
return ""
|
9
|
+
|
10
|
+
def to_html(self) -> str:
|
11
|
+
return ""
|
12
|
+
|
13
|
+
def to_dict(self) -> dict:
|
14
|
+
"""Convert the paragraph to a dictionary representation"""
|
15
|
+
return {
|
16
|
+
"type": "bookmark_end",
|
17
|
+
"id": self.id,
|
18
|
+
"name": self.name,
|
19
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Note: Bookmark is a target, the desination of any link
|
2
|
+
class RawBookmarkStart:
|
3
|
+
def __init__(self, id: str, name: str):
|
4
|
+
self.id = id
|
5
|
+
self.name = name
|
6
|
+
|
7
|
+
@property
|
8
|
+
def text(self) -> str:
|
9
|
+
return ""
|
10
|
+
|
11
|
+
def to_html(self) -> str:
|
12
|
+
return f'<span id="{self.name}"></span>'
|
13
|
+
|
14
|
+
def to_dict(self) -> dict:
|
15
|
+
"""Convert the paragraph to a dictionary representation"""
|
16
|
+
return {
|
17
|
+
"type": "bookmark_start",
|
18
|
+
"id": self.id,
|
19
|
+
"name": self.name,
|
20
|
+
}
|
raw_docx/raw_document.py
CHANGED
raw_docx/raw_docx.py
CHANGED
@@ -76,7 +76,8 @@ class RawDocx:
|
|
76
76
|
self._process_table(block_item, target_section)
|
77
77
|
else:
|
78
78
|
self._errors.warning(
|
79
|
-
"Ignoring element",
|
79
|
+
f"Ignoring element {block_item}",
|
80
|
+
KlassMethodLocation(self.MODULE, "_process"),
|
80
81
|
)
|
81
82
|
raise ValueError
|
82
83
|
except Exception as e:
|
@@ -133,17 +134,13 @@ class RawDocx:
|
|
133
134
|
pass
|
134
135
|
else:
|
135
136
|
self._errors.warning(
|
136
|
-
f"Ignoring eTree element {
|
137
|
+
f"Ignoring eTree element '{child.tag}'",
|
137
138
|
KlassMethodLocation(self.MODULE, "_iter_block_items"),
|
138
139
|
)
|
139
140
|
|
140
141
|
else:
|
141
142
|
raise ValueError(f"something's not right with a child {type(child)}")
|
142
143
|
|
143
|
-
def _tree(self, node, tab=1):
|
144
|
-
for child in node:
|
145
|
-
self._tree(child, tab + 1)
|
146
|
-
|
147
144
|
def _process_table(self, table, target: RawSection | RawTableCell):
|
148
145
|
target_table = RawTable()
|
149
146
|
target.add(target_table)
|
@@ -168,7 +165,7 @@ class RawDocx:
|
|
168
165
|
pass
|
169
166
|
else:
|
170
167
|
self._errors.warning(
|
171
|
-
f"Ignoring eTree element {block_item.tag}",
|
168
|
+
f"Ignoring eTree element '{block_item.tag}'",
|
172
169
|
KlassMethodLocation(self.MODULE, "_process_table"),
|
173
170
|
)
|
174
171
|
else:
|
@@ -176,63 +173,10 @@ class RawDocx:
|
|
176
173
|
f"Something's not right with a child {type(block_item)}"
|
177
174
|
)
|
178
175
|
|
179
|
-
# def _process_table(self, table, target: RawSection | RawTableCell):
|
180
|
-
# target_table = RawTable()
|
181
|
-
# target.add(target_table)
|
182
|
-
# for r_index, row in enumerate(table.rows):
|
183
|
-
# target_row = RawTableRow()
|
184
|
-
# target_table.add(target_row)
|
185
|
-
# cells = row.cells
|
186
|
-
# for c_index, cell in enumerate(cells):
|
187
|
-
# if cell._tc is not None:
|
188
|
-
# x = cell._tc
|
189
|
-
# right = x.right
|
190
|
-
# left = x.left
|
191
|
-
# top = x.top
|
192
|
-
# try:
|
193
|
-
# # Bottom method seems to have a bug.
|
194
|
-
# # See https://github.com/python-openxml/python-docx/issues/1433
|
195
|
-
# bottom = x.bottom
|
196
|
-
# except Exception as e:
|
197
|
-
# self._errors.exception(
|
198
|
-
# f"Row span exception! {x.xml}",
|
199
|
-
# e,
|
200
|
-
# KlassMethodLocation(self.MODULE, "_process_table"),
|
201
|
-
# )
|
202
|
-
# bottom = top + 1
|
203
|
-
# h_span = right - left
|
204
|
-
# v_span = bottom - top
|
205
|
-
# else:
|
206
|
-
# h_span = 1
|
207
|
-
# v_span = 1
|
208
|
-
# if cell._tc is not None:
|
209
|
-
# first = r_index == cell._tc.top and c_index == cell._tc.left
|
210
|
-
# else:
|
211
|
-
# first = r_index == 0 and c_index == 0
|
212
|
-
# target_cell = RawTableCell(h_span, v_span, first)
|
213
|
-
# target_row.add(target_cell)
|
214
|
-
# for block_item in self._iter_block_items(cell):
|
215
|
-
# if isinstance(block_item, Paragraph):
|
216
|
-
# self._process_cell(block_item, target_cell)
|
217
|
-
# elif isinstance(block_item, Table):
|
218
|
-
# raise self.LogicError("Table within table detected")
|
219
|
-
# elif isinstance(block_item, etree._Element):
|
220
|
-
# if block_item.tag == CT_TcPr:
|
221
|
-
# pass
|
222
|
-
# else:
|
223
|
-
# self._errors.warning(
|
224
|
-
# f"Ignoring eTree element {block_item.tag}",
|
225
|
-
# KlassMethodLocation(self.MODULE, "_process_table"),
|
226
|
-
# )
|
227
|
-
# else:
|
228
|
-
# raise self.LogicError(
|
229
|
-
# f"Something's not right with a child {type(block_item)}"
|
230
|
-
# )
|
231
|
-
|
232
176
|
def _process_cell(self, paragraph, target_cell: RawTableCell):
|
233
177
|
if self._is_list(paragraph):
|
234
178
|
list_level = self.get_list_level(paragraph)
|
235
|
-
item = RawListItem(paragraph.
|
179
|
+
item = RawListItem(paragraph.extract_content(self._errors), list_level)
|
236
180
|
if target_cell.is_in_list():
|
237
181
|
list = target_cell.current_list()
|
238
182
|
else:
|
@@ -240,7 +184,7 @@ class RawDocx:
|
|
240
184
|
target_cell.add(list)
|
241
185
|
list.add(item)
|
242
186
|
else:
|
243
|
-
target_paragraph = RawParagraph(paragraph.
|
187
|
+
target_paragraph = RawParagraph(paragraph.extract_content(self._errors))
|
244
188
|
target_cell.add(target_paragraph)
|
245
189
|
|
246
190
|
def _process_paragraph(
|
@@ -252,7 +196,7 @@ class RawDocx:
|
|
252
196
|
self.target_document.add(target_section)
|
253
197
|
elif self._is_list(paragraph):
|
254
198
|
list_level = self.get_list_level(paragraph)
|
255
|
-
item = RawListItem(paragraph.
|
199
|
+
item = RawListItem(paragraph.extract_content(self._errors), list_level)
|
256
200
|
if target_section.is_in_list():
|
257
201
|
list = target_section.current_list()
|
258
202
|
else:
|
@@ -265,7 +209,8 @@ class RawDocx:
|
|
265
209
|
target_image = RawImage(image_rels[rId], self._errors)
|
266
210
|
target_section.add(target_image)
|
267
211
|
else:
|
268
|
-
|
212
|
+
# print("===== Raw Para =====")
|
213
|
+
target_paragraph = RawParagraph(paragraph.extract_content(self._errors))
|
269
214
|
target_section.add(target_paragraph)
|
270
215
|
|
271
216
|
def get_list_level(self, paragraph):
|
raw_docx/raw_paragraph.py
CHANGED
@@ -1,16 +1,13 @@
|
|
1
1
|
from .raw_run import RawRun
|
2
|
+
from .raw_bookmark_start import RawBookmarkStart
|
3
|
+
from .raw_bookmark_end import RawBookmarkEnd
|
2
4
|
|
3
5
|
|
4
6
|
class RawParagraph:
|
5
|
-
def __init__(self,
|
6
|
-
self.
|
7
|
+
def __init__(self, items: list[RawRun | RawBookmarkStart | RawBookmarkEnd]):
|
8
|
+
self.items = items
|
9
|
+
self.text = self._item_text()
|
7
10
|
self.klasses = []
|
8
|
-
self.text = self._run_text()
|
9
|
-
|
10
|
-
def to_html(self) -> str:
|
11
|
-
klass_list = " ".join(self.klasses)
|
12
|
-
open_tag = f'<p class="{klass_list}">' if self.klasses else "<p>"
|
13
|
-
return f"{open_tag}{self.text}</p>"
|
14
11
|
|
15
12
|
def find(self, text: str) -> bool:
|
16
13
|
return True if text in self.text else False
|
@@ -18,6 +15,49 @@ class RawParagraph:
|
|
18
15
|
def find_at_start(self, text: str) -> bool:
|
19
16
|
return True if self.text.upper().startswith(text.upper()) else False
|
20
17
|
|
18
|
+
def to_html(self) -> str:
|
19
|
+
klass_list = " ".join(self.klasses)
|
20
|
+
open_tag = f'<p class="{klass_list}">' if self.klasses else "<p>"
|
21
|
+
close_tag = "</p>"
|
22
|
+
body = ""
|
23
|
+
in_anchor = False
|
24
|
+
in_bookmark = False
|
25
|
+
bookmark_id = None
|
26
|
+
for index, item in enumerate(self.items):
|
27
|
+
if isinstance(item, RawRun):
|
28
|
+
if item.field_char_type == "begin":
|
29
|
+
if next_item := self._next_run_item(index):
|
30
|
+
if next_item.instruction:
|
31
|
+
body += f'<a class="raw-docx-cross-ref" href="#{next_item.instruction}">'
|
32
|
+
in_anchor = True
|
33
|
+
elif item.field_char_type == "separate":
|
34
|
+
if next_item := self._next_run_item(index):
|
35
|
+
if next_item.instruction:
|
36
|
+
body += f'<a class="raw-docx-cross-ref" href="#{next_item.instruction}">'
|
37
|
+
in_anchor = True
|
38
|
+
elif in_bookmark:
|
39
|
+
body += (
|
40
|
+
f'<span class="raw-docx-bookmark" id="{bookmark_id}">'
|
41
|
+
)
|
42
|
+
elif in_anchor and item.field_char_type == "end":
|
43
|
+
body += "</a>"
|
44
|
+
in_anchor = False
|
45
|
+
elif in_bookmark and item.field_char_type == "end":
|
46
|
+
body += "</span>"
|
47
|
+
in_bookmark = False # Will also be caught by BookmarkEnd
|
48
|
+
bookmark_id = None
|
49
|
+
else:
|
50
|
+
body += item.to_html()
|
51
|
+
elif isinstance(item, RawBookmarkStart):
|
52
|
+
in_bookmark = True
|
53
|
+
bookmark_id = item.name
|
54
|
+
elif isinstance(item, RawBookmarkEnd):
|
55
|
+
in_bookmark = False # Will also be caught by field_char_type = "end"
|
56
|
+
bookmark_id = None
|
57
|
+
else:
|
58
|
+
body += item.to_html()
|
59
|
+
return f"{open_tag}{body}{close_tag}"
|
60
|
+
|
21
61
|
def add_class(self, klass) -> None:
|
22
62
|
self.klasses.append(klass)
|
23
63
|
|
@@ -26,7 +66,7 @@ class RawParagraph:
|
|
26
66
|
return {
|
27
67
|
"type": "paragraph",
|
28
68
|
"text": self.text,
|
29
|
-
"
|
69
|
+
"items": [item.to_dict() for item in self.items],
|
30
70
|
"classes": self.klasses,
|
31
71
|
}
|
32
72
|
|
@@ -34,5 +74,23 @@ class RawParagraph:
|
|
34
74
|
new_str = f'<span class="{klass}">{text}</span>'
|
35
75
|
self.text = new_str + self.text[len(text) :]
|
36
76
|
|
37
|
-
def
|
38
|
-
|
77
|
+
def _next_run_item(self, start: int) -> RawRun | None:
|
78
|
+
for index in range(start + 1, len(self.items)):
|
79
|
+
if isinstance(self.items[index], RawRun):
|
80
|
+
return self.items[index]
|
81
|
+
return None
|
82
|
+
|
83
|
+
def _item_text(self) -> str:
|
84
|
+
text = ""
|
85
|
+
in_separate = False
|
86
|
+
for index, item in enumerate(self.items):
|
87
|
+
if isinstance(item, RawRun):
|
88
|
+
if item.field_char_type == "separate":
|
89
|
+
in_separate = True
|
90
|
+
elif in_separate and item.field_char_type == "end":
|
91
|
+
in_separate = False
|
92
|
+
elif not in_separate:
|
93
|
+
text += item.text
|
94
|
+
else:
|
95
|
+
text += item.text
|
96
|
+
return text
|
raw_docx/raw_run.py
CHANGED
@@ -1,15 +1,49 @@
|
|
1
1
|
class RawRun:
|
2
|
-
def __init__(
|
3
|
-
self
|
2
|
+
def __init__(
|
3
|
+
self,
|
4
|
+
text: str,
|
5
|
+
color: str | None,
|
6
|
+
highlight: str | None,
|
7
|
+
style: str,
|
8
|
+
superscript: bool,
|
9
|
+
subscript: bool,
|
10
|
+
field_char_type: str,
|
11
|
+
instruction: str,
|
12
|
+
):
|
13
|
+
self._text = text
|
4
14
|
self.color = color
|
5
15
|
self.highlight = highlight
|
6
16
|
self.style = style
|
17
|
+
self.subscript = subscript
|
18
|
+
self.superscript = superscript
|
19
|
+
self.field_char_type = field_char_type
|
20
|
+
self.instruction = instruction
|
21
|
+
|
22
|
+
@property
|
23
|
+
def text(self) -> str:
|
24
|
+
return "" if self.subscript or self.superscript else self._text
|
25
|
+
|
26
|
+
def to_html(self) -> str:
|
27
|
+
# Note: no support for colours as yet
|
28
|
+
if self.field_char_type:
|
29
|
+
return ""
|
30
|
+
elif self.subscript:
|
31
|
+
return f"<sub>{self.text}</sub>" if self.text else ""
|
32
|
+
elif self.superscript:
|
33
|
+
return f"<sup>{self.text}</sup>" if self.text else ""
|
34
|
+
else:
|
35
|
+
return f"{self.text}"
|
7
36
|
|
8
37
|
def to_dict(self) -> dict:
|
9
38
|
"""Convert the instace to a dictionary representation"""
|
10
39
|
return {
|
40
|
+
"type": "run",
|
11
41
|
"text": self.text,
|
12
42
|
"color": self.color,
|
13
43
|
"highlight": self.highlight,
|
14
44
|
"style": self.style,
|
45
|
+
"superscript": self.superscript,
|
46
|
+
"subscript": self.subscript,
|
47
|
+
"field_char_type": self.field_char_type,
|
48
|
+
"instruction": self.instruction,
|
15
49
|
}
|
raw_docx/raw_section.py
CHANGED
@@ -94,6 +94,9 @@ class RawSection:
|
|
94
94
|
def next(self, index: int):
|
95
95
|
return self.items[index + 1] if (index + 1) < len(self.items) else None
|
96
96
|
|
97
|
+
def index(self, item: RawParagraph | RawList | RawTable | RawImage) -> int | None:
|
98
|
+
return next((i for i, x in enumerate(self.items) if x is item), None)
|
99
|
+
|
97
100
|
def next_paragraph(self, start_index: int) -> RawParagraph:
|
98
101
|
for index, item in enumerate(self.items):
|
99
102
|
if index >= start_index:
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from .raw_run import RawRun
|
2
|
+
|
3
|
+
|
4
|
+
class RawSimpleField:
|
5
|
+
def __init__(self, id: str, items: list[RawRun]):
|
6
|
+
self.id = id
|
7
|
+
self.items = items
|
8
|
+
self.text = self._item_text()
|
9
|
+
|
10
|
+
def to_html(self) -> str:
|
11
|
+
start_tag = f'<a class="raw-docx-cross-ref" href="#{self.id}">'
|
12
|
+
end_tag = "</a>"
|
13
|
+
return f"{start_tag}{''.join([item.to_html() for item in self.items])}{end_tag}"
|
14
|
+
|
15
|
+
def to_dict(self) -> dict:
|
16
|
+
return {
|
17
|
+
"type": "simple_field",
|
18
|
+
"id": self.id,
|
19
|
+
"text": self.text,
|
20
|
+
"items": [item.to_dict() for item in self.items],
|
21
|
+
}
|
22
|
+
|
23
|
+
def _item_text(self) -> str:
|
24
|
+
return "".join([x.text for x in self.items])
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: raw_docx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.0
|
4
4
|
Summary: A package for processing and analyzing raw document formats
|
5
5
|
Home-page: https://github.com/daveih/raw_docx
|
6
6
|
Author: Dave Iberson-Hurst
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
17
17
|
Requires-Python: >=3.8
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
License-File: LICENSE
|
20
|
-
Requires-Dist: python-docx
|
20
|
+
Requires-Dist: python-docx>=1.1.2
|
21
21
|
Requires-Dist: simple_error_log>=0.6.0
|
22
22
|
Dynamic: author
|
23
23
|
Dynamic: classifier
|
@@ -33,9 +33,12 @@ Dynamic: summary
|
|
33
33
|
|
34
34
|
Simple package to build on top of python-docx to assist in the handling of word documents
|
35
35
|
|
36
|
-
# Build
|
36
|
+
# Build Package
|
37
37
|
|
38
|
-
Build
|
38
|
+
Build steps for deployment to pypi.org
|
39
39
|
|
40
|
+
- Run `pytest`, ensure coverage and all tests pass
|
41
|
+
- Run `ruff format`
|
42
|
+
- Run `ruff check`, ensure no errors
|
40
43
|
- Build with `python3 -m build --sdist --wheel`
|
41
44
|
- Upload to pypi.org using `twine upload dist/*`
|
@@ -0,0 +1,24 @@
|
|
1
|
+
raw_docx/__info__.py,sha256=h9PyxJrMiLzLjjHyl_X5nf8x8m1g2dhpuTyPcSoFF4Y,31
|
2
|
+
raw_docx/__init__.py,sha256=FE5cpoCK1EVhpz3LiOOs43l027PcuJN5RljdW0UWON0,591
|
3
|
+
raw_docx/raw_bookmark_end.py,sha256=D9SpnM3HKOvICUn7HJEEkph6v782udUQD5e6KceOutM,441
|
4
|
+
raw_docx/raw_bookmark_start.py,sha256=_2H3DVaPQtn2Bu8upFd0pfqMtMNOE6LylIkq-6H9EpA,533
|
5
|
+
raw_docx/raw_document.py,sha256=Lq28CE3f-S_gfVxMC6FpykTKLr1Tviqyx3WzfHOAOPo,2474
|
6
|
+
raw_docx/raw_docx.py,sha256=mXK2YB0JPTkf3GsYMxN4r3CLVEQop399pjSvPD6YGTc,11247
|
7
|
+
raw_docx/raw_image.py,sha256=IUUETwW73-guaa_v-cHpfw0_z69u9wfvEk7adm9hHJQ,1506
|
8
|
+
raw_docx/raw_list.py,sha256=bhssQX_oVf8uBmUbcrCIzIJ8pCvdEtdHOAQBNH0EEQQ,2282
|
9
|
+
raw_docx/raw_list_item.py,sha256=I65FDqU4YE6TJ615qH1GYDcXTPKnlk173xXPr3MCMcQ,599
|
10
|
+
raw_docx/raw_paragraph.py,sha256=rXBTeiBg-eIEK0NEFhUbGO2I_N-NsyRX1dGPZoaGYFg,3841
|
11
|
+
raw_docx/raw_run.py,sha256=Kl1HmZ5R_z0S9R3u5RC_e6HcfcPYMidzSb9Q6FN1TXI,1492
|
12
|
+
raw_docx/raw_section.py,sha256=4WNKveU4wDCCwLrB797woPLBzj2PCnqU3gV0pNRQeG0,4108
|
13
|
+
raw_docx/raw_simple_field.py,sha256=jPcZhqzN6F3uCb6eXEWBkKOBBjoxw4so6EX9a00wjo4,711
|
14
|
+
raw_docx/raw_table.py,sha256=Sc0vV7g-Gonmo_EbdWyYmQy_rvgf4A-PYLuIrX_FTfc,1622
|
15
|
+
raw_docx/raw_table_cell.py,sha256=3gKm3m5JlkFVuEy9EHVhC2viNDUKfggny_bblY1TC48,1967
|
16
|
+
raw_docx/raw_table_row.py,sha256=m8SoLyVlKLjd_Vqa_U79A2wi8Wout8spgyusqJm79Kc,1297
|
17
|
+
raw_docx/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
raw_docx/docx/docx_paragraph.py,sha256=A1bBXWt2qVWAihX1dvYuWXV3QmDSuijWv2Of_qw6IMk,8185
|
19
|
+
raw_docx/docx/docx_table.py,sha256=dG8jjkOnxGwN-6KtqdoYq8znO2XLYuIYhjbr4ZjIs2Y,5404
|
20
|
+
raw_docx-0.10.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
21
|
+
raw_docx-0.10.0.dist-info/METADATA,sha256=sBCucC0z6oMmrfj7Mxppul2KNbu6Q6tyvFhb8WBIwl4,1367
|
22
|
+
raw_docx-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
23
|
+
raw_docx-0.10.0.dist-info/top_level.txt,sha256=Xl3dspPM9DBVj8clfdkHG7N4nNjNXeUmB4HcXAwOe60,9
|
24
|
+
raw_docx-0.10.0.dist-info/RECORD,,
|
raw_docx-0.9.0.dist-info/RECORD
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
raw_docx/__info__.py,sha256=XoLgeFOZMbpB10DzZCc-wDTTRgx_zTVO3xTtwZnF5Dw,30
|
2
|
-
raw_docx/__init__.py,sha256=FE5cpoCK1EVhpz3LiOOs43l027PcuJN5RljdW0UWON0,591
|
3
|
-
raw_docx/raw_document.py,sha256=hUrnf6QZs9-yysnz1UmYZCYvhqdyPi3v2i-t5mu5KsI,2340
|
4
|
-
raw_docx/raw_docx.py,sha256=bp4No6XbnwPoWkNsnutbxARgV5RzgNa7jalr8IMjUbQ,13843
|
5
|
-
raw_docx/raw_image.py,sha256=IUUETwW73-guaa_v-cHpfw0_z69u9wfvEk7adm9hHJQ,1506
|
6
|
-
raw_docx/raw_list.py,sha256=bhssQX_oVf8uBmUbcrCIzIJ8pCvdEtdHOAQBNH0EEQQ,2282
|
7
|
-
raw_docx/raw_list_item.py,sha256=I65FDqU4YE6TJ615qH1GYDcXTPKnlk173xXPr3MCMcQ,599
|
8
|
-
raw_docx/raw_paragraph.py,sha256=GDEU89MFVgzD52t0e7Tau8tm6V0uTtaHMawfpU2RbT0,1209
|
9
|
-
raw_docx/raw_run.py,sha256=0PJHiZIm1QclZfjdsrPPLSL7_GYoX8jSa6JvcfcOcWc,479
|
10
|
-
raw_docx/raw_section.py,sha256=_ONvR5Fyuif4vZs1LnE7Y67pX29JKWM13YB8Wy8di9o,3942
|
11
|
-
raw_docx/raw_table.py,sha256=Sc0vV7g-Gonmo_EbdWyYmQy_rvgf4A-PYLuIrX_FTfc,1622
|
12
|
-
raw_docx/raw_table_cell.py,sha256=3gKm3m5JlkFVuEy9EHVhC2viNDUKfggny_bblY1TC48,1967
|
13
|
-
raw_docx/raw_table_row.py,sha256=m8SoLyVlKLjd_Vqa_U79A2wi8Wout8spgyusqJm79Kc,1297
|
14
|
-
raw_docx/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
raw_docx/docx/docx_paragraph.py,sha256=DPFzCG26y-6teL3KDnC_Ihmbs48OsHfD4fCD5Tj1O4A,2938
|
16
|
-
raw_docx/docx/docx_table.py,sha256=dG8jjkOnxGwN-6KtqdoYq8znO2XLYuIYhjbr4ZjIs2Y,5404
|
17
|
-
raw_docx-0.9.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
18
|
-
raw_docx-0.9.0.dist-info/METADATA,sha256=TfLahTHKUgTE_bcb9Bsel3zz4j8vjOlU8fssYQBdGZk,1237
|
19
|
-
raw_docx-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
20
|
-
raw_docx-0.9.0.dist-info/top_level.txt,sha256=Xl3dspPM9DBVj8clfdkHG7N4nNjNXeUmB4HcXAwOe60,9
|
21
|
-
raw_docx-0.9.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|