raw-docx 0.8.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {raw_docx-0.8.0 → raw_docx-0.9.0}/PKG-INFO +3 -3
- {raw_docx-0.8.0 → raw_docx-0.9.0}/setup.py +2 -2
- raw_docx-0.9.0/src/raw_docx/__info__.py +1 -0
- raw_docx-0.9.0/src/raw_docx/docx/docx_table.py +170 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_docx.py +117 -55
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_list_item.py +0 -1
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_paragraph.py +0 -1
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_table.py +3 -2
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_table_cell.py +2 -1
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/PKG-INFO +3 -3
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/SOURCES.txt +1 -13
- raw_docx-0.9.0/src/raw_docx.egg-info/requires.txt +2 -0
- raw_docx-0.8.0/src/raw_docx/__info__.py +0 -1
- raw_docx-0.8.0/src/raw_docx.egg-info/requires.txt +0 -2
- raw_docx-0.8.0/tests/test_docx_paragraph.py +0 -244
- raw_docx-0.8.0/tests/test_integration.py +0 -36
- raw_docx-0.8.0/tests/test_raw_document.py +0 -132
- raw_docx-0.8.0/tests/test_raw_docx.py +0 -628
- raw_docx-0.8.0/tests/test_raw_image.py +0 -118
- raw_docx-0.8.0/tests/test_raw_list.py +0 -193
- raw_docx-0.8.0/tests/test_raw_list_item.py +0 -37
- raw_docx-0.8.0/tests/test_raw_paragraph.py +0 -134
- raw_docx-0.8.0/tests/test_raw_run.py +0 -92
- raw_docx-0.8.0/tests/test_raw_section.py +0 -478
- raw_docx-0.8.0/tests/test_raw_table.py +0 -155
- raw_docx-0.8.0/tests/test_raw_table_cell.py +0 -103
- raw_docx-0.8.0/tests/test_raw_table_row.py +0 -133
- {raw_docx-0.8.0 → raw_docx-0.9.0}/LICENSE +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/README.md +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/setup.cfg +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/__init__.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/docx/__init__.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/docx/docx_paragraph.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_document.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_image.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_list.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_run.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_section.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx/raw_table_row.py +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/dependency_links.txt +0 -0
- {raw_docx-0.8.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: raw_docx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.0
|
4
4
|
Summary: A package for processing and analyzing raw document formats
|
5
5
|
Home-page: https://github.com/daveih/raw_docx
|
6
6
|
Author: Dave Iberson-Hurst
|
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
|
|
17
17
|
Requires-Python: >=3.8
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
License-File: LICENSE
|
20
|
-
Requires-Dist: python-docx==1.
|
21
|
-
Requires-Dist: simple_error_log
|
20
|
+
Requires-Dist: python-docx==1.2.0
|
21
|
+
Requires-Dist: simple_error_log>=0.6.0
|
22
22
|
Dynamic: author
|
23
23
|
Dynamic: classifier
|
24
24
|
Dynamic: description
|
@@ -19,8 +19,8 @@ setup(
|
|
19
19
|
packages=find_packages(where="src"),
|
20
20
|
package_dir={"": "src"},
|
21
21
|
package_data={},
|
22
|
-
install_requires=["python-docx==1.
|
23
|
-
tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
|
22
|
+
install_requires=["python-docx==1.2.0", "simple_error_log>=0.6.0"],
|
23
|
+
tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv", "pyyaml"],
|
24
24
|
classifiers=[
|
25
25
|
"Development Status :: 3 - Alpha",
|
26
26
|
"Intended Audience :: Developers",
|
@@ -0,0 +1 @@
|
|
1
|
+
__package_version__ = "0.9.0"
|
@@ -0,0 +1,170 @@
|
|
1
|
+
from simple_error_log import Errors
|
2
|
+
from simple_error_log.error_location import KlassMethodLocation
|
3
|
+
from raw_docx.raw_table import RawTable
|
4
|
+
from docx.table import _Cell
|
5
|
+
|
6
|
+
|
7
|
+
class TableCell:
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
row: int,
|
11
|
+
col: int,
|
12
|
+
bottom: int,
|
13
|
+
right: int,
|
14
|
+
cell: _Cell,
|
15
|
+
h_merge: bool,
|
16
|
+
v_merge: bool,
|
17
|
+
):
|
18
|
+
self.cell = cell
|
19
|
+
self.top = row
|
20
|
+
self.bottom = bottom
|
21
|
+
self.left = col
|
22
|
+
self.right = right
|
23
|
+
self.v_merge = v_merge
|
24
|
+
self.h_merge = h_merge
|
25
|
+
|
26
|
+
def __str__(self):
|
27
|
+
text = ""
|
28
|
+
for paragraph in self.cell.paragraphs:
|
29
|
+
text += paragraph.text
|
30
|
+
return f"[{self.top}, {self.left}] --> [{self.bottom}, {self.right}] (H: {self.h_merge}, V: {self.v_merge}) {text}"
|
31
|
+
|
32
|
+
|
33
|
+
class TableRow:
|
34
|
+
def __init__(self, row: int):
|
35
|
+
self._row = row
|
36
|
+
self._data = []
|
37
|
+
|
38
|
+
def cell(self, col: int) -> TableCell:
|
39
|
+
return self._data[col]
|
40
|
+
|
41
|
+
def add(self, col: int, cell: TableCell):
|
42
|
+
try:
|
43
|
+
self._data[col] = cell
|
44
|
+
except IndexError:
|
45
|
+
if col >= 0:
|
46
|
+
self._data.extend(((col + 1) - len(self._data)) * [None])
|
47
|
+
self._data[col] = cell
|
48
|
+
|
49
|
+
def pad(self, width: int):
|
50
|
+
if len(self._data) < width:
|
51
|
+
self._data.extend((width - len(self._data)) * [None])
|
52
|
+
|
53
|
+
def __iter__(self):
|
54
|
+
return iter(self._data)
|
55
|
+
|
56
|
+
|
57
|
+
class TableMatrix:
|
58
|
+
MODULE = "raw_docx.docx.docx_table.TableMatrix"
|
59
|
+
|
60
|
+
class LogicError(Exception):
|
61
|
+
pass
|
62
|
+
|
63
|
+
def __init__(self, table: RawTable, errors: Errors):
|
64
|
+
try:
|
65
|
+
self._errors = errors
|
66
|
+
self._table = table
|
67
|
+
self._height = 0
|
68
|
+
self._width = 0
|
69
|
+
self._matrix: list[list[TableCell]] = []
|
70
|
+
for cell in self._iter_cells():
|
71
|
+
self._add(cell)
|
72
|
+
self._width = cell.left if cell.left > self._width else self._width
|
73
|
+
self._height = cell.top
|
74
|
+
self._height += 1 # Set length not index
|
75
|
+
self._width += 1 # Set length not index
|
76
|
+
self._pad()
|
77
|
+
except Exception as e:
|
78
|
+
self._errors.exception(
|
79
|
+
"Exception raised building table matrix",
|
80
|
+
e,
|
81
|
+
KlassMethodLocation(self.MODULE, "__init__"),
|
82
|
+
)
|
83
|
+
|
84
|
+
def _pad(self):
|
85
|
+
row: TableRow
|
86
|
+
for row in self._matrix:
|
87
|
+
row.pad(self._width)
|
88
|
+
|
89
|
+
def _add(self, cell: TableCell):
|
90
|
+
row = cell.top
|
91
|
+
col = cell.left
|
92
|
+
if row >= 0 and row < len(self._matrix):
|
93
|
+
row_data: TableRow = self._matrix[row]
|
94
|
+
row_data.add(col, cell)
|
95
|
+
elif row >= 0:
|
96
|
+
self._matrix.extend(((row + 1) - len(self._matrix)) * [None])
|
97
|
+
row_data = TableRow(row)
|
98
|
+
self._matrix[row] = row_data
|
99
|
+
row_data.add(col, cell)
|
100
|
+
else:
|
101
|
+
pass # negative row!
|
102
|
+
|
103
|
+
def _iter_cells(self):
|
104
|
+
table = self._table
|
105
|
+
for r, row in enumerate(table.rows):
|
106
|
+
for c, cell in enumerate(row.cells):
|
107
|
+
right = c
|
108
|
+
bottom = r
|
109
|
+
v_merge = False
|
110
|
+
h_merge = False
|
111
|
+
# Check if the cell equals the previous cell either horizontally or vertically
|
112
|
+
# so it can be ignored (part of a merge)
|
113
|
+
if (
|
114
|
+
r > 0
|
115
|
+
and c < len(table.rows[r - 1].cells)
|
116
|
+
and cell._tc is table.rows[r - 1].cells[c]._tc
|
117
|
+
) or (c > 0 and cell._tc is row.cells[c - 1]._tc):
|
118
|
+
continue
|
119
|
+
# Verical merge check
|
120
|
+
if (
|
121
|
+
r >= 0
|
122
|
+
and r + 1 < len(table.rows)
|
123
|
+
and c < len(table.rows[r + 1].cells)
|
124
|
+
and cell._tc is table.rows[r + 1].cells[c]._tc
|
125
|
+
):
|
126
|
+
v_merge = True
|
127
|
+
bottom = self._v_extent(r, c) - 1
|
128
|
+
# Horizontal merge check
|
129
|
+
if (
|
130
|
+
c >= 0
|
131
|
+
and c + 1 < len(table.rows[r].cells)
|
132
|
+
and cell._tc is row.cells[c + 1]._tc
|
133
|
+
):
|
134
|
+
h_merge = True
|
135
|
+
right = self._h_extent(r, c) - 1
|
136
|
+
yield TableCell(r, c, bottom, right, cell, h_merge, v_merge)
|
137
|
+
|
138
|
+
def _v_extent(self, row: int, col: int) -> int:
|
139
|
+
table = self._table
|
140
|
+
next_row = row + 1
|
141
|
+
height = len(table.rows)
|
142
|
+
while next_row < height:
|
143
|
+
if (
|
144
|
+
next_row >= 0
|
145
|
+
and col < len(table.rows[next_row].cells)
|
146
|
+
and table.rows[row].cells[col]._tc
|
147
|
+
is not table.rows[next_row].cells[col]._tc
|
148
|
+
):
|
149
|
+
return next_row
|
150
|
+
else:
|
151
|
+
next_row += 1
|
152
|
+
return height
|
153
|
+
|
154
|
+
def _h_extent(self, row: int, col: int) -> int:
|
155
|
+
table = self._table
|
156
|
+
next_col = col + 1
|
157
|
+
width = len(table.rows[row].cells)
|
158
|
+
while next_col < width:
|
159
|
+
if (
|
160
|
+
next_col >= 0
|
161
|
+
and table.rows[row].cells[col]._tc
|
162
|
+
is not table.rows[row].cells[next_col]._tc
|
163
|
+
):
|
164
|
+
return next_col
|
165
|
+
else:
|
166
|
+
next_col += 1
|
167
|
+
return width
|
168
|
+
|
169
|
+
def __iter__(self):
|
170
|
+
return iter(self._matrix)
|
@@ -13,6 +13,7 @@ from raw_docx.raw_table_cell import RawTableCell
|
|
13
13
|
from raw_docx.raw_list import RawList
|
14
14
|
from raw_docx.raw_list_item import RawListItem
|
15
15
|
from raw_docx.docx.docx_paragraph import install
|
16
|
+
from raw_docx.docx.docx_table import TableMatrix
|
16
17
|
from docx import Document as DocXProcessor
|
17
18
|
from docx.document import Document
|
18
19
|
from docx.oxml.table import CT_Tbl, CT_TcPr
|
@@ -21,23 +22,26 @@ from docx.table import Table, _Cell
|
|
21
22
|
from docx.text.paragraph import Paragraph
|
22
23
|
from lxml import etree
|
23
24
|
from simple_error_log import Errors
|
25
|
+
from simple_error_log.error_location import KlassMethodLocation
|
24
26
|
|
25
27
|
|
26
28
|
class RawDocx:
|
29
|
+
MODULE = "raw_docx.raw_docx.RawDocx"
|
30
|
+
|
27
31
|
class LogicError(Exception):
|
28
32
|
pass
|
29
33
|
|
30
34
|
def __init__(self, full_path: str):
|
31
35
|
install()
|
32
|
-
self.
|
36
|
+
self._errors = Errors()
|
33
37
|
path = Path(full_path)
|
34
|
-
# path.stem, path.suffix[1:]
|
35
38
|
self.full_path = full_path
|
36
39
|
self.dir = path.parent
|
37
40
|
self.filename = path.name
|
38
41
|
self.image_path = os.path.join(self.dir, "images")
|
39
|
-
self.
|
40
|
-
f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}"
|
42
|
+
self._errors.debug(
|
43
|
+
f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}",
|
44
|
+
KlassMethodLocation(self.MODULE, "__init__"),
|
41
45
|
)
|
42
46
|
self.image_rels = {}
|
43
47
|
self._organise_dir()
|
@@ -45,13 +49,21 @@ class RawDocx:
|
|
45
49
|
self.target_document = RawDocument()
|
46
50
|
self._process()
|
47
51
|
|
52
|
+
@property
|
53
|
+
def errors(self) -> Errors:
|
54
|
+
return self._errors
|
55
|
+
|
48
56
|
def _organise_dir(self):
|
49
57
|
try:
|
50
58
|
os.mkdir(self.image_path)
|
51
59
|
except FileExistsError:
|
52
60
|
pass
|
53
61
|
except Exception as e:
|
54
|
-
self.
|
62
|
+
self._errors.exception(
|
63
|
+
"Failed to create image directory",
|
64
|
+
e,
|
65
|
+
KlassMethodLocation(self.MODULE, "_organise_dir"),
|
66
|
+
)
|
55
67
|
|
56
68
|
def _process(self):
|
57
69
|
try:
|
@@ -63,10 +75,16 @@ class RawDocx:
|
|
63
75
|
elif isinstance(block_item, Table):
|
64
76
|
self._process_table(block_item, target_section)
|
65
77
|
else:
|
66
|
-
self.
|
78
|
+
self._errors.warning(
|
79
|
+
"Ignoring element", KlassMethodLocation(self.MODULE, "_process")
|
80
|
+
)
|
67
81
|
raise ValueError
|
68
82
|
except Exception as e:
|
69
|
-
self.
|
83
|
+
self._errors.exception(
|
84
|
+
"Exception raised processing document",
|
85
|
+
e,
|
86
|
+
KlassMethodLocation(self.MODULE, "_process"),
|
87
|
+
)
|
70
88
|
|
71
89
|
def _process_images(self):
|
72
90
|
# Extract images to image dir
|
@@ -94,7 +112,10 @@ class RawDocx:
|
|
94
112
|
|
95
113
|
for child in parent_elm.iterchildren():
|
96
114
|
if isinstance(child, str):
|
97
|
-
self.
|
115
|
+
self._errors.warning(
|
116
|
+
f"Ignoring eTree element {child}",
|
117
|
+
KlassMethodLocation(self.MODULE, "_iter_block_items"),
|
118
|
+
)
|
98
119
|
elif isinstance(child, CT_P):
|
99
120
|
yield Paragraph(child, parent)
|
100
121
|
elif isinstance(child, CT_Tbl):
|
@@ -111,7 +132,10 @@ class RawDocx:
|
|
111
132
|
):
|
112
133
|
pass
|
113
134
|
else:
|
114
|
-
self.
|
135
|
+
self._errors.warning(
|
136
|
+
f"Ignoring eTree element {self._tree(child)}",
|
137
|
+
KlassMethodLocation(self.MODULE, "_iter_block_items"),
|
138
|
+
)
|
115
139
|
|
116
140
|
else:
|
117
141
|
raise ValueError(f"something's not right with a child {type(child)}")
|
@@ -123,62 +147,100 @@ class RawDocx:
|
|
123
147
|
def _process_table(self, table, target: RawSection | RawTableCell):
|
124
148
|
target_table = RawTable()
|
125
149
|
target.add(target_table)
|
126
|
-
|
150
|
+
matrix = TableMatrix(table, self._errors)
|
151
|
+
for r_index, row in enumerate(matrix):
|
127
152
|
target_row = RawTableRow()
|
128
153
|
target_table.add(target_row)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
else:
|
150
|
-
first = r_index == 0 and c_index == 0
|
151
|
-
target_cell = RawTableCell(h_span, v_span, first)
|
152
|
-
target_row.add(target_cell)
|
153
|
-
for block_item in self._iter_block_items(cell):
|
154
|
-
if isinstance(block_item, Paragraph):
|
155
|
-
self._process_cell(block_item, target_cell)
|
156
|
-
elif isinstance(block_item, Table):
|
157
|
-
raise self.LogicError("Table within table detected")
|
158
|
-
elif isinstance(block_item, etree._Element):
|
159
|
-
if block_item.tag == CT_TcPr:
|
160
|
-
pass
|
154
|
+
for c_index, row_cell in enumerate(row):
|
155
|
+
if row_cell:
|
156
|
+
h_span = row_cell.right - row_cell.left + 1
|
157
|
+
v_span = row_cell.bottom - row_cell.top + 1
|
158
|
+
first = r_index == row_cell.top and c_index == row_cell.left
|
159
|
+
target_cell = RawTableCell(h_span, v_span, first)
|
160
|
+
target_row.add(target_cell)
|
161
|
+
for block_item in self._iter_block_items(row_cell.cell):
|
162
|
+
if isinstance(block_item, Paragraph):
|
163
|
+
self._process_cell(block_item, target_cell)
|
164
|
+
elif isinstance(block_item, Table):
|
165
|
+
raise self.LogicError("Table within table detected")
|
166
|
+
elif isinstance(block_item, etree._Element):
|
167
|
+
if block_item.tag == CT_TcPr:
|
168
|
+
pass
|
169
|
+
else:
|
170
|
+
self._errors.warning(
|
171
|
+
f"Ignoring eTree element {block_item.tag}",
|
172
|
+
KlassMethodLocation(self.MODULE, "_process_table"),
|
173
|
+
)
|
161
174
|
else:
|
162
|
-
self.
|
163
|
-
f"
|
175
|
+
raise self.LogicError(
|
176
|
+
f"Something's not right with a child {type(block_item)}"
|
164
177
|
)
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
178
|
+
|
179
|
+
# def _process_table(self, table, target: RawSection | RawTableCell):
|
180
|
+
# target_table = RawTable()
|
181
|
+
# target.add(target_table)
|
182
|
+
# for r_index, row in enumerate(table.rows):
|
183
|
+
# target_row = RawTableRow()
|
184
|
+
# target_table.add(target_row)
|
185
|
+
# cells = row.cells
|
186
|
+
# for c_index, cell in enumerate(cells):
|
187
|
+
# if cell._tc is not None:
|
188
|
+
# x = cell._tc
|
189
|
+
# right = x.right
|
190
|
+
# left = x.left
|
191
|
+
# top = x.top
|
192
|
+
# try:
|
193
|
+
# # Bottom method seems to have a bug.
|
194
|
+
# # See https://github.com/python-openxml/python-docx/issues/1433
|
195
|
+
# bottom = x.bottom
|
196
|
+
# except Exception as e:
|
197
|
+
# self._errors.exception(
|
198
|
+
# f"Row span exception! {x.xml}",
|
199
|
+
# e,
|
200
|
+
# KlassMethodLocation(self.MODULE, "_process_table"),
|
201
|
+
# )
|
202
|
+
# bottom = top + 1
|
203
|
+
# h_span = right - left
|
204
|
+
# v_span = bottom - top
|
205
|
+
# else:
|
206
|
+
# h_span = 1
|
207
|
+
# v_span = 1
|
208
|
+
# if cell._tc is not None:
|
209
|
+
# first = r_index == cell._tc.top and c_index == cell._tc.left
|
210
|
+
# else:
|
211
|
+
# first = r_index == 0 and c_index == 0
|
212
|
+
# target_cell = RawTableCell(h_span, v_span, first)
|
213
|
+
# target_row.add(target_cell)
|
214
|
+
# for block_item in self._iter_block_items(cell):
|
215
|
+
# if isinstance(block_item, Paragraph):
|
216
|
+
# self._process_cell(block_item, target_cell)
|
217
|
+
# elif isinstance(block_item, Table):
|
218
|
+
# raise self.LogicError("Table within table detected")
|
219
|
+
# elif isinstance(block_item, etree._Element):
|
220
|
+
# if block_item.tag == CT_TcPr:
|
221
|
+
# pass
|
222
|
+
# else:
|
223
|
+
# self._errors.warning(
|
224
|
+
# f"Ignoring eTree element {block_item.tag}",
|
225
|
+
# KlassMethodLocation(self.MODULE, "_process_table"),
|
226
|
+
# )
|
227
|
+
# else:
|
228
|
+
# raise self.LogicError(
|
229
|
+
# f"Something's not right with a child {type(block_item)}"
|
230
|
+
# )
|
169
231
|
|
170
232
|
def _process_cell(self, paragraph, target_cell: RawTableCell):
|
171
233
|
if self._is_list(paragraph):
|
172
234
|
list_level = self.get_list_level(paragraph)
|
173
|
-
item = RawListItem(paragraph.extract_runs(self.
|
235
|
+
item = RawListItem(paragraph.extract_runs(self._errors), list_level)
|
174
236
|
if target_cell.is_in_list():
|
175
237
|
list = target_cell.current_list()
|
176
238
|
else:
|
177
|
-
list = RawList(self.
|
239
|
+
list = RawList(self._errors)
|
178
240
|
target_cell.add(list)
|
179
241
|
list.add(item)
|
180
242
|
else:
|
181
|
-
target_paragraph = RawParagraph(paragraph.extract_runs(self.
|
243
|
+
target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
|
182
244
|
target_cell.add(target_paragraph)
|
183
245
|
|
184
246
|
def _process_paragraph(
|
@@ -190,20 +252,20 @@ class RawDocx:
|
|
190
252
|
self.target_document.add(target_section)
|
191
253
|
elif self._is_list(paragraph):
|
192
254
|
list_level = self.get_list_level(paragraph)
|
193
|
-
item = RawListItem(paragraph.extract_runs(self.
|
255
|
+
item = RawListItem(paragraph.extract_runs(self._errors), list_level)
|
194
256
|
if target_section.is_in_list():
|
195
257
|
list = target_section.current_list()
|
196
258
|
else:
|
197
|
-
list = RawList(self.
|
259
|
+
list = RawList(self._errors)
|
198
260
|
target_section.add(list)
|
199
261
|
list.add(item)
|
200
262
|
elif "Graphic" in paragraph._p.xml:
|
201
263
|
for rId in image_rels:
|
202
264
|
if rId in paragraph._p.xml:
|
203
|
-
target_image = RawImage(image_rels[rId], self.
|
265
|
+
target_image = RawImage(image_rels[rId], self._errors)
|
204
266
|
target_section.add(target_image)
|
205
267
|
else:
|
206
|
-
target_paragraph = RawParagraph(paragraph.extract_runs(self.
|
268
|
+
target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
|
207
269
|
target_section.add(target_paragraph)
|
208
270
|
|
209
271
|
def get_list_level(self, paragraph):
|
@@ -3,7 +3,7 @@ class RawTable:
|
|
3
3
|
from .raw_table_row import RawTableRow
|
4
4
|
|
5
5
|
self.rows: list[RawTableRow] = []
|
6
|
-
self.klasses = ["
|
6
|
+
self.klasses = ["raw-docx-table"]
|
7
7
|
|
8
8
|
# @ToDo Would like RawTableRow here but gets a circular import
|
9
9
|
def add(self, item):
|
@@ -40,7 +40,8 @@ class RawTable:
|
|
40
40
|
self.klasses.append(klass)
|
41
41
|
|
42
42
|
def replace_class(self, old_klass, new_klass):
|
43
|
-
self.klasses
|
43
|
+
if old_klass in self.klasses:
|
44
|
+
self.klasses.remove(old_klass)
|
44
45
|
self.klasses.append(new_klass)
|
45
46
|
|
46
47
|
def to_dict(self) -> dict:
|
@@ -42,7 +42,8 @@ class RawTableCell:
|
|
42
42
|
return ""
|
43
43
|
lines = []
|
44
44
|
colspan = f' colspan="{self.h_span}"' if self.h_merged else ""
|
45
|
-
|
45
|
+
rowspan = f' rowspan="{self.v_span}"' if self.v_merged else ""
|
46
|
+
lines.append(f"<td{colspan}{rowspan}>")
|
46
47
|
for item in self.items:
|
47
48
|
lines.append(item.to_html())
|
48
49
|
lines.append("</td>")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: raw_docx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.0
|
4
4
|
Summary: A package for processing and analyzing raw document formats
|
5
5
|
Home-page: https://github.com/daveih/raw_docx
|
6
6
|
Author: Dave Iberson-Hurst
|
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
|
|
17
17
|
Requires-Python: >=3.8
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
License-File: LICENSE
|
20
|
-
Requires-Dist: python-docx==1.
|
21
|
-
Requires-Dist: simple_error_log
|
20
|
+
Requires-Dist: python-docx==1.2.0
|
21
|
+
Requires-Dist: simple_error_log>=0.6.0
|
22
22
|
Dynamic: author
|
23
23
|
Dynamic: classifier
|
24
24
|
Dynamic: description
|
@@ -21,16 +21,4 @@ src/raw_docx.egg-info/requires.txt
|
|
21
21
|
src/raw_docx.egg-info/top_level.txt
|
22
22
|
src/raw_docx/docx/__init__.py
|
23
23
|
src/raw_docx/docx/docx_paragraph.py
|
24
|
-
|
25
|
-
tests/test_integration.py
|
26
|
-
tests/test_raw_document.py
|
27
|
-
tests/test_raw_docx.py
|
28
|
-
tests/test_raw_image.py
|
29
|
-
tests/test_raw_list.py
|
30
|
-
tests/test_raw_list_item.py
|
31
|
-
tests/test_raw_paragraph.py
|
32
|
-
tests/test_raw_run.py
|
33
|
-
tests/test_raw_section.py
|
34
|
-
tests/test_raw_table.py
|
35
|
-
tests/test_raw_table_cell.py
|
36
|
-
tests/test_raw_table_row.py
|
24
|
+
src/raw_docx/docx/docx_table.py
|
@@ -1 +0,0 @@
|
|
1
|
-
__package_version__ = "0.8.0"
|