raw-docx 0.7.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {raw_docx-0.7.0 → raw_docx-0.9.0}/PKG-INFO +4 -4
  2. {raw_docx-0.7.0 → raw_docx-0.9.0}/README.md +1 -1
  3. {raw_docx-0.7.0 → raw_docx-0.9.0}/setup.py +2 -2
  4. raw_docx-0.9.0/src/raw_docx/__info__.py +1 -0
  5. raw_docx-0.9.0/src/raw_docx/docx/docx_table.py +170 -0
  6. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_document.py +1 -1
  7. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_docx.py +136 -62
  8. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_list_item.py +0 -1
  9. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_paragraph.py +0 -1
  10. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_table.py +3 -2
  11. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_table_cell.py +2 -1
  12. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/PKG-INFO +4 -4
  13. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/SOURCES.txt +1 -13
  14. raw_docx-0.9.0/src/raw_docx.egg-info/requires.txt +2 -0
  15. raw_docx-0.7.0/src/raw_docx/__info__.py +0 -1
  16. raw_docx-0.7.0/src/raw_docx.egg-info/requires.txt +0 -2
  17. raw_docx-0.7.0/tests/test_docx_paragraph.py +0 -196
  18. raw_docx-0.7.0/tests/test_integration.py +0 -36
  19. raw_docx-0.7.0/tests/test_raw_document.py +0 -76
  20. raw_docx-0.7.0/tests/test_raw_docx.py +0 -107
  21. raw_docx-0.7.0/tests/test_raw_image.py +0 -118
  22. raw_docx-0.7.0/tests/test_raw_list.py +0 -168
  23. raw_docx-0.7.0/tests/test_raw_list_item.py +0 -37
  24. raw_docx-0.7.0/tests/test_raw_paragraph.py +0 -97
  25. raw_docx-0.7.0/tests/test_raw_run.py +0 -92
  26. raw_docx-0.7.0/tests/test_raw_section.py +0 -478
  27. raw_docx-0.7.0/tests/test_raw_table.py +0 -155
  28. raw_docx-0.7.0/tests/test_raw_table_cell.py +0 -103
  29. raw_docx-0.7.0/tests/test_raw_table_row.py +0 -133
  30. {raw_docx-0.7.0 → raw_docx-0.9.0}/LICENSE +0 -0
  31. {raw_docx-0.7.0 → raw_docx-0.9.0}/setup.cfg +0 -0
  32. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/__init__.py +0 -0
  33. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/docx/__init__.py +0 -0
  34. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/docx/docx_paragraph.py +0 -0
  35. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_image.py +0 -0
  36. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_list.py +0 -0
  37. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_run.py +0 -0
  38. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_section.py +0 -0
  39. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_table_row.py +0 -0
  40. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/dependency_links.txt +0 -0
  41. {raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raw_docx
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: A package for processing and analyzing raw document formats
5
5
  Home-page: https://github.com/daveih/raw_docx
6
6
  Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Requires-Python: >=3.8
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: python-docx
21
- Requires-Dist: simple_error_log
20
+ Requires-Dist: python-docx==1.2.0
21
+ Requires-Dist: simple_error_log>=0.6.0
22
22
  Dynamic: author
23
23
  Dynamic: classifier
24
24
  Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
38
38
  Build as a normal package
39
39
 
40
40
  - Build with `python3 -m build --sdist --wheel`
41
- - Upload to pypi.org using `twine upload dist/* `
41
+ - Upload to pypi.org using `twine upload dist/*`
@@ -7,4 +7,4 @@ Simple package to build on top of python-docx to assist in the handling of word
7
7
  Build as a normal package
8
8
 
9
9
  - Build with `python3 -m build --sdist --wheel`
10
- - Upload to pypi.org using `twine upload dist/* `
10
+ - Upload to pypi.org using `twine upload dist/*`
@@ -19,8 +19,8 @@ setup(
19
19
  packages=find_packages(where="src"),
20
20
  package_dir={"": "src"},
21
21
  package_data={},
22
- install_requires=["python-docx", "simple_error_log"],
23
- tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
22
+ install_requires=["python-docx==1.2.0", "simple_error_log>=0.6.0"],
23
+ tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv", "pyyaml"],
24
24
  classifiers=[
25
25
  "Development Status :: 3 - Alpha",
26
26
  "Intended Audience :: Developers",
@@ -0,0 +1 @@
1
+ __package_version__ = "0.9.0"
@@ -0,0 +1,170 @@
1
+ from simple_error_log import Errors
2
+ from simple_error_log.error_location import KlassMethodLocation
3
+ from raw_docx.raw_table import RawTable
4
+ from docx.table import _Cell
5
+
6
+
7
+ class TableCell:
8
+ def __init__(
9
+ self,
10
+ row: int,
11
+ col: int,
12
+ bottom: int,
13
+ right: int,
14
+ cell: _Cell,
15
+ h_merge: bool,
16
+ v_merge: bool,
17
+ ):
18
+ self.cell = cell
19
+ self.top = row
20
+ self.bottom = bottom
21
+ self.left = col
22
+ self.right = right
23
+ self.v_merge = v_merge
24
+ self.h_merge = h_merge
25
+
26
+ def __str__(self):
27
+ text = ""
28
+ for paragraph in self.cell.paragraphs:
29
+ text += paragraph.text
30
+ return f"[{self.top}, {self.left}] --> [{self.bottom}, {self.right}] (H: {self.h_merge}, V: {self.v_merge}) {text}"
31
+
32
+
33
+ class TableRow:
34
+ def __init__(self, row: int):
35
+ self._row = row
36
+ self._data = []
37
+
38
+ def cell(self, col: int) -> TableCell:
39
+ return self._data[col]
40
+
41
+ def add(self, col: int, cell: TableCell):
42
+ try:
43
+ self._data[col] = cell
44
+ except IndexError:
45
+ if col >= 0:
46
+ self._data.extend(((col + 1) - len(self._data)) * [None])
47
+ self._data[col] = cell
48
+
49
+ def pad(self, width: int):
50
+ if len(self._data) < width:
51
+ self._data.extend((width - len(self._data)) * [None])
52
+
53
+ def __iter__(self):
54
+ return iter(self._data)
55
+
56
+
57
+ class TableMatrix:
58
+ MODULE = "raw_docx.docx.docx_table.TableMatrix"
59
+
60
+ class LogicError(Exception):
61
+ pass
62
+
63
+ def __init__(self, table: RawTable, errors: Errors):
64
+ try:
65
+ self._errors = errors
66
+ self._table = table
67
+ self._height = 0
68
+ self._width = 0
69
+ self._matrix: list[list[TableCell]] = []
70
+ for cell in self._iter_cells():
71
+ self._add(cell)
72
+ self._width = cell.left if cell.left > self._width else self._width
73
+ self._height = cell.top
74
+ self._height += 1 # Set length not index
75
+ self._width += 1 # Set length not index
76
+ self._pad()
77
+ except Exception as e:
78
+ self._errors.exception(
79
+ "Exception raised building table matrix",
80
+ e,
81
+ KlassMethodLocation(self.MODULE, "__init__"),
82
+ )
83
+
84
+ def _pad(self):
85
+ row: TableRow
86
+ for row in self._matrix:
87
+ row.pad(self._width)
88
+
89
+ def _add(self, cell: TableCell):
90
+ row = cell.top
91
+ col = cell.left
92
+ if row >= 0 and row < len(self._matrix):
93
+ row_data: TableRow = self._matrix[row]
94
+ row_data.add(col, cell)
95
+ elif row >= 0:
96
+ self._matrix.extend(((row + 1) - len(self._matrix)) * [None])
97
+ row_data = TableRow(row)
98
+ self._matrix[row] = row_data
99
+ row_data.add(col, cell)
100
+ else:
101
+ pass # negative row!
102
+
103
+ def _iter_cells(self):
104
+ table = self._table
105
+ for r, row in enumerate(table.rows):
106
+ for c, cell in enumerate(row.cells):
107
+ right = c
108
+ bottom = r
109
+ v_merge = False
110
+ h_merge = False
111
+ # Check if the cell equals the previous cell either horizontally or vertically
112
+ # so it can be ignored (part of a merge)
113
+ if (
114
+ r > 0
115
+ and c < len(table.rows[r - 1].cells)
116
+ and cell._tc is table.rows[r - 1].cells[c]._tc
117
+ ) or (c > 0 and cell._tc is row.cells[c - 1]._tc):
118
+ continue
119
+ # Verical merge check
120
+ if (
121
+ r >= 0
122
+ and r + 1 < len(table.rows)
123
+ and c < len(table.rows[r + 1].cells)
124
+ and cell._tc is table.rows[r + 1].cells[c]._tc
125
+ ):
126
+ v_merge = True
127
+ bottom = self._v_extent(r, c) - 1
128
+ # Horizontal merge check
129
+ if (
130
+ c >= 0
131
+ and c + 1 < len(table.rows[r].cells)
132
+ and cell._tc is row.cells[c + 1]._tc
133
+ ):
134
+ h_merge = True
135
+ right = self._h_extent(r, c) - 1
136
+ yield TableCell(r, c, bottom, right, cell, h_merge, v_merge)
137
+
138
+ def _v_extent(self, row: int, col: int) -> int:
139
+ table = self._table
140
+ next_row = row + 1
141
+ height = len(table.rows)
142
+ while next_row < height:
143
+ if (
144
+ next_row >= 0
145
+ and col < len(table.rows[next_row].cells)
146
+ and table.rows[row].cells[col]._tc
147
+ is not table.rows[next_row].cells[col]._tc
148
+ ):
149
+ return next_row
150
+ else:
151
+ next_row += 1
152
+ return height
153
+
154
+ def _h_extent(self, row: int, col: int) -> int:
155
+ table = self._table
156
+ next_col = col + 1
157
+ width = len(table.rows[row].cells)
158
+ while next_col < width:
159
+ if (
160
+ next_col >= 0
161
+ and table.rows[row].cells[col]._tc
162
+ is not table.rows[row].cells[next_col]._tc
163
+ ):
164
+ return next_col
165
+ else:
166
+ next_col += 1
167
+ return width
168
+
169
+ def __iter__(self):
170
+ return iter(self._matrix)
@@ -22,7 +22,7 @@ class RawDocument:
22
22
  return self.sections[-1]
23
23
 
24
24
  def section_by_ordinal(self, ordinal: int) -> RawSection:
25
- if 1 >= ordinal <= len(self.sections):
25
+ if 1 <= ordinal <= len(self.sections):
26
26
  return self.sections[ordinal - 1]
27
27
  else:
28
28
  return None
@@ -13,6 +13,7 @@ from raw_docx.raw_table_cell import RawTableCell
13
13
  from raw_docx.raw_list import RawList
14
14
  from raw_docx.raw_list_item import RawListItem
15
15
  from raw_docx.docx.docx_paragraph import install
16
+ from raw_docx.docx.docx_table import TableMatrix
16
17
  from docx import Document as DocXProcessor
17
18
  from docx.document import Document
18
19
  from docx.oxml.table import CT_Tbl, CT_TcPr
@@ -21,23 +22,26 @@ from docx.table import Table, _Cell
21
22
  from docx.text.paragraph import Paragraph
22
23
  from lxml import etree
23
24
  from simple_error_log import Errors
25
+ from simple_error_log.error_location import KlassMethodLocation
24
26
 
25
27
 
26
28
  class RawDocx:
29
+ MODULE = "raw_docx.raw_docx.RawDocx"
30
+
27
31
  class LogicError(Exception):
28
32
  pass
29
33
 
30
34
  def __init__(self, full_path: str):
31
35
  install()
32
- self.errors = Errors()
36
+ self._errors = Errors()
33
37
  path = Path(full_path)
34
- # path.stem, path.suffix[1:]
35
38
  self.full_path = full_path
36
39
  self.dir = path.parent
37
40
  self.filename = path.name
38
41
  self.image_path = os.path.join(self.dir, "images")
39
- self.errors.debug(
40
- f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}"
42
+ self._errors.debug(
43
+ f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}",
44
+ KlassMethodLocation(self.MODULE, "__init__"),
41
45
  )
42
46
  self.image_rels = {}
43
47
  self._organise_dir()
@@ -45,13 +49,21 @@ class RawDocx:
45
49
  self.target_document = RawDocument()
46
50
  self._process()
47
51
 
52
+ @property
53
+ def errors(self) -> Errors:
54
+ return self._errors
55
+
48
56
  def _organise_dir(self):
49
57
  try:
50
58
  os.mkdir(self.image_path)
51
59
  except FileExistsError:
52
60
  pass
53
61
  except Exception as e:
54
- self.errors.exception("Failed to create image directory", e)
62
+ self._errors.exception(
63
+ "Failed to create image directory",
64
+ e,
65
+ KlassMethodLocation(self.MODULE, "_organise_dir"),
66
+ )
55
67
 
56
68
  def _process(self):
57
69
  try:
@@ -63,10 +75,16 @@ class RawDocx:
63
75
  elif isinstance(block_item, Table):
64
76
  self._process_table(block_item, target_section)
65
77
  else:
66
- self.errors.warning("Ignoring element")
78
+ self._errors.warning(
79
+ "Ignoring element", KlassMethodLocation(self.MODULE, "_process")
80
+ )
67
81
  raise ValueError
68
82
  except Exception as e:
69
- self.errors.exception("Exception raised processing document", e)
83
+ self._errors.exception(
84
+ "Exception raised processing document",
85
+ e,
86
+ KlassMethodLocation(self.MODULE, "_process"),
87
+ )
70
88
 
71
89
  def _process_images(self):
72
90
  # Extract images to image dir
@@ -94,7 +112,10 @@ class RawDocx:
94
112
 
95
113
  for child in parent_elm.iterchildren():
96
114
  if isinstance(child, str):
97
- self.errors.warning(f"Ignoring eTree element {child}")
115
+ self._errors.warning(
116
+ f"Ignoring eTree element {child}",
117
+ KlassMethodLocation(self.MODULE, "_iter_block_items"),
118
+ )
98
119
  elif isinstance(child, CT_P):
99
120
  yield Paragraph(child, parent)
100
121
  elif isinstance(child, CT_Tbl):
@@ -111,7 +132,10 @@ class RawDocx:
111
132
  ):
112
133
  pass
113
134
  else:
114
- self.errors.warning(f"Ignoring eTree element {self._tree(child)}")
135
+ self._errors.warning(
136
+ f"Ignoring eTree element {self._tree(child)}",
137
+ KlassMethodLocation(self.MODULE, "_iter_block_items"),
138
+ )
115
139
 
116
140
  else:
117
141
  raise ValueError(f"something's not right with a child {type(child)}")
@@ -123,59 +147,100 @@ class RawDocx:
123
147
  def _process_table(self, table, target: RawSection | RawTableCell):
124
148
  target_table = RawTable()
125
149
  target.add(target_table)
126
- for r_index, row in enumerate(table.rows):
150
+ matrix = TableMatrix(table, self._errors)
151
+ for r_index, row in enumerate(matrix):
127
152
  target_row = RawTableRow()
128
153
  target_table.add(target_row)
129
- cells = row.cells
130
- for c_index, cell in enumerate(cells):
131
- if cell._tc is not None:
132
- x = cell._tc
133
- right = x.right
134
- left = x.left
135
- top = x.top
136
- try:
137
- # Bottom method seems to have a bug.
138
- # See https://github.com/python-openxml/python-docx/issues/1433
139
- bottom = x.bottom
140
- except Exception:
141
- bottom = top + 1
142
- h_span = right - left
143
- v_span = bottom - top
144
- else:
145
- h_span = 1
146
- v_span = 1
147
- first = r_index == cell._tc.top and c_index == cell._tc.left
148
- target_cell = RawTableCell(h_span, v_span, first)
149
- target_row.add(target_cell)
150
- for block_item in self._iter_block_items(cell):
151
- if isinstance(block_item, Paragraph):
152
- self._process_cell(block_item, target_cell)
153
- elif isinstance(block_item, Table):
154
- raise self.LogicError("Table within table detected")
155
- elif isinstance(block_item, etree._Element):
156
- if block_item.tag == CT_TcPr:
157
- pass
154
+ for c_index, row_cell in enumerate(row):
155
+ if row_cell:
156
+ h_span = row_cell.right - row_cell.left + 1
157
+ v_span = row_cell.bottom - row_cell.top + 1
158
+ first = r_index == row_cell.top and c_index == row_cell.left
159
+ target_cell = RawTableCell(h_span, v_span, first)
160
+ target_row.add(target_cell)
161
+ for block_item in self._iter_block_items(row_cell.cell):
162
+ if isinstance(block_item, Paragraph):
163
+ self._process_cell(block_item, target_cell)
164
+ elif isinstance(block_item, Table):
165
+ raise self.LogicError("Table within table detected")
166
+ elif isinstance(block_item, etree._Element):
167
+ if block_item.tag == CT_TcPr:
168
+ pass
169
+ else:
170
+ self._errors.warning(
171
+ f"Ignoring eTree element {block_item.tag}",
172
+ KlassMethodLocation(self.MODULE, "_process_table"),
173
+ )
158
174
  else:
159
- self.errors.warning(
160
- f"Ignoring eTree element {block_item.tag}"
175
+ raise self.LogicError(
176
+ f"Something's not right with a child {type(block_item)}"
161
177
  )
162
- else:
163
- raise self.LogicError(
164
- f"something's not right with a child {type(block_item)}"
165
- )
178
+
179
+ # def _process_table(self, table, target: RawSection | RawTableCell):
180
+ # target_table = RawTable()
181
+ # target.add(target_table)
182
+ # for r_index, row in enumerate(table.rows):
183
+ # target_row = RawTableRow()
184
+ # target_table.add(target_row)
185
+ # cells = row.cells
186
+ # for c_index, cell in enumerate(cells):
187
+ # if cell._tc is not None:
188
+ # x = cell._tc
189
+ # right = x.right
190
+ # left = x.left
191
+ # top = x.top
192
+ # try:
193
+ # # Bottom method seems to have a bug.
194
+ # # See https://github.com/python-openxml/python-docx/issues/1433
195
+ # bottom = x.bottom
196
+ # except Exception as e:
197
+ # self._errors.exception(
198
+ # f"Row span exception! {x.xml}",
199
+ # e,
200
+ # KlassMethodLocation(self.MODULE, "_process_table"),
201
+ # )
202
+ # bottom = top + 1
203
+ # h_span = right - left
204
+ # v_span = bottom - top
205
+ # else:
206
+ # h_span = 1
207
+ # v_span = 1
208
+ # if cell._tc is not None:
209
+ # first = r_index == cell._tc.top and c_index == cell._tc.left
210
+ # else:
211
+ # first = r_index == 0 and c_index == 0
212
+ # target_cell = RawTableCell(h_span, v_span, first)
213
+ # target_row.add(target_cell)
214
+ # for block_item in self._iter_block_items(cell):
215
+ # if isinstance(block_item, Paragraph):
216
+ # self._process_cell(block_item, target_cell)
217
+ # elif isinstance(block_item, Table):
218
+ # raise self.LogicError("Table within table detected")
219
+ # elif isinstance(block_item, etree._Element):
220
+ # if block_item.tag == CT_TcPr:
221
+ # pass
222
+ # else:
223
+ # self._errors.warning(
224
+ # f"Ignoring eTree element {block_item.tag}",
225
+ # KlassMethodLocation(self.MODULE, "_process_table"),
226
+ # )
227
+ # else:
228
+ # raise self.LogicError(
229
+ # f"Something's not right with a child {type(block_item)}"
230
+ # )
166
231
 
167
232
  def _process_cell(self, paragraph, target_cell: RawTableCell):
168
233
  if self._is_list(paragraph):
169
234
  list_level = self.get_list_level(paragraph)
170
- item = RawListItem(paragraph.extract_runs(self.errors), list_level)
235
+ item = RawListItem(paragraph.extract_runs(self._errors), list_level)
171
236
  if target_cell.is_in_list():
172
237
  list = target_cell.current_list()
173
238
  else:
174
- list = RawList(self.errors)
239
+ list = RawList(self._errors)
175
240
  target_cell.add(list)
176
241
  list.add(item)
177
242
  else:
178
- target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
243
+ target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
179
244
  target_cell.add(target_paragraph)
180
245
 
181
246
  def _process_paragraph(
@@ -187,38 +252,47 @@ class RawDocx:
187
252
  self.target_document.add(target_section)
188
253
  elif self._is_list(paragraph):
189
254
  list_level = self.get_list_level(paragraph)
190
- item = RawListItem(paragraph.extract_runs(self.errors), list_level)
255
+ item = RawListItem(paragraph.extract_runs(self._errors), list_level)
191
256
  if target_section.is_in_list():
192
257
  list = target_section.current_list()
193
258
  else:
194
- list = RawList(self.errors)
259
+ list = RawList(self._errors)
195
260
  target_section.add(list)
196
261
  list.add(item)
197
262
  elif "Graphic" in paragraph._p.xml:
198
263
  for rId in image_rels:
199
264
  if rId in paragraph._p.xml:
200
- target_image = RawImage(image_rels[rId], self.errors)
265
+ target_image = RawImage(image_rels[rId], self._errors)
201
266
  target_section.add(target_image)
202
267
  else:
203
- target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
268
+ target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
204
269
  target_section.add(target_paragraph)
205
270
 
206
271
  def get_list_level(self, paragraph):
207
272
  list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
208
273
  return int(str(list_level[0])) if list_level else 0
209
274
 
210
- def _is_heading(self, text):
211
- if re.match(r"^\d\dHeading \d", text):
212
- try:
213
- level = int(text[0:2])
214
- return True, level
215
- except Exception:
216
- return True, 0
217
- if re.match(r"^Heading \d", text):
275
+ def _is_heading(self, text) -> tuple[bool, int]:
276
+ """
277
+ Extract heading level from text containing "Heading <N>" pattern.
278
+
279
+ Args:
280
+ text: Text to analyze for heading pattern
281
+
282
+ Returns:
283
+ tuple[bool, int]: (success, level) where success indicates if heading
284
+ pattern was found and level is the extracted integer value
285
+ """
286
+ if not text:
287
+ return False, 0
288
+
289
+ # Look for "Heading <N>" pattern where <N> is one or more digits
290
+ match = re.search(r"Heading\s+(\d+)", text, re.IGNORECASE)
291
+ if match:
218
292
  try:
219
- level = int(text[8])
293
+ level = int(match.group(1))
220
294
  return True, level
221
- except Exception:
295
+ except (ValueError, IndexError):
222
296
  return True, 0
223
297
  return False, 0
224
298
 
@@ -1,4 +1,3 @@
1
- from html import escape
2
1
  from .raw_paragraph import RawParagraph
3
2
  from .raw_run import RawRun
4
3
 
@@ -1,5 +1,4 @@
1
1
  from .raw_run import RawRun
2
- from html import escape
3
2
 
4
3
 
5
4
  class RawParagraph:
@@ -3,7 +3,7 @@ class RawTable:
3
3
  from .raw_table_row import RawTableRow
4
4
 
5
5
  self.rows: list[RawTableRow] = []
6
- self.klasses = ["ich-m11-table"]
6
+ self.klasses = ["raw-docx-table"]
7
7
 
8
8
  # @ToDo Would like RawTableRow here but gets a circular import
9
9
  def add(self, item):
@@ -40,7 +40,8 @@ class RawTable:
40
40
  self.klasses.append(klass)
41
41
 
42
42
  def replace_class(self, old_klass, new_klass):
43
- self.klasses.remove(old_klass)
43
+ if old_klass in self.klasses:
44
+ self.klasses.remove(old_klass)
44
45
  self.klasses.append(new_klass)
45
46
 
46
47
  def to_dict(self) -> dict:
@@ -42,7 +42,8 @@ class RawTableCell:
42
42
  return ""
43
43
  lines = []
44
44
  colspan = f' colspan="{self.h_span}"' if self.h_merged else ""
45
- lines.append(f"<td{colspan}>")
45
+ rowspan = f' rowspan="{self.v_span}"' if self.v_merged else ""
46
+ lines.append(f"<td{colspan}{rowspan}>")
46
47
  for item in self.items:
47
48
  lines.append(item.to_html())
48
49
  lines.append("</td>")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raw_docx
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: A package for processing and analyzing raw document formats
5
5
  Home-page: https://github.com/daveih/raw_docx
6
6
  Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Requires-Python: >=3.8
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: python-docx
21
- Requires-Dist: simple_error_log
20
+ Requires-Dist: python-docx==1.2.0
21
+ Requires-Dist: simple_error_log>=0.6.0
22
22
  Dynamic: author
23
23
  Dynamic: classifier
24
24
  Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
38
38
  Build as a normal package
39
39
 
40
40
  - Build with `python3 -m build --sdist --wheel`
41
- - Upload to pypi.org using `twine upload dist/* `
41
+ - Upload to pypi.org using `twine upload dist/*`
@@ -21,16 +21,4 @@ src/raw_docx.egg-info/requires.txt
21
21
  src/raw_docx.egg-info/top_level.txt
22
22
  src/raw_docx/docx/__init__.py
23
23
  src/raw_docx/docx/docx_paragraph.py
24
- tests/test_docx_paragraph.py
25
- tests/test_integration.py
26
- tests/test_raw_document.py
27
- tests/test_raw_docx.py
28
- tests/test_raw_image.py
29
- tests/test_raw_list.py
30
- tests/test_raw_list_item.py
31
- tests/test_raw_paragraph.py
32
- tests/test_raw_run.py
33
- tests/test_raw_section.py
34
- tests/test_raw_table.py
35
- tests/test_raw_table_cell.py
36
- tests/test_raw_table_row.py
24
+ src/raw_docx/docx/docx_table.py
@@ -0,0 +1,2 @@
1
+ python-docx==1.2.0
2
+ simple_error_log>=0.6.0
@@ -1 +0,0 @@
1
- __package_version__ = "0.7.0"
@@ -1,2 +0,0 @@
1
- python-docx
2
- simple_error_log