raw-docx 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {raw_docx-0.7.0 → raw_docx-0.8.0}/PKG-INFO +4 -4
  2. {raw_docx-0.7.0 → raw_docx-0.8.0}/README.md +1 -1
  3. {raw_docx-0.7.0 → raw_docx-0.8.0}/setup.py +1 -1
  4. raw_docx-0.8.0/src/raw_docx/__info__.py +1 -0
  5. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_document.py +1 -1
  6. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_docx.py +23 -11
  7. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx.egg-info/PKG-INFO +4 -4
  8. raw_docx-0.8.0/src/raw_docx.egg-info/requires.txt +2 -0
  9. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_docx_paragraph.py +48 -0
  10. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_document.py +56 -0
  11. raw_docx-0.8.0/tests/test_raw_docx.py +628 -0
  12. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_list.py +26 -1
  13. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_paragraph.py +37 -0
  14. raw_docx-0.7.0/src/raw_docx/__info__.py +0 -1
  15. raw_docx-0.7.0/src/raw_docx.egg-info/requires.txt +0 -2
  16. raw_docx-0.7.0/tests/test_raw_docx.py +0 -107
  17. {raw_docx-0.7.0 → raw_docx-0.8.0}/LICENSE +0 -0
  18. {raw_docx-0.7.0 → raw_docx-0.8.0}/setup.cfg +0 -0
  19. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/__init__.py +0 -0
  20. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/docx/__init__.py +0 -0
  21. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/docx/docx_paragraph.py +0 -0
  22. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_image.py +0 -0
  23. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_list.py +0 -0
  24. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_list_item.py +0 -0
  25. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_paragraph.py +0 -0
  26. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_run.py +0 -0
  27. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_section.py +0 -0
  28. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_table.py +0 -0
  29. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_table_cell.py +0 -0
  30. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx/raw_table_row.py +0 -0
  31. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx.egg-info/SOURCES.txt +0 -0
  32. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx.egg-info/dependency_links.txt +0 -0
  33. {raw_docx-0.7.0 → raw_docx-0.8.0}/src/raw_docx.egg-info/top_level.txt +0 -0
  34. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_integration.py +0 -0
  35. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_image.py +0 -0
  36. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_list_item.py +0 -0
  37. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_run.py +0 -0
  38. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_section.py +0 -0
  39. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_table.py +0 -0
  40. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_table_cell.py +0 -0
  41. {raw_docx-0.7.0 → raw_docx-0.8.0}/tests/test_raw_table_row.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raw_docx
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A package for processing and analyzing raw document formats
5
5
  Home-page: https://github.com/daveih/raw_docx
6
6
  Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Requires-Python: >=3.8
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: python-docx
21
- Requires-Dist: simple_error_log
20
+ Requires-Dist: python-docx==1.1.2
21
+ Requires-Dist: simple_error_log==0.6.0
22
22
  Dynamic: author
23
23
  Dynamic: classifier
24
24
  Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
38
38
  Build as a normal package
39
39
 
40
40
  - Build with `python3 -m build --sdist --wheel`
41
- - Upload to pypi.org using `twine upload dist/* `
41
+ - Upload to pypi.org using `twine upload dist/*`
@@ -7,4 +7,4 @@ Simple package to build on top of python-docx to assist in the handling of word
7
7
  Build as a normal package
8
8
 
9
9
  - Build with `python3 -m build --sdist --wheel`
10
- - Upload to pypi.org using `twine upload dist/* `
10
+ - Upload to pypi.org using `twine upload dist/*`
@@ -19,7 +19,7 @@ setup(
19
19
  packages=find_packages(where="src"),
20
20
  package_dir={"": "src"},
21
21
  package_data={},
22
- install_requires=["python-docx", "simple_error_log"],
22
+ install_requires=["python-docx==1.1.2", "simple_error_log==0.6.0"],
23
23
  tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
24
24
  classifiers=[
25
25
  "Development Status :: 3 - Alpha",
@@ -0,0 +1 @@
1
+ __package_version__ = "0.8.0"
@@ -22,7 +22,7 @@ class RawDocument:
22
22
  return self.sections[-1]
23
23
 
24
24
  def section_by_ordinal(self, ordinal: int) -> RawSection:
25
- if 1 >= ordinal <= len(self.sections):
25
+ if 1 <= ordinal <= len(self.sections):
26
26
  return self.sections[ordinal - 1]
27
27
  else:
28
28
  return None
@@ -144,7 +144,10 @@ class RawDocx:
144
144
  else:
145
145
  h_span = 1
146
146
  v_span = 1
147
- first = r_index == cell._tc.top and c_index == cell._tc.left
147
+ if cell._tc is not None:
148
+ first = r_index == cell._tc.top and c_index == cell._tc.left
149
+ else:
150
+ first = r_index == 0 and c_index == 0
148
151
  target_cell = RawTableCell(h_span, v_span, first)
149
152
  target_row.add(target_cell)
150
153
  for block_item in self._iter_block_items(cell):
@@ -207,18 +210,27 @@ class RawDocx:
207
210
  list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
208
211
  return int(str(list_level[0])) if list_level else 0
209
212
 
210
- def _is_heading(self, text):
211
- if re.match(r"^\d\dHeading \d", text):
212
- try:
213
- level = int(text[0:2])
214
- return True, level
215
- except Exception:
216
- return True, 0
217
- if re.match(r"^Heading \d", text):
213
+ def _is_heading(self, text) -> tuple[bool, int]:
214
+ """
215
+ Extract heading level from text containing "Heading <N>" pattern.
216
+
217
+ Args:
218
+ text: Text to analyze for heading pattern
219
+
220
+ Returns:
221
+ tuple[bool, int]: (success, level) where success indicates if heading
222
+ pattern was found and level is the extracted integer value
223
+ """
224
+ if not text:
225
+ return False, 0
226
+
227
+ # Look for "Heading <N>" pattern where <N> is one or more digits
228
+ match = re.search(r"Heading\s+(\d+)", text, re.IGNORECASE)
229
+ if match:
218
230
  try:
219
- level = int(text[8])
231
+ level = int(match.group(1))
220
232
  return True, level
221
- except Exception:
233
+ except (ValueError, IndexError):
222
234
  return True, 0
223
235
  return False, 0
224
236
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raw_docx
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A package for processing and analyzing raw document formats
5
5
  Home-page: https://github.com/daveih/raw_docx
6
6
  Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Requires-Python: >=3.8
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: python-docx
21
- Requires-Dist: simple_error_log
20
+ Requires-Dist: python-docx==1.1.2
21
+ Requires-Dist: simple_error_log==0.6.0
22
22
  Dynamic: author
23
23
  Dynamic: classifier
24
24
  Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
38
38
  Build as a normal package
39
39
 
40
40
  - Build with `python3 -m build --sdist --wheel`
41
- - Upload to pypi.org using `twine upload dist/* `
41
+ - Upload to pypi.org using `twine upload dist/*`
@@ -0,0 +1,2 @@
1
+ python-docx==1.1.2
2
+ simple_error_log==0.6.0
@@ -194,3 +194,51 @@ def test_extract_runs_with_mixed_colors():
194
194
  result = extract_runs(paragraph, errors)
195
195
  assert len(result) == 3
196
196
  assert [r.color for r in result] == ["FF0000", "0000FF", "FF0000"]
197
+
198
+
199
+ def test_run_style_color_exception_handling():
200
+ """Test _run_style_color exception handling (lines 80-82)"""
201
+ errors = Errors()
202
+
203
+ # Create a run that will cause an exception when accessing style properties
204
+ run = Mock(spec=Run)
205
+
206
+ # Create a style that raises an exception when trying to access font.color.rgb
207
+ def side_effect_func(*args, **kwargs):
208
+ raise AttributeError("Font color access error")
209
+
210
+ style_mock = Mock()
211
+ style_mock.font.color.rgb = Mock(side_effect=side_effect_func)
212
+ # Make the rgb property falsy in boolean context but raise when called
213
+ style_mock.font.color.rgb.__bool__ = Mock(side_effect=side_effect_func)
214
+ run.style = style_mock
215
+
216
+ result = _run_style_color(run, errors)
217
+ assert result is None
218
+ # Should have logged the exception
219
+
220
+
221
+ def test_get_font_colour_exception_handling():
222
+ """Test _get_font_colour exception handling"""
223
+ errors = Errors()
224
+
225
+ # Create an item that will cause an exception when accessing font properties
226
+ item = Mock()
227
+ item.font = None # This will cause AttributeError when trying to access font.color
228
+
229
+ result = _get_font_colour(item, errors)
230
+ assert result is None
231
+ # Should have logged the exception
232
+
233
+
234
+ def test_get_highlight_color_exception_handling():
235
+ """Test _get_highlight_color exception handling"""
236
+ errors = Errors()
237
+
238
+ # Create a run that will cause an exception when accessing highlight color
239
+ run = Mock(spec=Run)
240
+ run.font = None # This will cause AttributeError when trying to access font.highlight_color
241
+
242
+ result = _get_highlight_color(run, errors)
243
+ assert result is None
244
+ # Should have logged the exception
@@ -42,6 +42,62 @@ def test_current_section_with_sections(document, sample_section):
42
42
  assert current == sample_section
43
43
 
44
44
 
45
+ def test_section_by_ordinal(document):
46
+ """Test getting section by ordinal position"""
47
+ # Add some sections
48
+ section1 = RawSection("Section 1", "1", 1)
49
+ document.add(section1)
50
+
51
+ section2 = RawSection("Section 2", "2", 1)
52
+ document.add(section2)
53
+
54
+ # Test valid ordinals
55
+ assert document.section_by_ordinal(1) == document.sections[0] # Initial section
56
+ assert document.section_by_ordinal(2) == section1
57
+ assert document.section_by_ordinal(3) == section2
58
+
59
+ # Test invalid ordinals
60
+ assert document.section_by_ordinal(0) is None # Below range
61
+ assert document.section_by_ordinal(4) is None # Above range
62
+ assert document.section_by_ordinal(-1) is None # Negative
63
+
64
+
65
+ def test_section_by_number(document):
66
+ """Test getting section by section number"""
67
+ # Add sections with specific numbers
68
+ section1 = RawSection("Section 1", "1", 1)
69
+ document.add(section1)
70
+
71
+ section2 = RawSection("Section 2", "2", 1)
72
+ document.add(section2)
73
+
74
+ # Test valid section numbers
75
+ assert document.section_by_number("1") == section1
76
+ assert document.section_by_number("2") == section2
77
+
78
+ # Test invalid section number
79
+ assert document.section_by_number("999") is None
80
+ assert document.section_by_number("nonexistent") is None
81
+
82
+
83
+ def test_section_by_title(document):
84
+ """Test getting section by section title"""
85
+ # Add sections with specific titles
86
+ section1 = RawSection("First Section", "1", 1)
87
+ document.add(section1)
88
+
89
+ section2 = RawSection("Second Section", "2", 1)
90
+ document.add(section2)
91
+
92
+ # Test valid section titles
93
+ assert document.section_by_title("First Section") == section1
94
+ assert document.section_by_title("Second Section") == section2
95
+
96
+ # Test invalid section title
97
+ assert document.section_by_title("Nonexistent Section") is None
98
+ assert document.section_by_title("") is None
99
+
100
+
45
101
  def test_to_dict(document):
46
102
  """Test converting document to dictionary"""
47
103
  # Add sections with content
@@ -0,0 +1,628 @@
1
+ import pytest
2
+ import os
3
+ import tempfile
4
+ import zipfile
5
+ from unittest.mock import Mock, patch, PropertyMock
6
+ from pathlib import Path
7
+ from docx import Document as DocxDocument
8
+ from docx.shared import Inches
9
+ from docx.text.paragraph import Paragraph
10
+ from docx.table import Table, _Cell
11
+ from docx.oxml.text.paragraph import CT_P
12
+ from docx.oxml.table import CT_Tbl, CT_TcPr
13
+ from lxml import etree
14
+ import docx.parts.image
15
+ from src.raw_docx.raw_docx import RawDocx
16
+ from src.raw_docx.raw_document import RawDocument
17
+
18
+
19
+ @pytest.fixture
20
+ def raw_docx():
21
+ return RawDocx("tests/test_files/example_1.docx")
22
+
23
+
24
+ @pytest.fixture
25
+ def temp_docx(tmp_path):
26
+ """Create a test docx file with various content"""
27
+ doc_path = tmp_path / "test.docx"
28
+ doc = DocxDocument()
29
+
30
+ # Add regular paragraph
31
+ doc.add_paragraph("Regular paragraph")
32
+
33
+ # Add list items using standard list style
34
+ doc.add_paragraph("First bullet point", style="List Bullet")
35
+ doc.add_paragraph("Second bullet point", style="List Bullet")
36
+
37
+ # Add table with merged cells
38
+ table = doc.add_table(rows=2, cols=2)
39
+ table.cell(0, 0).merge(table.cell(0, 1)) # Merge first row
40
+ table.cell(0, 0).text = "Merged cell"
41
+ table.cell(1, 0).text = "Cell 1"
42
+ table.cell(1, 1).text = "Cell 2"
43
+
44
+ # Add image if test file exists
45
+ image_path = os.path.join(os.path.dirname(__file__), "test_files", "test_image.png")
46
+ if os.path.exists(image_path):
47
+ doc.add_picture(image_path, width=Inches(1.0))
48
+
49
+ # Save the document
50
+ doc.save(doc_path)
51
+ return str(doc_path)
52
+
53
+
54
+ @pytest.fixture
55
+ def simple_docx(tmp_path):
56
+ """Create a simple test docx file"""
57
+ doc_path = tmp_path / "simple_test.docx"
58
+ doc = DocxDocument()
59
+ doc.add_paragraph("Simple paragraph")
60
+ doc.save(doc_path)
61
+ return str(doc_path)
62
+
63
+
64
+ # Integration Tests
65
+ def test_to_dict_with_document(raw_docx):
66
+ """Test converting RawDocx to dictionary with loaded document"""
67
+ result = raw_docx.to_dict()
68
+ assert result["type"] == "raw_docx"
69
+ assert result["document"] is not None
70
+ assert result["document"]["type"] == "document"
71
+ assert isinstance(result["document"]["sections"], list)
72
+
73
+
74
+ def test_initialization_and_processing(temp_docx):
75
+ """Test document initialization and processing"""
76
+ from raw_docx.raw_document import RawDocument
77
+ docx = RawDocx(temp_docx)
78
+ assert os.path.exists(docx.image_path)
79
+ assert isinstance(docx.target_document, RawDocument)
80
+ assert len(docx.target_document.sections) > 0
81
+
82
+
83
+ def test_table_processing(temp_docx):
84
+ """Test table processing with merged cells"""
85
+ docx = RawDocx(temp_docx)
86
+ tables = docx.target_document.sections[0].tables()
87
+ assert len(tables) > 0
88
+
89
+ # Check merged cells
90
+ first_table = tables[0]
91
+ first_row = first_table.rows[0]
92
+ assert len(first_row.cells) == 2
93
+ assert first_row.cells[0].h_span == 2 # Horizontally merged
94
+ assert first_row.cells[0].first is True
95
+ assert first_row.cells[1].first is False
96
+
97
+
98
+ def test_image_processing(temp_docx):
99
+ """Test image extraction and processing"""
100
+ docx = RawDocx(temp_docx)
101
+
102
+ # Check if image directory was created
103
+ assert os.path.exists(docx.image_path)
104
+
105
+ # Check if image is referenced in document
106
+ image_path = os.path.join(os.path.dirname(__file__), "test_files", "test_image.png")
107
+ if os.path.exists(image_path):
108
+ found_image = False
109
+ for section in docx.target_document.sections:
110
+ for item in section.items:
111
+ if hasattr(item, "filepath") and item.filepath.endswith(
112
+ (".png", ".jpg", ".jpeg")
113
+ ):
114
+ found_image = True
115
+ break
116
+ assert found_image, "Image not found in document"
117
+
118
+
119
+ def test_error_handling(tmp_path):
120
+ """Test error handling for invalid files and directories"""
121
+ # Test with non-existent file
122
+ with pytest.raises(Exception):
123
+ RawDocx(str(tmp_path / "nonexistent.docx"))
124
+
125
+ # Test with invalid file format
126
+ invalid_file = tmp_path / "invalid.txt"
127
+ invalid_file.write_text("Not a docx file")
128
+ with pytest.raises(Exception):
129
+ RawDocx(str(invalid_file))
130
+
131
+
132
+ # Coverage Tests - Error Handling and Edge Cases
133
+ def test_organise_dir_permission_error(simple_docx):
134
+ """Test _organise_dir with permission error"""
135
+ with patch('os.mkdir', side_effect=PermissionError("Permission denied")):
136
+ with patch.object(RawDocx, '_process'):
137
+ docx = RawDocx.__new__(RawDocx)
138
+ docx.errors = Mock()
139
+ docx.image_path = "/some/path"
140
+ docx._organise_dir()
141
+ docx.errors.exception.assert_called_once()
142
+
143
+
144
+ def test_process_exception_handling(simple_docx):
145
+ """Test _process with exception in processing"""
146
+ with patch.object(RawDocx, '_organise_dir'):
147
+ with patch.object(RawDocx, '_process_images'):
148
+ docx = RawDocx.__new__(RawDocx)
149
+ docx.errors = Mock()
150
+ docx.target_document = Mock()
151
+ docx.target_document.current_section.return_value = Mock()
152
+
153
+ with patch.object(docx, '_iter_block_items', side_effect=Exception("Test error")):
154
+ docx._process()
155
+ docx.errors.exception.assert_called()
156
+
157
+
158
+ # Coverage Tests - _is_heading Method
159
+ def test_is_heading_comprehensive(simple_docx):
160
+ """Test _is_heading method comprehensively"""
161
+ docx = RawDocx(simple_docx)
162
+
163
+ # Test valid patterns
164
+ assert docx._is_heading("Heading 1") == (True, 1)
165
+ assert docx._is_heading("Heading 2") == (True, 2)
166
+ assert docx._is_heading("heading 3") == (True, 3) # Case insensitive
167
+ assert docx._is_heading("HEADING 10") == (True, 10)
168
+ assert docx._is_heading("Some text with Heading 5 in it") == (True, 5)
169
+ assert docx._is_heading("Heading 7") == (True, 7) # Multiple spaces
170
+
171
+ # Test invalid patterns
172
+ assert docx._is_heading("Not a heading") == (False, 0)
173
+ assert docx._is_heading("Header 1") == (False, 0)
174
+ assert docx._is_heading("Heading") == (False, 0)
175
+ assert docx._is_heading("Heading abc") == (False, 0)
176
+ assert docx._is_heading("1 Heading") == (False, 0)
177
+
178
+ # Test edge cases
179
+ assert docx._is_heading("") == (False, 0)
180
+ assert docx._is_heading(None) == (False, 0)
181
+
182
+
183
+ # Coverage Tests - List Processing
184
+ def test_get_list_level_with_mock(simple_docx):
185
+ """Test get_list_level with properly mocked paragraph"""
186
+ docx = RawDocx(simple_docx)
187
+
188
+ # Create mock paragraph with _p attribute
189
+ mock_paragraph = Mock()
190
+ mock_p = Mock()
191
+ mock_p.xpath.return_value = ['2']
192
+ mock_paragraph._p = mock_p
193
+
194
+ level = docx.get_list_level(mock_paragraph)
195
+ assert level == 2
196
+
197
+ # Test with no level
198
+ mock_p.xpath.return_value = []
199
+ level = docx.get_list_level(mock_paragraph)
200
+ assert level == 0
201
+
202
+
203
+ def test_is_list_comprehensive(simple_docx):
204
+ """Test _is_list method comprehensively"""
205
+ docx = RawDocx(simple_docx)
206
+
207
+ # Test with xpath level
208
+ mock_paragraph = Mock()
209
+ mock_p = Mock()
210
+ mock_p.xpath.return_value = ['1']
211
+ mock_paragraph._p = mock_p
212
+ mock_paragraph.style = Mock()
213
+ mock_paragraph.style.name = "Normal"
214
+ mock_paragraph.text = "Regular text"
215
+
216
+ result = docx._is_list(mock_paragraph)
217
+ assert result is True
218
+
219
+ # Test with bullet styles
220
+ mock_p.xpath.return_value = []
221
+ mock_paragraph.style.name = "CPT_List Bullet"
222
+ result = docx._is_list(mock_paragraph)
223
+ assert result is True
224
+
225
+ mock_paragraph.style.name = "List Bullet"
226
+ result = docx._is_list(mock_paragraph)
227
+ assert result is True
228
+
229
+ # Test with bullet character
230
+ mock_paragraph.style.name = "Normal"
231
+ mock_paragraph.text = "• Bullet point"
232
+ result = docx._is_list(mock_paragraph)
233
+ assert result is True
234
+
235
+ # Test false cases
236
+ mock_paragraph.text = "Regular text"
237
+ result = docx._is_list(mock_paragraph)
238
+ assert result is False
239
+
240
+ mock_paragraph.text = ""
241
+ result = docx._is_list(mock_paragraph)
242
+ assert result is False
243
+
244
+ mock_paragraph.text = None
245
+ result = docx._is_list(mock_paragraph)
246
+ assert result is False
247
+
248
+
249
+ # Coverage Tests - Image Processing
250
+ def test_extract_images_with_media(simple_docx):
251
+ """Test _extract_images when zipfile contains media files"""
252
+ docx = RawDocx.__new__(RawDocx)
253
+ docx.full_path = simple_docx
254
+ docx.image_path = "/tmp/test_images"
255
+
256
+ # Create mock zipfile with media files
257
+ mock_file = Mock()
258
+ mock_file.filename = "word/media/image1.png"
259
+
260
+ mock_archive = Mock()
261
+ mock_archive.filelist = [mock_file]
262
+
263
+ # Mock the context manager for archive.open
264
+ mock_source = Mock()
265
+ mock_source.read.return_value = b"fake image data"
266
+ mock_archive.open.return_value.__enter__ = Mock(return_value=mock_source)
267
+ mock_archive.open.return_value.__exit__ = Mock(return_value=None)
268
+
269
+ # Mock the file writing
270
+ mock_target = Mock()
271
+ mock_target.__enter__ = Mock(return_value=mock_target)
272
+ mock_target.__exit__ = Mock(return_value=None)
273
+ mock_target.write = Mock()
274
+
275
+ with patch('zipfile.ZipFile', return_value=mock_archive):
276
+ with patch('builtins.open', return_value=mock_target):
277
+ docx._extract_images()
278
+ # Should have tried to extract the image
279
+ mock_archive.open.assert_called()
280
+
281
+
282
+ # Coverage Tests - Utility Methods
283
+ def test_tree_method(simple_docx):
284
+ """Test _tree method with nested children"""
285
+ docx = RawDocx(simple_docx)
286
+
287
+ # Create nested mock structure
288
+ grandchild = Mock()
289
+ grandchild.__iter__ = Mock(return_value=iter([]))
290
+
291
+ child = Mock()
292
+ child.__iter__ = Mock(return_value=iter([grandchild]))
293
+
294
+ root = Mock()
295
+ root.__iter__ = Mock(return_value=iter([child]))
296
+
297
+ # Should traverse the tree without errors
298
+ docx._tree(root)
299
+
300
+
301
+ def test_to_dict_edge_cases(simple_docx):
302
+ """Test to_dict method edge cases"""
303
+ # Test without target_document attribute
304
+ docx = RawDocx.__new__(RawDocx)
305
+ result = docx.to_dict()
306
+ assert result == {"type": "raw_docx", "document": None}
307
+
308
+ # Test with target_document that doesn't have to_dict method
309
+ docx.target_document = "not a proper document"
310
+ result = docx.to_dict()
311
+ assert result == {"type": "raw_docx", "document": None}
312
+
313
+ # Test with proper target_document
314
+ docx = RawDocx(simple_docx)
315
+ result = docx.to_dict()
316
+ assert result["type"] == "raw_docx"
317
+ assert result["document"] is not None
318
+ assert isinstance(result["document"], dict)
319
+
320
+
321
+ def test_iter_block_items_string_handling(simple_docx):
322
+ """Test _iter_block_items handles string children gracefully"""
323
+ docx = RawDocx(simple_docx)
324
+
325
+ # Mock iterchildren to return a string (should be handled gracefully)
326
+ with patch.object(docx.source_document.element.body, 'iterchildren', return_value=["string_child"]):
327
+ items = list(docx._iter_block_items(docx.source_document))
328
+ # Should handle gracefully and continue without error
329
+
330
+
331
+ def test_process_table_bottom_exception(simple_docx):
332
+ """Test _process_table when accessing bottom raises exception"""
333
+ docx = RawDocx(simple_docx)
334
+
335
+ # Create a mock table structure
336
+ mock_table = Mock()
337
+ mock_row = Mock()
338
+ mock_cell = Mock()
339
+ mock_tc = Mock()
340
+ mock_tc.right = 2
341
+ mock_tc.left = 1
342
+ mock_tc.top = 1
343
+
344
+ # Make bottom property raise an exception
345
+ type(mock_tc).bottom = PropertyMock(side_effect=Exception("Bottom error"))
346
+ mock_cell._tc = mock_tc
347
+ mock_row.cells = [mock_cell]
348
+ mock_table.rows = [mock_row]
349
+
350
+ with patch.object(docx, '_iter_block_items', return_value=[]):
351
+ target_section = docx.target_document.current_section()
352
+ # Should handle the exception and set bottom = top + 1
353
+ docx._process_table(mock_table, target_section)
354
+
355
+
356
+ def test_logic_error_exception():
357
+ """Test LogicError exception can be raised"""
358
+ with pytest.raises(RawDocx.LogicError):
359
+ raise RawDocx.LogicError("Test error")
360
+
361
+
362
+ # Additional Coverage Tests for Missing Lines
363
+ def test_process_with_unknown_block_item(simple_docx):
364
+ """Test _process with unknown block item type (lines 66-67)"""
365
+ with patch.object(RawDocx, '_organise_dir'):
366
+ with patch.object(RawDocx, '_process_images'):
367
+ docx = RawDocx.__new__(RawDocx)
368
+ docx.errors = Mock()
369
+ docx.target_document = Mock()
370
+ docx.target_document.current_section.return_value = Mock()
371
+
372
+ # Mock _iter_block_items to return unknown type that will trigger warning and ValueError
373
+ unknown_item = 123 # Not a Paragraph or Table
374
+ with patch.object(docx, '_iter_block_items', return_value=[unknown_item]):
375
+ docx._process()
376
+ # Should catch the exception and log it
377
+ docx.errors.exception.assert_called()
378
+
379
+
380
+
381
+
382
+ def test_process_table_with_invalid_child_type(simple_docx):
383
+ """Test _process_table with invalid child type (lines 153-163)"""
384
+ docx = RawDocx(simple_docx)
385
+
386
+ mock_table = Mock()
387
+ mock_row = Mock()
388
+ mock_cell = Mock()
389
+ mock_tc = Mock()
390
+ mock_tc.right = 2
391
+ mock_tc.left = 1
392
+ mock_tc.top = 1
393
+ mock_tc.bottom = 2
394
+ mock_cell._tc = mock_tc
395
+ mock_row.cells = [mock_cell]
396
+ mock_table.rows = [mock_row]
397
+
398
+ # Return invalid child type
399
+ invalid_child = "invalid_type"
400
+
401
+ with patch.object(docx, '_iter_block_items', return_value=[invalid_child]):
402
+ target_section = docx.target_document.current_section()
403
+ with pytest.raises(RawDocx.LogicError, match="something's not right with a child"):
404
+ docx._process_table(mock_table, target_section)
405
+
406
+
407
+ def test_process_paragraph_with_graphic_and_matching_rid(simple_docx):
408
+ """Test _process_paragraph with graphic containing matching rId (lines 198-201)"""
409
+ docx = RawDocx(simple_docx)
410
+
411
+ # Create mock paragraph with graphic
412
+ mock_paragraph = Mock(spec=Paragraph)
413
+ mock_paragraph.style = Mock()
414
+ mock_paragraph.style.name = "Normal"
415
+ mock_paragraph._p = Mock()
416
+ mock_paragraph._p.xml = "Some xml content with rId1 reference and Graphic"
417
+ mock_paragraph.extract_runs.return_value = []
418
+
419
+ target_section = Mock()
420
+ image_rels = {"rId1": "/path/to/image.png"}
421
+
422
+ with patch.object(docx, '_is_heading', return_value=(False, 0)):
423
+ with patch.object(docx, '_is_list', return_value=False):
424
+ with patch('src.raw_docx.raw_docx.RawImage') as mock_image_class:
425
+ mock_image = Mock()
426
+ mock_image_class.return_value = mock_image
427
+
428
+ docx._process_paragraph(mock_paragraph, target_section, image_rels)
429
+ # Should have created and added image
430
+ mock_image_class.assert_called_with("/path/to/image.png", docx.errors)
431
+ target_section.add.assert_called_with(mock_image)
432
+
433
+
434
+ def test_is_heading_with_exception_handling(simple_docx):
435
+ """Test _is_heading exception handling (line 230-231)"""
436
+ docx = RawDocx(simple_docx)
437
+
438
+ # Mock re.search to return a match but int() raises ValueError
439
+ with patch('src.raw_docx.raw_docx.re.search') as mock_search:
440
+ mock_match = Mock()
441
+ mock_match.group.side_effect = ValueError("Invalid conversion")
442
+ mock_search.return_value = mock_match
443
+
444
+ result = docx._is_heading("Heading 1")
445
+ # Should return (True, 0) when exception occurs during conversion
446
+ assert result == (True, 0)
447
+
448
+
449
+ def test_is_heading_with_index_error(simple_docx):
450
+ """Test _is_heading with IndexError (line 230-231)"""
451
+ docx = RawDocx(simple_docx)
452
+
453
+ with patch('src.raw_docx.raw_docx.re.search') as mock_search:
454
+ mock_match = Mock()
455
+ mock_match.group.side_effect = IndexError("Index error")
456
+ mock_search.return_value = mock_match
457
+
458
+ result = docx._is_heading("Heading 1")
459
+ # Should return (True, 0) when IndexError occurs
460
+ assert result == (True, 0)
461
+
462
+
463
+ def test_process_images_with_image_part(simple_docx):
464
+ """Test _process_images when document has image parts (line 76)"""
465
+ raw_docx_instance = RawDocx.__new__(RawDocx)
466
+ raw_docx_instance.errors = Mock()
467
+ raw_docx_instance.image_path = "/tmp/test"
468
+ raw_docx_instance.image_rels = {}
469
+
470
+ # Mock source document with image parts
471
+ mock_source_doc = Mock()
472
+ mock_part = Mock()
473
+ mock_rels = Mock()
474
+
475
+ # Create mock image part
476
+ import docx.parts.image
477
+ mock_image_part = Mock(spec=docx.parts.image.ImagePart)
478
+ mock_image_part.partname = "/word/media/image1.png"
479
+
480
+ mock_rel = Mock()
481
+ mock_rel._target = mock_image_part
482
+ mock_rel.rId = "rId1"
483
+
484
+ mock_rels.values.return_value = [mock_rel]
485
+ mock_part.rels = mock_rels
486
+ mock_source_doc.part = mock_part
487
+ raw_docx_instance.source_document = mock_source_doc
488
+
489
+ with patch.object(raw_docx_instance, '_extract_images'):
490
+ raw_docx_instance._process_images()
491
+ # Should have added image to image_rels
492
+ assert "rId1" in raw_docx_instance.image_rels
493
+ assert raw_docx_instance.image_rels["rId1"].endswith("image1.png")
494
+
495
+
496
+ def test_iter_block_items_invalid_parent(simple_docx):
497
+ """Test _iter_block_items with invalid parent type (line 93)"""
498
+ docx = RawDocx(simple_docx)
499
+
500
+ # Test with invalid parent type
501
+ invalid_parent = "not_a_document_or_cell"
502
+
503
+ with pytest.raises(ValueError, match="something's not right with the parent"):
504
+ list(docx._iter_block_items(invalid_parent))
505
+
506
+
507
+ def test_iter_block_items_unknown_child_type(simple_docx):
508
+ """Test _iter_block_items with unknown child type (line 117)"""
509
+ from docx.document import Document
510
+ docx = RawDocx(simple_docx)
511
+
512
+ # Mock document with unknown child type
513
+ mock_doc = Mock(spec=Document)
514
+ mock_body = Mock()
515
+
516
+ # Create unknown child type
517
+ unknown_child = Mock()
518
+ unknown_child.__class__ = type('UnknownType', (), {})
519
+
520
+ mock_body.iterchildren.return_value = [unknown_child]
521
+ mock_doc.element = Mock()
522
+ mock_doc.element.body = mock_body
523
+
524
+ with pytest.raises(ValueError, match="something's not right with a child"):
525
+ list(docx._iter_block_items(mock_doc))
526
+
527
+
528
+ def test_process_table_cell_none(simple_docx):
529
+ """Test _process_table when cell._tc is None (lines 145-146)"""
530
+ docx = RawDocx(simple_docx)
531
+
532
+ # Create mock table with cell._tc = None
533
+ mock_table = Mock()
534
+ mock_row = Mock()
535
+ mock_cell = Mock()
536
+ mock_cell._tc = None # This triggers lines 145-146
537
+ # Need to mock the attributes that are accessed when _tc is None
538
+ mock_cell.top = 0
539
+ mock_cell.left = 0
540
+ mock_row.cells = [mock_cell]
541
+ mock_table.rows = [mock_row]
542
+
543
+ with patch.object(docx, '_iter_block_items', return_value=[]):
544
+ target_section = docx.target_document.current_section()
545
+ docx._process_table(mock_table, target_section)
546
+ # Should handle None _tc and set default spans
547
+
548
+
549
+ def test_process_table_nested_table(simple_docx):
550
+ """Test _process_table with nested table (line 154)"""
551
+ docx = RawDocx(simple_docx)
552
+
553
+ # Create mock table structure
554
+ mock_table = Mock()
555
+ mock_row = Mock()
556
+ mock_cell = Mock()
557
+ mock_tc = Mock()
558
+ mock_tc.right = 2
559
+ mock_tc.left = 1
560
+ mock_tc.top = 1
561
+ mock_tc.bottom = 2
562
+ mock_cell._tc = mock_tc
563
+ mock_row.cells = [mock_cell]
564
+ mock_table.rows = [mock_row]
565
+
566
+ # Create nested table
567
+ nested_table = Mock(spec=Table)
568
+
569
+ with patch.object(docx, '_iter_block_items', return_value=[nested_table]):
570
+ target_section = docx.target_document.current_section()
571
+ with pytest.raises(RawDocx.LogicError, match="Table within table detected"):
572
+ docx._process_table(mock_table, target_section)
573
+
574
+
575
+ def test_process_table_with_etree_element_ct_tcpr(simple_docx):
576
+ """Test _process_table with CT_TcPr etree element (lines 156-157)"""
577
+ docx = RawDocx(simple_docx)
578
+
579
+ # Create mock table structure
580
+ mock_table = Mock()
581
+ mock_row = Mock()
582
+ mock_cell = Mock()
583
+ mock_tc = Mock()
584
+ mock_tc.right = 2
585
+ mock_tc.left = 1
586
+ mock_tc.top = 1
587
+ mock_tc.bottom = 2
588
+ mock_cell._tc = mock_tc
589
+ mock_row.cells = [mock_cell]
590
+ mock_table.rows = [mock_row]
591
+
592
+ # Create etree element with CT_TcPr tag
593
+ etree_element = Mock(spec=etree._Element)
594
+ etree_element.tag = CT_TcPr
595
+
596
+ with patch.object(docx, '_iter_block_items', return_value=[etree_element]):
597
+ target_section = docx.target_document.current_section()
598
+ # Should pass through CT_TcPr elements without error
599
+ docx._process_table(mock_table, target_section)
600
+
601
+
602
+ def test_process_table_with_unknown_etree_element(simple_docx):
603
+ """Test _process_table with unknown etree element (lines 158-159)"""
604
+ docx = RawDocx(simple_docx)
605
+
606
+ # Create mock table structure
607
+ mock_table = Mock()
608
+ mock_row = Mock()
609
+ mock_cell = Mock()
610
+ mock_tc = Mock()
611
+ mock_tc.right = 2
612
+ mock_tc.left = 1
613
+ mock_tc.top = 1
614
+ mock_tc.bottom = 2
615
+ mock_cell._tc = mock_tc
616
+ mock_row.cells = [mock_cell]
617
+ mock_table.rows = [mock_row]
618
+
619
+ # Create etree element with unknown tag
620
+ etree_element = Mock(spec=etree._Element)
621
+ etree_element.tag = "unknown_tag"
622
+
623
+ with patch.object(docx, '_iter_block_items', return_value=[etree_element]):
624
+ target_section = docx.target_document.current_section()
625
+ with patch.object(docx.errors, 'warning') as mock_warning:
626
+ docx._process_table(mock_table, target_section)
627
+ # Should log warning for unknown etree element
628
+ mock_warning.assert_called()
@@ -42,7 +42,32 @@ def test_to_text():
42
42
  ]
43
43
  for item in items:
44
44
  list.add(item)
45
- assert item.to_text() == " Item 2"
45
+ # Test the RawList.to_text() method to cover lines 30-33
46
+ result = list.to_text()
47
+ expected_lines = [
48
+ " Item 1", # Level 1 item (2 spaces for level 1)
49
+ " Item 1.1", # Level 2 item (4 spaces for level 2)
50
+ " Item 2" # Level 1 item (2 spaces for level 1)
51
+ ]
52
+ assert result == "\n".join(expected_lines)
53
+
54
+
55
+ def test_to_text_empty_list():
56
+ """Test to_text method with empty list"""
57
+ errors = Errors()
58
+ list = RawList(errors, 0)
59
+ result = list.to_text()
60
+ assert result == ""
61
+
62
+
63
+ def test_to_text_single_item():
64
+ """Test to_text method with single item"""
65
+ errors = Errors()
66
+ list = RawList(errors, 0)
67
+ item = RawListItem([RawRun("Single item", "", None, "Normal")], 0)
68
+ list.add(item)
69
+ result = list.to_text()
70
+ assert result == "Single item"
46
71
 
47
72
 
48
73
  def test_add_multiple_items():
@@ -95,3 +95,40 @@ def test_empty_paragraph():
95
95
  paragraph = RawParagraph([])
96
96
  assert paragraph.text == ""
97
97
  assert len(paragraph.runs) == 0
98
+
99
+
100
+ def test_add_span():
101
+ """Test adding span to paragraph text"""
102
+ run = RawRun("Hello World", "", None, "Normal")
103
+ paragraph = RawParagraph([run])
104
+
105
+ # Add span to the beginning of "Hello"
106
+ paragraph.add_span("Hello", "highlight")
107
+
108
+ # The text should now have the span wrapped around "Hello"
109
+ expected_text = '<span class="highlight">Hello</span> World'
110
+ assert paragraph.text == expected_text
111
+
112
+
113
+ def test_add_span_partial_match():
114
+ """Test adding span with partial text match"""
115
+ run = RawRun("Testing paragraph", "", None, "Normal")
116
+ paragraph = RawParagraph([run])
117
+
118
+ # Add span to "Test" at the beginning
119
+ paragraph.add_span("Test", "emphasis")
120
+
121
+ expected_text = '<span class="emphasis">Test</span>ing paragraph'
122
+ assert paragraph.text == expected_text
123
+
124
+
125
+ def test_add_span_exact_match():
126
+ """Test adding span when text matches exactly"""
127
+ run = RawRun("Test", "", None, "Normal")
128
+ paragraph = RawParagraph([run])
129
+
130
+ # Add span to entire text
131
+ paragraph.add_span("Test", "bold")
132
+
133
+ expected_text = '<span class="bold">Test</span>'
134
+ assert paragraph.text == expected_text
@@ -1 +0,0 @@
1
- __package_version__ = "0.7.0"
@@ -1,2 +0,0 @@
1
- python-docx
2
- simple_error_log
@@ -1,107 +0,0 @@
1
- import pytest
2
- import os
3
- from docx import Document as DocxDocument
4
- from docx.shared import Inches
5
- from src.raw_docx.raw_docx import RawDocx
6
- from src.raw_docx.raw_document import RawDocument
7
-
8
-
9
- @pytest.fixture
10
- def raw_docx():
11
- return RawDocx("tests/test_files/example_1.docx")
12
-
13
-
14
- @pytest.fixture
15
- def temp_docx(tmp_path):
16
- """Create a test docx file with various content"""
17
- doc_path = tmp_path / "test.docx"
18
- doc = DocxDocument()
19
-
20
- # Add regular paragraph
21
- doc.add_paragraph("Regular paragraph")
22
-
23
- # Add list items using standard list style
24
- doc.add_paragraph("First bullet point", style="List Bullet")
25
- doc.add_paragraph("Second bullet point", style="List Bullet")
26
-
27
- # Add table with merged cells
28
- table = doc.add_table(rows=2, cols=2)
29
- table.cell(0, 0).merge(table.cell(0, 1)) # Merge first row
30
- table.cell(0, 0).text = "Merged cell"
31
- table.cell(1, 0).text = "Cell 1"
32
- table.cell(1, 1).text = "Cell 2"
33
-
34
- # Add image if test file exists
35
- image_path = os.path.join(os.path.dirname(__file__), "test_files", "test_image.png")
36
- if os.path.exists(image_path):
37
- doc.add_picture(image_path, width=Inches(1.0))
38
-
39
- # Save the document
40
- doc.save(doc_path)
41
- return str(doc_path)
42
-
43
-
44
- def test_to_dict_with_document(raw_docx):
45
- """Test converting RawDocx to dictionary with loaded document"""
46
- result = raw_docx.to_dict()
47
- assert result["type"] == "raw_docx"
48
- assert result["document"] is not None
49
- assert result["document"]["type"] == "document"
50
- assert isinstance(result["document"]["sections"], list)
51
-
52
-
53
- def test_initialization_and_processing(temp_docx):
54
- """Test document initialization and processing"""
55
- docx = RawDocx(temp_docx)
56
- assert os.path.exists(docx.image_path)
57
- assert isinstance(docx.target_document, RawDocument)
58
- assert len(docx.target_document.sections) > 0
59
-
60
-
61
- def test_table_processing(temp_docx):
62
- """Test table processing with merged cells"""
63
- docx = RawDocx(temp_docx)
64
- tables = docx.target_document.sections[0].tables()
65
- assert len(tables) > 0
66
-
67
- # Check merged cells
68
- first_table = tables[0]
69
- first_row = first_table.rows[0]
70
- assert len(first_row.cells) == 2
71
- assert first_row.cells[0].h_span == 2 # Horizontally merged
72
- assert first_row.cells[0].first is True
73
- assert first_row.cells[1].first is False
74
-
75
-
76
- def test_image_processing(temp_docx):
77
- """Test image extraction and processing"""
78
- docx = RawDocx(temp_docx)
79
-
80
- # Check if image directory was created
81
- assert os.path.exists(docx.image_path)
82
-
83
- # Check if image is referenced in document
84
- image_path = os.path.join(os.path.dirname(__file__), "test_files", "test_image.png")
85
- if os.path.exists(image_path):
86
- found_image = False
87
- for section in docx.target_document.sections:
88
- for item in section.items:
89
- if hasattr(item, "filepath") and item.filepath.endswith(
90
- (".png", ".jpg", ".jpeg")
91
- ):
92
- found_image = True
93
- break
94
- assert found_image, "Image not found in document"
95
-
96
-
97
- def test_error_handling(tmp_path):
98
- """Test error handling for invalid files and directories"""
99
- # Test with non-existent file
100
- with pytest.raises(Exception):
101
- RawDocx(str(tmp_path / "nonexistent.docx"))
102
-
103
- # Test with invalid file format
104
- invalid_file = tmp_path / "invalid.txt"
105
- invalid_file.write_text("Not a docx file")
106
- with pytest.raises(Exception):
107
- RawDocx(str(invalid_file))
File without changes
File without changes
File without changes