raw-docx 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {raw_docx-0.5.0 → raw_docx-0.7.0}/PKG-INFO +4 -3
- {raw_docx-0.5.0 → raw_docx-0.7.0}/setup.py +2 -2
- raw_docx-0.7.0/src/raw_docx/__info__.py +1 -0
- raw_docx-0.7.0/src/raw_docx/__init__.py +25 -0
- {raw_docx-0.5.0/src/raw_docx → raw_docx-0.7.0/src/raw_docx/docx}/docx_paragraph.py +23 -22
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_docx.py +33 -32
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_image.py +4 -3
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_list.py +6 -5
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_list_item.py +2 -1
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_paragraph.py +5 -1
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx.egg-info/PKG-INFO +4 -3
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx.egg-info/SOURCES.txt +3 -4
- raw_docx-0.7.0/src/raw_docx.egg-info/requires.txt +2 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_docx_paragraph.py +20 -13
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_image.py +19 -9
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_list.py +40 -36
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_section.py +23 -9
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_table_cell.py +3 -1
- raw_docx-0.5.0/src/raw_docx/__version__.py +0 -1
- raw_docx-0.5.0/src/raw_docx/raw_logger.py +0 -67
- raw_docx-0.5.0/src/raw_docx.egg-info/requires.txt +0 -2
- raw_docx-0.5.0/tests/test_raw_logger.py +0 -112
- {raw_docx-0.5.0 → raw_docx-0.7.0}/LICENSE +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/README.md +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/setup.cfg +0 -0
- {raw_docx-0.5.0/src/raw_docx → raw_docx-0.7.0/src/raw_docx/docx}/__init__.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_document.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_run.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_section.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_table.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_table_cell.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx/raw_table_row.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx.egg-info/dependency_links.txt +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/src/raw_docx.egg-info/top_level.txt +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_integration.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_document.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_docx.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_list_item.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_paragraph.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_run.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_table.py +0 -0
- {raw_docx-0.5.0 → raw_docx-0.7.0}/tests/test_raw_table_row.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: raw_docx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: A package for processing and analyzing raw document formats
|
5
5
|
Home-page: https://github.com/daveih/raw_docx
|
6
6
|
Author: Dave Iberson-Hurst
|
@@ -18,12 +18,13 @@ Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
License-File: LICENSE
|
20
20
|
Requires-Dist: python-docx
|
21
|
-
Requires-Dist:
|
21
|
+
Requires-Dist: simple_error_log
|
22
22
|
Dynamic: author
|
23
23
|
Dynamic: classifier
|
24
24
|
Dynamic: description
|
25
25
|
Dynamic: description-content-type
|
26
26
|
Dynamic: home-page
|
27
|
+
Dynamic: license-file
|
27
28
|
Dynamic: requires-dist
|
28
29
|
Dynamic: requires-python
|
29
30
|
Dynamic: summary
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
|
6
6
|
package_info = {}
|
7
|
-
with open("src/raw_docx/
|
7
|
+
with open("src/raw_docx/__info__.py") as fp:
|
8
8
|
exec(fp.read(), package_info)
|
9
9
|
|
10
10
|
setup(
|
@@ -19,7 +19,7 @@ setup(
|
|
19
19
|
packages=find_packages(where="src"),
|
20
20
|
package_dir={"": "src"},
|
21
21
|
package_data={},
|
22
|
-
install_requires=["python-docx", "
|
22
|
+
install_requires=["python-docx", "simple_error_log"],
|
23
23
|
tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
|
24
24
|
classifiers=[
|
25
25
|
"Development Status :: 3 - Alpha",
|
@@ -0,0 +1 @@
|
|
1
|
+
__package_version__ = "0.7.0"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from .raw_docx import RawDocx
|
2
|
+
from .raw_document import RawDocument
|
3
|
+
from .raw_image import RawImage
|
4
|
+
from .raw_list_item import RawListItem
|
5
|
+
from .raw_list import RawList
|
6
|
+
from .raw_paragraph import RawParagraph
|
7
|
+
from .raw_run import RawRun
|
8
|
+
from .raw_section import RawSection
|
9
|
+
from .raw_table_cell import RawTableCell
|
10
|
+
from .raw_table_row import RawTableRow
|
11
|
+
from .raw_table import RawTable
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
"RawDocx",
|
15
|
+
"RawDocument",
|
16
|
+
"RawImage",
|
17
|
+
"RawList",
|
18
|
+
"RawListItem",
|
19
|
+
"RawParagraph",
|
20
|
+
"RawRun",
|
21
|
+
"RawSection",
|
22
|
+
"RawTableCell",
|
23
|
+
"RawTableRow",
|
24
|
+
"RawTable",
|
25
|
+
]
|
@@ -1,31 +1,35 @@
|
|
1
1
|
from docx.text.paragraph import Paragraph
|
2
2
|
from docx.styles.style import ParagraphStyle
|
3
3
|
from docx.text.run import Run
|
4
|
-
from
|
5
|
-
from .raw_run import RawRun
|
4
|
+
from simple_error_log import Errors
|
5
|
+
from raw_docx.raw_run import RawRun
|
6
6
|
|
7
7
|
|
8
|
-
def
|
8
|
+
def install():
|
9
|
+
setattr(Paragraph, "extract_runs", extract_runs)
|
10
|
+
|
11
|
+
|
12
|
+
def extract_runs(paragraph: Paragraph, errors: Errors) -> list[RawRun]:
|
9
13
|
if paragraph.text.startswith(
|
10
14
|
"This template is intended for interventional clinical trials. The template is suitable"
|
11
15
|
):
|
12
|
-
|
16
|
+
errors.info(f"Paragraph style {paragraph.style.name}")
|
13
17
|
data = [
|
14
18
|
{
|
15
19
|
"text": run.text,
|
16
|
-
"color": _get_run_color(paragraph.style, run),
|
17
|
-
"highlight": _get_highlight_color(run),
|
20
|
+
"color": _get_run_color(paragraph.style, run, errors),
|
21
|
+
"highlight": _get_highlight_color(run, errors),
|
18
22
|
"keep": True,
|
19
23
|
# "style": run.style.name if run.style else paragraph.style.name
|
20
24
|
"style": paragraph.style.name,
|
21
25
|
}
|
22
26
|
for run in paragraph.runs
|
23
27
|
]
|
24
|
-
data = _tidy_runs_color(data)
|
28
|
+
data = _tidy_runs_color(data, errors)
|
25
29
|
return [RawRun(x["text"], x["color"], x["highlight"], x["style"]) for x in data]
|
26
30
|
|
27
31
|
|
28
|
-
def _tidy_runs_color(data: list[dict]) -> list[dict]:
|
32
|
+
def _tidy_runs_color(data: list[dict], errors: Errors) -> list[dict]:
|
29
33
|
more = False
|
30
34
|
for index, run in enumerate(data):
|
31
35
|
if (
|
@@ -38,14 +42,14 @@ def _tidy_runs_color(data: list[dict]) -> list[dict]:
|
|
38
42
|
more = True
|
39
43
|
new_data = [x for x in data if x["keep"]]
|
40
44
|
if more:
|
41
|
-
new_data = _tidy_runs_color(new_data)
|
45
|
+
new_data = _tidy_runs_color(new_data, errors)
|
42
46
|
return new_data
|
43
47
|
|
44
48
|
|
45
|
-
def _get_run_color(paragraph: Paragraph, run: Run) -> str | None:
|
46
|
-
paragraph_color = _get_font_colour(paragraph)
|
47
|
-
font_color = _get_font_colour(run)
|
48
|
-
style_color = _run_style_color(run)
|
49
|
+
def _get_run_color(paragraph: Paragraph, run: Run, errors: Errors) -> str | None:
|
50
|
+
paragraph_color = _get_font_colour(paragraph, errors)
|
51
|
+
font_color = _get_font_colour(run, errors)
|
52
|
+
style_color = _run_style_color(run, errors)
|
49
53
|
if font_color:
|
50
54
|
result = str(font_color)
|
51
55
|
elif style_color:
|
@@ -55,15 +59,15 @@ def _get_run_color(paragraph: Paragraph, run: Run) -> str | None:
|
|
55
59
|
return result
|
56
60
|
|
57
61
|
|
58
|
-
def _get_highlight_color(run: Run) -> str | None:
|
62
|
+
def _get_highlight_color(run: Run, errors: Errors) -> str | None:
|
59
63
|
try:
|
60
64
|
return str(run.font.highlight_color)
|
61
65
|
except Exception as e:
|
62
|
-
|
66
|
+
errors.exception("Failed to get run highlight color", e)
|
63
67
|
return None
|
64
68
|
|
65
69
|
|
66
|
-
def _run_style_color(run: Run) -> str | None:
|
70
|
+
def _run_style_color(run: Run, errors: Errors) -> str | None:
|
67
71
|
try:
|
68
72
|
run_color = None
|
69
73
|
run_style = run.style
|
@@ -74,16 +78,13 @@ def _run_style_color(run: Run) -> str | None:
|
|
74
78
|
run_style = run_style.base_style
|
75
79
|
return run_color
|
76
80
|
except Exception as e:
|
77
|
-
|
81
|
+
errors.exception("Failed to get run style color", e)
|
78
82
|
return None
|
79
83
|
|
80
84
|
|
81
|
-
def _get_font_colour(item: Run | ParagraphStyle) -> str | None:
|
85
|
+
def _get_font_colour(item: Run | ParagraphStyle, errors: Errors) -> str | None:
|
82
86
|
try:
|
83
87
|
return item.font.color.rgb
|
84
88
|
except Exception as e:
|
85
|
-
|
89
|
+
errors.exception("Failed to get font color", e)
|
86
90
|
return None
|
87
|
-
|
88
|
-
|
89
|
-
setattr(Paragraph, "extract_runs", extract_runs)
|
@@ -3,15 +3,16 @@ import re
|
|
3
3
|
import docx
|
4
4
|
import zipfile
|
5
5
|
from pathlib import Path
|
6
|
-
from .raw_document import RawDocument
|
7
|
-
from .raw_section import RawSection
|
8
|
-
from .raw_paragraph import RawParagraph
|
9
|
-
from .raw_image import RawImage
|
10
|
-
from .raw_table import RawTable
|
11
|
-
from .raw_table_row import RawTableRow
|
12
|
-
from .raw_table_cell import RawTableCell
|
13
|
-
from .raw_list import RawList
|
14
|
-
from .raw_list_item import RawListItem
|
6
|
+
from raw_docx.raw_document import RawDocument
|
7
|
+
from raw_docx.raw_section import RawSection
|
8
|
+
from raw_docx.raw_paragraph import RawParagraph
|
9
|
+
from raw_docx.raw_image import RawImage
|
10
|
+
from raw_docx.raw_table import RawTable
|
11
|
+
from raw_docx.raw_table_row import RawTableRow
|
12
|
+
from raw_docx.raw_table_cell import RawTableCell
|
13
|
+
from raw_docx.raw_list import RawList
|
14
|
+
from raw_docx.raw_list_item import RawListItem
|
15
|
+
from raw_docx.docx.docx_paragraph import install
|
15
16
|
from docx import Document as DocXProcessor
|
16
17
|
from docx.document import Document
|
17
18
|
from docx.oxml.table import CT_Tbl, CT_TcPr
|
@@ -19,8 +20,7 @@ from docx.oxml.text.paragraph import CT_P
|
|
19
20
|
from docx.table import Table, _Cell
|
20
21
|
from docx.text.paragraph import Paragraph
|
21
22
|
from lxml import etree
|
22
|
-
from
|
23
|
-
from .docx_paragraph import extract_runs # Needed such that method inserted into class
|
23
|
+
from simple_error_log import Errors
|
24
24
|
|
25
25
|
|
26
26
|
class RawDocx:
|
@@ -28,12 +28,17 @@ class RawDocx:
|
|
28
28
|
pass
|
29
29
|
|
30
30
|
def __init__(self, full_path: str):
|
31
|
+
install()
|
32
|
+
self.errors = Errors()
|
31
33
|
path = Path(full_path)
|
32
34
|
# path.stem, path.suffix[1:]
|
33
35
|
self.full_path = full_path
|
34
36
|
self.dir = path.parent
|
35
37
|
self.filename = path.name
|
36
38
|
self.image_path = os.path.join(self.dir, "images")
|
39
|
+
self.errors.debug(
|
40
|
+
f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}"
|
41
|
+
)
|
37
42
|
self.image_rels = {}
|
38
43
|
self._organise_dir()
|
39
44
|
self.source_document = DocXProcessor(self.full_path)
|
@@ -46,28 +51,26 @@ class RawDocx:
|
|
46
51
|
except FileExistsError:
|
47
52
|
pass
|
48
53
|
except Exception as e:
|
49
|
-
|
54
|
+
self.errors.exception("Failed to create image directory", e)
|
50
55
|
|
51
56
|
def _process(self):
|
52
57
|
try:
|
53
|
-
self.
|
58
|
+
self._process_images()
|
54
59
|
for block_item in self._iter_block_items(self.source_document):
|
55
60
|
target_section = self.target_document.current_section()
|
56
61
|
if isinstance(block_item, Paragraph):
|
57
|
-
# print(f"PARA BLOCK: {block_item.text}")
|
58
62
|
self._process_paragraph(block_item, target_section, self.image_rels)
|
59
63
|
elif isinstance(block_item, Table):
|
60
64
|
self._process_table(block_item, target_section)
|
61
65
|
else:
|
62
|
-
|
66
|
+
self.errors.warning("Ignoring element")
|
63
67
|
raise ValueError
|
64
68
|
except Exception as e:
|
65
|
-
|
69
|
+
self.errors.exception("Exception raised processing document", e)
|
66
70
|
|
67
|
-
def
|
71
|
+
def _process_images(self):
|
68
72
|
# Extract images to image dir
|
69
73
|
self._extract_images()
|
70
|
-
# Save all 'rId:filenames' as references
|
71
74
|
for r in self.source_document.part.rels.values():
|
72
75
|
if isinstance(r._target, docx.parts.image.ImagePart):
|
73
76
|
self.image_rels[r.rId] = os.path.join(
|
@@ -91,9 +94,8 @@ class RawDocx:
|
|
91
94
|
|
92
95
|
for child in parent_elm.iterchildren():
|
93
96
|
if isinstance(child, str):
|
94
|
-
|
97
|
+
self.errors.warning(f"Ignoring eTree element {child}")
|
95
98
|
elif isinstance(child, CT_P):
|
96
|
-
# print(f"PARA: {child.text}")
|
97
99
|
yield Paragraph(child, parent)
|
98
100
|
elif isinstance(child, CT_Tbl):
|
99
101
|
yield Table(child, parent)
|
@@ -109,13 +111,12 @@ class RawDocx:
|
|
109
111
|
):
|
110
112
|
pass
|
111
113
|
else:
|
112
|
-
|
114
|
+
self.errors.warning(f"Ignoring eTree element {self._tree(child)}")
|
113
115
|
|
114
116
|
else:
|
115
117
|
raise ValueError(f"something's not right with a child {type(child)}")
|
116
118
|
|
117
119
|
def _tree(self, node, tab=1):
|
118
|
-
# print(f"{' ' * tab}{node.tag} {node.text}")
|
119
120
|
for child in node:
|
120
121
|
self._tree(child, tab + 1)
|
121
122
|
|
@@ -155,7 +156,9 @@ class RawDocx:
|
|
155
156
|
if block_item.tag == CT_TcPr:
|
156
157
|
pass
|
157
158
|
else:
|
158
|
-
|
159
|
+
self.errors.warning(
|
160
|
+
f"Ignoring eTree element {block_item.tag}"
|
161
|
+
)
|
159
162
|
else:
|
160
163
|
raise self.LogicError(
|
161
164
|
f"something's not right with a child {type(block_item)}"
|
@@ -164,15 +167,15 @@ class RawDocx:
|
|
164
167
|
def _process_cell(self, paragraph, target_cell: RawTableCell):
|
165
168
|
if self._is_list(paragraph):
|
166
169
|
list_level = self.get_list_level(paragraph)
|
167
|
-
item = RawListItem(paragraph.extract_runs(), list_level)
|
170
|
+
item = RawListItem(paragraph.extract_runs(self.errors), list_level)
|
168
171
|
if target_cell.is_in_list():
|
169
172
|
list = target_cell.current_list()
|
170
173
|
else:
|
171
|
-
list = RawList()
|
174
|
+
list = RawList(self.errors)
|
172
175
|
target_cell.add(list)
|
173
176
|
list.add(item)
|
174
177
|
else:
|
175
|
-
target_paragraph = RawParagraph(paragraph.extract_runs())
|
178
|
+
target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
|
176
179
|
target_cell.add(target_paragraph)
|
177
180
|
|
178
181
|
def _process_paragraph(
|
@@ -183,23 +186,21 @@ class RawDocx:
|
|
183
186
|
target_section = RawSection(paragraph.text, paragraph.text, level)
|
184
187
|
self.target_document.add(target_section)
|
185
188
|
elif self._is_list(paragraph):
|
186
|
-
# print(f"START LIST: {paragraph.text}")
|
187
189
|
list_level = self.get_list_level(paragraph)
|
188
|
-
item = RawListItem(paragraph.extract_runs(), list_level)
|
190
|
+
item = RawListItem(paragraph.extract_runs(self.errors), list_level)
|
189
191
|
if target_section.is_in_list():
|
190
192
|
list = target_section.current_list()
|
191
193
|
else:
|
192
|
-
list = RawList()
|
194
|
+
list = RawList(self.errors)
|
193
195
|
target_section.add(list)
|
194
196
|
list.add(item)
|
195
197
|
elif "Graphic" in paragraph._p.xml:
|
196
198
|
for rId in image_rels:
|
197
199
|
if rId in paragraph._p.xml:
|
198
|
-
target_image = RawImage(image_rels[rId])
|
200
|
+
target_image = RawImage(image_rels[rId], self.errors)
|
199
201
|
target_section.add(target_image)
|
200
202
|
else:
|
201
|
-
|
202
|
-
target_paragraph = RawParagraph(paragraph.extract_runs())
|
203
|
+
target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
|
203
204
|
target_section.add(target_paragraph)
|
204
205
|
|
205
206
|
def get_list_level(self, paragraph):
|
@@ -1,12 +1,13 @@
|
|
1
1
|
import os
|
2
2
|
import base64
|
3
|
-
from
|
3
|
+
from simple_error_log import Errors
|
4
4
|
|
5
5
|
|
6
6
|
class RawImage:
|
7
7
|
FILE_TYPE_MAP = {".png": "png", ".jpg": "jpg", ".jpeg": "jpg"}
|
8
8
|
|
9
|
-
def __init__(self, filepath: str):
|
9
|
+
def __init__(self, filepath: str, errors: Errors):
|
10
|
+
self.errors = errors
|
10
11
|
self.filepath = filepath
|
11
12
|
|
12
13
|
def to_html(self):
|
@@ -21,7 +22,7 @@ class RawImage:
|
|
21
22
|
else:
|
22
23
|
return f"""<p style="color:red">Note: Unable to process embedded image of type '{file_extension}', image ignored.</p>"""
|
23
24
|
except Exception as e:
|
24
|
-
|
25
|
+
self.errors.exception("Exception converting image", e)
|
25
26
|
return (
|
26
27
|
"""<p style="color:red">Note: Error encountered processing image.</p>"""
|
27
28
|
)
|
@@ -1,9 +1,10 @@
|
|
1
1
|
from .raw_list_item import RawListItem
|
2
|
-
from
|
2
|
+
from simple_error_log import Errors
|
3
3
|
|
4
4
|
|
5
5
|
class RawList:
|
6
|
-
def __init__(self, level=0):
|
6
|
+
def __init__(self, errors: Errors, level=0):
|
7
|
+
self.errors = errors
|
7
8
|
self.items = [] # List to store RawListItems and nested RawLists
|
8
9
|
self.level = level
|
9
10
|
|
@@ -13,15 +14,15 @@ class RawList:
|
|
13
14
|
elif item.level > self.level:
|
14
15
|
list = self.items[-1] if self.items else None
|
15
16
|
if not isinstance(list, RawList):
|
16
|
-
list = RawList(item.level)
|
17
|
+
list = RawList(self.errors, item.level)
|
17
18
|
self.items.append(list)
|
18
19
|
list.add(item)
|
19
20
|
if item.level > self.level + 1:
|
20
|
-
|
21
|
+
self.errors.warning(
|
21
22
|
f"Adding list item '{item}' to item but level jump greater than 1"
|
22
23
|
)
|
23
24
|
else:
|
24
|
-
|
25
|
+
self.errors.error(
|
25
26
|
f"Failed to add list item '{item}' to list '{self}', levels are in error"
|
26
27
|
)
|
27
28
|
|
@@ -12,7 +12,8 @@ class RawListItem(RawParagraph):
|
|
12
12
|
return f"{' ' * self.level}{self.text}"
|
13
13
|
|
14
14
|
def to_html(self) -> str:
|
15
|
-
return f"{
|
15
|
+
return f"{self.text}"
|
16
|
+
# return f"{escape(self.text)}"
|
16
17
|
|
17
18
|
def to_dict(self) -> dict:
|
18
19
|
return {"type": "list_item", "text": self.text, "level": self.level}
|
@@ -11,7 +11,7 @@ class RawParagraph:
|
|
11
11
|
def to_html(self) -> str:
|
12
12
|
klass_list = " ".join(self.klasses)
|
13
13
|
open_tag = f'<p class="{klass_list}">' if self.klasses else "<p>"
|
14
|
-
return f"{open_tag}{
|
14
|
+
return f"{open_tag}{self.text}</p>"
|
15
15
|
|
16
16
|
def find(self, text: str) -> bool:
|
17
17
|
return True if text in self.text else False
|
@@ -31,5 +31,9 @@ class RawParagraph:
|
|
31
31
|
"classes": self.klasses,
|
32
32
|
}
|
33
33
|
|
34
|
+
def add_span(self, text: str, klass: str) -> None:
|
35
|
+
new_str = f'<span class="{klass}">{text}</span>'
|
36
|
+
self.text = new_str + self.text[len(text) :]
|
37
|
+
|
34
38
|
def _run_text(self) -> str:
|
35
39
|
return "".join([run.text for run in self.runs])
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: raw_docx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: A package for processing and analyzing raw document formats
|
5
5
|
Home-page: https://github.com/daveih/raw_docx
|
6
6
|
Author: Dave Iberson-Hurst
|
@@ -18,12 +18,13 @@ Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
License-File: LICENSE
|
20
20
|
Requires-Dist: python-docx
|
21
|
-
Requires-Dist:
|
21
|
+
Requires-Dist: simple_error_log
|
22
22
|
Dynamic: author
|
23
23
|
Dynamic: classifier
|
24
24
|
Dynamic: description
|
25
25
|
Dynamic: description-content-type
|
26
26
|
Dynamic: home-page
|
27
|
+
Dynamic: license-file
|
27
28
|
Dynamic: requires-dist
|
28
29
|
Dynamic: requires-python
|
29
30
|
Dynamic: summary
|
@@ -1,15 +1,13 @@
|
|
1
1
|
LICENSE
|
2
2
|
README.md
|
3
3
|
setup.py
|
4
|
+
src/raw_docx/__info__.py
|
4
5
|
src/raw_docx/__init__.py
|
5
|
-
src/raw_docx/__version__.py
|
6
|
-
src/raw_docx/docx_paragraph.py
|
7
6
|
src/raw_docx/raw_document.py
|
8
7
|
src/raw_docx/raw_docx.py
|
9
8
|
src/raw_docx/raw_image.py
|
10
9
|
src/raw_docx/raw_list.py
|
11
10
|
src/raw_docx/raw_list_item.py
|
12
|
-
src/raw_docx/raw_logger.py
|
13
11
|
src/raw_docx/raw_paragraph.py
|
14
12
|
src/raw_docx/raw_run.py
|
15
13
|
src/raw_docx/raw_section.py
|
@@ -21,6 +19,8 @@ src/raw_docx.egg-info/SOURCES.txt
|
|
21
19
|
src/raw_docx.egg-info/dependency_links.txt
|
22
20
|
src/raw_docx.egg-info/requires.txt
|
23
21
|
src/raw_docx.egg-info/top_level.txt
|
22
|
+
src/raw_docx/docx/__init__.py
|
23
|
+
src/raw_docx/docx/docx_paragraph.py
|
24
24
|
tests/test_docx_paragraph.py
|
25
25
|
tests/test_integration.py
|
26
26
|
tests/test_raw_document.py
|
@@ -28,7 +28,6 @@ tests/test_raw_docx.py
|
|
28
28
|
tests/test_raw_image.py
|
29
29
|
tests/test_raw_list.py
|
30
30
|
tests/test_raw_list_item.py
|
31
|
-
tests/test_raw_logger.py
|
32
31
|
tests/test_raw_paragraph.py
|
33
32
|
tests/test_raw_run.py
|
34
33
|
tests/test_raw_section.py
|
@@ -1,13 +1,14 @@
|
|
1
1
|
from unittest.mock import Mock, PropertyMock
|
2
2
|
from docx.text.paragraph import Paragraph
|
3
3
|
from docx.text.run import Run
|
4
|
-
from src.raw_docx.docx_paragraph import (
|
4
|
+
from src.raw_docx.docx.docx_paragraph import (
|
5
5
|
extract_runs,
|
6
6
|
_tidy_runs_color,
|
7
7
|
_get_highlight_color,
|
8
8
|
_run_style_color,
|
9
9
|
_get_font_colour,
|
10
10
|
)
|
11
|
+
from simple_error_log import Errors
|
11
12
|
|
12
13
|
|
13
14
|
def create_mock_run(text="", color=None, highlight=None, style=None):
|
@@ -56,45 +57,48 @@ def create_mock_paragraph(text="", style_name="Normal"):
|
|
56
57
|
|
57
58
|
|
58
59
|
def test_get_font_colour():
|
60
|
+
errors = Errors()
|
59
61
|
"""Test getting font color from a run"""
|
60
62
|
# Test with no color
|
61
63
|
run = create_mock_run()
|
62
|
-
assert _get_font_colour(run) is None
|
64
|
+
assert _get_font_colour(run, errors) is None
|
63
65
|
|
64
66
|
# Test with color
|
65
67
|
run = create_mock_run(color="FF0000")
|
66
|
-
assert _get_font_colour(run) == "FF0000"
|
68
|
+
assert _get_font_colour(run, errors) == "FF0000"
|
67
69
|
|
68
70
|
# Test with exception
|
69
71
|
run = Mock(spec=Run)
|
70
72
|
run.font = (
|
71
73
|
None # This should cause an AttributeError when code tries to access color
|
72
74
|
)
|
73
|
-
assert _get_font_colour(run) is None
|
75
|
+
assert _get_font_colour(run, errors) is None
|
74
76
|
|
75
77
|
|
76
78
|
def test_get_highlight_color():
|
79
|
+
errors = Errors()
|
77
80
|
"""Test getting highlight color from a run"""
|
78
81
|
# Test with no highlight
|
79
82
|
run = Mock(spec=Run)
|
80
83
|
run.font = None
|
81
|
-
assert _get_highlight_color(run) is None
|
84
|
+
assert _get_highlight_color(run, errors) is None
|
82
85
|
|
83
86
|
# Test with highlight
|
84
87
|
run = create_mock_run(highlight="yellow")
|
85
|
-
assert _get_highlight_color(run) == "yellow"
|
88
|
+
assert _get_highlight_color(run, errors) == "yellow"
|
86
89
|
|
87
90
|
|
88
91
|
def test_run_style_color():
|
92
|
+
errors = Errors()
|
89
93
|
"""Test getting color from run style"""
|
90
94
|
# Test with no style
|
91
95
|
run = create_mock_run()
|
92
|
-
assert _run_style_color(run) is None
|
96
|
+
assert _run_style_color(run, errors) is None
|
93
97
|
|
94
98
|
# Test with direct style color
|
95
99
|
run = create_mock_run(style="Normal")
|
96
100
|
type(run.style.font.color).rgb = PropertyMock(return_value="FF0000")
|
97
|
-
assert _run_style_color(run) == "FF0000"
|
101
|
+
assert _run_style_color(run, errors) == "FF0000"
|
98
102
|
|
99
103
|
# Test with base style color
|
100
104
|
run = create_mock_run(style="Normal")
|
@@ -104,10 +108,11 @@ def test_run_style_color():
|
|
104
108
|
type(base_style.font.color).rgb = PropertyMock(return_value="0000FF")
|
105
109
|
base_style.base_style = None
|
106
110
|
run.style.base_style = base_style
|
107
|
-
assert _run_style_color(run) == "0000FF"
|
111
|
+
assert _run_style_color(run, errors) == "0000FF"
|
108
112
|
|
109
113
|
|
110
114
|
def test_tidy_runs_color():
|
115
|
+
errors = Errors()
|
111
116
|
"""Test tidying up runs with colors"""
|
112
117
|
# Test with different colors - should not merge
|
113
118
|
data = [
|
@@ -126,7 +131,7 @@ def test_tidy_runs_color():
|
|
126
131
|
"keep": True,
|
127
132
|
},
|
128
133
|
]
|
129
|
-
result = _tidy_runs_color(data)
|
134
|
+
result = _tidy_runs_color(data, errors)
|
130
135
|
assert len(result) == 2
|
131
136
|
assert all(item["keep"] for item in result)
|
132
137
|
|
@@ -154,12 +159,13 @@ def test_tidy_runs_color():
|
|
154
159
|
"keep": True,
|
155
160
|
},
|
156
161
|
]
|
157
|
-
result = _tidy_runs_color(data)
|
162
|
+
result = _tidy_runs_color(data, errors)
|
158
163
|
assert len(result) == 1
|
159
164
|
assert result[0]["text"] == "Test More"
|
160
165
|
|
161
166
|
|
162
167
|
def test_extract_runs_mixed_styles():
|
168
|
+
errors = Errors()
|
163
169
|
"""Test extracting runs with different styles"""
|
164
170
|
paragraph = create_mock_paragraph()
|
165
171
|
runs = [
|
@@ -169,12 +175,13 @@ def test_extract_runs_mixed_styles():
|
|
169
175
|
]
|
170
176
|
paragraph.runs = runs
|
171
177
|
|
172
|
-
result = extract_runs(paragraph)
|
178
|
+
result = extract_runs(paragraph, errors)
|
173
179
|
assert len(result) == 3
|
174
180
|
assert [r.style for r in result] == ["Normal", "Normal", "Normal"]
|
175
181
|
|
176
182
|
|
177
183
|
def test_extract_runs_with_mixed_colors():
|
184
|
+
errors = Errors()
|
178
185
|
"""Test extracting runs with different colors and highlights"""
|
179
186
|
paragraph = create_mock_paragraph()
|
180
187
|
runs = [
|
@@ -184,6 +191,6 @@ def test_extract_runs_with_mixed_colors():
|
|
184
191
|
]
|
185
192
|
paragraph.runs = runs
|
186
193
|
|
187
|
-
result = extract_runs(paragraph)
|
194
|
+
result = extract_runs(paragraph, errors)
|
188
195
|
assert len(result) == 3
|
189
196
|
assert [r.color for r in result] == ["FF0000", "0000FF", "FF0000"]
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import pytest
|
2
2
|
from src.raw_docx.raw_image import RawImage
|
3
|
+
from simple_error_log import Errors
|
3
4
|
|
4
5
|
|
5
6
|
@pytest.fixture
|
@@ -27,45 +28,51 @@ def temp_image_unsupported(tmp_path):
|
|
27
28
|
|
28
29
|
|
29
30
|
def test_image_initialization_jpg(temp_image_jpg):
|
31
|
+
errors = Errors()
|
30
32
|
"""Test image initialization with JPG"""
|
31
|
-
image = RawImage(temp_image_jpg)
|
33
|
+
image = RawImage(temp_image_jpg, errors)
|
32
34
|
assert image.filepath == temp_image_jpg
|
33
35
|
|
34
36
|
|
35
37
|
def test_image_initialization_png(temp_image_png):
|
38
|
+
errors = Errors()
|
36
39
|
"""Test image initialization with PNG"""
|
37
|
-
image = RawImage(temp_image_png)
|
40
|
+
image = RawImage(temp_image_png, errors)
|
38
41
|
assert image.filepath == temp_image_png
|
39
42
|
|
40
43
|
|
41
44
|
def test_to_html_jpg(temp_image_jpg):
|
45
|
+
errors = Errors()
|
42
46
|
"""Test getting HTML for JPG image"""
|
43
|
-
image = RawImage(temp_image_jpg)
|
47
|
+
image = RawImage(temp_image_jpg, errors)
|
44
48
|
html = image.to_html()
|
45
49
|
assert "data:image/jpg;base64," in html
|
46
50
|
assert '<img alt="alt text" src=' in html
|
47
51
|
|
48
52
|
|
49
53
|
def test_to_html_png(temp_image_png):
|
54
|
+
errors = Errors()
|
50
55
|
"""Test getting HTML for PNG image"""
|
51
|
-
image = RawImage(temp_image_png)
|
56
|
+
image = RawImage(temp_image_png, errors)
|
52
57
|
html = image.to_html()
|
53
58
|
assert "data:image/png;base64," in html
|
54
59
|
assert '<img alt="alt text" src=' in html
|
55
60
|
|
56
61
|
|
57
62
|
def test_to_html_unsupported_format(temp_image_unsupported):
|
63
|
+
errors = Errors()
|
58
64
|
"""Test getting HTML for unsupported image format"""
|
59
|
-
image = RawImage(temp_image_unsupported)
|
65
|
+
image = RawImage(temp_image_unsupported, errors)
|
60
66
|
html = image.to_html()
|
61
67
|
assert "Unable to process embedded image" in html
|
62
68
|
assert "color:red" in html
|
63
69
|
|
64
70
|
|
65
71
|
def test_to_html_missing_file(tmp_path):
|
72
|
+
errors = Errors()
|
66
73
|
"""Test getting HTML for missing image file"""
|
67
74
|
missing_file = str(tmp_path / "missing.jpg")
|
68
|
-
image = RawImage(missing_file)
|
75
|
+
image = RawImage(missing_file, errors)
|
69
76
|
html = image.to_html()
|
70
77
|
assert "Error encountered processing image" in html
|
71
78
|
assert "color:red" in html
|
@@ -79,8 +86,9 @@ def test_supported_file_types():
|
|
79
86
|
|
80
87
|
|
81
88
|
def test_to_dict_jpg(temp_image_jpg):
|
89
|
+
errors = Errors()
|
82
90
|
"""Test converting JPG image to dictionary"""
|
83
|
-
image = RawImage(temp_image_jpg)
|
91
|
+
image = RawImage(temp_image_jpg, errors)
|
84
92
|
result = image.to_dict()
|
85
93
|
assert result["type"] == "image"
|
86
94
|
assert result["filepath"] == temp_image_jpg
|
@@ -89,8 +97,9 @@ def test_to_dict_jpg(temp_image_jpg):
|
|
89
97
|
|
90
98
|
|
91
99
|
def test_to_dict_png(temp_image_png):
|
100
|
+
errors = Errors()
|
92
101
|
"""Test converting PNG image to dictionary"""
|
93
|
-
image = RawImage(temp_image_png)
|
102
|
+
image = RawImage(temp_image_png, errors)
|
94
103
|
result = image.to_dict()
|
95
104
|
assert result["type"] == "image"
|
96
105
|
assert result["filepath"] == temp_image_png
|
@@ -99,8 +108,9 @@ def test_to_dict_png(temp_image_png):
|
|
99
108
|
|
100
109
|
|
101
110
|
def test_to_dict_unsupported(temp_image_unsupported):
|
111
|
+
errors = Errors()
|
102
112
|
"""Test converting unsupported image to dictionary"""
|
103
|
-
image = RawImage(temp_image_unsupported)
|
113
|
+
image = RawImage(temp_image_unsupported, errors)
|
104
114
|
result = image.to_dict()
|
105
115
|
assert result["type"] == "image"
|
106
116
|
assert result["filepath"] == temp_image_unsupported
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import pytest
|
2
|
-
from unittest.mock import patch
|
3
2
|
from src.raw_docx.raw_list import RawList
|
4
3
|
from src.raw_docx.raw_list_item import RawListItem
|
5
4
|
from src.raw_docx.raw_run import RawRun
|
5
|
+
from simple_error_log import Errors
|
6
6
|
|
7
7
|
|
8
8
|
@pytest.fixture
|
9
9
|
def raw_list():
|
10
|
-
|
10
|
+
errors = Errors()
|
11
|
+
return RawList(errors)
|
11
12
|
|
12
13
|
|
13
14
|
@pytest.fixture
|
@@ -22,16 +23,18 @@ def test_list_initialization(raw_list):
|
|
22
23
|
|
23
24
|
|
24
25
|
def test_add_item(list_item):
|
26
|
+
errors = Errors()
|
25
27
|
"""Test adding an item to the list"""
|
26
|
-
list = RawList(1)
|
28
|
+
list = RawList(errors, 1)
|
27
29
|
list.add(list_item)
|
28
30
|
assert len(list.items) == 1
|
29
31
|
assert list.items[0] == list_item
|
30
32
|
|
31
33
|
|
32
34
|
def test_to_text():
|
35
|
+
errors = Errors()
|
33
36
|
"""Test to text"""
|
34
|
-
list = RawList(1)
|
37
|
+
list = RawList(errors, 1)
|
35
38
|
items = [
|
36
39
|
RawListItem([RawRun("Item 1", "", None, "Normal")], 1),
|
37
40
|
RawListItem([RawRun("Item 1.1", "", None, "Normal")], 2),
|
@@ -43,8 +46,9 @@ def test_to_text():
|
|
43
46
|
|
44
47
|
|
45
48
|
def test_add_multiple_items():
|
49
|
+
errors = Errors()
|
46
50
|
"""Test adding multiple items with different levels"""
|
47
|
-
list = RawList(1)
|
51
|
+
list = RawList(errors, 1)
|
48
52
|
items = [
|
49
53
|
RawListItem([RawRun("Item 1", "", None, "Normal")], 1),
|
50
54
|
RawListItem([RawRun("Item 1.1", "", None, "Normal")], 2),
|
@@ -57,22 +61,21 @@ def test_add_multiple_items():
|
|
57
61
|
|
58
62
|
|
59
63
|
def test_add_multiple_items_level_error():
|
64
|
+
errors = Errors()
|
60
65
|
"""Test adding multiple items with different levels with level error"""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
assert "Adding list item" in error_msg
|
75
|
-
assert "to item but level jump greater than 1" in error_msg
|
66
|
+
list = RawList(errors, 1)
|
67
|
+
items = [
|
68
|
+
RawListItem([RawRun("Item 1", "", None, "Normal")], 1),
|
69
|
+
RawListItem([RawRun("Item 1.1.1", "", None, "Normal")], 3),
|
70
|
+
RawListItem([RawRun("Item 2", "", None, "Normal")], 1),
|
71
|
+
]
|
72
|
+
for item in items:
|
73
|
+
list.add(item)
|
74
|
+
assert len(list.items) == 3
|
75
|
+
assert [item.level for item in list.items] == [1, 3, 1]
|
76
|
+
assert errors.count() == 1
|
77
|
+
assert "Adding list item" in errors._items[0].message
|
78
|
+
assert "to item but level jump greater than 1" in errors._items[0].message
|
76
79
|
|
77
80
|
|
78
81
|
def test_to_html(raw_list):
|
@@ -89,8 +92,9 @@ def test_to_html(raw_list):
|
|
89
92
|
|
90
93
|
|
91
94
|
def test_nested_list_to_html():
|
95
|
+
errors = Errors()
|
92
96
|
"""Test converting nested list to HTML format"""
|
93
|
-
root_list = RawList(0)
|
97
|
+
root_list = RawList(errors, 0)
|
94
98
|
items = [
|
95
99
|
RawListItem([RawRun("Item 1", "", None, "Normal")], 1),
|
96
100
|
RawListItem([RawRun("Subitem 1.1", "", None, "Normal")], 2),
|
@@ -104,26 +108,25 @@ def test_nested_list_to_html():
|
|
104
108
|
|
105
109
|
|
106
110
|
def test_add_item_lower_level_logs_error():
|
111
|
+
errors = Errors()
|
107
112
|
"""Test that adding an item with lower level than list level logs an error"""
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
error_msg = mock_logger.error.call_args[0][0]
|
117
|
-
assert "Failed to add list item" in error_msg
|
118
|
-
assert "levels are in error" in error_msg
|
113
|
+
list_obj = RawList(errors, 2) # List with level 2
|
114
|
+
item = RawListItem(
|
115
|
+
[RawRun("Test Item", "", None, "Normal")], 1
|
116
|
+
) # Item with lower level 1
|
117
|
+
list_obj.add(item)
|
118
|
+
assert errors.count() == 1
|
119
|
+
assert "Failed to add list item" in errors._items[0].message
|
120
|
+
assert "levels are in error" in errors._items[0].message
|
119
121
|
|
120
122
|
|
121
123
|
def test_to_dict():
|
124
|
+
errors = Errors()
|
122
125
|
"""Test converting list to dictionary"""
|
123
|
-
list_obj = RawList(1)
|
126
|
+
list_obj = RawList(errors, 1)
|
124
127
|
item1 = RawListItem([RawRun("Item 1", "", None, "Normal")], 1)
|
125
128
|
item2 = RawListItem([RawRun("Item 2", "", None, "Normal")], 1)
|
126
|
-
sublist = RawList(2)
|
129
|
+
sublist = RawList(errors, 2)
|
127
130
|
sublist.add(RawListItem([RawRun("Subitem 1", "", None, "Normal")], 2))
|
128
131
|
|
129
132
|
list_obj.add(item1)
|
@@ -151,10 +154,11 @@ def test_to_dict():
|
|
151
154
|
|
152
155
|
|
153
156
|
def test_all_items():
|
154
|
-
|
157
|
+
errors = Errors()
|
158
|
+
list_obj = RawList(errors, 1)
|
155
159
|
item1 = RawListItem([RawRun("Item 1", "", None, "Normal")], 1)
|
156
160
|
item2 = RawListItem([RawRun("Item 2", "", None, "Normal")], 1)
|
157
|
-
sublist = RawList(2)
|
161
|
+
sublist = RawList(errors, 2)
|
158
162
|
item3 = RawListItem([RawRun("Subitem 1", "", None, "Normal")], 2)
|
159
163
|
sublist.add(item3)
|
160
164
|
list_obj.add(item1)
|
@@ -8,6 +8,7 @@ from src.raw_docx.raw_table_row import RawTableRow
|
|
8
8
|
from src.raw_docx.raw_table_cell import RawTableCell
|
9
9
|
from src.raw_docx.raw_list_item import RawListItem
|
10
10
|
from src.raw_docx.raw_run import RawRun
|
11
|
+
from simple_error_log import Errors
|
11
12
|
|
12
13
|
|
13
14
|
@pytest.fixture
|
@@ -23,9 +24,10 @@ def paragraph():
|
|
23
24
|
|
24
25
|
@pytest.fixture
|
25
26
|
def image(tmp_path):
|
27
|
+
errors = Errors()
|
26
28
|
image_path = tmp_path / "test.jpg"
|
27
29
|
image_path.write_bytes(b"dummy image content")
|
28
|
-
return RawImage(str(image_path))
|
30
|
+
return RawImage(str(image_path), errors)
|
29
31
|
|
30
32
|
|
31
33
|
def test_section_initialization(section):
|
@@ -37,6 +39,7 @@ def test_section_initialization(section):
|
|
37
39
|
|
38
40
|
|
39
41
|
def test_section_initialization_strip(section):
|
42
|
+
# errors = Errors()
|
40
43
|
"""Test section initialization"""
|
41
44
|
section = RawSection(" Test Section ", " Test Content ", 1)
|
42
45
|
assert section.title == "Test Section"
|
@@ -46,6 +49,7 @@ def test_section_initialization_strip(section):
|
|
46
49
|
|
47
50
|
|
48
51
|
def test_add_paragraph(section, paragraph):
|
52
|
+
# errors = Errors()
|
49
53
|
"""Test adding a paragraph to section"""
|
50
54
|
section.add(paragraph)
|
51
55
|
assert len(section.items) == 1
|
@@ -53,6 +57,7 @@ def test_add_paragraph(section, paragraph):
|
|
53
57
|
|
54
58
|
|
55
59
|
def test_add_image(section, image):
|
60
|
+
# errors = Errors()
|
56
61
|
"""Test adding an image to section"""
|
57
62
|
section.add(image)
|
58
63
|
assert len(section.items) == 1
|
@@ -60,13 +65,15 @@ def test_add_image(section, image):
|
|
60
65
|
|
61
66
|
|
62
67
|
def test_is_in_list_empty_section(section):
|
68
|
+
# errors = Errors()
|
63
69
|
"""Test is_in_list with empty section"""
|
64
70
|
assert not section.is_in_list()
|
65
71
|
|
66
72
|
|
67
73
|
def test_is_in_list_with_list(section):
|
74
|
+
errors = Errors()
|
68
75
|
"""Test is_in_list with a list"""
|
69
|
-
section.add(RawList())
|
76
|
+
section.add(RawList(errors))
|
70
77
|
assert section.is_in_list()
|
71
78
|
|
72
79
|
|
@@ -76,8 +83,9 @@ def test_current_list_no_list(section):
|
|
76
83
|
|
77
84
|
|
78
85
|
def test_current_list_with_list(section):
|
86
|
+
errors = Errors()
|
79
87
|
"""Test current_list with existing list"""
|
80
|
-
test_list = RawList()
|
88
|
+
test_list = RawList(errors)
|
81
89
|
section.add(test_list)
|
82
90
|
assert section.current_list() == test_list
|
83
91
|
|
@@ -105,6 +113,7 @@ def test_to_html_between(section):
|
|
105
113
|
|
106
114
|
|
107
115
|
def test_paragraphs(section):
|
116
|
+
errors = Errors()
|
108
117
|
"""Test getting all paragraphs"""
|
109
118
|
run1 = RawRun("First", "", "", "Normal")
|
110
119
|
run2 = RawRun("Second", "", "", "Normal")
|
@@ -113,7 +122,7 @@ def test_paragraphs(section):
|
|
113
122
|
p2 = RawParagraph([run2])
|
114
123
|
|
115
124
|
section.add(p1)
|
116
|
-
section.add(RawList()) # Add non-paragraph item
|
125
|
+
section.add(RawList(errors)) # Add non-paragraph item
|
117
126
|
section.add(p2)
|
118
127
|
|
119
128
|
paragraphs = section.paragraphs()
|
@@ -133,9 +142,10 @@ def test_tables(section):
|
|
133
142
|
|
134
143
|
|
135
144
|
def test_lists(section):
|
145
|
+
errors = Errors()
|
136
146
|
"""Test getting all lists"""
|
137
147
|
run = RawRun("Test", "", "", "Normal")
|
138
|
-
list1 = RawList()
|
148
|
+
list1 = RawList(errors)
|
139
149
|
section.add(RawParagraph([run]))
|
140
150
|
section.add(list1)
|
141
151
|
lists = section.lists()
|
@@ -209,9 +219,10 @@ def test_find_first_at_start_not_found(section):
|
|
209
219
|
|
210
220
|
|
211
221
|
def test_has_lists(section):
|
222
|
+
errors = Errors()
|
212
223
|
"""Test checking if section has lists"""
|
213
224
|
assert not section.has_lists()
|
214
|
-
section.add(RawList())
|
225
|
+
section.add(RawList(errors))
|
215
226
|
assert section.has_lists()
|
216
227
|
|
217
228
|
|
@@ -245,6 +256,7 @@ def test_next(section):
|
|
245
256
|
|
246
257
|
|
247
258
|
def test_next_paragraph(section):
|
259
|
+
errors = Errors()
|
248
260
|
"""Test getting next paragraph"""
|
249
261
|
run1 = RawRun("First", "", "", "Normal")
|
250
262
|
run2 = RawRun("Second", "", "", "Normal")
|
@@ -253,7 +265,7 @@ def test_next_paragraph(section):
|
|
253
265
|
p2 = RawParagraph([run2])
|
254
266
|
|
255
267
|
section.add(p1)
|
256
|
-
section.add(RawList()) # Add non-paragraph item
|
268
|
+
section.add(RawList(errors)) # Add non-paragraph item
|
257
269
|
section.add(p2)
|
258
270
|
|
259
271
|
assert section.next_paragraph(0) == p1
|
@@ -293,12 +305,13 @@ def test_format_heading(section):
|
|
293
305
|
|
294
306
|
|
295
307
|
def test_to_dict(section):
|
308
|
+
errors = Errors()
|
296
309
|
"""Test converting section to dictionary"""
|
297
310
|
# Add various types of content
|
298
311
|
run = RawRun("Test paragraph", "", "", "Normal")
|
299
312
|
section.add(RawParagraph([run]))
|
300
313
|
|
301
|
-
list_obj = RawList(1)
|
314
|
+
list_obj = RawList(errors, 1)
|
302
315
|
list_obj.add(RawListItem([RawRun("Test item", "", None, "Normal")], 1))
|
303
316
|
section.add(list_obj)
|
304
317
|
|
@@ -407,6 +420,7 @@ def test_section_search(section):
|
|
407
420
|
|
408
421
|
|
409
422
|
def test_section_list_operations(section):
|
423
|
+
errors = Errors()
|
410
424
|
"""Test list-related operations"""
|
411
425
|
# Test empty section
|
412
426
|
assert not section.is_in_list()
|
@@ -414,7 +428,7 @@ def test_section_list_operations(section):
|
|
414
428
|
assert not section.has_lists()
|
415
429
|
|
416
430
|
# Add a list
|
417
|
-
list1 = RawList(1)
|
431
|
+
list1 = RawList(errors, 1)
|
418
432
|
section.add(list1)
|
419
433
|
|
420
434
|
# Test with list
|
@@ -4,11 +4,13 @@ from src.raw_docx.raw_paragraph import RawParagraph
|
|
4
4
|
from src.raw_docx.raw_list import RawList
|
5
5
|
from src.raw_docx.raw_table import RawTable
|
6
6
|
from src.raw_docx.raw_run import RawRun
|
7
|
+
from simple_error_log import Errors
|
7
8
|
|
8
9
|
|
9
10
|
@pytest.fixture
|
10
11
|
def list():
|
11
|
-
|
12
|
+
errors = Errors()
|
13
|
+
return RawList(errors)
|
12
14
|
|
13
15
|
|
14
16
|
@pytest.fixture
|
@@ -1 +0,0 @@
|
|
1
|
-
__package_version__ = "0.5.0"
|
@@ -1,67 +0,0 @@
|
|
1
|
-
import sys
|
2
|
-
import logging
|
3
|
-
from pathlib import Path
|
4
|
-
from typing import Optional
|
5
|
-
from pythonjsonlogger import jsonlogger
|
6
|
-
|
7
|
-
|
8
|
-
class RawLogger:
|
9
|
-
_instance = None
|
10
|
-
_initialized = False
|
11
|
-
|
12
|
-
def __new__(cls):
|
13
|
-
if cls._instance is None:
|
14
|
-
cls._instance = super().__new__(cls)
|
15
|
-
return cls._instance
|
16
|
-
|
17
|
-
def __init__(self):
|
18
|
-
if not RawLogger._initialized:
|
19
|
-
self.logger = logging.getLogger("raw_docx")
|
20
|
-
self.logger.setLevel(logging.INFO)
|
21
|
-
|
22
|
-
# Create JSON formatter
|
23
|
-
formatter = jsonlogger.JsonFormatter(
|
24
|
-
fmt="%(asctime)s %(name)s %(levelname)s %(message)s",
|
25
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
26
|
-
)
|
27
|
-
|
28
|
-
# Console handler
|
29
|
-
console_handler = logging.StreamHandler(sys.stdout)
|
30
|
-
console_handler.setFormatter(formatter)
|
31
|
-
self.logger.addHandler(console_handler)
|
32
|
-
|
33
|
-
RawLogger._initialized = True
|
34
|
-
|
35
|
-
def setup_file_logging(self, log_dir: Optional[str] = None):
|
36
|
-
"""Setup file logging in addition to console logging"""
|
37
|
-
if log_dir:
|
38
|
-
log_path = Path(log_dir)
|
39
|
-
log_path.mkdir(parents=True, exist_ok=True)
|
40
|
-
file_handler = logging.FileHandler(log_path / "raw_docx.log")
|
41
|
-
file_handler.setFormatter(
|
42
|
-
jsonlogger.JsonFormatter(
|
43
|
-
fmt="%(asctime)s %(name)s %(levelname)s %(message)s",
|
44
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
45
|
-
)
|
46
|
-
)
|
47
|
-
self.logger.addHandler(file_handler)
|
48
|
-
|
49
|
-
def info(self, message: str):
|
50
|
-
"""Log info message"""
|
51
|
-
self.logger.info(message)
|
52
|
-
|
53
|
-
def warning(self, message: str):
|
54
|
-
"""Log warning message"""
|
55
|
-
self.logger.warning(message)
|
56
|
-
|
57
|
-
def error(self, message: str):
|
58
|
-
"""Log error message"""
|
59
|
-
self.logger.error(message)
|
60
|
-
|
61
|
-
def exception(self, message: str, exc: Exception):
|
62
|
-
"""Log exception with message"""
|
63
|
-
self.logger.exception(message, exc_info=exc)
|
64
|
-
|
65
|
-
|
66
|
-
# Create singleton instance
|
67
|
-
logger = RawLogger()
|
@@ -1,112 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
import pytest
|
4
|
-
from pathlib import Path
|
5
|
-
from src.raw_docx.raw_logger import RawLogger
|
6
|
-
|
7
|
-
|
8
|
-
@pytest.fixture
|
9
|
-
def logger_instance():
|
10
|
-
"""Fixture to provide a fresh logger instance for each test"""
|
11
|
-
# Reset the singleton state
|
12
|
-
RawLogger._instance = None
|
13
|
-
RawLogger._initialized = False
|
14
|
-
|
15
|
-
# Clear any existing handlers
|
16
|
-
logger = logging.getLogger("raw_docx")
|
17
|
-
logger.handlers.clear()
|
18
|
-
|
19
|
-
return RawLogger()
|
20
|
-
|
21
|
-
|
22
|
-
@pytest.fixture
|
23
|
-
def temp_log_dir(tmp_path):
|
24
|
-
"""Fixture to provide a temporary directory for log files"""
|
25
|
-
log_dir = tmp_path / "logs"
|
26
|
-
log_dir.mkdir()
|
27
|
-
return str(log_dir)
|
28
|
-
|
29
|
-
|
30
|
-
def test_singleton_pattern():
|
31
|
-
"""Test that RawLogger implements singleton pattern correctly"""
|
32
|
-
logger1 = RawLogger()
|
33
|
-
logger2 = RawLogger()
|
34
|
-
assert logger1 is logger2
|
35
|
-
|
36
|
-
|
37
|
-
def test_default_initialization(logger_instance):
|
38
|
-
"""Test default logger initialization"""
|
39
|
-
assert logger_instance.logger.level == logging.INFO
|
40
|
-
assert len(logger_instance.logger.handlers) == 1
|
41
|
-
assert isinstance(logger_instance.logger.handlers[0], logging.StreamHandler)
|
42
|
-
|
43
|
-
|
44
|
-
def test_file_logging_setup(logger_instance, temp_log_dir):
|
45
|
-
"""Test setting up file logging"""
|
46
|
-
logger_instance.setup_file_logging(temp_log_dir)
|
47
|
-
|
48
|
-
# Check that a file handler was added
|
49
|
-
assert len(logger_instance.logger.handlers) == 2
|
50
|
-
assert any(
|
51
|
-
isinstance(h, logging.FileHandler) for h in logger_instance.logger.handlers
|
52
|
-
)
|
53
|
-
|
54
|
-
# Check that log file was created
|
55
|
-
log_file = Path(temp_log_dir) / "raw_docx.log"
|
56
|
-
assert log_file.exists()
|
57
|
-
|
58
|
-
|
59
|
-
def test_log_message_format(logger_instance, temp_log_dir, caplog):
|
60
|
-
"""Test that log messages are properly formatted as JSON"""
|
61
|
-
logger_instance.setup_file_logging(temp_log_dir)
|
62
|
-
|
63
|
-
test_message = "Test log message"
|
64
|
-
logger_instance.info(test_message)
|
65
|
-
|
66
|
-
# Read the log file
|
67
|
-
log_file = Path(temp_log_dir) / "raw_docx.log"
|
68
|
-
with open(log_file) as f:
|
69
|
-
log_entry = json.loads(f.readline())
|
70
|
-
|
71
|
-
# Check JSON structure
|
72
|
-
assert "asctime" in log_entry
|
73
|
-
assert "name" in log_entry
|
74
|
-
assert "levelname" in log_entry
|
75
|
-
assert "message" in log_entry
|
76
|
-
assert log_entry["message"] == test_message
|
77
|
-
assert log_entry["levelname"] == "INFO"
|
78
|
-
|
79
|
-
|
80
|
-
def test_log_levels(logger_instance, caplog):
|
81
|
-
"""Test different log levels"""
|
82
|
-
test_message = "Test message"
|
83
|
-
|
84
|
-
logger_instance.info(test_message)
|
85
|
-
assert "INFO" in caplog.text
|
86
|
-
|
87
|
-
logger_instance.warning(test_message)
|
88
|
-
assert "WARNING" in caplog.text
|
89
|
-
|
90
|
-
logger_instance.error(test_message)
|
91
|
-
assert "ERROR" in caplog.text
|
92
|
-
|
93
|
-
|
94
|
-
def test_exception_logging(logger_instance, caplog):
|
95
|
-
"""Test exception logging"""
|
96
|
-
try:
|
97
|
-
raise ValueError("Test exception")
|
98
|
-
except ValueError as e:
|
99
|
-
logger_instance.exception("Error occurred", e)
|
100
|
-
|
101
|
-
assert "ERROR" in caplog.text
|
102
|
-
assert "Test exception" in caplog.text
|
103
|
-
|
104
|
-
|
105
|
-
def test_invalid_log_directory(logger_instance, tmp_path):
|
106
|
-
"""Test handling of invalid log directory"""
|
107
|
-
invalid_dir = tmp_path / "nonexistent" / "logs"
|
108
|
-
logger_instance.setup_file_logging(str(invalid_dir))
|
109
|
-
|
110
|
-
# Check that the directory was created
|
111
|
-
assert invalid_dir.exists()
|
112
|
-
assert invalid_dir.is_dir()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|