raw-docx 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {raw_docx-0.4.0 → raw_docx-0.6.0}/PKG-INFO +4 -7
  2. {raw_docx-0.4.0 → raw_docx-0.6.0}/setup.py +8 -11
  3. raw_docx-0.6.0/src/raw_docx/__init__.py +28 -0
  4. raw_docx-0.6.0/src/raw_docx/__version__.py +1 -0
  5. raw_docx-0.6.0/src/raw_docx/docx_paragraph.py +89 -0
  6. raw_docx-0.6.0/src/raw_docx/raw_document.py +64 -0
  7. raw_docx-0.6.0/src/raw_docx/raw_docx.py +256 -0
  8. raw_docx-0.6.0/src/raw_docx/raw_image.py +37 -0
  9. raw_docx-0.6.0/src/raw_docx/raw_list.py +69 -0
  10. raw_docx-0.6.0/src/raw_docx/raw_list_item.py +21 -0
  11. raw_docx-0.6.0/src/raw_docx/raw_logger.py +67 -0
  12. raw_docx-0.6.0/src/raw_docx/raw_paragraph.py +35 -0
  13. raw_docx-0.6.0/src/raw_docx/raw_run.py +15 -0
  14. raw_docx-0.6.0/src/raw_docx/raw_section.py +119 -0
  15. raw_docx-0.6.0/src/raw_docx/raw_table.py +48 -0
  16. raw_docx-0.6.0/src/raw_docx/raw_table_cell.py +62 -0
  17. raw_docx-0.6.0/src/raw_docx/raw_table_row.py +41 -0
  18. {raw_docx-0.4.0 → raw_docx-0.6.0}/src/raw_docx.egg-info/PKG-INFO +4 -7
  19. {raw_docx-0.4.0 → raw_docx-0.6.0}/src/raw_docx.egg-info/SOURCES.txt +15 -0
  20. raw_docx-0.6.0/src/raw_docx.egg-info/requires.txt +2 -0
  21. raw_docx-0.6.0/src/raw_docx.egg-info/top_level.txt +1 -0
  22. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_docx_paragraph.py +1 -1
  23. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_integration.py +1 -1
  24. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_document.py +4 -4
  25. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_docx.py +2 -2
  26. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_image.py +1 -1
  27. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_list.py +5 -5
  28. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_list_item.py +2 -2
  29. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_logger.py +1 -1
  30. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_paragraph.py +2 -2
  31. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_run.py +1 -1
  32. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_section.py +9 -9
  33. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_table.py +5 -5
  34. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_table_cell.py +5 -5
  35. {raw_docx-0.4.0 → raw_docx-0.6.0}/tests/test_raw_table_row.py +4 -4
  36. raw_docx-0.4.0/src/raw_docx.egg-info/requires.txt +0 -5
  37. raw_docx-0.4.0/src/raw_docx.egg-info/top_level.txt +0 -1
  38. {raw_docx-0.4.0 → raw_docx-0.6.0}/LICENSE +0 -0
  39. {raw_docx-0.4.0 → raw_docx-0.6.0}/README.md +0 -0
  40. {raw_docx-0.4.0 → raw_docx-0.6.0}/setup.cfg +0 -0
  41. {raw_docx-0.4.0 → raw_docx-0.6.0}/src/raw_docx.egg-info/dependency_links.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: raw_docx
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: A package for processing and analyzing raw document formats
5
5
  Home-page: https://github.com/daveih/raw_docx
6
- Author: Dave Berson-Hurst
6
+ Author: Dave Iberson-Hurst
7
7
  Author-email:
8
8
  Classifier: Development Status :: 3 - Alpha
9
9
  Classifier: Intended Audience :: Developers
@@ -17,11 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Requires-Python: >=3.8
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: python-docx==1.1.2
21
- Requires-Dist: ruff==0.8.6
22
- Requires-Dist: python-json-logger==3.2.1
23
- Requires-Dist: pytest==7.4.4
24
- Requires-Dist: pytest-cov==4.1.0
20
+ Requires-Dist: python-docx
21
+ Requires-Dist: python-json-logger
25
22
  Dynamic: author
26
23
  Dynamic: classifier
27
24
  Dynamic: description
@@ -3,19 +3,14 @@ from setuptools import setup, find_packages
3
3
  with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
 
6
- with open("requirements.txt", "r", encoding="utf-8") as fh:
7
- requirements = [
8
- line.strip() for line in fh if line.strip() and not line.startswith("#")
9
- ]
10
-
11
- version = {}
12
- with open("src/__init__.py") as fp:
13
- exec(fp.read(), version)
6
+ package_info = {}
7
+ with open("src/raw_docx/__version__.py") as fp:
8
+ exec(fp.read(), package_info)
14
9
 
15
10
  setup(
16
11
  name="raw_docx",
17
- version=version["__package_version__"],
18
- author="Dave Berson-Hurst",
12
+ version=package_info["__package_version__"],
13
+ author="Dave Iberson-Hurst",
19
14
  author_email="",
20
15
  description="A package for processing and analyzing raw document formats",
21
16
  long_description=long_description,
@@ -23,6 +18,9 @@ setup(
23
18
  url="https://github.com/daveih/raw_docx",
24
19
  packages=find_packages(where="src"),
25
20
  package_dir={"": "src"},
21
+ package_data={},
22
+ install_requires=["python-docx", "python-json-logger"],
23
+ tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
26
24
  classifiers=[
27
25
  "Development Status :: 3 - Alpha",
28
26
  "Intended Audience :: Developers",
@@ -35,5 +33,4 @@ setup(
35
33
  "Programming Language :: Python :: 3.11",
36
34
  ],
37
35
  python_requires=">=3.8",
38
- install_requires=requirements,
39
36
  )
@@ -0,0 +1,28 @@
1
+ from .raw_docx import RawDocx
2
+ from .raw_document import RawDocument
3
+ from .raw_image import RawImage
4
+ from .raw_list_item import RawListItem
5
+ from .raw_list import RawList
6
+ from .raw_logger import RawLogger
7
+ from .raw_paragraph import RawParagraph
8
+ from .raw_run import RawRun
9
+ from .raw_section import RawSection
10
+ from .raw_table_cell import RawTableCell
11
+ from .raw_table_row import RawTableRow
12
+ from .raw_table import RawTable
13
+
14
+ __all__ = [
15
+ "RawDocx",
16
+ "RawDocument",
17
+ "RawImage",
18
+ "RawList",
19
+ "RawListItem",
20
+ "RawLogger",
21
+ "RawParagraph",
22
+ "RawRun",
23
+ "RawSection",
24
+ "RawTableCell",
25
+ "RawTableRow",
26
+ "RawTable"
27
+ ]
28
+
@@ -0,0 +1 @@
1
+ __package_version__ = "0.6.0"
@@ -0,0 +1,89 @@
1
+ from docx.text.paragraph import Paragraph
2
+ from docx.styles.style import ParagraphStyle
3
+ from docx.text.run import Run
4
+ from .raw_logger import logger
5
+ from .raw_run import RawRun
6
+
7
+
8
+ def extract_runs(paragraph: Paragraph) -> list[dict]:
9
+ if paragraph.text.startswith(
10
+ "This template is intended for interventional clinical trials. The template is suitable"
11
+ ):
12
+ logger.info(f"Paragraph style {paragraph.style.name}")
13
+ data = [
14
+ {
15
+ "text": run.text,
16
+ "color": _get_run_color(paragraph.style, run),
17
+ "highlight": _get_highlight_color(run),
18
+ "keep": True,
19
+ # "style": run.style.name if run.style else paragraph.style.name
20
+ "style": paragraph.style.name,
21
+ }
22
+ for run in paragraph.runs
23
+ ]
24
+ data = _tidy_runs_color(data)
25
+ return [RawRun(x["text"], x["color"], x["highlight"], x["style"]) for x in data]
26
+
27
+
28
+ def _tidy_runs_color(data: list[dict]) -> list[dict]:
29
+ more = False
30
+ for index, run in enumerate(data):
31
+ if (
32
+ index > 0
33
+ and run["color"] == data[index - 1]["color"]
34
+ and run["highlight"] == data[index - 1]["highlight"]
35
+ ):
36
+ run["text"] = data[index - 1]["text"] + run["text"]
37
+ data[index - 1]["keep"] = False
38
+ more = True
39
+ new_data = [x for x in data if x["keep"]]
40
+ if more:
41
+ new_data = _tidy_runs_color(new_data)
42
+ return new_data
43
+
44
+
45
+ def _get_run_color(paragraph: Paragraph, run: Run) -> str | None:
46
+ paragraph_color = _get_font_colour(paragraph)
47
+ font_color = _get_font_colour(run)
48
+ style_color = _run_style_color(run)
49
+ if font_color:
50
+ result = str(font_color)
51
+ elif style_color:
52
+ result = str(style_color)
53
+ else:
54
+ result = str(paragraph_color)
55
+ return result
56
+
57
+
58
+ def _get_highlight_color(run: Run) -> str | None:
59
+ try:
60
+ return str(run.font.highlight_color)
61
+ except Exception as e:
62
+ logger.exception("Failed to get run highlight color", e)
63
+ return None
64
+
65
+
66
+ def _run_style_color(run: Run) -> str | None:
67
+ try:
68
+ run_color = None
69
+ run_style = run.style
70
+ while run_style and not run_color:
71
+ if run_style.font.color.rgb:
72
+ run_color = run_style.font.color.rgb
73
+ else:
74
+ run_style = run_style.base_style
75
+ return run_color
76
+ except Exception as e:
77
+ logger.exception("Failed to get run style color", e)
78
+ return None
79
+
80
+
81
+ def _get_font_colour(item: Run | ParagraphStyle) -> str | None:
82
+ try:
83
+ return item.font.color.rgb
84
+ except Exception as e:
85
+ logger.exception("Failed to get font color", e)
86
+ return None
87
+
88
+
89
+ setattr(Paragraph, "extract_runs", extract_runs)
@@ -0,0 +1,64 @@
1
+ from .raw_section import RawSection
2
+
3
+
4
+ class RawDocument:
5
+ def __init__(self):
6
+ self.sections = []
7
+ self._levels = [0, 0, 0, 0, 0, 0]
8
+ self._section_number_mapping = {}
9
+ self._section_title_mapping = {}
10
+ section = RawSection(None, None, 1)
11
+ self.add(section, False) # No section number increment
12
+
13
+ def add(self, section: RawSection, increment=True):
14
+ if increment:
15
+ self._inc_section_number(section.level)
16
+ section.number = self._get_section_number(section.level)
17
+ self._section_number_mapping[section.number] = section
18
+ self._section_title_mapping[section.title] = section
19
+ self.sections.append(section)
20
+
21
+ def current_section(self) -> RawSection:
22
+ return self.sections[-1]
23
+
24
+ def section_by_ordinal(self, ordinal: int) -> RawSection:
25
+ if 1 >= ordinal <= len(self.sections):
26
+ return self.sections[ordinal - 1]
27
+ else:
28
+ return None
29
+
30
+ def section_by_number(self, section_number: str) -> RawSection:
31
+ if section_number in self._section_number_mapping:
32
+ return self._section_number_mapping[section_number]
33
+ else:
34
+ return None
35
+
36
+ def section_by_title(self, section_title: str) -> RawSection:
37
+ if section_title in self._section_title_mapping:
38
+ return self._section_title_mapping[section_title]
39
+ else:
40
+ return None
41
+
42
+ def _inc_section_number(self, level: int) -> None:
43
+ self._levels[level] += 1
44
+ for index in range(level + 1, len(self._levels)):
45
+ self._levels[index] = 0
46
+
47
+ def _get_section_number(self, level: int) -> str:
48
+ return ".".join(str(x) for x in self._levels[1 : level + 1])
49
+
50
+ def to_dict(self) -> dict:
51
+ """Convert the document to a dictionary representation"""
52
+ return {
53
+ "type": "document",
54
+ "sections": [section.to_dict() for section in self.sections],
55
+ "levels": self._levels,
56
+ "section_number_mapping": {
57
+ num: section.to_dict()
58
+ for num, section in self._section_number_mapping.items()
59
+ },
60
+ "section_title_mapping": {
61
+ title: section.to_dict()
62
+ for title, section in self._section_title_mapping.items()
63
+ },
64
+ }
@@ -0,0 +1,256 @@
1
+ import os
2
+ import re
3
+ import docx
4
+ import zipfile
5
+ from pathlib import Path
6
+ from .raw_document import RawDocument
7
+ from .raw_section import RawSection
8
+ from .raw_paragraph import RawParagraph
9
+ from .raw_image import RawImage
10
+ from .raw_table import RawTable
11
+ from .raw_table_row import RawTableRow
12
+ from .raw_table_cell import RawTableCell
13
+ from .raw_list import RawList
14
+ from .raw_list_item import RawListItem
15
+ from docx import Document as DocXProcessor
16
+ from docx.document import Document
17
+ from docx.oxml.table import CT_Tbl, CT_TcPr
18
+ from docx.oxml.text.paragraph import CT_P
19
+ from docx.table import Table, _Cell
20
+ from docx.text.paragraph import Paragraph
21
+ from lxml import etree
22
+ from .raw_logger import logger
23
+ from .docx_paragraph import extract_runs # Needed such that method inserted into class
24
+
25
+
26
+ class RawDocx:
27
+ class LogicError(Exception):
28
+ pass
29
+
30
+ def __init__(self, full_path: str):
31
+ path = Path(full_path)
32
+ # path.stem, path.suffix[1:]
33
+ self.full_path = full_path
34
+ self.dir = path.parent
35
+ self.filename = path.name
36
+ self.image_path = os.path.join(self.dir, "images")
37
+ self.image_rels = {}
38
+ self._organise_dir()
39
+ self.source_document = DocXProcessor(self.full_path)
40
+ self.target_document = RawDocument()
41
+ self._process()
42
+
43
+ def _organise_dir(self):
44
+ try:
45
+ os.mkdir(self.image_path)
46
+ except FileExistsError:
47
+ pass
48
+ except Exception as e:
49
+ logger.exception("Failed to create image directory", e)
50
+
51
+ def _process(self):
52
+ try:
53
+ self._extract_images()
54
+ for block_item in self._iter_block_items(self.source_document):
55
+ target_section = self.target_document.current_section()
56
+ if isinstance(block_item, Paragraph):
57
+ # print(f"PARA BLOCK: {block_item.text}")
58
+ self._process_paragraph(block_item, target_section, self.image_rels)
59
+ elif isinstance(block_item, Table):
60
+ self._process_table(block_item, target_section)
61
+ else:
62
+ logger.warning("Ignoring element")
63
+ raise ValueError
64
+ except Exception as e:
65
+ logger.exception("Exception raised processing document", e)
66
+
67
+ def _extract_images(self):
68
+ # Extract images to image dir
69
+ self._extract_images()
70
+ # Save all 'rId:filenames' as references
71
+ for r in self.source_document.part.rels.values():
72
+ if isinstance(r._target, docx.parts.image.ImagePart):
73
+ self.image_rels[r.rId] = os.path.join(
74
+ self.image_path, os.path.basename(r._target.partname)
75
+ )
76
+
77
+ def _iter_block_items(self, parent):
78
+ """
79
+ Yield each paragraph and table child within *parent*, in document
80
+ order. Each returned value is an instance of either Table or
81
+ Paragraph. *parent* would most commonly be a reference to a main
82
+ Document object, but also works for a _Cell object, which itself can
83
+ contain paragraphs and tables.
84
+ """
85
+ if isinstance(parent, Document):
86
+ parent_elm = parent.element.body
87
+ elif isinstance(parent, _Cell):
88
+ parent_elm = parent._tc
89
+ else:
90
+ raise ValueError("something's not right with the parent")
91
+
92
+ for child in parent_elm.iterchildren():
93
+ if isinstance(child, str):
94
+ logger.warning(f"Ignoring eTree element {child}")
95
+ elif isinstance(child, CT_P):
96
+ # print(f"PARA: {child.text}")
97
+ yield Paragraph(child, parent)
98
+ elif isinstance(child, CT_Tbl):
99
+ yield Table(child, parent)
100
+ elif isinstance(child, etree._Element):
101
+ if (
102
+ child.tag
103
+ == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcPr"
104
+ ):
105
+ pass
106
+ elif (
107
+ child.tag
108
+ == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdt"
109
+ ):
110
+ pass
111
+ else:
112
+ logger.warning(f"Ignoring eTree element {self._tree(child)}")
113
+
114
+ else:
115
+ raise ValueError(f"something's not right with a child {type(child)}")
116
+
117
+ def _tree(self, node, tab=1):
118
+ # print(f"{' ' * tab}{node.tag} {node.text}")
119
+ for child in node:
120
+ self._tree(child, tab + 1)
121
+
122
+ def _process_table(self, table, target: RawSection | RawTableCell):
123
+ target_table = RawTable()
124
+ target.add(target_table)
125
+ for r_index, row in enumerate(table.rows):
126
+ target_row = RawTableRow()
127
+ target_table.add(target_row)
128
+ cells = row.cells
129
+ for c_index, cell in enumerate(cells):
130
+ if cell._tc is not None:
131
+ x = cell._tc
132
+ right = x.right
133
+ left = x.left
134
+ top = x.top
135
+ try:
136
+ # Bottom method seems to have a bug.
137
+ # See https://github.com/python-openxml/python-docx/issues/1433
138
+ bottom = x.bottom
139
+ except Exception:
140
+ bottom = top + 1
141
+ h_span = right - left
142
+ v_span = bottom - top
143
+ else:
144
+ h_span = 1
145
+ v_span = 1
146
+ first = r_index == cell._tc.top and c_index == cell._tc.left
147
+ target_cell = RawTableCell(h_span, v_span, first)
148
+ target_row.add(target_cell)
149
+ for block_item in self._iter_block_items(cell):
150
+ if isinstance(block_item, Paragraph):
151
+ self._process_cell(block_item, target_cell)
152
+ elif isinstance(block_item, Table):
153
+ raise self.LogicError("Table within table detected")
154
+ elif isinstance(block_item, etree._Element):
155
+ if block_item.tag == CT_TcPr:
156
+ pass
157
+ else:
158
+ logger.warning(f"Ignoring eTree element {block_item.tag}")
159
+ else:
160
+ raise self.LogicError(
161
+ f"something's not right with a child {type(block_item)}"
162
+ )
163
+
164
+ def _process_cell(self, paragraph, target_cell: RawTableCell):
165
+ if self._is_list(paragraph):
166
+ list_level = self.get_list_level(paragraph)
167
+ item = RawListItem(paragraph.extract_runs(), list_level)
168
+ if target_cell.is_in_list():
169
+ list = target_cell.current_list()
170
+ else:
171
+ list = RawList()
172
+ target_cell.add(list)
173
+ list.add(item)
174
+ else:
175
+ target_paragraph = RawParagraph(paragraph.extract_runs())
176
+ target_cell.add(target_paragraph)
177
+
178
+ def _process_paragraph(
179
+ self, paragraph, target_section: RawSection, image_rels: dict
180
+ ):
181
+ is_heading, level = self._is_heading(paragraph.style.name)
182
+ if is_heading:
183
+ target_section = RawSection(paragraph.text, paragraph.text, level)
184
+ self.target_document.add(target_section)
185
+ elif self._is_list(paragraph):
186
+ # print(f"START LIST: {paragraph.text}")
187
+ list_level = self.get_list_level(paragraph)
188
+ item = RawListItem(paragraph.extract_runs(), list_level)
189
+ if target_section.is_in_list():
190
+ list = target_section.current_list()
191
+ else:
192
+ list = RawList()
193
+ target_section.add(list)
194
+ list.add(item)
195
+ elif "Graphic" in paragraph._p.xml:
196
+ for rId in image_rels:
197
+ if rId in paragraph._p.xml:
198
+ target_image = RawImage(image_rels[rId])
199
+ target_section.add(target_image)
200
+ else:
201
+ # print(f"START RUNS: {paragraph.text}")
202
+ target_paragraph = RawParagraph(paragraph.extract_runs())
203
+ target_section.add(target_paragraph)
204
+
205
+ def get_list_level(self, paragraph):
206
+ list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
207
+ return int(str(list_level[0])) if list_level else 0
208
+
209
+ def _is_heading(self, text):
210
+ if re.match(r"^\d\dHeading \d", text):
211
+ try:
212
+ level = int(text[0:2])
213
+ return True, level
214
+ except Exception:
215
+ return True, 0
216
+ if re.match(r"^Heading \d", text):
217
+ try:
218
+ level = int(text[8])
219
+ return True, level
220
+ except Exception:
221
+ return True, 0
222
+ return False, 0
223
+
224
+ def _is_list(self, paragraph):
225
+ level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
226
+ if level:
227
+ return True
228
+ if paragraph.style.name in ["CPT_List Bullet", "List Bullet"]:
229
+ return True
230
+ if paragraph.text:
231
+ if hex(ord(paragraph.text[0])) == "0x2022":
232
+ return True
233
+ return False
234
+
235
+ def _extract_images(self):
236
+ archive = zipfile.ZipFile(self.full_path)
237
+ for file in archive.filelist:
238
+ if file.filename.startswith("word/media/"):
239
+ # Extract the image file name from the path
240
+ image_name = Path(file.filename).name
241
+ # Create the target path for the image
242
+ target_path = os.path.join(self.image_path, image_name)
243
+ # Extract the file to the target path
244
+ with archive.open(file) as source, open(target_path, "wb") as target:
245
+ target.write(source.read())
246
+
247
+ def to_dict(self) -> dict:
248
+ """Convert the RawDocx instance to a dictionary representation"""
249
+ if hasattr(self, "target_document"):
250
+ return {
251
+ "type": "raw_docx",
252
+ "document": self.target_document.to_dict()
253
+ if hasattr(self.target_document, "to_dict")
254
+ else None,
255
+ }
256
+ return {"type": "raw_docx", "document": None}
@@ -0,0 +1,37 @@
1
+ import os
2
+ import base64
3
+ from .raw_logger import logger
4
+
5
+
6
+ class RawImage:
7
+ FILE_TYPE_MAP = {".png": "png", ".jpg": "jpg", ".jpeg": "jpg"}
8
+
9
+ def __init__(self, filepath: str):
10
+ self.filepath = filepath
11
+
12
+ def to_html(self):
13
+ try:
14
+ file_root, file_extension = os.path.splitext(self.filepath)
15
+ if file_extension in self.FILE_TYPE_MAP:
16
+ file_type = self.FILE_TYPE_MAP[file_extension]
17
+ with open(self.filepath, "rb") as image_file:
18
+ data = base64.b64encode(image_file.read())
19
+ decoded = data.decode("ascii")
20
+ return f'<img alt="alt text" src="data:image/{file_type};base64,{decoded}"/>'
21
+ else:
22
+ return f"""<p style="color:red">Note: Unable to process embedded image of type '{file_extension}', image ignored.</p>"""
23
+ except Exception as e:
24
+ logger.exception("Exception converting image", e)
25
+ return (
26
+ """<p style="color:red">Note: Error encountered processing image.</p>"""
27
+ )
28
+
29
+ def to_dict(self) -> dict:
30
+ """Convert the image to a dictionary representation"""
31
+ file_root, file_extension = os.path.splitext(self.filepath)
32
+ return {
33
+ "type": "image",
34
+ "filepath": self.filepath,
35
+ "extension": file_extension,
36
+ "file_type": self.FILE_TYPE_MAP.get(file_extension, "unknown"),
37
+ }
@@ -0,0 +1,69 @@
1
+ from .raw_list_item import RawListItem
2
+ from .raw_logger import logger
3
+
4
+
5
+ class RawList:
6
+ def __init__(self, level=0):
7
+ self.items = [] # List to store RawListItems and nested RawLists
8
+ self.level = level
9
+
10
+ def add(self, item: RawListItem) -> None:
11
+ if item.level == self.level:
12
+ self.items.append(item)
13
+ elif item.level > self.level:
14
+ list = self.items[-1] if self.items else None
15
+ if not isinstance(list, RawList):
16
+ list = RawList(item.level)
17
+ self.items.append(list)
18
+ list.add(item)
19
+ if item.level > self.level + 1:
20
+ logger.warning(
21
+ f"Adding list item '{item}' to item but level jump greater than 1"
22
+ )
23
+ else:
24
+ logger.error(
25
+ f"Failed to add list item '{item}' to list '{self}', levels are in error"
26
+ )
27
+
28
+ def to_text(self) -> str:
29
+ lines = []
30
+ for item in self.items:
31
+ lines.append(f"{item.to_text()}")
32
+ return ("\n").join(lines)
33
+
34
+ return self.text # Note: This line appears unreachable
35
+
36
+ def all_items(self) -> list[RawListItem]:
37
+ result = []
38
+ for item in self.items:
39
+ if isinstance(item, RawListItem):
40
+ result.append(item)
41
+ elif isinstance(item, RawList):
42
+ result += item.all_items()
43
+ return result
44
+
45
+ def to_html(self) -> str:
46
+ lines = []
47
+ lines.append("<ul>")
48
+ for item in self.items:
49
+ lines.append(f"<li>{item.to_html()}</li>")
50
+ lines.append("</ul>")
51
+ return ("\n").join(lines)
52
+
53
+ def to_dict(self) -> dict:
54
+ return {
55
+ "type": "list",
56
+ "level": self.level,
57
+ "items": [
58
+ item.to_dict() if hasattr(item, "to_dict") else str(item)
59
+ for item in self.items
60
+ ],
61
+ }
62
+
63
+ def __str__(self) -> str:
64
+ """Return a string representation of the list showing its level and item count.
65
+
66
+ Returns:
67
+ str: String representation of the list
68
+ """
69
+ return f"[level='{self.level}', item_count='{len(self.items)}']"
@@ -0,0 +1,21 @@
1
+ from html import escape
2
+ from .raw_paragraph import RawParagraph
3
+ from .raw_run import RawRun
4
+
5
+
6
+ class RawListItem(RawParagraph):
7
+ def __init__(self, runs: list[RawRun], level: int):
8
+ self.level = level
9
+ super().__init__(runs)
10
+
11
+ def to_text(self) -> str:
12
+ return f"{' ' * self.level}{self.text}"
13
+
14
+ def to_html(self) -> str:
15
+ return f"{escape(self.text)}"
16
+
17
+ def to_dict(self) -> dict:
18
+ return {"type": "list_item", "text": self.text, "level": self.level}
19
+
20
+ def __str__(self) -> str:
21
+ return f"[text='{self.text}', level='{self.level}']"