PyPI - raw-docx - Versions diffs - 0.7.0__tar.gz → 0.9.0__tar.gz - Mend

raw-docx 0.7.0tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{raw_docx-0.7.0 → raw_docx-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: raw_docx
-Version: 0.7.0
+Version: 0.9.0
 Summary: A package for processing and analyzing raw document formats
 Home-page: https://github.com/daveih/raw_docx
 Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: python-docx
-Requires-Dist: simple_error_log
+Requires-Dist: python-docx==1.2.0
+Requires-Dist: simple_error_log>=0.6.0
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
 Build as a normal package
 - Build with `python3 -m build --sdist --wheel`
-- Upload to pypi.org using `twine upload dist/* `
+- Upload to pypi.org using `twine upload dist/*`

{raw_docx-0.7.0 → raw_docx-0.9.0}/README.md RENAMED Viewed

@@ -7,4 +7,4 @@ Simple package to build on top of python-docx to assist in the handling of word
 Build as a normal package
 - Build with `python3 -m build --sdist --wheel`
-- Upload to pypi.org using `twine upload dist/* `
+- Upload to pypi.org using `twine upload dist/*`

{raw_docx-0.7.0 → raw_docx-0.9.0}/setup.py RENAMED Viewed

@@ -19,8 +19,8 @@ setup(
     packages=find_packages(where="src"),
     package_dir={"": "src"},
     package_data={},
-    install_requires=["python-docx", "simple_error_log"],
-    tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
+    install_requires=["python-docx==1.2.0", "simple_error_log>=0.6.0"],
+    tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv", "pyyaml"],
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",

raw_docx-0.9.0/src/raw_docx/__info__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __package_version__ = "0.9.0"

raw_docx-0.9.0/src/raw_docx/docx/docx_table.py ADDED Viewed

@@ -0,0 +1,170 @@
+from simple_error_log import Errors
+from simple_error_log.error_location import KlassMethodLocation
+from raw_docx.raw_table import RawTable
+from docx.table import _Cell
+class TableCell:
+    def __init__(
+        self,
+        row: int,
+        col: int,
+        bottom: int,
+        right: int,
+        cell: _Cell,
+        h_merge: bool,
+        v_merge: bool,
+    ):
+        self.cell = cell
+        self.top = row
+        self.bottom = bottom
+        self.left = col
+        self.right = right
+        self.v_merge = v_merge
+        self.h_merge = h_merge
+    def __str__(self):
+        text = ""
+        for paragraph in self.cell.paragraphs:
+            text += paragraph.text
+        return f"[{self.top}, {self.left}] --> [{self.bottom}, {self.right}] (H: {self.h_merge}, V: {self.v_merge}) {text}"
+class TableRow:
+    def __init__(self, row: int):
+        self._row = row
+        self._data = []
+    def cell(self, col: int) -> TableCell:
+        return self._data[col]
+    def add(self, col: int, cell: TableCell):
+        try:
+            self._data[col] = cell
+        except IndexError:
+            if col >= 0:
+                self._data.extend(((col + 1) - len(self._data)) * [None])
+                self._data[col] = cell
+    def pad(self, width: int):
+        if len(self._data) < width:
+            self._data.extend((width - len(self._data)) * [None])
+    def __iter__(self):
+        return iter(self._data)
+class TableMatrix:
+    MODULE = "raw_docx.docx.docx_table.TableMatrix"
+    class LogicError(Exception):
+        pass
+    def __init__(self, table: RawTable, errors: Errors):
+        try:
+            self._errors = errors
+            self._table = table
+            self._height = 0
+            self._width = 0
+            self._matrix: list[list[TableCell]] = []
+            for cell in self._iter_cells():
+                self._add(cell)
+                self._width = cell.left if cell.left > self._width else self._width
+                self._height = cell.top
+            self._height += 1  # Set length not index
+            self._width += 1  # Set length not index
+            self._pad()
+        except Exception as e:
+            self._errors.exception(
+                "Exception raised building table matrix",
+                e,
+                KlassMethodLocation(self.MODULE, "__init__"),
+            )
+    def _pad(self):
+        row: TableRow
+        for row in self._matrix:
+            row.pad(self._width)
+    def _add(self, cell: TableCell):
+        row = cell.top
+        col = cell.left
+        if row >= 0 and row < len(self._matrix):
+            row_data: TableRow = self._matrix[row]
+            row_data.add(col, cell)
+        elif row >= 0:
+            self._matrix.extend(((row + 1) - len(self._matrix)) * [None])
+            row_data = TableRow(row)
+            self._matrix[row] = row_data
+            row_data.add(col, cell)
+        else:
+            pass  # negative row!
+    def _iter_cells(self):
+        table = self._table
+        for r, row in enumerate(table.rows):
+            for c, cell in enumerate(row.cells):
+                right = c
+                bottom = r
+                v_merge = False
+                h_merge = False
+                # Check if the cell equals the previous cell either horizontally or vertically
+                #   so it can be ignored (part of a merge)
+                if (
+                    r > 0
+                    and c < len(table.rows[r - 1].cells)
+                    and cell._tc is table.rows[r - 1].cells[c]._tc
+                ) or (c > 0 and cell._tc is row.cells[c - 1]._tc):
+                    continue
+                # Verical merge check
+                if (
+                    r >= 0
+                    and r + 1 < len(table.rows)
+                    and c < len(table.rows[r + 1].cells)
+                    and cell._tc is table.rows[r + 1].cells[c]._tc
+                ):
+                    v_merge = True
+                    bottom = self._v_extent(r, c) - 1
+                # Horizontal merge check
+                if (
+                    c >= 0
+                    and c + 1 < len(table.rows[r].cells)
+                    and cell._tc is row.cells[c + 1]._tc
+                ):
+                    h_merge = True
+                    right = self._h_extent(r, c) - 1
+                yield TableCell(r, c, bottom, right, cell, h_merge, v_merge)
+    def _v_extent(self, row: int, col: int) -> int:
+        table = self._table
+        next_row = row + 1
+        height = len(table.rows)
+        while next_row < height:
+            if (
+                next_row >= 0
+                and col < len(table.rows[next_row].cells)
+                and table.rows[row].cells[col]._tc
+                is not table.rows[next_row].cells[col]._tc
+            ):
+                return next_row
+            else:
+                next_row += 1
+        return height
+    def _h_extent(self, row: int, col: int) -> int:
+        table = self._table
+        next_col = col + 1
+        width = len(table.rows[row].cells)
+        while next_col < width:
+            if (
+                next_col >= 0
+                and table.rows[row].cells[col]._tc
+                is not table.rows[row].cells[next_col]._tc
+            ):
+                return next_col
+            else:
+                next_col += 1
+        return width
+    def __iter__(self):
+        return iter(self._matrix)

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_document.py RENAMED Viewed

@@ -22,7 +22,7 @@ class RawDocument:
         return self.sections[-1]
     def section_by_ordinal(self, ordinal: int) -> RawSection:
-        if 1 >= ordinal <= len(self.sections):
+        if 1 <= ordinal <= len(self.sections):
             return self.sections[ordinal - 1]
         else:
             return None

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_docx.py RENAMED Viewed

@@ -13,6 +13,7 @@ from raw_docx.raw_table_cell import RawTableCell
 from raw_docx.raw_list import RawList
 from raw_docx.raw_list_item import RawListItem
 from raw_docx.docx.docx_paragraph import install
+from raw_docx.docx.docx_table import TableMatrix
 from docx import Document as DocXProcessor
 from docx.document import Document
 from docx.oxml.table import CT_Tbl, CT_TcPr
@@ -21,23 +22,26 @@ from docx.table import Table, _Cell
 from docx.text.paragraph import Paragraph
 from lxml import etree
 from simple_error_log import Errors
+from simple_error_log.error_location import KlassMethodLocation
 class RawDocx:
+    MODULE = "raw_docx.raw_docx.RawDocx"
     class LogicError(Exception):
         pass
     def __init__(self, full_path: str):
         install()
-        self.errors = Errors()
+        self._errors = Errors()
         path = Path(full_path)
-        # path.stem, path.suffix[1:]
         self.full_path = full_path
         self.dir = path.parent
         self.filename = path.name
         self.image_path = os.path.join(self.dir, "images")
-        self.errors.debug(
-            f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}"
+        self._errors.debug(
+            f"RawDocx initialisation: full_path='{self.full_path}', dir='{self.dir}', image_path0'{self.image_path}', filename='{self.filename}",
+            KlassMethodLocation(self.MODULE, "__init__"),
         )
         self.image_rels = {}
         self._organise_dir()
@@ -45,13 +49,21 @@ class RawDocx:
         self.target_document = RawDocument()
         self._process()
+    @property
+    def errors(self) -> Errors:
+        return self._errors
     def _organise_dir(self):
         try:
             os.mkdir(self.image_path)
         except FileExistsError:
             pass
         except Exception as e:
-            self.errors.exception("Failed to create image directory", e)
+            self._errors.exception(
+                "Failed to create image directory",
+                e,
+                KlassMethodLocation(self.MODULE, "_organise_dir"),
+            )
     def _process(self):
         try:
@@ -63,10 +75,16 @@ class RawDocx:
                 elif isinstance(block_item, Table):
                     self._process_table(block_item, target_section)
                 else:
-                    self.errors.warning("Ignoring element")
+                    self._errors.warning(
+                        "Ignoring element", KlassMethodLocation(self.MODULE, "_process")
+                    )
                     raise ValueError
         except Exception as e:
-            self.errors.exception("Exception raised processing document", e)
+            self._errors.exception(
+                "Exception raised processing document",
+                e,
+                KlassMethodLocation(self.MODULE, "_process"),
+            )
     def _process_images(self):
         # Extract images to image dir
@@ -94,7 +112,10 @@ class RawDocx:
         for child in parent_elm.iterchildren():
             if isinstance(child, str):
-                self.errors.warning(f"Ignoring eTree element {child}")
+                self._errors.warning(
+                    f"Ignoring eTree element {child}",
+                    KlassMethodLocation(self.MODULE, "_iter_block_items"),
+                )
             elif isinstance(child, CT_P):
                 yield Paragraph(child, parent)
             elif isinstance(child, CT_Tbl):
@@ -111,7 +132,10 @@ class RawDocx:
                 ):
                     pass
                 else:
-                    self.errors.warning(f"Ignoring eTree element {self._tree(child)}")
+                    self._errors.warning(
+                        f"Ignoring eTree element {self._tree(child)}",
+                        KlassMethodLocation(self.MODULE, "_iter_block_items"),
+                    )
             else:
                 raise ValueError(f"something's not right with a child {type(child)}")
@@ -123,59 +147,100 @@ class RawDocx:
     def _process_table(self, table, target: RawSection | RawTableCell):
         target_table = RawTable()
         target.add(target_table)
-        for r_index, row in enumerate(table.rows):
+        matrix = TableMatrix(table, self._errors)
+        for r_index, row in enumerate(matrix):
             target_row = RawTableRow()
             target_table.add(target_row)
-            cells = row.cells
-            for c_index, cell in enumerate(cells):
-                if cell._tc is not None:
-                    x = cell._tc
-                    right = x.right
-                    left = x.left
-                    top = x.top
-                    try:
-                        # Bottom method seems to have a bug.
-                        # See https://github.com/python-openxml/python-docx/issues/1433
-                        bottom = x.bottom
-                    except Exception:
-                        bottom = top + 1
-                    h_span = right - left
-                    v_span = bottom - top
-                else:
-                    h_span = 1
-                    v_span = 1
-                first = r_index == cell._tc.top and c_index == cell._tc.left
-                target_cell = RawTableCell(h_span, v_span, first)
-                target_row.add(target_cell)
-                for block_item in self._iter_block_items(cell):
-                    if isinstance(block_item, Paragraph):
-                        self._process_cell(block_item, target_cell)
-                    elif isinstance(block_item, Table):
-                        raise self.LogicError("Table within table detected")
-                    elif isinstance(block_item, etree._Element):
-                        if block_item.tag == CT_TcPr:
-                            pass
+            for c_index, row_cell in enumerate(row):
+                if row_cell:
+                    h_span = row_cell.right - row_cell.left + 1
+                    v_span = row_cell.bottom - row_cell.top + 1
+                    first = r_index == row_cell.top and c_index == row_cell.left
+                    target_cell = RawTableCell(h_span, v_span, first)
+                    target_row.add(target_cell)
+                    for block_item in self._iter_block_items(row_cell.cell):
+                        if isinstance(block_item, Paragraph):
+                            self._process_cell(block_item, target_cell)
+                        elif isinstance(block_item, Table):
+                            raise self.LogicError("Table within table detected")
+                        elif isinstance(block_item, etree._Element):
+                            if block_item.tag == CT_TcPr:
+                                pass
+                            else:
+                                self._errors.warning(
+                                    f"Ignoring eTree element {block_item.tag}",
+                                    KlassMethodLocation(self.MODULE, "_process_table"),
+                                )
                         else:
-                            self.errors.warning(
-                                f"Ignoring eTree element {block_item.tag}"
+                            raise self.LogicError(
+                                f"Something's not right with a child {type(block_item)}"
                             )
-                    else:
-                        raise self.LogicError(
-                            f"something's not right with a child {type(block_item)}"
-                        )
+    # def _process_table(self, table, target: RawSection | RawTableCell):
+    #     target_table = RawTable()
+    #     target.add(target_table)
+    #     for r_index, row in enumerate(table.rows):
+    #         target_row = RawTableRow()
+    #         target_table.add(target_row)
+    #         cells = row.cells
+    #         for c_index, cell in enumerate(cells):
+    #             if cell._tc is not None:
+    #                 x = cell._tc
+    #                 right = x.right
+    #                 left = x.left
+    #                 top = x.top
+    #                 try:
+    #                     # Bottom method seems to have a bug.
+    #                     # See https://github.com/python-openxml/python-docx/issues/1433
+    #                     bottom = x.bottom
+    #                 except Exception as e:
+    #                     self._errors.exception(
+    #                         f"Row span exception! {x.xml}",
+    #                         e,
+    #                         KlassMethodLocation(self.MODULE, "_process_table"),
+    #                     )
+    #                     bottom = top + 1
+    #                 h_span = right - left
+    #                 v_span = bottom - top
+    #             else:
+    #                 h_span = 1
+    #                 v_span = 1
+    #             if cell._tc is not None:
+    #                 first = r_index == cell._tc.top and c_index == cell._tc.left
+    #             else:
+    #                 first = r_index == 0 and c_index == 0
+    #             target_cell = RawTableCell(h_span, v_span, first)
+    #             target_row.add(target_cell)
+    #             for block_item in self._iter_block_items(cell):
+    #                 if isinstance(block_item, Paragraph):
+    #                     self._process_cell(block_item, target_cell)
+    #                 elif isinstance(block_item, Table):
+    #                     raise self.LogicError("Table within table detected")
+    #                 elif isinstance(block_item, etree._Element):
+    #                     if block_item.tag == CT_TcPr:
+    #                         pass
+    #                     else:
+    #                         self._errors.warning(
+    #                             f"Ignoring eTree element {block_item.tag}",
+    #                             KlassMethodLocation(self.MODULE, "_process_table"),
+    #                         )
+    #                 else:
+    #                     raise self.LogicError(
+    #                         f"Something's not right with a child {type(block_item)}"
+    #                     )
     def _process_cell(self, paragraph, target_cell: RawTableCell):
         if self._is_list(paragraph):
             list_level = self.get_list_level(paragraph)
-            item = RawListItem(paragraph.extract_runs(self.errors), list_level)
+            item = RawListItem(paragraph.extract_runs(self._errors), list_level)
             if target_cell.is_in_list():
                 list = target_cell.current_list()
             else:
-                list = RawList(self.errors)
+                list = RawList(self._errors)
                 target_cell.add(list)
             list.add(item)
         else:
-            target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
+            target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
             target_cell.add(target_paragraph)
     def _process_paragraph(
@@ -187,38 +252,47 @@ class RawDocx:
             self.target_document.add(target_section)
         elif self._is_list(paragraph):
             list_level = self.get_list_level(paragraph)
-            item = RawListItem(paragraph.extract_runs(self.errors), list_level)
+            item = RawListItem(paragraph.extract_runs(self._errors), list_level)
             if target_section.is_in_list():
                 list = target_section.current_list()
             else:
-                list = RawList(self.errors)
+                list = RawList(self._errors)
                 target_section.add(list)
             list.add(item)
         elif "Graphic" in paragraph._p.xml:
             for rId in image_rels:
                 if rId in paragraph._p.xml:
-                    target_image = RawImage(image_rels[rId], self.errors)
+                    target_image = RawImage(image_rels[rId], self._errors)
                     target_section.add(target_image)
         else:
-            target_paragraph = RawParagraph(paragraph.extract_runs(self.errors))
+            target_paragraph = RawParagraph(paragraph.extract_runs(self._errors))
             target_section.add(target_paragraph)
     def get_list_level(self, paragraph):
         list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
         return int(str(list_level[0])) if list_level else 0
-    def _is_heading(self, text):
-        if re.match(r"^\d\dHeading \d", text):
-            try:
-                level = int(text[0:2])
-                return True, level
-            except Exception:
-                return True, 0
-        if re.match(r"^Heading \d", text):
+    def _is_heading(self, text) -> tuple[bool, int]:
+        """
+        Extract heading level from text containing "Heading <N>" pattern.
+        Args:
+            text: Text to analyze for heading pattern
+        Returns:
+            tuple[bool, int]: (success, level) where success indicates if heading
+            pattern was found and level is the extracted integer value
+        """
+        if not text:
+            return False, 0
+        # Look for "Heading <N>" pattern where <N> is one or more digits
+        match = re.search(r"Heading\s+(\d+)", text, re.IGNORECASE)
+        if match:
             try:
-                level = int(text[8])
+                level = int(match.group(1))
                 return True, level
-            except Exception:
+            except (ValueError, IndexError):
                 return True, 0
         return False, 0

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_list_item.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from html import escape
 from .raw_paragraph import RawParagraph
 from .raw_run import RawRun

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_paragraph.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from .raw_run import RawRun
-from html import escape
 class RawParagraph:

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_table.py RENAMED Viewed

@@ -3,7 +3,7 @@ class RawTable:
         from .raw_table_row import RawTableRow
         self.rows: list[RawTableRow] = []
-        self.klasses = ["ich-m11-table"]
+        self.klasses = ["raw-docx-table"]
     # @ToDo Would like RawTableRow here but gets a circular import
     def add(self, item):
@@ -40,7 +40,8 @@ class RawTable:
         self.klasses.append(klass)
     def replace_class(self, old_klass, new_klass):
-        self.klasses.remove(old_klass)
+        if old_klass in self.klasses:
+            self.klasses.remove(old_klass)
         self.klasses.append(new_klass)
     def to_dict(self) -> dict:

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx/raw_table_cell.py RENAMED Viewed

@@ -42,7 +42,8 @@ class RawTableCell:
             return ""
         lines = []
         colspan = f' colspan="{self.h_span}"' if self.h_merged else ""
-        lines.append(f"<td{colspan}>")
+        rowspan = f' rowspan="{self.v_span}"' if self.v_merged else ""
+        lines.append(f"<td{colspan}{rowspan}>")
         for item in self.items:
             lines.append(item.to_html())
         lines.append("</td>")

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: raw_docx
-Version: 0.7.0
+Version: 0.9.0
 Summary: A package for processing and analyzing raw document formats
 Home-page: https://github.com/daveih/raw_docx
 Author: Dave Iberson-Hurst
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: python-docx
-Requires-Dist: simple_error_log
+Requires-Dist: python-docx==1.2.0
+Requires-Dist: simple_error_log>=0.6.0
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
@@ -38,4 +38,4 @@ Simple package to build on top of python-docx to assist in the handling of word
 Build as a normal package
 - Build with `python3 -m build --sdist --wheel`
-- Upload to pypi.org using `twine upload dist/* `
+- Upload to pypi.org using `twine upload dist/*`

{raw_docx-0.7.0 → raw_docx-0.9.0}/src/raw_docx.egg-info/SOURCES.txt RENAMED Viewed

@@ -21,16 +21,4 @@ src/raw_docx.egg-info/requires.txt
 src/raw_docx.egg-info/top_level.txt
 src/raw_docx/docx/__init__.py
 src/raw_docx/docx/docx_paragraph.py
-tests/test_docx_paragraph.py
-tests/test_integration.py
-tests/test_raw_document.py
-tests/test_raw_docx.py
-tests/test_raw_image.py
-tests/test_raw_list.py
-tests/test_raw_list_item.py
-tests/test_raw_paragraph.py
-tests/test_raw_run.py
-tests/test_raw_section.py
-tests/test_raw_table.py
-tests/test_raw_table_cell.py
-tests/test_raw_table_row.py
+src/raw_docx/docx/docx_table.py

raw_docx-0.9.0/src/raw_docx.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ python-docx==1.2.0
2	+ simple_error_log>=0.6.0

raw_docx-0.7.0/src/raw_docx/__info__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __package_version__ = "0.7.0"

raw_docx-0.7.0/src/raw_docx.egg-info/requires.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- python-docx
2	- simple_error_log

raw-docx 0.7.0__tar.gz → 0.9.0__tar.gz

raw-docx 0.7.0tar.gz → 0.9.0tar.gz