PyPI - raw-docx - Versions diffs - 0.4.0__tar.gz → 0.6.0__tar.gz - Mend

raw-docx 0.4.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{raw_docx-0.4.0 → raw_docx-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.2
 Name: raw_docx
-Version: 0.4.0
+Version: 0.6.0
 Summary: A package for processing and analyzing raw document formats
 Home-page: https://github.com/daveih/raw_docx
-Author: Dave Berson-Hurst
+Author: Dave Iberson-Hurst
 Author-email:
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -17,11 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: python-docx==1.1.2
-Requires-Dist: ruff==0.8.6
-Requires-Dist: python-json-logger==3.2.1
-Requires-Dist: pytest==7.4.4
-Requires-Dist: pytest-cov==4.1.0
+Requires-Dist: python-docx
+Requires-Dist: python-json-logger
 Dynamic: author
 Dynamic: classifier
 Dynamic: description

{raw_docx-0.4.0 → raw_docx-0.6.0}/setup.py RENAMED Viewed

@@ -3,19 +3,14 @@ from setuptools import setup, find_packages
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
-with open("requirements.txt", "r", encoding="utf-8") as fh:
-    requirements = [
-        line.strip() for line in fh if line.strip() and not line.startswith("#")
-    ]
-version = {}
-with open("src/__init__.py") as fp:
-    exec(fp.read(), version)
+package_info = {}
+with open("src/raw_docx/__version__.py") as fp:
+    exec(fp.read(), package_info)
 setup(
     name="raw_docx",
-    version=version["__package_version__"],
-    author="Dave Berson-Hurst",
+    version=package_info["__package_version__"],
+    author="Dave Iberson-Hurst",
     author_email="",
     description="A package for processing and analyzing raw document formats",
     long_description=long_description,
@@ -23,6 +18,9 @@ setup(
     url="https://github.com/daveih/raw_docx",
     packages=find_packages(where="src"),
     package_dir={"": "src"},
+    package_data={},
+    install_requires=["python-docx", "python-json-logger"],
+    tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
@@ -35,5 +33,4 @@ setup(
         "Programming Language :: Python :: 3.11",
     ],
     python_requires=">=3.8",
-    install_requires=requirements,
 )

raw_docx-0.6.0/src/raw_docx/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from .raw_docx import RawDocx
+from .raw_document import RawDocument
+from .raw_image import RawImage
+from .raw_list_item import RawListItem
+from .raw_list import RawList
+from .raw_logger import RawLogger
+from .raw_paragraph import RawParagraph
+from .raw_run import RawRun
+from .raw_section import RawSection
+from .raw_table_cell import RawTableCell
+from .raw_table_row import RawTableRow
+from .raw_table import RawTable
+__all__ = [
+    "RawDocx",
+    "RawDocument",
+    "RawImage",
+    "RawList",
+    "RawListItem",
+    "RawLogger",
+    "RawParagraph",
+    "RawRun",
+    "RawSection",
+    "RawTableCell",
+    "RawTableRow",
+    "RawTable"
+]

raw_docx-0.6.0/src/raw_docx/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __package_version__ = "0.6.0"

raw_docx-0.6.0/src/raw_docx/docx_paragraph.py ADDED Viewed

@@ -0,0 +1,89 @@
+from docx.text.paragraph import Paragraph
+from docx.styles.style import ParagraphStyle
+from docx.text.run import Run
+from .raw_logger import logger
+from .raw_run import RawRun
+def extract_runs(paragraph: Paragraph) -> list[dict]:
+    if paragraph.text.startswith(
+        "This template is intended for interventional clinical trials.  The template is suitable"
+    ):
+        logger.info(f"Paragraph style {paragraph.style.name}")
+    data = [
+        {
+            "text": run.text,
+            "color": _get_run_color(paragraph.style, run),
+            "highlight": _get_highlight_color(run),
+            "keep": True,
+            # "style": run.style.name if run.style else paragraph.style.name
+            "style": paragraph.style.name,
+        }
+        for run in paragraph.runs
+    ]
+    data = _tidy_runs_color(data)
+    return [RawRun(x["text"], x["color"], x["highlight"], x["style"]) for x in data]
+def _tidy_runs_color(data: list[dict]) -> list[dict]:
+    more = False
+    for index, run in enumerate(data):
+        if (
+            index > 0
+            and run["color"] == data[index - 1]["color"]
+            and run["highlight"] == data[index - 1]["highlight"]
+        ):
+            run["text"] = data[index - 1]["text"] + run["text"]
+            data[index - 1]["keep"] = False
+            more = True
+    new_data = [x for x in data if x["keep"]]
+    if more:
+        new_data = _tidy_runs_color(new_data)
+    return new_data
+def _get_run_color(paragraph: Paragraph, run: Run) -> str | None:
+    paragraph_color = _get_font_colour(paragraph)
+    font_color = _get_font_colour(run)
+    style_color = _run_style_color(run)
+    if font_color:
+        result = str(font_color)
+    elif style_color:
+        result = str(style_color)
+    else:
+        result = str(paragraph_color)
+    return result
+def _get_highlight_color(run: Run) -> str | None:
+    try:
+        return str(run.font.highlight_color)
+    except Exception as e:
+        logger.exception("Failed to get run highlight color", e)
+        return None
+def _run_style_color(run: Run) -> str | None:
+    try:
+        run_color = None
+        run_style = run.style
+        while run_style and not run_color:
+            if run_style.font.color.rgb:
+                run_color = run_style.font.color.rgb
+            else:
+                run_style = run_style.base_style
+        return run_color
+    except Exception as e:
+        logger.exception("Failed to get run style color", e)
+        return None
+def _get_font_colour(item: Run | ParagraphStyle) -> str | None:
+    try:
+        return item.font.color.rgb
+    except Exception as e:
+        logger.exception("Failed to get font color", e)
+        return None
+setattr(Paragraph, "extract_runs", extract_runs)

raw_docx-0.6.0/src/raw_docx/raw_document.py ADDED Viewed

@@ -0,0 +1,64 @@
+from .raw_section import RawSection
+class RawDocument:
+    def __init__(self):
+        self.sections = []
+        self._levels = [0, 0, 0, 0, 0, 0]
+        self._section_number_mapping = {}
+        self._section_title_mapping = {}
+        section = RawSection(None, None, 1)
+        self.add(section, False)  # No section number increment
+    def add(self, section: RawSection, increment=True):
+        if increment:
+            self._inc_section_number(section.level)
+            section.number = self._get_section_number(section.level)
+        self._section_number_mapping[section.number] = section
+        self._section_title_mapping[section.title] = section
+        self.sections.append(section)
+    def current_section(self) -> RawSection:
+        return self.sections[-1]
+    def section_by_ordinal(self, ordinal: int) -> RawSection:
+        if 1 >= ordinal <= len(self.sections):
+            return self.sections[ordinal - 1]
+        else:
+            return None
+    def section_by_number(self, section_number: str) -> RawSection:
+        if section_number in self._section_number_mapping:
+            return self._section_number_mapping[section_number]
+        else:
+            return None
+    def section_by_title(self, section_title: str) -> RawSection:
+        if section_title in self._section_title_mapping:
+            return self._section_title_mapping[section_title]
+        else:
+            return None
+    def _inc_section_number(self, level: int) -> None:
+        self._levels[level] += 1
+        for index in range(level + 1, len(self._levels)):
+            self._levels[index] = 0
+    def _get_section_number(self, level: int) -> str:
+        return ".".join(str(x) for x in self._levels[1 : level + 1])
+    def to_dict(self) -> dict:
+        """Convert the document to a dictionary representation"""
+        return {
+            "type": "document",
+            "sections": [section.to_dict() for section in self.sections],
+            "levels": self._levels,
+            "section_number_mapping": {
+                num: section.to_dict()
+                for num, section in self._section_number_mapping.items()
+            },
+            "section_title_mapping": {
+                title: section.to_dict()
+                for title, section in self._section_title_mapping.items()
+            },
+        }

raw_docx-0.6.0/src/raw_docx/raw_docx.py ADDED Viewed

@@ -0,0 +1,256 @@
+import os
+import re
+import docx
+import zipfile
+from pathlib import Path
+from .raw_document import RawDocument
+from .raw_section import RawSection
+from .raw_paragraph import RawParagraph
+from .raw_image import RawImage
+from .raw_table import RawTable
+from .raw_table_row import RawTableRow
+from .raw_table_cell import RawTableCell
+from .raw_list import RawList
+from .raw_list_item import RawListItem
+from docx import Document as DocXProcessor
+from docx.document import Document
+from docx.oxml.table import CT_Tbl, CT_TcPr
+from docx.oxml.text.paragraph import CT_P
+from docx.table import Table, _Cell
+from docx.text.paragraph import Paragraph
+from lxml import etree
+from .raw_logger import logger
+from .docx_paragraph import extract_runs  # Needed such that method inserted into class
+class RawDocx:
+    class LogicError(Exception):
+        pass
+    def __init__(self, full_path: str):
+        path = Path(full_path)
+        # path.stem, path.suffix[1:]
+        self.full_path = full_path
+        self.dir = path.parent
+        self.filename = path.name
+        self.image_path = os.path.join(self.dir, "images")
+        self.image_rels = {}
+        self._organise_dir()
+        self.source_document = DocXProcessor(self.full_path)
+        self.target_document = RawDocument()
+        self._process()
+    def _organise_dir(self):
+        try:
+            os.mkdir(self.image_path)
+        except FileExistsError:
+            pass
+        except Exception as e:
+            logger.exception("Failed to create image directory", e)
+    def _process(self):
+        try:
+            self._extract_images()
+            for block_item in self._iter_block_items(self.source_document):
+                target_section = self.target_document.current_section()
+                if isinstance(block_item, Paragraph):
+                    # print(f"PARA BLOCK: {block_item.text}")
+                    self._process_paragraph(block_item, target_section, self.image_rels)
+                elif isinstance(block_item, Table):
+                    self._process_table(block_item, target_section)
+                else:
+                    logger.warning("Ignoring element")
+                    raise ValueError
+        except Exception as e:
+            logger.exception("Exception raised processing document", e)
+    def _extract_images(self):
+        # Extract images to image dir
+        self._extract_images()
+        # Save all 'rId:filenames' as references
+        for r in self.source_document.part.rels.values():
+            if isinstance(r._target, docx.parts.image.ImagePart):
+                self.image_rels[r.rId] = os.path.join(
+                    self.image_path, os.path.basename(r._target.partname)
+                )
+    def _iter_block_items(self, parent):
+        """
+        Yield each paragraph and table child within *parent*, in document
+        order. Each returned value is an instance of either Table or
+        Paragraph. *parent* would most commonly be a reference to a main
+        Document object, but also works for a _Cell object, which itself can
+        contain paragraphs and tables.
+        """
+        if isinstance(parent, Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something's not right with the parent")
+        for child in parent_elm.iterchildren():
+            if isinstance(child, str):
+                logger.warning(f"Ignoring eTree element {child}")
+            elif isinstance(child, CT_P):
+                # print(f"PARA: {child.text}")
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+            elif isinstance(child, etree._Element):
+                if (
+                    child.tag
+                    == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcPr"
+                ):
+                    pass
+                elif (
+                    child.tag
+                    == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdt"
+                ):
+                    pass
+                else:
+                    logger.warning(f"Ignoring eTree element {self._tree(child)}")
+            else:
+                raise ValueError(f"something's not right with a child {type(child)}")
+    def _tree(self, node, tab=1):
+        # print(f"{'  ' * tab}{node.tag} {node.text}")
+        for child in node:
+            self._tree(child, tab + 1)
+    def _process_table(self, table, target: RawSection | RawTableCell):
+        target_table = RawTable()
+        target.add(target_table)
+        for r_index, row in enumerate(table.rows):
+            target_row = RawTableRow()
+            target_table.add(target_row)
+            cells = row.cells
+            for c_index, cell in enumerate(cells):
+                if cell._tc is not None:
+                    x = cell._tc
+                    right = x.right
+                    left = x.left
+                    top = x.top
+                    try:
+                        # Bottom method seems to have a bug.
+                        # See https://github.com/python-openxml/python-docx/issues/1433
+                        bottom = x.bottom
+                    except Exception:
+                        bottom = top + 1
+                    h_span = right - left
+                    v_span = bottom - top
+                else:
+                    h_span = 1
+                    v_span = 1
+                first = r_index == cell._tc.top and c_index == cell._tc.left
+                target_cell = RawTableCell(h_span, v_span, first)
+                target_row.add(target_cell)
+                for block_item in self._iter_block_items(cell):
+                    if isinstance(block_item, Paragraph):
+                        self._process_cell(block_item, target_cell)
+                    elif isinstance(block_item, Table):
+                        raise self.LogicError("Table within table detected")
+                    elif isinstance(block_item, etree._Element):
+                        if block_item.tag == CT_TcPr:
+                            pass
+                        else:
+                            logger.warning(f"Ignoring eTree element {block_item.tag}")
+                    else:
+                        raise self.LogicError(
+                            f"something's not right with a child {type(block_item)}"
+                        )
+    def _process_cell(self, paragraph, target_cell: RawTableCell):
+        if self._is_list(paragraph):
+            list_level = self.get_list_level(paragraph)
+            item = RawListItem(paragraph.extract_runs(), list_level)
+            if target_cell.is_in_list():
+                list = target_cell.current_list()
+            else:
+                list = RawList()
+                target_cell.add(list)
+            list.add(item)
+        else:
+            target_paragraph = RawParagraph(paragraph.extract_runs())
+            target_cell.add(target_paragraph)
+    def _process_paragraph(
+        self, paragraph, target_section: RawSection, image_rels: dict
+    ):
+        is_heading, level = self._is_heading(paragraph.style.name)
+        if is_heading:
+            target_section = RawSection(paragraph.text, paragraph.text, level)
+            self.target_document.add(target_section)
+        elif self._is_list(paragraph):
+            # print(f"START LIST: {paragraph.text}")
+            list_level = self.get_list_level(paragraph)
+            item = RawListItem(paragraph.extract_runs(), list_level)
+            if target_section.is_in_list():
+                list = target_section.current_list()
+            else:
+                list = RawList()
+                target_section.add(list)
+            list.add(item)
+        elif "Graphic" in paragraph._p.xml:
+            for rId in image_rels:
+                if rId in paragraph._p.xml:
+                    target_image = RawImage(image_rels[rId])
+                    target_section.add(target_image)
+        else:
+            # print(f"START RUNS: {paragraph.text}")
+            target_paragraph = RawParagraph(paragraph.extract_runs())
+            target_section.add(target_paragraph)
+    def get_list_level(self, paragraph):
+        list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
+        return int(str(list_level[0])) if list_level else 0
+    def _is_heading(self, text):
+        if re.match(r"^\d\dHeading \d", text):
+            try:
+                level = int(text[0:2])
+                return True, level
+            except Exception:
+                return True, 0
+        if re.match(r"^Heading \d", text):
+            try:
+                level = int(text[8])
+                return True, level
+            except Exception:
+                return True, 0
+        return False, 0
+    def _is_list(self, paragraph):
+        level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
+        if level:
+            return True
+        if paragraph.style.name in ["CPT_List Bullet", "List Bullet"]:
+            return True
+        if paragraph.text:
+            if hex(ord(paragraph.text[0])) == "0x2022":
+                return True
+        return False
+    def _extract_images(self):
+        archive = zipfile.ZipFile(self.full_path)
+        for file in archive.filelist:
+            if file.filename.startswith("word/media/"):
+                # Extract the image file name from the path
+                image_name = Path(file.filename).name
+                # Create the target path for the image
+                target_path = os.path.join(self.image_path, image_name)
+                # Extract the file to the target path
+                with archive.open(file) as source, open(target_path, "wb") as target:
+                    target.write(source.read())
+    def to_dict(self) -> dict:
+        """Convert the RawDocx instance to a dictionary representation"""
+        if hasattr(self, "target_document"):
+            return {
+                "type": "raw_docx",
+                "document": self.target_document.to_dict()
+                if hasattr(self.target_document, "to_dict")
+                else None,
+            }
+        return {"type": "raw_docx", "document": None}

raw_docx-0.6.0/src/raw_docx/raw_image.py ADDED Viewed

@@ -0,0 +1,37 @@
+import os
+import base64
+from .raw_logger import logger
+class RawImage:
+    FILE_TYPE_MAP = {".png": "png", ".jpg": "jpg", ".jpeg": "jpg"}
+    def __init__(self, filepath: str):
+        self.filepath = filepath
+    def to_html(self):
+        try:
+            file_root, file_extension = os.path.splitext(self.filepath)
+            if file_extension in self.FILE_TYPE_MAP:
+                file_type = self.FILE_TYPE_MAP[file_extension]
+                with open(self.filepath, "rb") as image_file:
+                    data = base64.b64encode(image_file.read())
+                decoded = data.decode("ascii")
+                return f'<img alt="alt text" src="data:image/{file_type};base64,{decoded}"/>'
+            else:
+                return f"""<p style="color:red">Note: Unable to process embedded image of type '{file_extension}', image ignored.</p>"""
+        except Exception as e:
+            logger.exception("Exception converting image", e)
+            return (
+                """<p style="color:red">Note: Error encountered processing image.</p>"""
+            )
+    def to_dict(self) -> dict:
+        """Convert the image to a dictionary representation"""
+        file_root, file_extension = os.path.splitext(self.filepath)
+        return {
+            "type": "image",
+            "filepath": self.filepath,
+            "extension": file_extension,
+            "file_type": self.FILE_TYPE_MAP.get(file_extension, "unknown"),
+        }

raw_docx-0.6.0/src/raw_docx/raw_list.py ADDED Viewed

@@ -0,0 +1,69 @@
+from .raw_list_item import RawListItem
+from .raw_logger import logger
+class RawList:
+    def __init__(self, level=0):
+        self.items = []  # List to store RawListItems and nested RawLists
+        self.level = level
+    def add(self, item: RawListItem) -> None:
+        if item.level == self.level:
+            self.items.append(item)
+        elif item.level > self.level:
+            list = self.items[-1] if self.items else None
+            if not isinstance(list, RawList):
+                list = RawList(item.level)
+                self.items.append(list)
+            list.add(item)
+            if item.level > self.level + 1:
+                logger.warning(
+                    f"Adding list item '{item}' to item but level jump greater than 1"
+                )
+        else:
+            logger.error(
+                f"Failed to add list item '{item}' to list '{self}', levels are in error"
+            )
+    def to_text(self) -> str:
+        lines = []
+        for item in self.items:
+            lines.append(f"{item.to_text()}")
+        return ("\n").join(lines)
+        return self.text  # Note: This line appears unreachable
+    def all_items(self) -> list[RawListItem]:
+        result = []
+        for item in self.items:
+            if isinstance(item, RawListItem):
+                result.append(item)
+            elif isinstance(item, RawList):
+                result += item.all_items()
+        return result
+    def to_html(self) -> str:
+        lines = []
+        lines.append("<ul>")
+        for item in self.items:
+            lines.append(f"<li>{item.to_html()}</li>")
+        lines.append("</ul>")
+        return ("\n").join(lines)
+    def to_dict(self) -> dict:
+        return {
+            "type": "list",
+            "level": self.level,
+            "items": [
+                item.to_dict() if hasattr(item, "to_dict") else str(item)
+                for item in self.items
+            ],
+        }
+    def __str__(self) -> str:
+        """Return a string representation of the list showing its level and item count.
+        Returns:
+            str: String representation of the list
+        """
+        return f"[level='{self.level}', item_count='{len(self.items)}']"

raw_docx-0.6.0/src/raw_docx/raw_list_item.py ADDED Viewed

@@ -0,0 +1,21 @@
+from html import escape
+from .raw_paragraph import RawParagraph
+from .raw_run import RawRun
+class RawListItem(RawParagraph):
+    def __init__(self, runs: list[RawRun], level: int):
+        self.level = level
+        super().__init__(runs)
+    def to_text(self) -> str:
+        return f"{'  ' * self.level}{self.text}"
+    def to_html(self) -> str:
+        return f"{escape(self.text)}"
+    def to_dict(self) -> dict:
+        return {"type": "list_item", "text": self.text, "level": self.level}
+    def __str__(self) -> str:
+        return f"[text='{self.text}', level='{self.level}']"

raw-docx 0.4.0__tar.gz → 0.6.0__tar.gz

raw-docx 0.4.0tar.gz → 0.6.0tar.gz