PyPI - raw-docx - Versions diffs - 0.3.0__tar.gz → 0.5.0__tar.gz - Mend

raw-docx 0.3.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{raw_docx-0.3.0 → raw_docx-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.2
 Name: raw_docx
-Version: 0.3.0
+Version: 0.5.0
 Summary: A package for processing and analyzing raw document formats
 Home-page: https://github.com/daveih/raw_docx
-Author: Dave Berson-Hurst
+Author: Dave Iberson-Hurst
 Author-email:
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -17,11 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: python-docx==1.1.2
-Requires-Dist: ruff==0.8.6
-Requires-Dist: python-json-logger==3.2.1
-Requires-Dist: pytest==7.4.4
-Requires-Dist: pytest-cov==4.1.0
+Requires-Dist: python-docx
+Requires-Dist: python-json-logger
 Dynamic: author
 Dynamic: classifier
 Dynamic: description

{raw_docx-0.3.0 → raw_docx-0.5.0}/setup.py RENAMED Viewed

@@ -3,19 +3,14 @@ from setuptools import setup, find_packages
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
-with open("requirements.txt", "r", encoding="utf-8") as fh:
-    requirements = [
-        line.strip() for line in fh if line.strip() and not line.startswith("#")
-    ]
-version = {}
-with open("src/__init__.py") as fp:
-    exec(fp.read(), version)
+package_info = {}
+with open("src/raw_docx/__version__.py") as fp:
+    exec(fp.read(), package_info)
 setup(
     name="raw_docx",
-    version=version["__package_version__"],
-    author="Dave Berson-Hurst",
+    version=package_info["__package_version__"],
+    author="Dave Iberson-Hurst",
     author_email="",
     description="A package for processing and analyzing raw document formats",
     long_description=long_description,
@@ -23,6 +18,9 @@ setup(
     url="https://github.com/daveih/raw_docx",
     packages=find_packages(where="src"),
     package_dir={"": "src"},
+    package_data={},
+    install_requires=["python-docx", "python-json-logger"],
+    tests_require=["pytest", "pytest-cov", "pytest-mock", "python-dotenv"],
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
@@ -35,5 +33,4 @@ setup(
         "Programming Language :: Python :: 3.11",
     ],
     python_requires=">=3.8",
-    install_requires=requirements,
 )

raw_docx-0.5.0/src/raw_docx/__init__.py ADDED Viewed

File without changes

raw_docx-0.5.0/src/raw_docx/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __package_version__ = "0.5.0"

raw_docx-0.5.0/src/raw_docx/docx_paragraph.py ADDED Viewed

@@ -0,0 +1,89 @@
+from docx.text.paragraph import Paragraph
+from docx.styles.style import ParagraphStyle
+from docx.text.run import Run
+from .raw_logger import logger
+from .raw_run import RawRun
+def extract_runs(paragraph: Paragraph) -> list[dict]:
+    if paragraph.text.startswith(
+        "This template is intended for interventional clinical trials.  The template is suitable"
+    ):
+        logger.info(f"Paragraph style {paragraph.style.name}")
+    data = [
+        {
+            "text": run.text,
+            "color": _get_run_color(paragraph.style, run),
+            "highlight": _get_highlight_color(run),
+            "keep": True,
+            # "style": run.style.name if run.style else paragraph.style.name
+            "style": paragraph.style.name,
+        }
+        for run in paragraph.runs
+    ]
+    data = _tidy_runs_color(data)
+    return [RawRun(x["text"], x["color"], x["highlight"], x["style"]) for x in data]
+def _tidy_runs_color(data: list[dict]) -> list[dict]:
+    more = False
+    for index, run in enumerate(data):
+        if (
+            index > 0
+            and run["color"] == data[index - 1]["color"]
+            and run["highlight"] == data[index - 1]["highlight"]
+        ):
+            run["text"] = data[index - 1]["text"] + run["text"]
+            data[index - 1]["keep"] = False
+            more = True
+    new_data = [x for x in data if x["keep"]]
+    if more:
+        new_data = _tidy_runs_color(new_data)
+    return new_data
+def _get_run_color(paragraph: Paragraph, run: Run) -> str | None:
+    paragraph_color = _get_font_colour(paragraph)
+    font_color = _get_font_colour(run)
+    style_color = _run_style_color(run)
+    if font_color:
+        result = str(font_color)
+    elif style_color:
+        result = str(style_color)
+    else:
+        result = str(paragraph_color)
+    return result
+def _get_highlight_color(run: Run) -> str | None:
+    try:
+        return str(run.font.highlight_color)
+    except Exception as e:
+        logger.exception("Failed to get run highlight color", e)
+        return None
+def _run_style_color(run: Run) -> str | None:
+    try:
+        run_color = None
+        run_style = run.style
+        while run_style and not run_color:
+            if run_style.font.color.rgb:
+                run_color = run_style.font.color.rgb
+            else:
+                run_style = run_style.base_style
+        return run_color
+    except Exception as e:
+        logger.exception("Failed to get run style color", e)
+        return None
+def _get_font_colour(item: Run | ParagraphStyle) -> str | None:
+    try:
+        return item.font.color.rgb
+    except Exception as e:
+        logger.exception("Failed to get font color", e)
+        return None
+setattr(Paragraph, "extract_runs", extract_runs)

raw_docx-0.5.0/src/raw_docx/raw_document.py ADDED Viewed

@@ -0,0 +1,64 @@
+from .raw_section import RawSection
+class RawDocument:
+    def __init__(self):
+        self.sections = []
+        self._levels = [0, 0, 0, 0, 0, 0]
+        self._section_number_mapping = {}
+        self._section_title_mapping = {}
+        section = RawSection(None, None, 1)
+        self.add(section, False)  # No section number increment
+    def add(self, section: RawSection, increment=True):
+        if increment:
+            self._inc_section_number(section.level)
+            section.number = self._get_section_number(section.level)
+        self._section_number_mapping[section.number] = section
+        self._section_title_mapping[section.title] = section
+        self.sections.append(section)
+    def current_section(self) -> RawSection:
+        return self.sections[-1]
+    def section_by_ordinal(self, ordinal: int) -> RawSection:
+        if 1 >= ordinal <= len(self.sections):
+            return self.sections[ordinal - 1]
+        else:
+            return None
+    def section_by_number(self, section_number: str) -> RawSection:
+        if section_number in self._section_number_mapping:
+            return self._section_number_mapping[section_number]
+        else:
+            return None
+    def section_by_title(self, section_title: str) -> RawSection:
+        if section_title in self._section_title_mapping:
+            return self._section_title_mapping[section_title]
+        else:
+            return None
+    def _inc_section_number(self, level: int) -> None:
+        self._levels[level] += 1
+        for index in range(level + 1, len(self._levels)):
+            self._levels[index] = 0
+    def _get_section_number(self, level: int) -> str:
+        return ".".join(str(x) for x in self._levels[1 : level + 1])
+    def to_dict(self) -> dict:
+        """Convert the document to a dictionary representation"""
+        return {
+            "type": "document",
+            "sections": [section.to_dict() for section in self.sections],
+            "levels": self._levels,
+            "section_number_mapping": {
+                num: section.to_dict()
+                for num, section in self._section_number_mapping.items()
+            },
+            "section_title_mapping": {
+                title: section.to_dict()
+                for title, section in self._section_title_mapping.items()
+            },
+        }

raw_docx-0.5.0/src/raw_docx/raw_docx.py ADDED Viewed

@@ -0,0 +1,256 @@
+import os
+import re
+import docx
+import zipfile
+from pathlib import Path
+from .raw_document import RawDocument
+from .raw_section import RawSection
+from .raw_paragraph import RawParagraph
+from .raw_image import RawImage
+from .raw_table import RawTable
+from .raw_table_row import RawTableRow
+from .raw_table_cell import RawTableCell
+from .raw_list import RawList
+from .raw_list_item import RawListItem
+from docx import Document as DocXProcessor
+from docx.document import Document
+from docx.oxml.table import CT_Tbl, CT_TcPr
+from docx.oxml.text.paragraph import CT_P
+from docx.table import Table, _Cell
+from docx.text.paragraph import Paragraph
+from lxml import etree
+from .raw_logger import logger
+from .docx_paragraph import extract_runs  # Needed such that method inserted into class
+class RawDocx:
+    class LogicError(Exception):
+        pass
+    def __init__(self, full_path: str):
+        path = Path(full_path)
+        # path.stem, path.suffix[1:]
+        self.full_path = full_path
+        self.dir = path.parent
+        self.filename = path.name
+        self.image_path = os.path.join(self.dir, "images")
+        self.image_rels = {}
+        self._organise_dir()
+        self.source_document = DocXProcessor(self.full_path)
+        self.target_document = RawDocument()
+        self._process()
+    def _organise_dir(self):
+        try:
+            os.mkdir(self.image_path)
+        except FileExistsError:
+            pass
+        except Exception as e:
+            logger.exception("Failed to create image directory", e)
+    def _process(self):
+        try:
+            self._extract_images()
+            for block_item in self._iter_block_items(self.source_document):
+                target_section = self.target_document.current_section()
+                if isinstance(block_item, Paragraph):
+                    # print(f"PARA BLOCK: {block_item.text}")
+                    self._process_paragraph(block_item, target_section, self.image_rels)
+                elif isinstance(block_item, Table):
+                    self._process_table(block_item, target_section)
+                else:
+                    logger.warning("Ignoring element")
+                    raise ValueError
+        except Exception as e:
+            logger.exception("Exception raised processing document", e)
+    def _extract_images(self):
+        # Extract images to image dir
+        self._extract_images()
+        # Save all 'rId:filenames' as references
+        for r in self.source_document.part.rels.values():
+            if isinstance(r._target, docx.parts.image.ImagePart):
+                self.image_rels[r.rId] = os.path.join(
+                    self.image_path, os.path.basename(r._target.partname)
+                )
+    def _iter_block_items(self, parent):
+        """
+        Yield each paragraph and table child within *parent*, in document
+        order. Each returned value is an instance of either Table or
+        Paragraph. *parent* would most commonly be a reference to a main
+        Document object, but also works for a _Cell object, which itself can
+        contain paragraphs and tables.
+        """
+        if isinstance(parent, Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something's not right with the parent")
+        for child in parent_elm.iterchildren():
+            if isinstance(child, str):
+                logger.warning(f"Ignoring eTree element {child}")
+            elif isinstance(child, CT_P):
+                # print(f"PARA: {child.text}")
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+            elif isinstance(child, etree._Element):
+                if (
+                    child.tag
+                    == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcPr"
+                ):
+                    pass
+                elif (
+                    child.tag
+                    == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sdt"
+                ):
+                    pass
+                else:
+                    logger.warning(f"Ignoring eTree element {self._tree(child)}")
+            else:
+                raise ValueError(f"something's not right with a child {type(child)}")
+    def _tree(self, node, tab=1):
+        # print(f"{'  ' * tab}{node.tag} {node.text}")
+        for child in node:
+            self._tree(child, tab + 1)
+    def _process_table(self, table, target: RawSection | RawTableCell):
+        target_table = RawTable()
+        target.add(target_table)
+        for r_index, row in enumerate(table.rows):
+            target_row = RawTableRow()
+            target_table.add(target_row)
+            cells = row.cells
+            for c_index, cell in enumerate(cells):
+                if cell._tc is not None:
+                    x = cell._tc
+                    right = x.right
+                    left = x.left
+                    top = x.top
+                    try:
+                        # Bottom method seems to have a bug.
+                        # See https://github.com/python-openxml/python-docx/issues/1433
+                        bottom = x.bottom
+                    except Exception:
+                        bottom = top + 1
+                    h_span = right - left
+                    v_span = bottom - top
+                else:
+                    h_span = 1
+                    v_span = 1
+                first = r_index == cell._tc.top and c_index == cell._tc.left
+                target_cell = RawTableCell(h_span, v_span, first)
+                target_row.add(target_cell)
+                for block_item in self._iter_block_items(cell):
+                    if isinstance(block_item, Paragraph):
+                        self._process_cell(block_item, target_cell)
+                    elif isinstance(block_item, Table):
+                        raise self.LogicError("Table within table detected")
+                    elif isinstance(block_item, etree._Element):
+                        if block_item.tag == CT_TcPr:
+                            pass
+                        else:
+                            logger.warning(f"Ignoring eTree element {block_item.tag}")
+                    else:
+                        raise self.LogicError(
+                            f"something's not right with a child {type(block_item)}"
+                        )
+    def _process_cell(self, paragraph, target_cell: RawTableCell):
+        if self._is_list(paragraph):
+            list_level = self.get_list_level(paragraph)
+            item = RawListItem(paragraph.extract_runs(), list_level)
+            if target_cell.is_in_list():
+                list = target_cell.current_list()
+            else:
+                list = RawList()
+                target_cell.add(list)
+            list.add(item)
+        else:
+            target_paragraph = RawParagraph(paragraph.extract_runs())
+            target_cell.add(target_paragraph)
+    def _process_paragraph(
+        self, paragraph, target_section: RawSection, image_rels: dict
+    ):
+        is_heading, level = self._is_heading(paragraph.style.name)
+        if is_heading:
+            target_section = RawSection(paragraph.text, paragraph.text, level)
+            self.target_document.add(target_section)
+        elif self._is_list(paragraph):
+            # print(f"START LIST: {paragraph.text}")
+            list_level = self.get_list_level(paragraph)
+            item = RawListItem(paragraph.extract_runs(), list_level)
+            if target_section.is_in_list():
+                list = target_section.current_list()
+            else:
+                list = RawList()
+                target_section.add(list)
+            list.add(item)
+        elif "Graphic" in paragraph._p.xml:
+            for rId in image_rels:
+                if rId in paragraph._p.xml:
+                    target_image = RawImage(image_rels[rId])
+                    target_section.add(target_image)
+        else:
+            # print(f"START RUNS: {paragraph.text}")
+            target_paragraph = RawParagraph(paragraph.extract_runs())
+            target_section.add(target_paragraph)
+    def get_list_level(self, paragraph):
+        list_level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
+        return int(str(list_level[0])) if list_level else 0
+    def _is_heading(self, text):
+        if re.match(r"^\d\dHeading \d", text):
+            try:
+                level = int(text[0:2])
+                return True, level
+            except Exception:
+                return True, 0
+        if re.match(r"^Heading \d", text):
+            try:
+                level = int(text[8])
+                return True, level
+            except Exception:
+                return True, 0
+        return False, 0
+    def _is_list(self, paragraph):
+        level = paragraph._p.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
+        if level:
+            return True
+        if paragraph.style.name in ["CPT_List Bullet", "List Bullet"]:
+            return True
+        if paragraph.text:
+            if hex(ord(paragraph.text[0])) == "0x2022":
+                return True
+        return False
+    def _extract_images(self):
+        archive = zipfile.ZipFile(self.full_path)
+        for file in archive.filelist:
+            if file.filename.startswith("word/media/"):
+                # Extract the image file name from the path
+                image_name = Path(file.filename).name
+                # Create the target path for the image
+                target_path = os.path.join(self.image_path, image_name)
+                # Extract the file to the target path
+                with archive.open(file) as source, open(target_path, "wb") as target:
+                    target.write(source.read())
+    def to_dict(self) -> dict:
+        """Convert the RawDocx instance to a dictionary representation"""
+        if hasattr(self, "target_document"):
+            return {
+                "type": "raw_docx",
+                "document": self.target_document.to_dict()
+                if hasattr(self.target_document, "to_dict")
+                else None,
+            }
+        return {"type": "raw_docx", "document": None}

raw_docx-0.5.0/src/raw_docx/raw_image.py ADDED Viewed

@@ -0,0 +1,37 @@
+import os
+import base64
+from .raw_logger import logger
+class RawImage:
+    FILE_TYPE_MAP = {".png": "png", ".jpg": "jpg", ".jpeg": "jpg"}
+    def __init__(self, filepath: str):
+        self.filepath = filepath
+    def to_html(self):
+        try:
+            file_root, file_extension = os.path.splitext(self.filepath)
+            if file_extension in self.FILE_TYPE_MAP:
+                file_type = self.FILE_TYPE_MAP[file_extension]
+                with open(self.filepath, "rb") as image_file:
+                    data = base64.b64encode(image_file.read())
+                decoded = data.decode("ascii")
+                return f'<img alt="alt text" src="data:image/{file_type};base64,{decoded}"/>'
+            else:
+                return f"""<p style="color:red">Note: Unable to process embedded image of type '{file_extension}', image ignored.</p>"""
+        except Exception as e:
+            logger.exception("Exception converting image", e)
+            return (
+                """<p style="color:red">Note: Error encountered processing image.</p>"""
+            )
+    def to_dict(self) -> dict:
+        """Convert the image to a dictionary representation"""
+        file_root, file_extension = os.path.splitext(self.filepath)
+        return {
+            "type": "image",
+            "filepath": self.filepath,
+            "extension": file_extension,
+            "file_type": self.FILE_TYPE_MAP.get(file_extension, "unknown"),
+        }

raw_docx-0.5.0/src/raw_docx/raw_list.py ADDED Viewed

@@ -0,0 +1,69 @@
+from .raw_list_item import RawListItem
+from .raw_logger import logger
+class RawList:
+    def __init__(self, level=0):
+        self.items = []  # List to store RawListItems and nested RawLists
+        self.level = level
+    def add(self, item: RawListItem) -> None:
+        if item.level == self.level:
+            self.items.append(item)
+        elif item.level > self.level:
+            list = self.items[-1] if self.items else None
+            if not isinstance(list, RawList):
+                list = RawList(item.level)
+                self.items.append(list)
+            list.add(item)
+            if item.level > self.level + 1:
+                logger.warning(
+                    f"Adding list item '{item}' to item but level jump greater than 1"
+                )
+        else:
+            logger.error(
+                f"Failed to add list item '{item}' to list '{self}', levels are in error"
+            )
+    def to_text(self) -> str:
+        lines = []
+        for item in self.items:
+            lines.append(f"{item.to_text()}")
+        return ("\n").join(lines)
+        return self.text  # Note: This line appears unreachable
+    def all_items(self) -> list[RawListItem]:
+        result = []
+        for item in self.items:
+            if isinstance(item, RawListItem):
+                result.append(item)
+            elif isinstance(item, RawList):
+                result += item.all_items()
+        return result
+    def to_html(self) -> str:
+        lines = []
+        lines.append("<ul>")
+        for item in self.items:
+            lines.append(f"<li>{item.to_html()}</li>")
+        lines.append("</ul>")
+        return ("\n").join(lines)
+    def to_dict(self) -> dict:
+        return {
+            "type": "list",
+            "level": self.level,
+            "items": [
+                item.to_dict() if hasattr(item, "to_dict") else str(item)
+                for item in self.items
+            ],
+        }
+    def __str__(self) -> str:
+        """Return a string representation of the list showing its level and item count.
+        Returns:
+            str: String representation of the list
+        """
+        return f"[level='{self.level}', item_count='{len(self.items)}']"

raw_docx-0.5.0/src/raw_docx/raw_list_item.py ADDED Viewed

@@ -0,0 +1,21 @@
+from html import escape
+from .raw_paragraph import RawParagraph
+from .raw_run import RawRun
+class RawListItem(RawParagraph):
+    def __init__(self, runs: list[RawRun], level: int):
+        self.level = level
+        super().__init__(runs)
+    def to_text(self) -> str:
+        return f"{'  ' * self.level}{self.text}"
+    def to_html(self) -> str:
+        return f"{escape(self.text)}"
+    def to_dict(self) -> dict:
+        return {"type": "list_item", "text": self.text, "level": self.level}
+    def __str__(self) -> str:
+        return f"[text='{self.text}', level='{self.level}']"

raw_docx-0.5.0/src/raw_docx/raw_logger.py ADDED Viewed

@@ -0,0 +1,67 @@
+import sys
+import logging
+from pathlib import Path
+from typing import Optional
+from pythonjsonlogger import jsonlogger
+class RawLogger:
+    _instance = None
+    _initialized = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not RawLogger._initialized:
+            self.logger = logging.getLogger("raw_docx")
+            self.logger.setLevel(logging.INFO)
+            # Create JSON formatter
+            formatter = jsonlogger.JsonFormatter(
+                fmt="%(asctime)s %(name)s %(levelname)s %(message)s",
+                datefmt="%Y-%m-%d %H:%M:%S",
+            )
+            # Console handler
+            console_handler = logging.StreamHandler(sys.stdout)
+            console_handler.setFormatter(formatter)
+            self.logger.addHandler(console_handler)
+            RawLogger._initialized = True
+    def setup_file_logging(self, log_dir: Optional[str] = None):
+        """Setup file logging in addition to console logging"""
+        if log_dir:
+            log_path = Path(log_dir)
+            log_path.mkdir(parents=True, exist_ok=True)
+            file_handler = logging.FileHandler(log_path / "raw_docx.log")
+            file_handler.setFormatter(
+                jsonlogger.JsonFormatter(
+                    fmt="%(asctime)s %(name)s %(levelname)s %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                )
+            )
+            self.logger.addHandler(file_handler)
+    def info(self, message: str):
+        """Log info message"""
+        self.logger.info(message)
+    def warning(self, message: str):
+        """Log warning message"""
+        self.logger.warning(message)
+    def error(self, message: str):
+        """Log error message"""
+        self.logger.error(message)
+    def exception(self, message: str, exc: Exception):
+        """Log exception with message"""
+        self.logger.exception(message, exc_info=exc)
+# Create singleton instance
+logger = RawLogger()

raw_docx-0.5.0/src/raw_docx/raw_paragraph.py ADDED Viewed

@@ -0,0 +1,35 @@
+from .raw_run import RawRun
+from html import escape
+class RawParagraph:
+    def __init__(self, runs: list[RawRun]):
+        self.runs = runs
+        self.klasses = []
+        self.text = self._run_text()
+    def to_html(self) -> str:
+        klass_list = " ".join(self.klasses)
+        open_tag = f'<p class="{klass_list}">' if self.klasses else "<p>"
+        return f"{open_tag}{escape(self.text)}</p>"
+    def find(self, text: str) -> bool:
+        return True if text in self.text else False
+    def find_at_start(self, text: str) -> bool:
+        return True if self.text.upper().startswith(text.upper()) else False
+    def add_class(self, klass) -> None:
+        self.klasses.append(klass)
+    def to_dict(self) -> dict:
+        """Convert the paragraph to a dictionary representation"""
+        return {
+            "type": "paragraph",
+            "text": self.text,
+            "runs": [run.to_dict() for run in self.runs],
+            "classes": self.klasses,
+        }
+    def _run_text(self) -> str:
+        return "".join([run.text for run in self.runs])

raw_docx-0.5.0/src/raw_docx/raw_run.py ADDED Viewed

@@ -0,0 +1,15 @@
+class RawRun:
+    def __init__(self, text: str, color: str | None, highlight: str | None, style: str):
+        self.text = text
+        self.color = color
+        self.highlight = highlight
+        self.style = style
+    def to_dict(self) -> dict:
+        """Convert the instace to a dictionary representation"""
+        return {
+            "text": self.text,
+            "color": self.color,
+            "highlight": self.highlight,
+            "style": self.style,
+        }

raw_docx-0.5.0/src/raw_docx/raw_section.py ADDED Viewed

@@ -0,0 +1,119 @@
+from .raw_paragraph import RawParagraph
+from .raw_list import RawList
+from .raw_table import RawTable
+from .raw_image import RawImage
+class RawSection:
+    def __init__(self, title: str | None, number: str | None, level: int):
+        self.title = title.strip() if title else title
+        self.number = number.strip() if number else number
+        self.level = level
+        self.items = []
+    def add(self, item: RawParagraph | RawList | RawTable | RawImage) -> None:
+        self.items.append(item)
+    def is_in_list(self) -> bool:
+        if self.items:
+            if isinstance(self.items[-1], RawList):
+                return True
+        return False
+    def current_list(self) -> RawList:
+        if self.items:
+            return self.items[-1] if isinstance(self.items[-1], RawList) else None
+        else:
+            return None
+    def to_dict(self) -> dict:
+        """Convert the section to a dictionary representation"""
+        return {
+            "type": "section",
+            "title": self.title,
+            "number": self.number,
+            "level": self.level,
+            "items": [
+                item.to_dict() if hasattr(item, "to_dict") else str(item)
+                for item in self.items
+            ],
+        }
+    def to_html(self):
+        text = []
+        for item in self.items:
+            result = item.to_html()
+            text.append(result)
+        return ("\n").join(text)
+    def to_html_between(self, start, end):
+        text = []
+        for index, item in enumerate(self.items):
+            if index >= start and index < end:
+                result = item.to_html()
+                text.append(result)
+        return ("\n").join(text)
+    def paragraphs(self) -> list[RawParagraph]:
+        return [x for x in self.items if isinstance(x, RawParagraph)]
+    def tables(self) -> list[RawTable]:
+        return [x for x in self.items if isinstance(x, RawTable)]
+    def lists(self) -> list[RawList]:
+        return [x for x in self.items if isinstance(x, RawList)]
+    def items_between(self, start_index, end_index):
+        return self.items[start_index:end_index]
+    def find(self, text) -> list[RawParagraph]:
+        return [x for x in self.items if isinstance(x, RawParagraph) and x.find(text)]
+    def find_at_start(self, text) -> list[RawParagraph]:
+        return [
+            x
+            for x in self.items
+            if isinstance(x, RawParagraph) and x.find_at_start(text)
+        ]
+    def find_first_at_start(self, text) -> tuple[RawParagraph, int]:
+        for index, item in enumerate(self.items):
+            if isinstance(item, RawParagraph) and item.find_at_start(text):
+                return item, index
+        return None, -1
+    def has_lists(self) -> bool:
+        return len(self.lists()) > 0
+    def has_content(self) -> bool:
+        return not self.is_empty()
+    def is_empty(self) -> bool:
+        return len(self.items) == 0
+    def next(self, index: int):
+        return self.items[index + 1] if (index + 1) < len(self.items) else None
+    def next_paragraph(self, start_index: int) -> RawParagraph:
+        for index, item in enumerate(self.items):
+            if index >= start_index:
+                if isinstance(self.items[index], RawParagraph):
+                    return item
+        return None
+    def next_table(self, start_index: int) -> RawTable:
+        for index, item in enumerate(self.items):
+            if index >= start_index:
+                if isinstance(self.items[index], RawTable):
+                    return item
+        return None
+    def _format_heading(self):
+        if self.number and self.title:
+            return f"<h{self.level}>{self.number} {self.title}</h{self.level}>"
+        elif self.number:
+            return f"<h{self.level}>{self.number}</h{self.level}>"
+        elif self.title:
+            return f"<h{self.level}>{self.title}</h{self.level}>"
+        else:
+            return ""

raw_docx-0.5.0/src/raw_docx/raw_table.py ADDED Viewed

@@ -0,0 +1,48 @@
+class RawTable:
+    def __init__(self):
+        from .raw_table_row import RawTableRow
+        self.rows: list[RawTableRow] = []
+        self.klasses = ["ich-m11-table"]
+    # @ToDo Would like RawTableRow here but gets a circular import
+    def add(self, item):
+        self.rows.append(item)
+    def row(self, index: int):
+        return self.rows[index] if (index) < len(self.rows) else None
+    def next(self, index: int) -> tuple[object, int]:
+        return (
+            (self.rows[index + 1], index + 1)
+            if (index + 1) < len(self.rows)
+            else (None, -1)
+        )
+    def find_row(self, text: str) -> tuple[object, int]:
+        for index, row in enumerate(self.rows):
+            if row.cells[0].is_text():
+                if text.upper() in row.cells[0].text().upper():
+                    return row, index
+        return None, -1
+    def to_html(self):
+        lines = []
+        klass_list = " ".join(self.klasses)
+        open_tag = f'<table class="{klass_list}">' if self.klasses else "<table>"
+        lines.append(open_tag)
+        for item in self.rows:
+            lines.append(item.to_html())
+        lines.append("</table>")
+        return ("\n").join(lines)
+    def add_class(self, klass):
+        self.klasses.append(klass)
+    def replace_class(self, old_klass, new_klass):
+        self.klasses.remove(old_klass)
+        self.klasses.append(new_klass)
+    def to_dict(self) -> dict:
+        """Convert the table to a dictionary representation"""
+        return {"type": "table", "rows": [row.to_dict() for row in self.rows]}

raw_docx-0.5.0/src/raw_docx/raw_table_cell.py ADDED Viewed

@@ -0,0 +1,62 @@
+from .raw_paragraph import RawParagraph
+from .raw_list import RawList
+from .raw_table import RawTable
+class RawTableCell:
+    def __init__(self, h_span: int = 1, v_span: int = 1, first: bool = True):
+        self.h_span = h_span
+        self.v_span = v_span
+        self.h_merged = h_span > 1
+        self.v_merged = v_span > 1
+        self.merged = self.h_merged or self.v_merged
+        self.first = first
+        self.items = []
+    def add(self, item: RawParagraph | RawList | RawTable) -> None:
+        self.items.append(item)
+    def is_text(self) -> bool:
+        for item in self.items:
+            if not isinstance(item, RawParagraph):
+                return False
+        return True
+    def text(self) -> str:
+        return ("\n").join([x.text for x in self.items])
+    def is_in_list(self) -> bool:
+        if self.items:
+            if isinstance(self.items[-1], RawList):
+                return True
+        return False
+    def current_list(self) -> RawList:
+        if self.items:
+            return self.items[-1] if isinstance(self.items[-1], RawList) else None
+        else:
+            return None
+    def to_html(self):
+        if not self.first:
+            return ""
+        lines = []
+        colspan = f' colspan="{self.h_span}"' if self.h_merged else ""
+        lines.append(f"<td{colspan}>")
+        for item in self.items:
+            lines.append(item.to_html())
+        lines.append("</td>")
+        return ("\n").join(lines)
+    def to_dict(self) -> dict:
+        """Convert the table cell to a dictionary representation"""
+        return {
+            "type": "table_cell",
+            "row_span": self.v_span,
+            "col_span": self.h_span,
+            "first": self.first,
+            "content": [
+                item.to_dict() if hasattr(item, "to_dict") else str(item)
+                for item in self.items
+            ],
+        }

raw_docx-0.5.0/src/raw_docx/raw_table_row.py ADDED Viewed

@@ -0,0 +1,41 @@
+from .raw_table_cell import RawTableCell
+class RawTableRow:
+    def __init__(self):
+        self.cells: list[RawTableCell] = []
+    def add(self, cell: RawTableCell):
+        self.cells.append(cell)
+    def find_cell(self, text: str) -> RawTableCell:
+        for cell in self.cells:
+            if cell.is_text():
+                if text.upper() in cell.text().upper():
+                    return cell
+        return None
+    def find_cell_next_to(self, text: str) -> RawTableCell:
+        for index, cell in enumerate(self.cells):
+            if cell.is_text():
+                if text.upper() in cell.text().upper():
+                    return self.next_cell(index)
+        return None
+    def to_html(self):
+        lines = []
+        lines.append("<tr>")
+        for item in self.cells:
+            lines.append(item.to_html())
+        lines.append("</tr>")
+        return ("\n").join(lines)
+    def next_cell(self, start_index: int) -> RawTableCell:
+        for index, cell in enumerate(self.cells):
+            if index > start_index and cell.first:
+                return cell
+        return None
+    def to_dict(self) -> dict:
+        """Convert the table row to a dictionary representation"""
+        return {"type": "table_row", "cells": [cell.to_dict() for cell in self.cells]}

{raw_docx-0.3.0 → raw_docx-0.5.0}/src/raw_docx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.2
 Name: raw_docx
-Version: 0.3.0
+Version: 0.5.0
 Summary: A package for processing and analyzing raw document formats
 Home-page: https://github.com/daveih/raw_docx
-Author: Dave Berson-Hurst
+Author: Dave Iberson-Hurst
 Author-email:
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -17,11 +17,8 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: python-docx==1.1.2
-Requires-Dist: ruff==0.8.6
-Requires-Dist: python-json-logger==3.2.1
-Requires-Dist: pytest==7.4.4
-Requires-Dist: pytest-cov==4.1.0
+Requires-Dist: python-docx
+Requires-Dist: python-json-logger
 Dynamic: author
 Dynamic: classifier
 Dynamic: description

{raw_docx-0.3.0 → raw_docx-0.5.0}/src/raw_docx.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,6 +1,21 @@
 LICENSE
 README.md
 setup.py
+src/raw_docx/__init__.py
+src/raw_docx/__version__.py
+src/raw_docx/docx_paragraph.py
+src/raw_docx/raw_document.py
+src/raw_docx/raw_docx.py
+src/raw_docx/raw_image.py
+src/raw_docx/raw_list.py
+src/raw_docx/raw_list_item.py
+src/raw_docx/raw_logger.py
+src/raw_docx/raw_paragraph.py
+src/raw_docx/raw_run.py
+src/raw_docx/raw_section.py
+src/raw_docx/raw_table.py
+src/raw_docx/raw_table_cell.py
+src/raw_docx/raw_table_row.py
 src/raw_docx.egg-info/PKG-INFO
 src/raw_docx.egg-info/SOURCES.txt
 src/raw_docx.egg-info/dependency_links.txt

raw_docx-0.5.0/src/raw_docx.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ python-docx
2	+ python-json-logger

raw_docx-0.5.0/src/raw_docx.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ raw_docx

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_docx_paragraph.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from unittest.mock import Mock, PropertyMock
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
-from docx_paragraph import (
+from src.raw_docx.docx_paragraph import (
     extract_runs,
     _tidy_runs_color,
     _get_highlight_color,

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_integration.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import json
-from raw_docx import RawDocx
+from src.raw_docx.raw_docx import RawDocx
 WRITE_FILE = True

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_document.py RENAMED Viewed

@@ -1,8 +1,8 @@
 import pytest
-from raw_document import RawDocument
-from raw_section import RawSection
-from raw_paragraph import RawParagraph
-from raw_run import RawRun
+from src.raw_docx.raw_document import RawDocument
+from src.raw_docx.raw_section import RawSection
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_docx.py RENAMED Viewed

@@ -2,8 +2,8 @@ import pytest
 import os
 from docx import Document as DocxDocument
 from docx.shared import Inches
-from raw_docx import RawDocx
-from raw_document import RawDocument
+from src.raw_docx.raw_docx import RawDocx
+from src.raw_docx.raw_document import RawDocument
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_image.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from raw_image import RawImage
+from src.raw_docx.raw_image import RawImage
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_list.py RENAMED Viewed

@@ -1,8 +1,8 @@
 import pytest
 from unittest.mock import patch
-from raw_list import RawList
-from raw_list_item import RawListItem
-from raw_run import RawRun
+from src.raw_docx.raw_list import RawList
+from src.raw_docx.raw_list_item import RawListItem
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture
@@ -58,7 +58,7 @@ def test_add_multiple_items():
 def test_add_multiple_items_level_error():
     """Test adding multiple items with different levels with level error"""
-    with patch("raw_list.logger") as mock_logger:
+    with patch("src.raw_docx.raw_list.logger") as mock_logger:
         list = RawList(1)
         items = [
             RawListItem([RawRun("Item 1", "", None, "Normal")], 1),
@@ -105,7 +105,7 @@ def test_nested_list_to_html():
 def test_add_item_lower_level_logs_error():
     """Test that adding an item with lower level than list level logs an error"""
-    with patch("raw_list.logger") as mock_logger:
+    with patch("src.raw_docx.raw_list.logger") as mock_logger:
         list_obj = RawList(2)  # List with level 2
         item = RawListItem(
             [RawRun("Test Item", "", None, "Normal")], 1

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_list_item.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import pytest
-from raw_list_item import RawListItem
-from raw_run import RawRun
+from src.raw_docx.raw_list_item import RawListItem
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_logger.py RENAMED Viewed

@@ -2,7 +2,7 @@ import json
 import logging
 import pytest
 from pathlib import Path
-from raw_logger import RawLogger
+from src.raw_docx.raw_logger import RawLogger
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_paragraph.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import pytest
-from raw_paragraph import RawParagraph
-from raw_run import RawRun
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_run.py RENAMED Viewed

@@ -2,7 +2,7 @@ import pytest
 from docx import Document
 from docx.shared import RGBColor
 from docx.enum.text import WD_COLOR_INDEX
-from raw_run import RawRun
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_section.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import pytest
-from raw_section import RawSection
-from raw_paragraph import RawParagraph
-from raw_list import RawList
-from raw_image import RawImage
-from raw_table import RawTable
-from raw_table_row import RawTableRow
-from raw_table_cell import RawTableCell
-from raw_list_item import RawListItem
-from raw_run import RawRun
+from src.raw_docx.raw_section import RawSection
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_list import RawList
+from src.raw_docx.raw_image import RawImage
+from src.raw_docx.raw_table import RawTable
+from src.raw_docx.raw_table_row import RawTableRow
+from src.raw_docx.raw_table_cell import RawTableCell
+from src.raw_docx.raw_list_item import RawListItem
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_table.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import pytest
-from raw_table import RawTable
-from raw_table_row import RawTableRow
-from raw_table_cell import RawTableCell
-from raw_paragraph import RawParagraph
-from raw_run import RawRun
+from src.raw_docx.raw_table import RawTable
+from src.raw_docx.raw_table_row import RawTableRow
+from src.raw_docx.raw_table_cell import RawTableCell
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_table_cell.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import pytest
-from raw_table_cell import RawTableCell
-from raw_paragraph import RawParagraph
-from raw_list import RawList
-from raw_table import RawTable
-from raw_run import RawRun
+from src.raw_docx.raw_table_cell import RawTableCell
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_list import RawList
+from src.raw_docx.raw_table import RawTable
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

{raw_docx-0.3.0 → raw_docx-0.5.0}/tests/test_raw_table_row.py RENAMED Viewed

@@ -1,8 +1,8 @@
 import pytest
-from raw_table_row import RawTableRow
-from raw_table_cell import RawTableCell
-from raw_paragraph import RawParagraph
-from raw_run import RawRun
+from src.raw_docx.raw_table_row import RawTableRow
+from src.raw_docx.raw_table_cell import RawTableCell
+from src.raw_docx.raw_paragraph import RawParagraph
+from src.raw_docx.raw_run import RawRun
 @pytest.fixture

raw_docx-0.3.0/src/raw_docx.egg-info/requires.txt DELETED Viewed

@@ -1,5 +0,0 @@
-python-docx==1.1.2
-ruff==0.8.6
-python-json-logger==3.2.1
-pytest==7.4.4
-pytest-cov==4.1.0

raw_docx-0.3.0/src/raw_docx.egg-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	-

{raw_docx-0.3.0 → raw_docx-0.5.0}/LICENSE RENAMED Viewed

File without changes

{raw_docx-0.3.0 → raw_docx-0.5.0}/README.md RENAMED Viewed

File without changes

{raw_docx-0.3.0 → raw_docx-0.5.0}/setup.cfg RENAMED Viewed

File without changes

{raw_docx-0.3.0 → raw_docx-0.5.0}/src/raw_docx.egg-info/dependency_links.txt RENAMED Viewed

File without changes

raw-docx 0.3.0__tar.gz → 0.5.0__tar.gz

raw-docx 0.3.0tar.gz → 0.5.0tar.gz