PyPI - paraencoder - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

paraencoder 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

para/handlers.py ADDED Viewed

@@ -0,0 +1,326 @@
+"""File format handlers for different document types."""
+from __future__ import annotations
+import json
+import re
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Callable, Optional
+# Optional imports - will be None if not installed
+try:
+    import openpyxl
+except ImportError:
+    openpyxl = None
+try:
+    from docx import Document as DocxDocument
+except ImportError:
+    DocxDocument = None
+try:
+    from odf import text as odf_text
+    from odf.opendocument import load as odf_load
+except ImportError:
+    odf_text = None
+    odf_load = None
+# File extensions grouped by handler type
+PLAIN_TEXT_EXTENSIONS = {
+    # Plain text
+    ".txt", ".text", ".log", ".md", ".rst", ".asc",
+    # Web/markup
+    ".html", ".htm", ".xhtml", ".xml", ".csv", ".tsv",
+    ".json", ".yaml", ".yml",
+    # Documentation
+    ".tex", ".latex", ".adoc", ".org", ".wiki", ".mediawiki",
+    # Config files
+    ".ini", ".cfg", ".conf", ".properties", ".env", ".toml", ".lock",
+    # Source code
+    ".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
+    ".php", ".rb", ".go", ".rs", ".sh", ".bat", ".ps1", ".sql",
+    # Notes/misc
+    ".note", ".eml", ".mbox",
+    # Subtitles
+    ".srt", ".vtt", ".sub",
+    # Translation
+    ".po", ".pot",
+    # Other
+    ".texi", ".man", ".nfo", ".readme",
+}
+DOCX_EXTENSIONS = {".docx", ".docm"}
+XLSX_EXTENSIONS = {".xlsx", ".xlsm"}
+ODT_EXTENSIONS = {".odt"}
+RTF_EXTENSIONS = {".rtf"}
+class FileHandler(ABC):
+    """Base class for file format handlers."""
+    @abstractmethod
+    def read(self, path: Path) -> str:
+        """Read file and return text content."""
+        pass
+    @abstractmethod
+    def convert(
+        self,
+        input_path: Path,
+        output_path: Path,
+        converter: Callable[[str], str],
+    ) -> None:
+        """Convert file in-place or to new file, preserving format."""
+        pass
+    @staticmethod
+    def can_handle(path: Path) -> bool:
+        """Check if this handler can process the given file."""
+        return False
+class PlainTextHandler(FileHandler):
+    """Handler for plain text files."""
+    def read(self, path: Path, encoding: str = "utf-8") -> str:
+        return path.read_text(encoding=encoding)
+    def convert(
+        self,
+        input_path: Path,
+        output_path: Path,
+        converter: Callable[[str], str],
+        encoding: str = "utf-8",
+    ) -> None:
+        text = self.read(input_path, encoding=encoding)
+        converted = converter(text)
+        output_path.write_text(converted, encoding=encoding)
+    @staticmethod
+    def can_handle(path: Path) -> bool:
+        return path.suffix.lower() in PLAIN_TEXT_EXTENSIONS or path.suffix == ""
+class DocxHandler(FileHandler):
+    """Handler for Microsoft Word .docx files."""
+    def __init__(self):
+        if DocxDocument is None:
+            raise ImportError(
+                "python-docx is required for .docx support. "
+                "Install with: pip install paraencoder[office]"
+            )
+    def read(self, path: Path) -> str:
+        doc = DocxDocument(str(path))
+        paragraphs = [p.text for p in doc.paragraphs]
+        return "\n".join(paragraphs)
+    def convert(
+        self,
+        input_path: Path,
+        output_path: Path,
+        converter: Callable[[str], str],
+    ) -> None:
+        doc = DocxDocument(str(input_path))
+        # Convert text in paragraphs
+        for para in doc.paragraphs:
+            for run in para.runs:
+                if run.text:
+                    run.text = converter(run.text)
+        # Convert text in tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    for para in cell.paragraphs:
+                        for run in para.runs:
+                            if run.text:
+                                run.text = converter(run.text)
+        # Convert text in headers/footers
+        for section in doc.sections:
+            for header in [section.header, section.first_page_header, section.even_page_header]:
+                if header:
+                    for para in header.paragraphs:
+                        for run in para.runs:
+                            if run.text:
+                                run.text = converter(run.text)
+            for footer in [section.footer, section.first_page_footer, section.even_page_footer]:
+                if footer:
+                    for para in footer.paragraphs:
+                        for run in para.runs:
+                            if run.text:
+                                run.text = converter(run.text)
+        doc.save(str(output_path))
+    @staticmethod
+    def can_handle(path: Path) -> bool:
+        return path.suffix.lower() in DOCX_EXTENSIONS
+class XlsxHandler(FileHandler):
+    """Handler for Microsoft Excel .xlsx files."""
+    def __init__(self):
+        if openpyxl is None:
+            raise ImportError(
+                "openpyxl is required for .xlsx support. "
+                "Install with: pip install paraencoder[office]"
+            )
+    def read(self, path: Path) -> str:
+        wb = openpyxl.load_workbook(str(path), data_only=True)
+        lines = []
+        for sheet in wb.worksheets:
+            for row in sheet.iter_rows():
+                for cell in row:
+                    if cell.value and isinstance(cell.value, str):
+                        lines.append(cell.value)
+        return "\n".join(lines)
+    def convert(
+        self,
+        input_path: Path,
+        output_path: Path,
+        converter: Callable[[str], str],
+    ) -> None:
+        # Load workbook preserving everything (images, charts, etc.)
+        wb = openpyxl.load_workbook(str(input_path))
+        for sheet in wb.worksheets:
+            # Convert regular cell values
+            for row in sheet.iter_rows():
+                for cell in row:
+                    if cell.value and isinstance(cell.value, str):
+                        cell.value = converter(cell.value)
+            # Convert merged cell values (they're stored in the top-left cell)
+            for merged_range in sheet.merged_cells.ranges:
+                cell = sheet.cell(merged_range.min_row, merged_range.min_col)
+                if cell.value and isinstance(cell.value, str):
+                    cell.value = converter(cell.value)
+            # Convert comments
+            for row in sheet.iter_rows():
+                for cell in row:
+                    if cell.comment and cell.comment.text:
+                        cell.comment.text = converter(cell.comment.text)
+            # Convert header/footer
+            if sheet.oddHeader and sheet.oddHeader.center:
+                if sheet.oddHeader.center.text:
+                    sheet.oddHeader.center.text = converter(sheet.oddHeader.center.text)
+            if sheet.oddHeader and sheet.oddHeader.left:
+                if sheet.oddHeader.left.text:
+                    sheet.oddHeader.left.text = converter(sheet.oddHeader.left.text)
+            if sheet.oddHeader and sheet.oddHeader.right:
+                if sheet.oddHeader.right.text:
+                    sheet.oddHeader.right.text = converter(sheet.oddHeader.right.text)
+            if sheet.oddFooter and sheet.oddFooter.center:
+                if sheet.oddFooter.center.text:
+                    sheet.oddFooter.center.text = converter(sheet.oddFooter.center.text)
+        # Convert sheet names
+        for sheet in wb.worksheets:
+            original_title = sheet.title
+            converted_title = converter(original_title)
+            if converted_title != original_title:
+                sheet.title = converted_title
+        wb.save(str(output_path))
+    @staticmethod
+    def can_handle(path: Path) -> bool:
+        return path.suffix.lower() in XLSX_EXTENSIONS
+class OdtHandler(FileHandler):
+    """Handler for OpenDocument .odt files."""
+    def __init__(self):
+        if odf_load is None:
+            raise ImportError(
+                "odfpy is required for .odt support. "
+                "Install with: pip install paraencoder[office]"
+            )
+    def _get_text_elements(self, element):
+        """Recursively get all text elements."""
+        from odf.text import P, H, Span
+        from odf.element import Text
+        elements = []
+        for child in element.childNodes:
+            if isinstance(child, (P, H, Span)):
+                elements.extend(self._get_text_elements(child))
+            elif isinstance(child, Text):
+                elements.append(child)
+            elif hasattr(child, 'childNodes'):
+                elements.extend(self._get_text_elements(child))
+        return elements
+    def read(self, path: Path) -> str:
+        doc = odf_load(str(path))
+        text_content = []
+        for para in doc.getElementsByType(odf_text.P):
+            text_content.append(str(para))
+        return "\n".join(text_content)
+    def convert(
+        self,
+        input_path: Path,
+        output_path: Path,
+        converter: Callable[[str], str],
+    ) -> None:
+        doc = odf_load(str(input_path))
+        # Get all text elements and convert them
+        text_elements = self._get_text_elements(doc.body)
+        for text_node in text_elements:
+            if text_node.data:
+                text_node.data = converter(text_node.data)
+        doc.save(str(output_path))
+    @staticmethod
+    def can_handle(path: Path) -> bool:
+        return path.suffix.lower() in ODT_EXTENSIONS
+def get_handler(path: Path) -> FileHandler:
+    """Get the appropriate handler for a file path."""
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix in DOCX_EXTENSIONS:
+        return DocxHandler()
+    elif suffix in XLSX_EXTENSIONS:
+        return XlsxHandler()
+    elif suffix in ODT_EXTENSIONS:
+        return OdtHandler()
+    elif suffix in PLAIN_TEXT_EXTENSIONS or suffix == "":
+        return PlainTextHandler()
+    else:
+        # Try plain text as fallback
+        return PlainTextHandler()
+def get_supported_extensions() -> set[str]:
+    """Get all supported file extensions."""
+    extensions = set(PLAIN_TEXT_EXTENSIONS)
+    extensions.update(DOCX_EXTENSIONS)
+    extensions.update(XLSX_EXTENSIONS)
+    extensions.update(ODT_EXTENSIONS)
+    return extensions
+def is_supported(path: Path) -> bool:
+    """Check if a file type is supported."""
+    path = Path(path)
+    suffix = path.suffix.lower()
+    return suffix in get_supported_extensions() or suffix == ""

para/io.py CHANGED Viewed

@@ -6,16 +6,19 @@ from pathlib import Path
 from typing import Optional
 from para.convert import zg_to_unicode
+from para.handlers import get_handler, is_supported, PlainTextHandler
 DEFAULT_ENCODING = "utf-8"
 def read_text(path: str, *, encoding: str = DEFAULT_ENCODING) -> str:
+    """Read text from a file. For plain text files only."""
     return Path(path).read_text(encoding=encoding)
 def write_text(path: str, data: str, *, encoding: str = DEFAULT_ENCODING) -> None:
+    """Write text to a file. For plain text files only."""
     Path(path).write_text(data, encoding=encoding)
@@ -30,13 +33,37 @@ def convert_file(
     """
     Convert a file from Zawgyi to Unicode and write the result.
-    Returns the converted text. When ``output_path`` is None, the caller can
-    capture the returned string.
+    Supports multiple file formats:
+    - Plain text files (.txt, .md, .csv, .json, .xml, .html, etc.)
+    - Microsoft Word (.docx) - requires: pip install paraencoder[office]
+    - Microsoft Excel (.xlsx) - requires: pip install paraencoder[office]
+    - OpenDocument (.odt) - requires: pip install paraencoder[office]
+    Returns the converted text. When ``output_path`` is None for plain text
+    files, the caller can capture the returned string. For binary formats
+    like .docx and .xlsx, output_path is required.
     """
-    data = read_text(input_path, encoding=encoding)
-    converted = zg_to_unicode(data, normalize=normalize, force=assume_zawgyi)
+    input_p = Path(input_path)
+    handler = get_handler(input_p)
+    # Create converter function
+    def converter(text: str) -> str:
+        return zg_to_unicode(text, normalize=normalize, force=assume_zawgyi)
+    # For plain text, we can return the string
+    if isinstance(handler, PlainTextHandler):
+        data = handler.read(input_p, encoding=encoding)
+        converted = converter(data)
+        if output_path:
+            Path(output_path).write_text(converted, encoding=encoding)
+        return converted
+    else:
+        # Binary formats require output path
+        if not output_path:
+            output_path = input_path  # Overwrite in place
+        handler.convert(input_p, Path(output_path), converter)
-    if output_path:
-        write_text(output_path, converted, encoding=encoding)
+        # Return text content for display
+        return handler.read(Path(output_path))
-    return converted

{paraencoder-0.1.1.dist-info → paraencoder-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: paraencoder
-Version: 0.1.1
+Version: 0.2.1
 Summary: Burmese text detection and conversion toolkit for Zawgyi and Unicode
 Project-URL: Homepage, https://github.com/Laitei40/ParaEncoder
 Project-URL: Repository, https://github.com/Laitei40/ParaEncoder
@@ -15,6 +15,14 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.9
+Provides-Extra: all
+Requires-Dist: odfpy>=1.4; extra == 'all'
+Requires-Dist: openpyxl>=3.1; extra == 'all'
+Requires-Dist: python-docx>=1.0; extra == 'all'
+Provides-Extra: office
+Requires-Dist: odfpy>=1.4; extra == 'office'
+Requires-Dist: openpyxl>=3.1; extra == 'office'
+Requires-Dist: python-docx>=1.0; extra == 'office'
 Provides-Extra: test
 Requires-Dist: pytest>=7; extra == 'test'
 Description-Content-Type: text/markdown
@@ -37,6 +45,27 @@ Para is a small, boring, and transparent toolkit for working with Burmese text.
 pip install paraencoder
 ```
+For Office document support (.docx, .xlsx, .odt):
+```bash
+pip install paraencoder[office]
+```
+## Supported File Formats
+### Plain Text (built-in, no extra dependencies)
+- **Text files:** `.txt`, `.text`, `.log`, `.md`, `.rst`, `.asc`
+- **Web/markup:** `.html`, `.htm`, `.xhtml`, `.xml`, `.json`, `.yaml`, `.yml`, `.csv`, `.tsv`
+- **Documentation:** `.tex`, `.latex`, `.adoc`, `.org`, `.wiki`, `.mediawiki`
+- **Config:** `.ini`, `.cfg`, `.conf`, `.properties`, `.env`, `.toml`, `.lock`
+- **Source code:** `.py`, `.js`, `.ts`, `.java`, `.c`, `.cpp`, `.h`, `.cs`, `.php`, `.rb`, `.go`, `.rs`, `.sh`, `.bat`, `.ps1`, `.sql`
+- **Subtitles:** `.srt`, `.vtt`, `.sub`
+- **Other:** `.po`, `.pot`, `.texi`, `.man`, `.nfo`, `.readme`, `.eml`, `.mbox`
+### Office Documents (requires `paraencoder[office]`)
+- **Microsoft Word:** `.docx`, `.docm`
+- **Microsoft Excel:** `.xlsx`, `.xlsm`
+- **OpenDocument:** `.odt`
 ## Usage
 ```python
 from para.detect import is_zawgyi, detect_encoding
@@ -65,6 +94,12 @@ Process a file in place (write to stdout by default):
 para convert --input input.txt --output output.txt
 ```
+Convert Office documents (requires `paraencoder[office]`):
+```bash
+para convert --input "Document.docx" --output "Document_Unicode.docx"
+para convert --input "Spreadsheet.xlsx" --output "Spreadsheet_Unicode.xlsx"
+```
 #### Windows / PowerShell note
 PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
 ```powershell

{paraencoder-0.1.1.dist-info → paraencoder-0.2.1.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,13 @@ para/__init__.py,sha256=XgvX7tM1z4fLz6yEjcJJU4jW1OzR5SAUaXYuKwZ352s,319
 para/cli.py,sha256=_hZsUTXKAS_X1zO7GDM1zbNjexnJ8dfPsFpMUz9BJIg,3154
 para/convert.py,sha256=hpsqjjt8kgEnOfryw1sDYE6RsTX-INNm8hGuL1pqZeA,1370
 para/detect.py,sha256=rGask21S1ST1KwZnvPT-SFpODGXJ6-VAAkLfaalsKKk,1929
-para/io.py,sha256=jG-vB7y_x7dn-nHjMrygn3e9jz-FDsxRRtjJylCHDeA,1074
+para/handlers.py,sha256=3qZqJ_qHTqiSARx0eZQFY8WDcvM2jlJFqCrjE7UZHMQ,10773
+para/io.py,sha256=_XvEBzKegEhi6rJi-jK_yNr6QDasHFkVmHYdYX4ms3E,2256
 para/normalize.py,sha256=k4a8-OtYh-bbPAwGytpP92CwiX_R9QNZSDjdccSgYEM,784
 para/pyproject.toml,sha256=2qF-g_VqBwQzoE_gRs0Q9dNudPramF747dfAhGkdjH0,1056
 para/rules.py,sha256=U1uIxYW2Ag-Y8ZNa0DY5KsdLKNw5fwHztpYhjRo9GfA,9753
-paraencoder-0.1.1.dist-info/METADATA,sha256=62ZtYixYQ3T6pmvr6dKI8FBWoah_RPI_-p4tg0mZ7mo,5707
-paraencoder-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-paraencoder-0.1.1.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
-paraencoder-0.1.1.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
-paraencoder-0.1.1.dist-info/RECORD,,
+paraencoder-0.2.1.dist-info/METADATA,sha256=Nki2CNRYMjmZjCLvWiMXIVukH66tQeaJZLpb-SeS14w,7151
+paraencoder-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+paraencoder-0.2.1.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
+paraencoder-0.2.1.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
+paraencoder-0.2.1.dist-info/RECORD,,

{paraencoder-0.1.1.dist-info → paraencoder-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{paraencoder-0.1.1.dist-info → paraencoder-0.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{paraencoder-0.1.1.dist-info → paraencoder-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

paraencoder 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

paraencoder 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl