paraencoder 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
para/handlers.py ADDED
@@ -0,0 +1,326 @@
1
+ """File format handlers for different document types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from abc import ABC, abstractmethod
8
+ from pathlib import Path
9
+ from typing import Callable, Optional
10
+
11
+ # Optional imports - will be None if not installed
12
+ try:
13
+ import openpyxl
14
+ except ImportError:
15
+ openpyxl = None
16
+
17
+ try:
18
+ from docx import Document as DocxDocument
19
+ except ImportError:
20
+ DocxDocument = None
21
+
22
+ try:
23
+ from odf import text as odf_text
24
+ from odf.opendocument import load as odf_load
25
+ except ImportError:
26
+ odf_text = None
27
+ odf_load = None
28
+
29
+
30
+ # File extensions grouped by handler type
31
+ PLAIN_TEXT_EXTENSIONS = {
32
+ # Plain text
33
+ ".txt", ".text", ".log", ".md", ".rst", ".asc",
34
+ # Web/markup
35
+ ".html", ".htm", ".xhtml", ".xml", ".csv", ".tsv",
36
+ ".json", ".yaml", ".yml",
37
+ # Documentation
38
+ ".tex", ".latex", ".adoc", ".org", ".wiki", ".mediawiki",
39
+ # Config files
40
+ ".ini", ".cfg", ".conf", ".properties", ".env", ".toml", ".lock",
41
+ # Source code
42
+ ".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".cs",
43
+ ".php", ".rb", ".go", ".rs", ".sh", ".bat", ".ps1", ".sql",
44
+ # Notes/misc
45
+ ".note", ".eml", ".mbox",
46
+ # Subtitles
47
+ ".srt", ".vtt", ".sub",
48
+ # Translation
49
+ ".po", ".pot",
50
+ # Other
51
+ ".texi", ".man", ".nfo", ".readme",
52
+ }
53
+
54
+ DOCX_EXTENSIONS = {".docx", ".docm"}
55
+ XLSX_EXTENSIONS = {".xlsx", ".xlsm"}
56
+ ODT_EXTENSIONS = {".odt"}
57
+ RTF_EXTENSIONS = {".rtf"}
58
+
59
+
60
+ class FileHandler(ABC):
61
+ """Base class for file format handlers."""
62
+
63
+ @abstractmethod
64
+ def read(self, path: Path) -> str:
65
+ """Read file and return text content."""
66
+ pass
67
+
68
+ @abstractmethod
69
+ def convert(
70
+ self,
71
+ input_path: Path,
72
+ output_path: Path,
73
+ converter: Callable[[str], str],
74
+ ) -> None:
75
+ """Convert file in-place or to new file, preserving format."""
76
+ pass
77
+
78
+ @staticmethod
79
+ def can_handle(path: Path) -> bool:
80
+ """Check if this handler can process the given file."""
81
+ return False
82
+
83
+
84
+ class PlainTextHandler(FileHandler):
85
+ """Handler for plain text files."""
86
+
87
+ def read(self, path: Path, encoding: str = "utf-8") -> str:
88
+ return path.read_text(encoding=encoding)
89
+
90
+ def convert(
91
+ self,
92
+ input_path: Path,
93
+ output_path: Path,
94
+ converter: Callable[[str], str],
95
+ encoding: str = "utf-8",
96
+ ) -> None:
97
+ text = self.read(input_path, encoding=encoding)
98
+ converted = converter(text)
99
+ output_path.write_text(converted, encoding=encoding)
100
+
101
+ @staticmethod
102
+ def can_handle(path: Path) -> bool:
103
+ return path.suffix.lower() in PLAIN_TEXT_EXTENSIONS or path.suffix == ""
104
+
105
+
106
+ class DocxHandler(FileHandler):
107
+ """Handler for Microsoft Word .docx files."""
108
+
109
+ def __init__(self):
110
+ if DocxDocument is None:
111
+ raise ImportError(
112
+ "python-docx is required for .docx support. "
113
+ "Install with: pip install paraencoder[office]"
114
+ )
115
+
116
+ def read(self, path: Path) -> str:
117
+ doc = DocxDocument(str(path))
118
+ paragraphs = [p.text for p in doc.paragraphs]
119
+ return "\n".join(paragraphs)
120
+
121
+ def convert(
122
+ self,
123
+ input_path: Path,
124
+ output_path: Path,
125
+ converter: Callable[[str], str],
126
+ ) -> None:
127
+ doc = DocxDocument(str(input_path))
128
+
129
+ # Convert text in paragraphs
130
+ for para in doc.paragraphs:
131
+ for run in para.runs:
132
+ if run.text:
133
+ run.text = converter(run.text)
134
+
135
+ # Convert text in tables
136
+ for table in doc.tables:
137
+ for row in table.rows:
138
+ for cell in row.cells:
139
+ for para in cell.paragraphs:
140
+ for run in para.runs:
141
+ if run.text:
142
+ run.text = converter(run.text)
143
+
144
+ # Convert text in headers/footers
145
+ for section in doc.sections:
146
+ for header in [section.header, section.first_page_header, section.even_page_header]:
147
+ if header:
148
+ for para in header.paragraphs:
149
+ for run in para.runs:
150
+ if run.text:
151
+ run.text = converter(run.text)
152
+ for footer in [section.footer, section.first_page_footer, section.even_page_footer]:
153
+ if footer:
154
+ for para in footer.paragraphs:
155
+ for run in para.runs:
156
+ if run.text:
157
+ run.text = converter(run.text)
158
+
159
+ doc.save(str(output_path))
160
+
161
+ @staticmethod
162
+ def can_handle(path: Path) -> bool:
163
+ return path.suffix.lower() in DOCX_EXTENSIONS
164
+
165
+
166
+ class XlsxHandler(FileHandler):
167
+ """Handler for Microsoft Excel .xlsx files."""
168
+
169
+ def __init__(self):
170
+ if openpyxl is None:
171
+ raise ImportError(
172
+ "openpyxl is required for .xlsx support. "
173
+ "Install with: pip install paraencoder[office]"
174
+ )
175
+
176
+ def read(self, path: Path) -> str:
177
+ wb = openpyxl.load_workbook(str(path), data_only=True)
178
+ lines = []
179
+ for sheet in wb.worksheets:
180
+ for row in sheet.iter_rows():
181
+ for cell in row:
182
+ if cell.value and isinstance(cell.value, str):
183
+ lines.append(cell.value)
184
+ return "\n".join(lines)
185
+
186
+ def convert(
187
+ self,
188
+ input_path: Path,
189
+ output_path: Path,
190
+ converter: Callable[[str], str],
191
+ ) -> None:
192
+ # Load workbook preserving everything (images, charts, etc.)
193
+ wb = openpyxl.load_workbook(str(input_path))
194
+
195
+ for sheet in wb.worksheets:
196
+ # Convert regular cell values
197
+ for row in sheet.iter_rows():
198
+ for cell in row:
199
+ if cell.value and isinstance(cell.value, str):
200
+ cell.value = converter(cell.value)
201
+
202
+ # Convert merged cell values (they're stored in the top-left cell)
203
+ for merged_range in sheet.merged_cells.ranges:
204
+ cell = sheet.cell(merged_range.min_row, merged_range.min_col)
205
+ if cell.value and isinstance(cell.value, str):
206
+ cell.value = converter(cell.value)
207
+
208
+ # Convert comments
209
+ for row in sheet.iter_rows():
210
+ for cell in row:
211
+ if cell.comment and cell.comment.text:
212
+ cell.comment.text = converter(cell.comment.text)
213
+
214
+ # Convert header/footer
215
+ if sheet.oddHeader and sheet.oddHeader.center:
216
+ if sheet.oddHeader.center.text:
217
+ sheet.oddHeader.center.text = converter(sheet.oddHeader.center.text)
218
+ if sheet.oddHeader and sheet.oddHeader.left:
219
+ if sheet.oddHeader.left.text:
220
+ sheet.oddHeader.left.text = converter(sheet.oddHeader.left.text)
221
+ if sheet.oddHeader and sheet.oddHeader.right:
222
+ if sheet.oddHeader.right.text:
223
+ sheet.oddHeader.right.text = converter(sheet.oddHeader.right.text)
224
+ if sheet.oddFooter and sheet.oddFooter.center:
225
+ if sheet.oddFooter.center.text:
226
+ sheet.oddFooter.center.text = converter(sheet.oddFooter.center.text)
227
+
228
+ # Convert sheet names
229
+ for sheet in wb.worksheets:
230
+ original_title = sheet.title
231
+ converted_title = converter(original_title)
232
+ if converted_title != original_title:
233
+ sheet.title = converted_title
234
+
235
+ wb.save(str(output_path))
236
+
237
+ @staticmethod
238
+ def can_handle(path: Path) -> bool:
239
+ return path.suffix.lower() in XLSX_EXTENSIONS
240
+
241
+
242
+ class OdtHandler(FileHandler):
243
+ """Handler for OpenDocument .odt files."""
244
+
245
+ def __init__(self):
246
+ if odf_load is None:
247
+ raise ImportError(
248
+ "odfpy is required for .odt support. "
249
+ "Install with: pip install paraencoder[office]"
250
+ )
251
+
252
+ def _get_text_elements(self, element):
253
+ """Recursively get all text elements."""
254
+ from odf.text import P, H, Span
255
+ from odf.element import Text
256
+
257
+ elements = []
258
+ for child in element.childNodes:
259
+ if isinstance(child, (P, H, Span)):
260
+ elements.extend(self._get_text_elements(child))
261
+ elif isinstance(child, Text):
262
+ elements.append(child)
263
+ elif hasattr(child, 'childNodes'):
264
+ elements.extend(self._get_text_elements(child))
265
+ return elements
266
+
267
+ def read(self, path: Path) -> str:
268
+ doc = odf_load(str(path))
269
+ text_content = []
270
+ for para in doc.getElementsByType(odf_text.P):
271
+ text_content.append(str(para))
272
+ return "\n".join(text_content)
273
+
274
+ def convert(
275
+ self,
276
+ input_path: Path,
277
+ output_path: Path,
278
+ converter: Callable[[str], str],
279
+ ) -> None:
280
+ doc = odf_load(str(input_path))
281
+
282
+ # Get all text elements and convert them
283
+ text_elements = self._get_text_elements(doc.body)
284
+ for text_node in text_elements:
285
+ if text_node.data:
286
+ text_node.data = converter(text_node.data)
287
+
288
+ doc.save(str(output_path))
289
+
290
+ @staticmethod
291
+ def can_handle(path: Path) -> bool:
292
+ return path.suffix.lower() in ODT_EXTENSIONS
293
+
294
+
295
+ def get_handler(path: Path) -> FileHandler:
296
+ """Get the appropriate handler for a file path."""
297
+ path = Path(path)
298
+ suffix = path.suffix.lower()
299
+
300
+ if suffix in DOCX_EXTENSIONS:
301
+ return DocxHandler()
302
+ elif suffix in XLSX_EXTENSIONS:
303
+ return XlsxHandler()
304
+ elif suffix in ODT_EXTENSIONS:
305
+ return OdtHandler()
306
+ elif suffix in PLAIN_TEXT_EXTENSIONS or suffix == "":
307
+ return PlainTextHandler()
308
+ else:
309
+ # Try plain text as fallback
310
+ return PlainTextHandler()
311
+
312
+
313
+ def get_supported_extensions() -> set[str]:
314
+ """Get all supported file extensions."""
315
+ extensions = set(PLAIN_TEXT_EXTENSIONS)
316
+ extensions.update(DOCX_EXTENSIONS)
317
+ extensions.update(XLSX_EXTENSIONS)
318
+ extensions.update(ODT_EXTENSIONS)
319
+ return extensions
320
+
321
+
322
+ def is_supported(path: Path) -> bool:
323
+ """Check if a file type is supported."""
324
+ path = Path(path)
325
+ suffix = path.suffix.lower()
326
+ return suffix in get_supported_extensions() or suffix == ""
para/io.py CHANGED
@@ -6,16 +6,19 @@ from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
8
  from para.convert import zg_to_unicode
9
+ from para.handlers import get_handler, is_supported, PlainTextHandler
9
10
 
10
11
 
11
12
  DEFAULT_ENCODING = "utf-8"
12
13
 
13
14
 
14
15
  def read_text(path: str, *, encoding: str = DEFAULT_ENCODING) -> str:
16
+ """Read text from a file. For plain text files only."""
15
17
  return Path(path).read_text(encoding=encoding)
16
18
 
17
19
 
18
20
  def write_text(path: str, data: str, *, encoding: str = DEFAULT_ENCODING) -> None:
21
+ """Write text to a file. For plain text files only."""
19
22
  Path(path).write_text(data, encoding=encoding)
20
23
 
21
24
 
@@ -30,13 +33,37 @@ def convert_file(
30
33
  """
31
34
  Convert a file from Zawgyi to Unicode and write the result.
32
35
 
33
- Returns the converted text. When ``output_path`` is None, the caller can
34
- capture the returned string.
36
+ Supports multiple file formats:
37
+ - Plain text files (.txt, .md, .csv, .json, .xml, .html, etc.)
38
+ - Microsoft Word (.docx) - requires: pip install paraencoder[office]
39
+ - Microsoft Excel (.xlsx) - requires: pip install paraencoder[office]
40
+ - OpenDocument (.odt) - requires: pip install paraencoder[office]
41
+
42
+ Returns the converted text. When ``output_path`` is None for plain text
43
+ files, the caller can capture the returned string. For binary formats
44
+ like .docx and .xlsx, output_path is required.
35
45
  """
36
- data = read_text(input_path, encoding=encoding)
37
- converted = zg_to_unicode(data, normalize=normalize, force=assume_zawgyi)
46
+ input_p = Path(input_path)
47
+ handler = get_handler(input_p)
48
+
49
+ # Create converter function
50
+ def converter(text: str) -> str:
51
+ return zg_to_unicode(text, normalize=normalize, force=assume_zawgyi)
52
+
53
+ # For plain text, we can return the string
54
+ if isinstance(handler, PlainTextHandler):
55
+ data = handler.read(input_p, encoding=encoding)
56
+ converted = converter(data)
57
+ if output_path:
58
+ Path(output_path).write_text(converted, encoding=encoding)
59
+ return converted
60
+ else:
61
+ # Binary formats require output path
62
+ if not output_path:
63
+ output_path = input_path # Overwrite in place
64
+
65
+ handler.convert(input_p, Path(output_path), converter)
38
66
 
39
- if output_path:
40
- write_text(output_path, converted, encoding=encoding)
67
+ # Return text content for display
68
+ return handler.read(Path(output_path))
41
69
 
42
- return converted
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paraencoder
3
- Version: 0.1.1
3
+ Version: 0.2.1
4
4
  Summary: Burmese text detection and conversion toolkit for Zawgyi and Unicode
5
5
  Project-URL: Homepage, https://github.com/Laitei40/ParaEncoder
6
6
  Project-URL: Repository, https://github.com/Laitei40/ParaEncoder
@@ -15,6 +15,14 @@ Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Programming Language :: Python :: 3 :: Only
16
16
  Classifier: Topic :: Text Processing :: Linguistic
17
17
  Requires-Python: >=3.9
18
+ Provides-Extra: all
19
+ Requires-Dist: odfpy>=1.4; extra == 'all'
20
+ Requires-Dist: openpyxl>=3.1; extra == 'all'
21
+ Requires-Dist: python-docx>=1.0; extra == 'all'
22
+ Provides-Extra: office
23
+ Requires-Dist: odfpy>=1.4; extra == 'office'
24
+ Requires-Dist: openpyxl>=3.1; extra == 'office'
25
+ Requires-Dist: python-docx>=1.0; extra == 'office'
18
26
  Provides-Extra: test
19
27
  Requires-Dist: pytest>=7; extra == 'test'
20
28
  Description-Content-Type: text/markdown
@@ -37,6 +45,27 @@ Para is a small, boring, and transparent toolkit for working with Burmese text.
37
45
  pip install paraencoder
38
46
  ```
39
47
 
48
+ For Office document support (.docx, .xlsx, .odt):
49
+ ```bash
50
+ pip install paraencoder[office]
51
+ ```
52
+
53
+ ## Supported File Formats
54
+
55
+ ### Plain Text (built-in, no extra dependencies)
56
+ - **Text files:** `.txt`, `.text`, `.log`, `.md`, `.rst`, `.asc`
57
+ - **Web/markup:** `.html`, `.htm`, `.xhtml`, `.xml`, `.json`, `.yaml`, `.yml`, `.csv`, `.tsv`
58
+ - **Documentation:** `.tex`, `.latex`, `.adoc`, `.org`, `.wiki`, `.mediawiki`
59
+ - **Config:** `.ini`, `.cfg`, `.conf`, `.properties`, `.env`, `.toml`, `.lock`
60
+ - **Source code:** `.py`, `.js`, `.ts`, `.java`, `.c`, `.cpp`, `.h`, `.cs`, `.php`, `.rb`, `.go`, `.rs`, `.sh`, `.bat`, `.ps1`, `.sql`
61
+ - **Subtitles:** `.srt`, `.vtt`, `.sub`
62
+ - **Other:** `.po`, `.pot`, `.texi`, `.man`, `.nfo`, `.readme`, `.eml`, `.mbox`
63
+
64
+ ### Office Documents (requires `paraencoder[office]`)
65
+ - **Microsoft Word:** `.docx`, `.docm`
66
+ - **Microsoft Excel:** `.xlsx`, `.xlsm`
67
+ - **OpenDocument:** `.odt`
68
+
40
69
  ## Usage
41
70
  ```python
42
71
  from para.detect import is_zawgyi, detect_encoding
@@ -65,6 +94,12 @@ Process a file in place (write to stdout by default):
65
94
  para convert --input input.txt --output output.txt
66
95
  ```
67
96
 
97
+ Convert Office documents (requires `paraencoder[office]`):
98
+ ```bash
99
+ para convert --input "Document.docx" --output "Document_Unicode.docx"
100
+ para convert --input "Spreadsheet.xlsx" --output "Spreadsheet_Unicode.xlsx"
101
+ ```
102
+
68
103
  #### Windows / PowerShell note
69
104
  PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
70
105
  ```powershell
@@ -2,12 +2,13 @@ para/__init__.py,sha256=XgvX7tM1z4fLz6yEjcJJU4jW1OzR5SAUaXYuKwZ352s,319
2
2
  para/cli.py,sha256=_hZsUTXKAS_X1zO7GDM1zbNjexnJ8dfPsFpMUz9BJIg,3154
3
3
  para/convert.py,sha256=hpsqjjt8kgEnOfryw1sDYE6RsTX-INNm8hGuL1pqZeA,1370
4
4
  para/detect.py,sha256=rGask21S1ST1KwZnvPT-SFpODGXJ6-VAAkLfaalsKKk,1929
5
- para/io.py,sha256=jG-vB7y_x7dn-nHjMrygn3e9jz-FDsxRRtjJylCHDeA,1074
5
+ para/handlers.py,sha256=3qZqJ_qHTqiSARx0eZQFY8WDcvM2jlJFqCrjE7UZHMQ,10773
6
+ para/io.py,sha256=_XvEBzKegEhi6rJi-jK_yNr6QDasHFkVmHYdYX4ms3E,2256
6
7
  para/normalize.py,sha256=k4a8-OtYh-bbPAwGytpP92CwiX_R9QNZSDjdccSgYEM,784
7
8
  para/pyproject.toml,sha256=2qF-g_VqBwQzoE_gRs0Q9dNudPramF747dfAhGkdjH0,1056
8
9
  para/rules.py,sha256=U1uIxYW2Ag-Y8ZNa0DY5KsdLKNw5fwHztpYhjRo9GfA,9753
9
- paraencoder-0.1.1.dist-info/METADATA,sha256=62ZtYixYQ3T6pmvr6dKI8FBWoah_RPI_-p4tg0mZ7mo,5707
10
- paraencoder-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
11
- paraencoder-0.1.1.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
12
- paraencoder-0.1.1.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
13
- paraencoder-0.1.1.dist-info/RECORD,,
10
+ paraencoder-0.2.1.dist-info/METADATA,sha256=Nki2CNRYMjmZjCLvWiMXIVukH66tQeaJZLpb-SeS14w,7151
11
+ paraencoder-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
+ paraencoder-0.2.1.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
13
+ paraencoder-0.2.1.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
14
+ paraencoder-0.2.1.dist-info/RECORD,,