openconvert 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,596 @@
1
+ """
2
+ Document converter module for handling document format conversions.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Union, Optional, Dict, Any
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Define supported conversions
14
+ SUPPORTED_CONVERSIONS = {
15
+ 'txt': ['pdf', 'docx', 'rtf', 'md', 'html', 'csv'],
16
+ 'docx': ['pdf', 'txt', 'rtf', 'html', 'md', 'epub'],
17
+ 'pdf': ['docx', 'txt', 'jpg', 'png', 'epub', 'html'],
18
+ 'html': ['pdf', 'docx', 'txt', 'md'],
19
+ 'md': ['pdf', 'docx', 'html', 'txt'],
20
+ 'rtf': ['docx', 'pdf', 'txt'],
21
+ 'csv': ['xlsx', 'json', 'txt', 'xml', 'sql'],
22
+ 'xlsx': ['csv', 'json', 'xml', 'sql'],
23
+ 'epub': ['pdf', 'docx', 'txt', 'html']
24
+ }
25
+
26
+ def convert(
27
+ filepath: Union[str, Path],
28
+ source_format: str,
29
+ target_format: str,
30
+ output_path: Union[str, Path],
31
+ options: Optional[Dict[str, Any]] = None
32
+ ) -> str:
33
+ """
34
+ Convert a document from one format to another.
35
+
36
+ Args:
37
+ filepath: Path to the source document file
38
+ source_format: Source document format
39
+ target_format: Target document format
40
+ output_path: Path to save the converted document
41
+ options: Additional conversion options
42
+
43
+ Returns:
44
+ Path to the converted document
45
+
46
+ Raises:
47
+ ValueError: If the conversion is not supported
48
+ RuntimeError: If the conversion fails
49
+ """
50
+ if options is None:
51
+ options = {}
52
+
53
+ # Check if conversion is supported
54
+ if target_format not in SUPPORTED_CONVERSIONS.get(source_format, []):
55
+ raise ValueError(f"Conversion from {source_format} to {target_format} is not supported")
56
+
57
+ filepath = Path(filepath)
58
+ output_path = Path(output_path)
59
+
60
+ # Route to appropriate conversion method
61
+ try:
62
+ if source_format == 'txt':
63
+ return _convert_from_txt(filepath, target_format, output_path, options)
64
+ elif source_format == 'docx':
65
+ return _convert_from_docx(filepath, target_format, output_path, options)
66
+ elif source_format == 'pdf':
67
+ return _convert_from_pdf(filepath, target_format, output_path, options)
68
+ elif source_format == 'html':
69
+ return _convert_from_html(filepath, target_format, output_path, options)
70
+ elif source_format == 'md':
71
+ return _convert_from_md(filepath, target_format, output_path, options)
72
+ elif source_format == 'rtf':
73
+ return _convert_from_rtf(filepath, target_format, output_path, options)
74
+ elif source_format == 'csv':
75
+ return _convert_from_csv(filepath, target_format, output_path, options)
76
+ elif source_format == 'xlsx':
77
+ return _convert_from_xlsx(filepath, target_format, output_path, options)
78
+ elif source_format == 'epub':
79
+ return _convert_from_epub(filepath, target_format, output_path, options)
80
+ else:
81
+ raise ValueError(f"Unsupported source format: {source_format}")
82
+
83
+ except Exception as e:
84
+ logger.error(f"Error converting {filepath} to {target_format}: {str(e)}")
85
+ raise RuntimeError(f"Failed to convert {filepath} to {target_format}: {str(e)}")
86
+
87
+ def _convert_from_txt(
88
+ filepath: Path,
89
+ target_format: str,
90
+ output_path: Path,
91
+ options: Dict[str, Any]
92
+ ) -> str:
93
+ """Convert from TXT to other formats."""
94
+ if target_format == 'pdf':
95
+ try:
96
+ from reportlab.lib.pagesizes import letter
97
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
98
+ from reportlab.lib.styles import getSampleStyleSheet
99
+
100
+ # Read the text file
101
+ with open(filepath, 'r', encoding='utf-8') as f:
102
+ text_content = f.read()
103
+
104
+ # Create PDF
105
+ doc = SimpleDocTemplate(str(output_path), pagesize=letter)
106
+ styles = getSampleStyleSheet()
107
+ story = []
108
+
109
+ # Split text into paragraphs
110
+ paragraphs = text_content.split('\n\n')
111
+ for para in paragraphs:
112
+ if para.strip():
113
+ p = Paragraph(para.replace('\n', '<br/>'), styles['Normal'])
114
+ story.append(p)
115
+ story.append(Spacer(1, 12))
116
+
117
+ doc.build(story)
118
+
119
+ except ImportError:
120
+ # Alternative method using pandoc
121
+ _convert_using_pandoc(filepath, target_format, output_path)
122
+
123
+ elif target_format == 'docx':
124
+ try:
125
+ from docx import Document
126
+
127
+ # Read the text file
128
+ with open(filepath, 'r', encoding='utf-8') as f:
129
+ text_content = f.read()
130
+
131
+ # Create DOCX
132
+ doc = Document()
133
+
134
+ # Split text into paragraphs
135
+ paragraphs = text_content.split('\n\n')
136
+ for para in paragraphs:
137
+ if para.strip():
138
+ doc.add_paragraph(para)
139
+
140
+ doc.save(str(output_path))
141
+
142
+ except ImportError:
143
+ # Alternative method using pandoc
144
+ _convert_using_pandoc(filepath, target_format, output_path)
145
+
146
+ elif target_format == 'rtf':
147
+ # Direct conversion to RTF without pandoc
148
+ try:
149
+ with open(filepath, 'r', encoding='utf-8') as f:
150
+ text_content = f.read()
151
+
152
+ # Create a simple RTF file
153
+ rtf_header = r"{\rtf1\ansi\ansicpg1252\cocoartf2580\cocoasubrtf220"
154
+ rtf_header += r"{\fonttbl\f0\fswiss\fcharset0 Helvetica;}"
155
+ rtf_header += r"{\colortbl;\red255\green255\blue255;}"
156
+ rtf_header += r"{\*\expandedcolortbl;;}"
157
+ rtf_header += r"\margl1440\margr1440\vieww11520\viewh8400\viewkind0"
158
+ rtf_header += r"\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0"
159
+ rtf_header += r"\f0\fs24 \cf0 "
160
+
161
+ # Prepare content for RTF
162
+ rtf_content = text_content.replace("\n", "\\par\n")
163
+ rtf_content = rtf_content.replace("\\", "\\\\")
164
+ rtf_content = rtf_content.replace("{", "\\{")
165
+ rtf_content = rtf_content.replace("}", "\\}")
166
+
167
+ # Write the RTF file
168
+ with open(output_path, 'w', encoding='utf-8') as f:
169
+ f.write(rtf_header + rtf_content + "}")
170
+
171
+ except Exception as e:
172
+ logger.warning(f"Direct RTF conversion failed: {str(e)}. Trying with pandoc...")
173
+ # Fallback to pandoc
174
+ try:
175
+ _convert_using_pandoc(filepath, target_format, output_path)
176
+ except Exception:
177
+ # Create a very simple RTF as a last resort
178
+ with open(output_path, 'w', encoding='utf-8') as f:
179
+ f.write(r'{\rtf1\ansi\deff0{\fonttbl{\f0 Times New Roman;}}{\colortbl;\red0\green0\blue0;}\f0\fs24\cf1 ')
180
+ with open(filepath, 'r', encoding='utf-8') as in_f:
181
+ for line in in_f:
182
+ f.write(line.replace('\n', '\\par\n'))
183
+ f.write('}')
184
+
185
+ elif target_format in ['html', 'md']:
186
+ # Use pandoc for these conversions
187
+ _convert_using_pandoc(filepath, target_format, output_path)
188
+
189
+ elif target_format == 'csv':
190
+ # Simple conversion - each line becomes a row with a single column
191
+ with open(filepath, 'r', encoding='utf-8') as f:
192
+ text_content = f.read()
193
+
194
+ with open(output_path, 'w', encoding='utf-8', newline='') as f:
195
+ import csv
196
+ writer = csv.writer(f)
197
+ for line in text_content.split('\n'):
198
+ writer.writerow([line])
199
+
200
+ return str(output_path)
201
+
202
+ def _convert_from_docx(
203
+ filepath: Path,
204
+ target_format: str,
205
+ output_path: Path,
206
+ options: Dict[str, Any]
207
+ ) -> str:
208
+ """Convert from DOCX to other formats."""
209
+ if target_format == 'txt':
210
+ try:
211
+ from docx import Document
212
+
213
+ doc = Document(filepath)
214
+ with open(output_path, 'w', encoding='utf-8') as f:
215
+ for para in doc.paragraphs:
216
+ f.write(para.text + '\n')
217
+ if para.text.strip() == '':
218
+ f.write('\n') # Add extra newline for paragraph breaks
219
+
220
+ except ImportError:
221
+ # Alternative method using pandoc
222
+ _convert_using_pandoc(filepath, target_format, output_path)
223
+
224
+ elif target_format in ['pdf', 'html', 'md', 'rtf', 'epub']:
225
+ # Use pandoc for these conversions
226
+ _convert_using_pandoc(filepath, target_format, output_path)
227
+
228
+ return str(output_path)
229
+
230
+ def _convert_from_pdf(
231
+ filepath: Path,
232
+ target_format: str,
233
+ output_path: Path,
234
+ options: Dict[str, Any]
235
+ ) -> str:
236
+ """Convert from PDF to other formats."""
237
+ if target_format == 'txt':
238
+ try:
239
+ import PyPDF2
240
+
241
+ with open(filepath, 'rb') as f:
242
+ pdf_reader = PyPDF2.PdfReader(f)
243
+ text = ''
244
+ for page_num in range(len(pdf_reader.pages)):
245
+ text += pdf_reader.pages[page_num].extract_text() + '\n\n'
246
+
247
+ with open(output_path, 'w', encoding='utf-8') as f:
248
+ f.write(text)
249
+
250
+ except ImportError:
251
+ # Alternative method using pdfminer
252
+ try:
253
+ from pdfminer.high_level import extract_text
254
+
255
+ text = extract_text(str(filepath))
256
+ with open(output_path, 'w', encoding='utf-8') as f:
257
+ f.write(text)
258
+
259
+ except ImportError:
260
+ # Last resort: try pandoc
261
+ _convert_using_pandoc(filepath, target_format, output_path)
262
+
263
+ elif target_format in ['docx', 'html', 'epub']:
264
+ # Try using pandoc
265
+ _convert_using_pandoc(filepath, target_format, output_path)
266
+
267
+ elif target_format in ['jpg', 'png']:
268
+ # Convert PDF to image
269
+ try:
270
+ from pdf2image import convert_from_path
271
+
272
+ images = convert_from_path(filepath)
273
+
274
+ # If there's only one page, save it directly
275
+ if len(images) == 1:
276
+ images[0].save(str(output_path))
277
+ else:
278
+ # If there are multiple pages, save them with sequential numbering
279
+ base_path = output_path.with_suffix('')
280
+ for i, image in enumerate(images):
281
+ page_path = f"{base_path}_page_{i+1}{output_path.suffix}"
282
+ image.save(page_path)
283
+
284
+ # Return the directory containing all pages
285
+ return str(base_path.parent)
286
+
287
+ except ImportError:
288
+ raise RuntimeError("pdf2image library is required for PDF to image conversion. Please install it.")
289
+
290
+ return str(output_path)
291
+
292
+ def _convert_from_html(
293
+ filepath: Path,
294
+ target_format: str,
295
+ output_path: Path,
296
+ options: Dict[str, Any]
297
+ ) -> str:
298
+ """Convert from HTML to other formats."""
299
+ if target_format == 'txt':
300
+ try:
301
+ from bs4 import BeautifulSoup
302
+
303
+ with open(filepath, 'r', encoding='utf-8') as f:
304
+ soup = BeautifulSoup(f, 'html.parser')
305
+ text = soup.get_text(separator='\n\n')
306
+
307
+ with open(output_path, 'w', encoding='utf-8') as f:
308
+ f.write(text)
309
+
310
+ except ImportError:
311
+ # Alternative method using pandoc
312
+ _convert_using_pandoc(filepath, target_format, output_path)
313
+
314
+ elif target_format == 'pdf':
315
+ # Try multiple methods for HTML to PDF conversion
316
+ success = False
317
+
318
+ # Method 1: Try using pdfkit (wkhtmltopdf)
319
+ try:
320
+ import pdfkit
321
+
322
+ # Configure pdfkit
323
+ options_dict = {
324
+ 'encoding': 'UTF-8',
325
+ 'quiet': ''
326
+ }
327
+
328
+ # Add any user-provided options
329
+ if options:
330
+ for key, value in options.items():
331
+ if key not in ['quality', 'resize']: # Skip general options
332
+ options_dict[key] = value
333
+
334
+ pdfkit.from_file(str(filepath), str(output_path), options=options_dict)
335
+ success = True
336
+ logger.info("Converted HTML to PDF using pdfkit")
337
+
338
+ except (ImportError, Exception) as e:
339
+ logger.warning(f"pdfkit conversion failed: {str(e)}. Trying next method...")
340
+
341
+ # Method 2: Try using weasyprint
342
+ if not success:
343
+ try:
344
+ from weasyprint import HTML
345
+
346
+ HTML(filename=str(filepath)).write_pdf(str(output_path))
347
+ success = True
348
+ logger.info("Converted HTML to PDF using weasyprint")
349
+
350
+ except (ImportError, Exception) as e:
351
+ logger.warning(f"weasyprint conversion failed: {str(e)}. Trying next method...")
352
+
353
+ # Method 3: Try using pandoc as a last resort
354
+ if not success:
355
+ try:
356
+ _convert_using_pandoc(filepath, target_format, output_path)
357
+ logger.info("Converted HTML to PDF using pandoc")
358
+ success = True
359
+ except Exception as e:
360
+ logger.error(f"All HTML to PDF conversion methods failed: {str(e)}")
361
+ raise RuntimeError(f"Failed to convert HTML to PDF. Please install pdfkit, weasyprint, or pandoc with LaTeX.")
362
+
363
+ elif target_format == 'docx' or target_format == 'md':
364
+ # Use pandoc for these conversions
365
+ _convert_using_pandoc(filepath, target_format, output_path)
366
+
367
+ return str(output_path)
368
+
369
+ def _convert_from_md(
370
+ filepath: Path,
371
+ target_format: str,
372
+ output_path: Path,
373
+ options: Dict[str, Any]
374
+ ) -> str:
375
+ """Convert from Markdown to other formats."""
376
+ if target_format == 'txt':
377
+ # Simple conversion - just strip markdown syntax
378
+ with open(filepath, 'r', encoding='utf-8') as f:
379
+ md_content = f.read()
380
+
381
+ # Very basic markdown stripping
382
+ import re
383
+
384
+ # Remove headers
385
+ text = re.sub(r'^#+\s+', '', md_content, flags=re.MULTILINE)
386
+
387
+ # Remove bold/italic
388
+ text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
389
+ text = re.sub(r'\*(.*?)\*', r'\1', text)
390
+
391
+ # Remove links
392
+ text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
393
+
394
+ # Remove code blocks
395
+ text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
396
+
397
+ with open(output_path, 'w', encoding='utf-8') as f:
398
+ f.write(text)
399
+
400
+ elif target_format in ['pdf', 'docx', 'html']:
401
+ # Use pandoc for these conversions
402
+ _convert_using_pandoc(filepath, target_format, output_path)
403
+
404
+ return str(output_path)
405
+
406
+ def _convert_from_rtf(
407
+ filepath: Path,
408
+ target_format: str,
409
+ output_path: Path,
410
+ options: Dict[str, Any]
411
+ ) -> str:
412
+ """Convert from RTF to other formats."""
413
+ # Use pandoc for all RTF conversions
414
+ _convert_using_pandoc(filepath, target_format, output_path)
415
+ return str(output_path)
416
+
417
+ def _convert_from_csv(
418
+ filepath: Path,
419
+ target_format: str,
420
+ output_path: Path,
421
+ options: Dict[str, Any]
422
+ ) -> str:
423
+ """Convert from CSV to other formats."""
424
+ if target_format == 'xlsx':
425
+ try:
426
+ import pandas as pd
427
+
428
+ df = pd.read_csv(filepath)
429
+ df.to_excel(output_path, index=False)
430
+
431
+ except ImportError:
432
+ raise RuntimeError("pandas library is required for CSV to XLSX conversion. Please install it.")
433
+
434
+ elif target_format == 'json':
435
+ try:
436
+ import pandas as pd
437
+
438
+ df = pd.read_csv(filepath)
439
+ df.to_json(output_path, orient='records', indent=4)
440
+
441
+ except ImportError:
442
+ # Alternative method using csv and json modules
443
+ import csv
444
+ import json
445
+
446
+ with open(filepath, 'r', encoding='utf-8', newline='') as f:
447
+ reader = csv.DictReader(f)
448
+ data = list(reader)
449
+
450
+ with open(output_path, 'w', encoding='utf-8') as f:
451
+ json.dump(data, f, indent=4)
452
+
453
+ elif target_format == 'txt':
454
+ # Simple conversion - just copy the CSV content
455
+ with open(filepath, 'r', encoding='utf-8') as f:
456
+ content = f.read()
457
+
458
+ with open(output_path, 'w', encoding='utf-8') as f:
459
+ f.write(content)
460
+
461
+ elif target_format == 'xml':
462
+ try:
463
+ import pandas as pd
464
+ import dicttoxml
465
+
466
+ df = pd.read_csv(filepath)
467
+ data = df.to_dict(orient='records')
468
+ xml = dicttoxml.dicttoxml(data, custom_root='data', attr_type=False)
469
+
470
+ with open(output_path, 'wb') as f:
471
+ f.write(xml)
472
+
473
+ except ImportError:
474
+ raise RuntimeError("pandas and dicttoxml libraries are required for CSV to XML conversion. Please install them.")
475
+
476
+ elif target_format == 'sql':
477
+ try:
478
+ import pandas as pd
479
+ import sqlite3
480
+
481
+ df = pd.read_csv(filepath)
482
+
483
+ # Create a temporary SQLite database
484
+ conn = sqlite3.connect(str(output_path))
485
+
486
+ # Get table name from file name or use default
487
+ table_name = options.get('table_name', filepath.stem)
488
+
489
+ # Write to SQL
490
+ df.to_sql(table_name, conn, index=False)
491
+ conn.close()
492
+
493
+ except ImportError:
494
+ raise RuntimeError("pandas library is required for CSV to SQL conversion. Please install it.")
495
+
496
+ return str(output_path)
497
+
498
+ def _convert_from_xlsx(
499
+ filepath: Path,
500
+ target_format: str,
501
+ output_path: Path,
502
+ options: Dict[str, Any]
503
+ ) -> str:
504
+ """Convert from XLSX to other formats."""
505
+ try:
506
+ import pandas as pd
507
+
508
+ # Read the Excel file
509
+ df = pd.read_excel(filepath, sheet_name=options.get('sheet_name', 0))
510
+
511
+ if target_format == 'csv':
512
+ df.to_csv(output_path, index=False)
513
+
514
+ elif target_format == 'json':
515
+ df.to_json(output_path, orient='records', indent=4)
516
+
517
+ elif target_format == 'xml':
518
+ import dicttoxml
519
+
520
+ data = df.to_dict(orient='records')
521
+ xml = dicttoxml.dicttoxml(data, custom_root='data', attr_type=False)
522
+
523
+ with open(output_path, 'wb') as f:
524
+ f.write(xml)
525
+
526
+ elif target_format == 'sql':
527
+ import sqlite3
528
+
529
+ # Create a temporary SQLite database
530
+ conn = sqlite3.connect(str(output_path))
531
+
532
+ # Get table name from file name or use default
533
+ table_name = options.get('table_name', filepath.stem)
534
+
535
+ # Write to SQL
536
+ df.to_sql(table_name, conn, index=False)
537
+ conn.close()
538
+
539
+ except ImportError:
540
+ raise RuntimeError("pandas library is required for XLSX conversions. Please install it.")
541
+
542
+ return str(output_path)
543
+
544
+ def _convert_from_epub(
545
+ filepath: Path,
546
+ target_format: str,
547
+ output_path: Path,
548
+ options: Dict[str, Any]
549
+ ) -> str:
550
+ """Convert from EPUB to other formats."""
551
+ # Use pandoc for all EPUB conversions
552
+ _convert_using_pandoc(filepath, target_format, output_path)
553
+ return str(output_path)
554
+
555
+ def _convert_using_pandoc(
556
+ filepath: Path,
557
+ target_format: str,
558
+ output_path: Path
559
+ ) -> None:
560
+ """Use pandoc for document conversion."""
561
+ try:
562
+ import subprocess
563
+
564
+ # Map our format names to pandoc format names
565
+ format_map = {
566
+ 'txt': 'markdown', # Use markdown as the input format for .txt files
567
+ 'md': 'markdown',
568
+ 'html': 'html',
569
+ 'docx': 'docx',
570
+ 'pdf': 'pdf',
571
+ 'rtf': 'rtf',
572
+ 'epub': 'epub'
573
+ }
574
+
575
+ source_ext = filepath.suffix.lower().lstrip('.')
576
+ if source_ext == 'jpeg':
577
+ source_ext = 'jpg'
578
+
579
+ pandoc_source = format_map.get(source_ext, source_ext)
580
+ pandoc_target = format_map.get(target_format, target_format)
581
+
582
+ cmd = [
583
+ 'pandoc',
584
+ '-f', pandoc_source,
585
+ '-t', pandoc_target,
586
+ '-o', str(output_path),
587
+ str(filepath)
588
+ ]
589
+
590
+ # For debug purposes, log the command
591
+ logger.debug(f"Running pandoc command: {' '.join(cmd)}")
592
+
593
+ subprocess.run(cmd, check=True)
594
+
595
+ except (ImportError, subprocess.SubprocessError) as e:
596
+ raise RuntimeError(f"Failed to convert using pandoc: {str(e)}. Please install pandoc.")