openconvert 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openconvert/__init__.py +7 -0
- openconvert/cli.py +145 -0
- openconvert/converter.py +152 -0
- openconvert/converters/__init__.py +3 -0
- openconvert/converters/archive_converter.py +277 -0
- openconvert/converters/audio_converter.py +223 -0
- openconvert/converters/code_converter.py +412 -0
- openconvert/converters/document_converter.py +596 -0
- openconvert/converters/image_converter.py +214 -0
- openconvert/converters/model_converter.py +208 -0
- openconvert/converters/video_converter.py +259 -0
- openconvert/launcher.py +0 -0
- openconvert-0.1.0.dist-info/METADATA +232 -0
- openconvert-0.1.0.dist-info/RECORD +17 -0
- openconvert-0.1.0.dist-info/WHEEL +5 -0
- openconvert-0.1.0.dist-info/entry_points.txt +2 -0
- openconvert-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,596 @@
|
|
1
|
+
"""
|
2
|
+
Document converter module for handling document format conversions.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import tempfile
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Union, Optional, Dict, Any
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
# Define supported conversions
|
14
|
+
SUPPORTED_CONVERSIONS = {
|
15
|
+
'txt': ['pdf', 'docx', 'rtf', 'md', 'html', 'csv'],
|
16
|
+
'docx': ['pdf', 'txt', 'rtf', 'html', 'md', 'epub'],
|
17
|
+
'pdf': ['docx', 'txt', 'jpg', 'png', 'epub', 'html'],
|
18
|
+
'html': ['pdf', 'docx', 'txt', 'md'],
|
19
|
+
'md': ['pdf', 'docx', 'html', 'txt'],
|
20
|
+
'rtf': ['docx', 'pdf', 'txt'],
|
21
|
+
'csv': ['xlsx', 'json', 'txt', 'xml', 'sql'],
|
22
|
+
'xlsx': ['csv', 'json', 'xml', 'sql'],
|
23
|
+
'epub': ['pdf', 'docx', 'txt', 'html']
|
24
|
+
}
|
25
|
+
|
26
|
+
def convert(
|
27
|
+
filepath: Union[str, Path],
|
28
|
+
source_format: str,
|
29
|
+
target_format: str,
|
30
|
+
output_path: Union[str, Path],
|
31
|
+
options: Optional[Dict[str, Any]] = None
|
32
|
+
) -> str:
|
33
|
+
"""
|
34
|
+
Convert a document from one format to another.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
filepath: Path to the source document file
|
38
|
+
source_format: Source document format
|
39
|
+
target_format: Target document format
|
40
|
+
output_path: Path to save the converted document
|
41
|
+
options: Additional conversion options
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
Path to the converted document
|
45
|
+
|
46
|
+
Raises:
|
47
|
+
ValueError: If the conversion is not supported
|
48
|
+
RuntimeError: If the conversion fails
|
49
|
+
"""
|
50
|
+
if options is None:
|
51
|
+
options = {}
|
52
|
+
|
53
|
+
# Check if conversion is supported
|
54
|
+
if target_format not in SUPPORTED_CONVERSIONS.get(source_format, []):
|
55
|
+
raise ValueError(f"Conversion from {source_format} to {target_format} is not supported")
|
56
|
+
|
57
|
+
filepath = Path(filepath)
|
58
|
+
output_path = Path(output_path)
|
59
|
+
|
60
|
+
# Route to appropriate conversion method
|
61
|
+
try:
|
62
|
+
if source_format == 'txt':
|
63
|
+
return _convert_from_txt(filepath, target_format, output_path, options)
|
64
|
+
elif source_format == 'docx':
|
65
|
+
return _convert_from_docx(filepath, target_format, output_path, options)
|
66
|
+
elif source_format == 'pdf':
|
67
|
+
return _convert_from_pdf(filepath, target_format, output_path, options)
|
68
|
+
elif source_format == 'html':
|
69
|
+
return _convert_from_html(filepath, target_format, output_path, options)
|
70
|
+
elif source_format == 'md':
|
71
|
+
return _convert_from_md(filepath, target_format, output_path, options)
|
72
|
+
elif source_format == 'rtf':
|
73
|
+
return _convert_from_rtf(filepath, target_format, output_path, options)
|
74
|
+
elif source_format == 'csv':
|
75
|
+
return _convert_from_csv(filepath, target_format, output_path, options)
|
76
|
+
elif source_format == 'xlsx':
|
77
|
+
return _convert_from_xlsx(filepath, target_format, output_path, options)
|
78
|
+
elif source_format == 'epub':
|
79
|
+
return _convert_from_epub(filepath, target_format, output_path, options)
|
80
|
+
else:
|
81
|
+
raise ValueError(f"Unsupported source format: {source_format}")
|
82
|
+
|
83
|
+
except Exception as e:
|
84
|
+
logger.error(f"Error converting {filepath} to {target_format}: {str(e)}")
|
85
|
+
raise RuntimeError(f"Failed to convert {filepath} to {target_format}: {str(e)}")
|
86
|
+
|
87
|
+
def _convert_from_txt(
|
88
|
+
filepath: Path,
|
89
|
+
target_format: str,
|
90
|
+
output_path: Path,
|
91
|
+
options: Dict[str, Any]
|
92
|
+
) -> str:
|
93
|
+
"""Convert from TXT to other formats."""
|
94
|
+
if target_format == 'pdf':
|
95
|
+
try:
|
96
|
+
from reportlab.lib.pagesizes import letter
|
97
|
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
98
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
99
|
+
|
100
|
+
# Read the text file
|
101
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
102
|
+
text_content = f.read()
|
103
|
+
|
104
|
+
# Create PDF
|
105
|
+
doc = SimpleDocTemplate(str(output_path), pagesize=letter)
|
106
|
+
styles = getSampleStyleSheet()
|
107
|
+
story = []
|
108
|
+
|
109
|
+
# Split text into paragraphs
|
110
|
+
paragraphs = text_content.split('\n\n')
|
111
|
+
for para in paragraphs:
|
112
|
+
if para.strip():
|
113
|
+
p = Paragraph(para.replace('\n', '<br/>'), styles['Normal'])
|
114
|
+
story.append(p)
|
115
|
+
story.append(Spacer(1, 12))
|
116
|
+
|
117
|
+
doc.build(story)
|
118
|
+
|
119
|
+
except ImportError:
|
120
|
+
# Alternative method using pandoc
|
121
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
122
|
+
|
123
|
+
elif target_format == 'docx':
|
124
|
+
try:
|
125
|
+
from docx import Document
|
126
|
+
|
127
|
+
# Read the text file
|
128
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
129
|
+
text_content = f.read()
|
130
|
+
|
131
|
+
# Create DOCX
|
132
|
+
doc = Document()
|
133
|
+
|
134
|
+
# Split text into paragraphs
|
135
|
+
paragraphs = text_content.split('\n\n')
|
136
|
+
for para in paragraphs:
|
137
|
+
if para.strip():
|
138
|
+
doc.add_paragraph(para)
|
139
|
+
|
140
|
+
doc.save(str(output_path))
|
141
|
+
|
142
|
+
except ImportError:
|
143
|
+
# Alternative method using pandoc
|
144
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
145
|
+
|
146
|
+
elif target_format == 'rtf':
|
147
|
+
# Direct conversion to RTF without pandoc
|
148
|
+
try:
|
149
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
150
|
+
text_content = f.read()
|
151
|
+
|
152
|
+
# Create a simple RTF file
|
153
|
+
rtf_header = r"{\rtf1\ansi\ansicpg1252\cocoartf2580\cocoasubrtf220"
|
154
|
+
rtf_header += r"{\fonttbl\f0\fswiss\fcharset0 Helvetica;}"
|
155
|
+
rtf_header += r"{\colortbl;\red255\green255\blue255;}"
|
156
|
+
rtf_header += r"{\*\expandedcolortbl;;}"
|
157
|
+
rtf_header += r"\margl1440\margr1440\vieww11520\viewh8400\viewkind0"
|
158
|
+
rtf_header += r"\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0"
|
159
|
+
rtf_header += r"\f0\fs24 \cf0 "
|
160
|
+
|
161
|
+
# Prepare content for RTF
|
162
|
+
rtf_content = text_content.replace("\n", "\\par\n")
|
163
|
+
rtf_content = rtf_content.replace("\\", "\\\\")
|
164
|
+
rtf_content = rtf_content.replace("{", "\\{")
|
165
|
+
rtf_content = rtf_content.replace("}", "\\}")
|
166
|
+
|
167
|
+
# Write the RTF file
|
168
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
169
|
+
f.write(rtf_header + rtf_content + "}")
|
170
|
+
|
171
|
+
except Exception as e:
|
172
|
+
logger.warning(f"Direct RTF conversion failed: {str(e)}. Trying with pandoc...")
|
173
|
+
# Fallback to pandoc
|
174
|
+
try:
|
175
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
176
|
+
except Exception:
|
177
|
+
# Create a very simple RTF as a last resort
|
178
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
179
|
+
f.write(r'{\rtf1\ansi\deff0{\fonttbl{\f0 Times New Roman;}}{\colortbl;\red0\green0\blue0;}\f0\fs24\cf1 ')
|
180
|
+
with open(filepath, 'r', encoding='utf-8') as in_f:
|
181
|
+
for line in in_f:
|
182
|
+
f.write(line.replace('\n', '\\par\n'))
|
183
|
+
f.write('}')
|
184
|
+
|
185
|
+
elif target_format in ['html', 'md']:
|
186
|
+
# Use pandoc for these conversions
|
187
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
188
|
+
|
189
|
+
elif target_format == 'csv':
|
190
|
+
# Simple conversion - each line becomes a row with a single column
|
191
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
192
|
+
text_content = f.read()
|
193
|
+
|
194
|
+
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
195
|
+
import csv
|
196
|
+
writer = csv.writer(f)
|
197
|
+
for line in text_content.split('\n'):
|
198
|
+
writer.writerow([line])
|
199
|
+
|
200
|
+
return str(output_path)
|
201
|
+
|
202
|
+
def _convert_from_docx(
|
203
|
+
filepath: Path,
|
204
|
+
target_format: str,
|
205
|
+
output_path: Path,
|
206
|
+
options: Dict[str, Any]
|
207
|
+
) -> str:
|
208
|
+
"""Convert from DOCX to other formats."""
|
209
|
+
if target_format == 'txt':
|
210
|
+
try:
|
211
|
+
from docx import Document
|
212
|
+
|
213
|
+
doc = Document(filepath)
|
214
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
215
|
+
for para in doc.paragraphs:
|
216
|
+
f.write(para.text + '\n')
|
217
|
+
if para.text.strip() == '':
|
218
|
+
f.write('\n') # Add extra newline for paragraph breaks
|
219
|
+
|
220
|
+
except ImportError:
|
221
|
+
# Alternative method using pandoc
|
222
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
223
|
+
|
224
|
+
elif target_format in ['pdf', 'html', 'md', 'rtf', 'epub']:
|
225
|
+
# Use pandoc for these conversions
|
226
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
227
|
+
|
228
|
+
return str(output_path)
|
229
|
+
|
230
|
+
def _convert_from_pdf(
|
231
|
+
filepath: Path,
|
232
|
+
target_format: str,
|
233
|
+
output_path: Path,
|
234
|
+
options: Dict[str, Any]
|
235
|
+
) -> str:
|
236
|
+
"""Convert from PDF to other formats."""
|
237
|
+
if target_format == 'txt':
|
238
|
+
try:
|
239
|
+
import PyPDF2
|
240
|
+
|
241
|
+
with open(filepath, 'rb') as f:
|
242
|
+
pdf_reader = PyPDF2.PdfReader(f)
|
243
|
+
text = ''
|
244
|
+
for page_num in range(len(pdf_reader.pages)):
|
245
|
+
text += pdf_reader.pages[page_num].extract_text() + '\n\n'
|
246
|
+
|
247
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
248
|
+
f.write(text)
|
249
|
+
|
250
|
+
except ImportError:
|
251
|
+
# Alternative method using pdfminer
|
252
|
+
try:
|
253
|
+
from pdfminer.high_level import extract_text
|
254
|
+
|
255
|
+
text = extract_text(str(filepath))
|
256
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
257
|
+
f.write(text)
|
258
|
+
|
259
|
+
except ImportError:
|
260
|
+
# Last resort: try pandoc
|
261
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
262
|
+
|
263
|
+
elif target_format in ['docx', 'html', 'epub']:
|
264
|
+
# Try using pandoc
|
265
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
266
|
+
|
267
|
+
elif target_format in ['jpg', 'png']:
|
268
|
+
# Convert PDF to image
|
269
|
+
try:
|
270
|
+
from pdf2image import convert_from_path
|
271
|
+
|
272
|
+
images = convert_from_path(filepath)
|
273
|
+
|
274
|
+
# If there's only one page, save it directly
|
275
|
+
if len(images) == 1:
|
276
|
+
images[0].save(str(output_path))
|
277
|
+
else:
|
278
|
+
# If there are multiple pages, save them with sequential numbering
|
279
|
+
base_path = output_path.with_suffix('')
|
280
|
+
for i, image in enumerate(images):
|
281
|
+
page_path = f"{base_path}_page_{i+1}{output_path.suffix}"
|
282
|
+
image.save(page_path)
|
283
|
+
|
284
|
+
# Return the directory containing all pages
|
285
|
+
return str(base_path.parent)
|
286
|
+
|
287
|
+
except ImportError:
|
288
|
+
raise RuntimeError("pdf2image library is required for PDF to image conversion. Please install it.")
|
289
|
+
|
290
|
+
return str(output_path)
|
291
|
+
|
292
|
+
def _convert_from_html(
|
293
|
+
filepath: Path,
|
294
|
+
target_format: str,
|
295
|
+
output_path: Path,
|
296
|
+
options: Dict[str, Any]
|
297
|
+
) -> str:
|
298
|
+
"""Convert from HTML to other formats."""
|
299
|
+
if target_format == 'txt':
|
300
|
+
try:
|
301
|
+
from bs4 import BeautifulSoup
|
302
|
+
|
303
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
304
|
+
soup = BeautifulSoup(f, 'html.parser')
|
305
|
+
text = soup.get_text(separator='\n\n')
|
306
|
+
|
307
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
308
|
+
f.write(text)
|
309
|
+
|
310
|
+
except ImportError:
|
311
|
+
# Alternative method using pandoc
|
312
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
313
|
+
|
314
|
+
elif target_format == 'pdf':
|
315
|
+
# Try multiple methods for HTML to PDF conversion
|
316
|
+
success = False
|
317
|
+
|
318
|
+
# Method 1: Try using pdfkit (wkhtmltopdf)
|
319
|
+
try:
|
320
|
+
import pdfkit
|
321
|
+
|
322
|
+
# Configure pdfkit
|
323
|
+
options_dict = {
|
324
|
+
'encoding': 'UTF-8',
|
325
|
+
'quiet': ''
|
326
|
+
}
|
327
|
+
|
328
|
+
# Add any user-provided options
|
329
|
+
if options:
|
330
|
+
for key, value in options.items():
|
331
|
+
if key not in ['quality', 'resize']: # Skip general options
|
332
|
+
options_dict[key] = value
|
333
|
+
|
334
|
+
pdfkit.from_file(str(filepath), str(output_path), options=options_dict)
|
335
|
+
success = True
|
336
|
+
logger.info("Converted HTML to PDF using pdfkit")
|
337
|
+
|
338
|
+
except (ImportError, Exception) as e:
|
339
|
+
logger.warning(f"pdfkit conversion failed: {str(e)}. Trying next method...")
|
340
|
+
|
341
|
+
# Method 2: Try using weasyprint
|
342
|
+
if not success:
|
343
|
+
try:
|
344
|
+
from weasyprint import HTML
|
345
|
+
|
346
|
+
HTML(filename=str(filepath)).write_pdf(str(output_path))
|
347
|
+
success = True
|
348
|
+
logger.info("Converted HTML to PDF using weasyprint")
|
349
|
+
|
350
|
+
except (ImportError, Exception) as e:
|
351
|
+
logger.warning(f"weasyprint conversion failed: {str(e)}. Trying next method...")
|
352
|
+
|
353
|
+
# Method 3: Try using pandoc as a last resort
|
354
|
+
if not success:
|
355
|
+
try:
|
356
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
357
|
+
logger.info("Converted HTML to PDF using pandoc")
|
358
|
+
success = True
|
359
|
+
except Exception as e:
|
360
|
+
logger.error(f"All HTML to PDF conversion methods failed: {str(e)}")
|
361
|
+
raise RuntimeError(f"Failed to convert HTML to PDF. Please install pdfkit, weasyprint, or pandoc with LaTeX.")
|
362
|
+
|
363
|
+
elif target_format == 'docx' or target_format == 'md':
|
364
|
+
# Use pandoc for these conversions
|
365
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
366
|
+
|
367
|
+
return str(output_path)
|
368
|
+
|
369
|
+
def _convert_from_md(
|
370
|
+
filepath: Path,
|
371
|
+
target_format: str,
|
372
|
+
output_path: Path,
|
373
|
+
options: Dict[str, Any]
|
374
|
+
) -> str:
|
375
|
+
"""Convert from Markdown to other formats."""
|
376
|
+
if target_format == 'txt':
|
377
|
+
# Simple conversion - just strip markdown syntax
|
378
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
379
|
+
md_content = f.read()
|
380
|
+
|
381
|
+
# Very basic markdown stripping
|
382
|
+
import re
|
383
|
+
|
384
|
+
# Remove headers
|
385
|
+
text = re.sub(r'^#+\s+', '', md_content, flags=re.MULTILINE)
|
386
|
+
|
387
|
+
# Remove bold/italic
|
388
|
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
|
389
|
+
text = re.sub(r'\*(.*?)\*', r'\1', text)
|
390
|
+
|
391
|
+
# Remove links
|
392
|
+
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
|
393
|
+
|
394
|
+
# Remove code blocks
|
395
|
+
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
|
396
|
+
|
397
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
398
|
+
f.write(text)
|
399
|
+
|
400
|
+
elif target_format in ['pdf', 'docx', 'html']:
|
401
|
+
# Use pandoc for these conversions
|
402
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
403
|
+
|
404
|
+
return str(output_path)
|
405
|
+
|
406
|
+
def _convert_from_rtf(
|
407
|
+
filepath: Path,
|
408
|
+
target_format: str,
|
409
|
+
output_path: Path,
|
410
|
+
options: Dict[str, Any]
|
411
|
+
) -> str:
|
412
|
+
"""Convert from RTF to other formats."""
|
413
|
+
# Use pandoc for all RTF conversions
|
414
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
415
|
+
return str(output_path)
|
416
|
+
|
417
|
+
def _convert_from_csv(
|
418
|
+
filepath: Path,
|
419
|
+
target_format: str,
|
420
|
+
output_path: Path,
|
421
|
+
options: Dict[str, Any]
|
422
|
+
) -> str:
|
423
|
+
"""Convert from CSV to other formats."""
|
424
|
+
if target_format == 'xlsx':
|
425
|
+
try:
|
426
|
+
import pandas as pd
|
427
|
+
|
428
|
+
df = pd.read_csv(filepath)
|
429
|
+
df.to_excel(output_path, index=False)
|
430
|
+
|
431
|
+
except ImportError:
|
432
|
+
raise RuntimeError("pandas library is required for CSV to XLSX conversion. Please install it.")
|
433
|
+
|
434
|
+
elif target_format == 'json':
|
435
|
+
try:
|
436
|
+
import pandas as pd
|
437
|
+
|
438
|
+
df = pd.read_csv(filepath)
|
439
|
+
df.to_json(output_path, orient='records', indent=4)
|
440
|
+
|
441
|
+
except ImportError:
|
442
|
+
# Alternative method using csv and json modules
|
443
|
+
import csv
|
444
|
+
import json
|
445
|
+
|
446
|
+
with open(filepath, 'r', encoding='utf-8', newline='') as f:
|
447
|
+
reader = csv.DictReader(f)
|
448
|
+
data = list(reader)
|
449
|
+
|
450
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
451
|
+
json.dump(data, f, indent=4)
|
452
|
+
|
453
|
+
elif target_format == 'txt':
|
454
|
+
# Simple conversion - just copy the CSV content
|
455
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
456
|
+
content = f.read()
|
457
|
+
|
458
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
459
|
+
f.write(content)
|
460
|
+
|
461
|
+
elif target_format == 'xml':
|
462
|
+
try:
|
463
|
+
import pandas as pd
|
464
|
+
import dicttoxml
|
465
|
+
|
466
|
+
df = pd.read_csv(filepath)
|
467
|
+
data = df.to_dict(orient='records')
|
468
|
+
xml = dicttoxml.dicttoxml(data, custom_root='data', attr_type=False)
|
469
|
+
|
470
|
+
with open(output_path, 'wb') as f:
|
471
|
+
f.write(xml)
|
472
|
+
|
473
|
+
except ImportError:
|
474
|
+
raise RuntimeError("pandas and dicttoxml libraries are required for CSV to XML conversion. Please install them.")
|
475
|
+
|
476
|
+
elif target_format == 'sql':
|
477
|
+
try:
|
478
|
+
import pandas as pd
|
479
|
+
import sqlite3
|
480
|
+
|
481
|
+
df = pd.read_csv(filepath)
|
482
|
+
|
483
|
+
# Create a temporary SQLite database
|
484
|
+
conn = sqlite3.connect(str(output_path))
|
485
|
+
|
486
|
+
# Get table name from file name or use default
|
487
|
+
table_name = options.get('table_name', filepath.stem)
|
488
|
+
|
489
|
+
# Write to SQL
|
490
|
+
df.to_sql(table_name, conn, index=False)
|
491
|
+
conn.close()
|
492
|
+
|
493
|
+
except ImportError:
|
494
|
+
raise RuntimeError("pandas library is required for CSV to SQL conversion. Please install it.")
|
495
|
+
|
496
|
+
return str(output_path)
|
497
|
+
|
498
|
+
def _convert_from_xlsx(
|
499
|
+
filepath: Path,
|
500
|
+
target_format: str,
|
501
|
+
output_path: Path,
|
502
|
+
options: Dict[str, Any]
|
503
|
+
) -> str:
|
504
|
+
"""Convert from XLSX to other formats."""
|
505
|
+
try:
|
506
|
+
import pandas as pd
|
507
|
+
|
508
|
+
# Read the Excel file
|
509
|
+
df = pd.read_excel(filepath, sheet_name=options.get('sheet_name', 0))
|
510
|
+
|
511
|
+
if target_format == 'csv':
|
512
|
+
df.to_csv(output_path, index=False)
|
513
|
+
|
514
|
+
elif target_format == 'json':
|
515
|
+
df.to_json(output_path, orient='records', indent=4)
|
516
|
+
|
517
|
+
elif target_format == 'xml':
|
518
|
+
import dicttoxml
|
519
|
+
|
520
|
+
data = df.to_dict(orient='records')
|
521
|
+
xml = dicttoxml.dicttoxml(data, custom_root='data', attr_type=False)
|
522
|
+
|
523
|
+
with open(output_path, 'wb') as f:
|
524
|
+
f.write(xml)
|
525
|
+
|
526
|
+
elif target_format == 'sql':
|
527
|
+
import sqlite3
|
528
|
+
|
529
|
+
# Create a temporary SQLite database
|
530
|
+
conn = sqlite3.connect(str(output_path))
|
531
|
+
|
532
|
+
# Get table name from file name or use default
|
533
|
+
table_name = options.get('table_name', filepath.stem)
|
534
|
+
|
535
|
+
# Write to SQL
|
536
|
+
df.to_sql(table_name, conn, index=False)
|
537
|
+
conn.close()
|
538
|
+
|
539
|
+
except ImportError:
|
540
|
+
raise RuntimeError("pandas library is required for XLSX conversions. Please install it.")
|
541
|
+
|
542
|
+
return str(output_path)
|
543
|
+
|
544
|
+
def _convert_from_epub(
|
545
|
+
filepath: Path,
|
546
|
+
target_format: str,
|
547
|
+
output_path: Path,
|
548
|
+
options: Dict[str, Any]
|
549
|
+
) -> str:
|
550
|
+
"""Convert from EPUB to other formats."""
|
551
|
+
# Use pandoc for all EPUB conversions
|
552
|
+
_convert_using_pandoc(filepath, target_format, output_path)
|
553
|
+
return str(output_path)
|
554
|
+
|
555
|
+
def _convert_using_pandoc(
|
556
|
+
filepath: Path,
|
557
|
+
target_format: str,
|
558
|
+
output_path: Path
|
559
|
+
) -> None:
|
560
|
+
"""Use pandoc for document conversion."""
|
561
|
+
try:
|
562
|
+
import subprocess
|
563
|
+
|
564
|
+
# Map our format names to pandoc format names
|
565
|
+
format_map = {
|
566
|
+
'txt': 'markdown', # Use markdown as the input format for .txt files
|
567
|
+
'md': 'markdown',
|
568
|
+
'html': 'html',
|
569
|
+
'docx': 'docx',
|
570
|
+
'pdf': 'pdf',
|
571
|
+
'rtf': 'rtf',
|
572
|
+
'epub': 'epub'
|
573
|
+
}
|
574
|
+
|
575
|
+
source_ext = filepath.suffix.lower().lstrip('.')
|
576
|
+
if source_ext == 'jpeg':
|
577
|
+
source_ext = 'jpg'
|
578
|
+
|
579
|
+
pandoc_source = format_map.get(source_ext, source_ext)
|
580
|
+
pandoc_target = format_map.get(target_format, target_format)
|
581
|
+
|
582
|
+
cmd = [
|
583
|
+
'pandoc',
|
584
|
+
'-f', pandoc_source,
|
585
|
+
'-t', pandoc_target,
|
586
|
+
'-o', str(output_path),
|
587
|
+
str(filepath)
|
588
|
+
]
|
589
|
+
|
590
|
+
# For debug purposes, log the command
|
591
|
+
logger.debug(f"Running pandoc command: {' '.join(cmd)}")
|
592
|
+
|
593
|
+
subprocess.run(cmd, check=True)
|
594
|
+
|
595
|
+
except (ImportError, subprocess.SubprocessError) as e:
|
596
|
+
raise RuntimeError(f"Failed to convert using pandoc: {str(e)}. Please install pandoc.")
|