mdify-cli 3.0.7__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/PKG-INFO +1 -1
  2. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/__init__.py +1 -1
  3. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/cli.py +76 -15
  4. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/PKG-INFO +1 -1
  5. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/pyproject.toml +1 -1
  6. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/LICENSE +0 -0
  7. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/README.md +0 -0
  8. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/assets/mdify.png +0 -0
  9. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/__main__.py +0 -0
  10. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/container.py +0 -0
  11. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/docling_client.py +0 -0
  12. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/formatting.py +0 -0
  13. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/ssh/__init__.py +0 -0
  14. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/ssh/client.py +0 -0
  15. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/ssh/models.py +0 -0
  16. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/ssh/remote_container.py +0 -0
  17. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify/ssh/transfer.py +0 -0
  18. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/SOURCES.txt +0 -0
  19. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/dependency_links.txt +0 -0
  20. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/entry_points.txt +0 -0
  21. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/requires.txt +0 -0
  22. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/mdify_cli.egg-info/top_level.txt +0 -0
  23. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/setup.cfg +0 -0
  24. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/tests/test_cli.py +0 -0
  25. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/tests/test_container.py +0 -0
  26. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/tests/test_docling_client.py +0 -0
  27. {mdify_cli-3.0.7 → mdify_cli-3.1.0}/tests/test_ssh_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 3.0.7
3
+ Version: 3.1.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "3.0.7"
3
+ __version__ = "3.1.0"
@@ -8,12 +8,14 @@ is lightweight and has no ML dependencies.
8
8
  """
9
9
 
10
10
  import argparse
11
+ import asyncio
11
12
  import json
12
13
  import os
13
14
  import platform
14
15
  import shutil
15
16
  import subprocess
16
17
  import sys
18
+ import tempfile
17
19
  import threading
18
20
  import time
19
21
  from pathlib import Path
@@ -883,7 +885,7 @@ Examples:
883
885
  "--timeout",
884
886
  type=int,
885
887
  default=None,
886
- help="Conversion timeout in seconds (default: 1200, can be set via MDIFY_TIMEOUT env var)",
888
+ help="Conversion timeout in seconds (default: 1200s for local, 3600s for remote with large PDFs, can be set via MDIFY_TIMEOUT env var)",
887
889
  )
888
890
 
889
891
  parser.add_argument(
@@ -1057,6 +1059,10 @@ def main_async_remote(args) -> int:
1057
1059
  # Resolve timeout value: CLI > env > default 1200
1058
1060
  timeout = args.timeout or int(os.environ.get("MDIFY_TIMEOUT", 1200))
1059
1061
 
1062
+ # For remote operations, extend timeout significantly for large PDF processing
1063
+ # Remote conversions include network latency, file upload/download, and OCR processing
1064
+ remote_conversion_timeout = max(timeout, 3600) # At least 1 hour for remote conversion
1065
+
1060
1066
  # Build SSH config from CLI arguments and SSH config files
1061
1067
  try:
1062
1068
  # Build config with proper precedence (lowest to highest):
@@ -1178,7 +1184,8 @@ def main_async_remote(args) -> int:
1178
1184
  return 1
1179
1185
 
1180
1186
  if not args.quiet:
1181
- print(color.cyan(f"\nFound {len(files_to_convert)} file(s) to convert"), file=sys.stderr)
1187
+ print(color.cyan(f"Found {len(files_to_convert)} file(s) to convert"), file=sys.stderr)
1188
+ print(color.cyan(f"Conversion timeout: {remote_conversion_timeout}s (for large PDFs with OCR)"), file=sys.stderr)
1182
1189
 
1183
1190
  # Import remote container and transfer manager
1184
1191
  from mdify.ssh.transfer import FileTransferManager
@@ -1242,7 +1249,8 @@ def main_async_remote(args) -> int:
1242
1249
  if isinstance(exc, SSHConnectionError):
1243
1250
  return True
1244
1251
  msg = str(exc).lower()
1245
- return "broken pipe" in msg or "connection closed" in msg
1252
+ # Errno 32 = Broken pipe, Errno 54 = Connection reset by peer
1253
+ return any(x in msg for x in ["broken pipe", "connection closed", "connection reset", "errno 32", "errno 54", "ssh connection"])
1246
1254
 
1247
1255
  try:
1248
1256
  for idx, input_file in enumerate(files_to_convert, 1):
@@ -1309,8 +1317,13 @@ def main_async_remote(args) -> int:
1309
1317
  remote_output_path = f"{work_dir}/{input_file.stem}.md"
1310
1318
 
1311
1319
  # Build conversion command on remote - use -F for multipart form data
1320
+ # Important: use generous timeouts since large PDFs with OCR take time
1321
+ # --connect-timeout: max time to establish connection (60s)
1322
+ # --max-time: max total operation time (extended timeout)
1312
1323
  convert_cmd = (
1313
1324
  f"curl -X POST "
1325
+ f"--connect-timeout 60 "
1326
+ f"--max-time {remote_conversion_timeout} "
1314
1327
  f"-F 'files=@{remote_file_path}' "
1315
1328
  f"-F 'to_formats=md' "
1316
1329
  f"-F 'do_ocr=true' "
@@ -1326,27 +1339,50 @@ def main_async_remote(args) -> int:
1326
1339
  while conversion_attempt < 3 and not conversion_success:
1327
1340
  try:
1328
1341
  if conversion_attempt > 0 and not args.quiet:
1329
- print(f" ↻ Conversion retry {conversion_attempt}...", file=sys.stderr)
1342
+ # Exponential backoff: 2s, 4s, 8s
1343
+ backoff_delay = 2 ** conversion_attempt
1344
+ print(f" ↻ Conversion retry {conversion_attempt} (waiting {backoff_delay}s for server recovery)...", file=sys.stderr)
1345
+ await asyncio.sleep(backoff_delay)
1330
1346
 
1331
- conversion_output, _, conv_code = await ssh_client.run_command(convert_cmd, timeout=timeout)
1347
+ conversion_output, _, conv_code = await ssh_client.run_command(convert_cmd, timeout=remote_conversion_timeout)
1332
1348
 
1333
1349
  if conv_code == 0:
1334
1350
  conversion_success = True
1335
1351
  break
1336
1352
  else:
1337
- conversion_attempt += 1
1353
+ # Non-zero exit code - fail without retry for non-connection errors
1354
+ break
1338
1355
  except Exception as conv_exc:
1339
- if is_connection_error(conv_exc) and conversion_attempt < 2:
1356
+ is_conn_err = is_connection_error(conv_exc)
1357
+ if is_conn_err and conversion_attempt < 2:
1340
1358
  conversion_attempt += 1
1341
1359
  if not args.quiet:
1342
- print(f" ↻ Connection lost during conversion. Reconnecting (attempt {conversion_attempt})...", file=sys.stderr)
1360
+ # Exponential backoff: 5s, 10s
1361
+ backoff_delay = 5 * conversion_attempt
1362
+ print(f" ↻ Connection reset during conversion. Reconnecting in {backoff_delay}s...", file=sys.stderr)
1363
+
1364
+ await asyncio.sleep(backoff_delay)
1365
+
1343
1366
  try:
1344
1367
  await ssh_client.disconnect()
1345
1368
  except Exception:
1346
1369
  pass
1347
- await ssh_client.connect()
1370
+
1371
+ # Reconnect with retry
1372
+ try:
1373
+ await ssh_client.connect()
1374
+ except Exception:
1375
+ if not args.quiet:
1376
+ print(f" ⚠ Reconnection failed: retrying...", file=sys.stderr)
1377
+ continue
1348
1378
  else:
1349
- conversion_attempt += 1
1379
+ # Either not a connection error, or we've exhausted retries
1380
+ if not args.quiet:
1381
+ print(f" [DEBUG] Breaking loop: not conn_err or exhausted retries", file=sys.stderr)
1382
+ if conversion_attempt >= 2 and is_conn_err:
1383
+ if not args.quiet:
1384
+ print(f" ↻ Connection error on final retry attempt", file=sys.stderr)
1385
+ break
1350
1386
 
1351
1387
  if not conversion_success:
1352
1388
  print(f" ✗ Failed: Conversion failed after {conversion_attempt} attempt(s)", file=sys.stderr)
@@ -1386,12 +1422,37 @@ def main_async_remote(args) -> int:
1386
1422
  # Ultimate fallback
1387
1423
  markdown_content = conversion_output
1388
1424
 
1389
- # Write markdown content to remote file
1390
- write_cmd = f"cat > {remote_output_path} << 'MDIFY_EOF'\n{markdown_content}\nMDIFY_EOF"
1391
- _, _, write_code = await ssh_client.run_command(write_cmd, timeout=30)
1425
+ # Write markdown content to local temp file first, then upload via SFTP
1426
+ # (Piping large content through SSH here-documents can crash the connection)
1427
+ content_size_kb = len(markdown_content) / 1024
1428
+ if not args.quiet:
1429
+ print(f" {color.cyan('Writing')} {content_size_kb:.1f}KB markdown via SFTP...", file=sys.stderr)
1392
1430
 
1393
- if write_code != 0:
1394
- print(f" ✗ Failed to write markdown output", file=sys.stderr)
1431
+ try:
1432
+ # Write to temporary local file
1433
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
1434
+ temp_file.write(markdown_content)
1435
+ temp_path = temp_file.name
1436
+
1437
+ # Upload via SFTP (more reliable for large files)
1438
+ await transfer_manager.upload_file(
1439
+ local_path=temp_path,
1440
+ remote_path=remote_output_path,
1441
+ overwrite=True,
1442
+ compress=False,
1443
+ )
1444
+
1445
+ # Cleanup temp file
1446
+ try:
1447
+ os.unlink(temp_path)
1448
+ except Exception:
1449
+ pass
1450
+
1451
+ if not args.quiet:
1452
+ print(f" {color.green('✓')} Markdown written", file=sys.stderr)
1453
+ except Exception as write_exc:
1454
+ if not args.quiet:
1455
+ print(f" ✗ Failed to write markdown: {write_exc}", file=sys.stderr)
1395
1456
  failed += 1
1396
1457
  break
1397
1458
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 3.0.7
3
+ Version: 3.1.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mdify-cli"
3
- version = "3.0.7"
3
+ version = "3.1.0"
4
4
  description = "Convert PDFs and document images into structured Markdown for LLM workflows"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes