PyPI - markitai - Versions diffs - 0.3.0__tar.gz → 0.4.1__tar.gz - Mend

markitai 0.3.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

{markitai-0.3.0 → markitai-0.4.1}/.gitignore RENAMED Viewed

@@ -38,6 +38,7 @@ ENV/
 # Testing
 .pytest_cache/
 .coverage
+coverage.xml
 htmlcov/
 .tox/
 .nox/
@@ -46,6 +47,9 @@ htmlcov/
 .mypy_cache/
 .pytype/
+# Linting
+.ruff_cache/
 # Markitai output
 output/
 output-*/
@@ -55,6 +59,7 @@ markitai.json
 # Logs
 logs/
+logs_*/
 *.log
 # Environment variables (API keys)
@@ -66,13 +71,8 @@ logs/
 .DS_Store
 Thumbs.db
-# SQLite cache (including WAL mode files)
-cache.db
-cache.db-wal
-cache.db-shm
-*.db-wal
-*.db-shm
-fetch_cache.db
+# Markitai cache directory
+.markitai/
 # VitePress (website)
 website/node_modules/

markitai-0.4.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,196 @@
+Metadata-Version: 2.4
+Name: markitai
+Version: 0.4.1
+Summary: Opinionated Markdown converter with native LLM enhancement support
+Project-URL: Homepage, https://markitai.ynewtime.com
+Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
+Project-URL: Repository, https://github.com/Ynewtime/markitai
+Project-URL: Changelog, https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md
+Author-email: Ynewtime <longqiliuye@gmail.com>
+License-Expression: MIT
+Keywords: converter,docx,llm,markdown,ocr,pdf
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Topic :: Utilities
+Requires-Python: <3.14,>=3.11
+Requires-Dist: aiofiles>=25.1.0
+Requires-Dist: click>=8.1.0
+Requires-Dist: instructor>=1.14.0
+Requires-Dist: litellm>=1.80.16
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: markitdown[all]>=0.1.4
+Requires-Dist: opencv-python>=4.8.0
+Requires-Dist: pillow>=12.1.0
+Requires-Dist: pydantic>=2.10.0
+Requires-Dist: pymupdf4llm>=0.2.9
+Requires-Dist: python-dotenv>=1.2.1
+Requires-Dist: pywin32>=310; sys_platform == 'win32'
+Requires-Dist: rapidocr>=3.5.0
+Requires-Dist: rich>=14.2.0
+Provides-Extra: all
+Requires-Dist: claude-agent-sdk>=0.1.0; extra == 'all'
+Requires-Dist: github-copilot-sdk>=0.1.0; extra == 'all'
+Requires-Dist: playwright>=1.50.0; extra == 'all'
+Provides-Extra: browser
+Requires-Dist: playwright>=1.50.0; extra == 'browser'
+Provides-Extra: claude-agent
+Requires-Dist: claude-agent-sdk>=0.1.0; extra == 'claude-agent'
+Provides-Extra: copilot
+Requires-Dist: github-copilot-sdk>=0.1.0; extra == 'copilot'
+Description-Content-Type: text/markdown
+# Markitai
+English | [简体中文](./README_ZH.md)
+Opinionated Markdown converter with native LLM enhancement support.
+## Features
+- **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
+- **LLM Enhancement** - Format cleaning, metadata generation, image analysis
+- **Batch Processing** - Concurrent conversion, resume capability, progress display
+- **OCR Recognition** - Text extraction from scanned PDFs and images
+- **URL Conversion** - Direct webpage conversion with SPA browser rendering support
+- **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
+## Installation
+### One-Click Setup (Recommended)
+```bash
+# Linux/macOS
+curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
+# Windows (PowerShell)
+irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
+```
+### Manual Installation
+```bash
+# Requires Python 3.11-3.13 (3.14 not yet supported)
+uv tool install markitai
+# Or using uv pip (for virtual environment)
+uv pip install markitai
+```
+## Quick Start
+```bash
+# Basic conversion
+markitai document.docx
+# URL conversion
+markitai https://example.com/article
+# LLM enhancement
+markitai document.docx --llm
+# Using presets
+markitai document.pdf --preset rich      # LLM + alt + desc + screenshot
+markitai document.pdf --preset standard  # LLM + alt + desc
+markitai document.pdf --preset minimal   # Basic conversion only
+# Batch processing
+markitai ./docs -o ./output
+# Resume interrupted job
+markitai ./docs -o ./output --resume
+# Batch URL processing (auto-detect .urls files)
+markitai urls.urls -o ./output
+```
+## Output Structure
+```
+output/
+├── document.docx.md        # Basic Markdown
+├── document.docx.llm.md    # LLM-enhanced version
+├── assets/
+│   ├── document.docx.0001.jpg
+│   └── images.json         # Image descriptions
+├── screenshots/            # Page screenshots (with --screenshot)
+│   └── example_com.full.jpg
+```
+## Configuration
+Priority: CLI arguments > Environment variables > Config file > Defaults
+```bash
+# View configuration
+markitai config list
+# Initialize config file
+markitai config init -o .
+# View cache status
+markitai cache stats
+# Clear cache
+markitai cache clear
+# Check system health and dependencies
+markitai doctor
+```
+Config file location: `./markitai.json` or `~/.markitai/config.json`
+### Local Providers (Subscription-based)
+Use your existing Claude Code or GitHub Copilot subscription:
+```bash
+# Claude Agent (requires Claude Code CLI)
+markitai document.pdf --llm  # Configure claude-agent/sonnet in config
+# GitHub Copilot (requires Copilot CLI)
+markitai document.pdf --llm  # Configure copilot/gpt-5.2 in config
+```
+Install CLI tools:
+```bash
+# Claude Code CLI
+curl -fsSL https://claude.ai/install.sh | bash
+# GitHub Copilot CLI
+curl -fsSL https://gh.io/copilot-install | bash
+```
+## Environment Variables
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI API Key |
+| `GEMINI_API_KEY` | Google Gemini API Key |
+| `DEEPSEEK_API_KEY` | DeepSeek API Key |
+| `ANTHROPIC_API_KEY` | Anthropic API Key |
+| `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
+## Dependencies
+- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
+- [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
+- [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
+- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
+## Documentation
+- [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
+- [Configuration](https://markitai.ynewtime.com/guide/configuration)
+- [CLI Reference](https://markitai.ynewtime.com/guide/cli)
+## License
+MIT

markitai-0.4.1/README.md ADDED Viewed

@@ -0,0 +1,147 @@
+# Markitai
+English | [简体中文](./README_ZH.md)
+Opinionated Markdown converter with native LLM enhancement support.
+## Features
+- **Multi-format Support** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
+- **LLM Enhancement** - Format cleaning, metadata generation, image analysis
+- **Batch Processing** - Concurrent conversion, resume capability, progress display
+- **OCR Recognition** - Text extraction from scanned PDFs and images
+- **URL Conversion** - Direct webpage conversion with SPA browser rendering support
+- **Smart Caching** - LLM result caching, SPA domain learning, auto-proxy detection
+## Installation
+### One-Click Setup (Recommended)
+```bash
+# Linux/macOS
+curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
+# Windows (PowerShell)
+irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
+```
+### Manual Installation
+```bash
+# Requires Python 3.11-3.13 (3.14 not yet supported)
+uv tool install markitai
+# Or using uv pip (for virtual environment)
+uv pip install markitai
+```
+## Quick Start
+```bash
+# Basic conversion
+markitai document.docx
+# URL conversion
+markitai https://example.com/article
+# LLM enhancement
+markitai document.docx --llm
+# Using presets
+markitai document.pdf --preset rich      # LLM + alt + desc + screenshot
+markitai document.pdf --preset standard  # LLM + alt + desc
+markitai document.pdf --preset minimal   # Basic conversion only
+# Batch processing
+markitai ./docs -o ./output
+# Resume interrupted job
+markitai ./docs -o ./output --resume
+# Batch URL processing (auto-detect .urls files)
+markitai urls.urls -o ./output
+```
+## Output Structure
+```
+output/
+├── document.docx.md        # Basic Markdown
+├── document.docx.llm.md    # LLM-enhanced version
+├── assets/
+│   ├── document.docx.0001.jpg
+│   └── images.json         # Image descriptions
+├── screenshots/            # Page screenshots (with --screenshot)
+│   └── example_com.full.jpg
+```
+## Configuration
+Priority: CLI arguments > Environment variables > Config file > Defaults
+```bash
+# View configuration
+markitai config list
+# Initialize config file
+markitai config init -o .
+# View cache status
+markitai cache stats
+# Clear cache
+markitai cache clear
+# Check system health and dependencies
+markitai doctor
+```
+Config file location: `./markitai.json` or `~/.markitai/config.json`
+### Local Providers (Subscription-based)
+Use your existing Claude Code or GitHub Copilot subscription:
+```bash
+# Claude Agent (requires Claude Code CLI)
+markitai document.pdf --llm  # Configure claude-agent/sonnet in config
+# GitHub Copilot (requires Copilot CLI)
+markitai document.pdf --llm  # Configure copilot/gpt-5.2 in config
+```
+Install CLI tools:
+```bash
+# Claude Code CLI
+curl -fsSL https://claude.ai/install.sh | bash
+# GitHub Copilot CLI
+curl -fsSL https://gh.io/copilot-install | bash
+```
+## Environment Variables
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI API Key |
+| `GEMINI_API_KEY` | Google Gemini API Key |
+| `DEEPSEEK_API_KEY` | DeepSeek API Key |
+| `ANTHROPIC_API_KEY` | Anthropic API Key |
+| `JINA_API_KEY` | Jina Reader API Key (URL conversion) |
+## Dependencies
+- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF conversion
+- [markitdown](https://github.com/microsoft/markitdown) - Office documents and URL conversion
+- [LiteLLM](https://github.com/BerriAI/litellm) - LLM gateway
+- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR recognition
+## Documentation
+- [Getting Started](https://markitai.ynewtime.com/guide/getting-started)
+- [Configuration](https://markitai.ynewtime.com/guide/configuration)
+- [CLI Reference](https://markitai.ynewtime.com/guide/cli)
+## License
+MIT

{markitai-0.3.0 → markitai-0.4.1}/pyproject.toml RENAMED Viewed

@@ -1,10 +1,10 @@
 [project]
 name = "markitai"
-version = "0.3.0"
-description = "Document to Markdown converter with LLM enhancement"
+version = "0.4.1"
+description = "Opinionated Markdown converter with native LLM enhancement support"
 license = "MIT"
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.11,<3.14"
 authors = [
     { name = "Ynewtime", email = "longqiliuye@gmail.com" }
 ]
@@ -32,6 +32,7 @@ dependencies = [
     "loguru>=0.7.3",
     "rich>=14.2.0",
     "Pillow>=12.1.0",
+    "opencv-python>=4.8.0",
     "aiofiles>=25.1.0",
     "pydantic>=2.10.0",
     "python-dotenv>=1.2.1",
@@ -48,7 +49,10 @@ Changelog = "https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md"
 markitai = "markitai.cli:app"
 [project.optional-dependencies]
-all = []
+claude-agent = ["claude-agent-sdk>=0.1.0"]
+copilot = ["github-copilot-sdk>=0.1.0"]
+browser = ["playwright>=1.50.0"]
+all = ["claude-agent-sdk>=0.1.0", "github-copilot-sdk>=0.1.0", "playwright>=1.50.0"]
 [dependency-groups]
 dev = [
@@ -71,9 +75,13 @@ packages = ["src/markitai"]
 testpaths = ["tests"]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "network: marks tests that require network access (deselect with '-m \"not network\"')",
+]
 [tool.ruff]
-target-version = "py311"
+target-version = "py313"
 line-length = 88
 src = ["src", "tests"]
@@ -114,13 +122,15 @@ skip-magic-trailing-comma = false
 line-ending = "auto"
 [tool.pyright]
-pythonVersion = "3.11"
+pythonVersion = "3.13"
 typeCheckingMode = "basic"
 include = ["src"]
 exclude = ["tests", "**/__pycache__"]
 venvPath = "../.."
 venv = ".venv"
-reportMissingImports = true
+# Allow optional dependencies to be missing (claude-agent-sdk)
+# These are runtime-checked before import using importlib.util.find_spec
+reportMissingImports = "warning"
 reportMissingTypeStubs = false
 reportUnusedImport = true
 reportUnusedVariable = "warning"

markitai-0.4.1/src/markitai/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Markitai - Opinionated Markdown converter with native LLM enhancement support."""
+__version__ = "0.4.1"

{markitai-0.3.0 → markitai-0.4.1}/src/markitai/batch.py RENAMED Viewed

@@ -13,7 +13,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from loguru import logger
-from rich.console import Console, Group
+from rich.console import Group
 from rich.live import Live
 from rich.panel import Panel
 from rich.progress import (
@@ -28,9 +28,11 @@ from rich.progress import (
 from rich.table import Table
 from rich.text import Text
+from markitai.cli.console import get_console
 from markitai.constants import DEFAULT_LOG_PANEL_MAX_LINES
 from markitai.json_order import order_report, order_state
 from markitai.security import atomic_write_json
+from markitai.utils.text import format_error_message
 if TYPE_CHECKING:
     from markitai.config import BatchConfig
@@ -464,10 +466,15 @@ class BatchProcessor:
         self.state_file = self._get_state_file_path()
         self.report_file = self._get_report_file_path()
         self.state: BatchState | None = None
-        self.console = Console()
+        self.console = get_console()
         # Collect image analysis results for JSON aggregation
         self.image_analysis_results: list[ImageAnalysisResult] = []
+        # Optimization: Lock for state saving to prevent IO congestion
+        import threading
+        self._save_lock = threading.Lock()
         # Live display state (managed by start_live_display/stop_live_display)
         self._live: Live | None = None
         self._log_panel: LogPanel | None = None
@@ -515,7 +522,7 @@ class BatchProcessor:
             "options": key_options,
         }
         hash_str = json.dumps(hash_params, sort_keys=True)
-        return hashlib.md5(hash_str.encode()).hexdigest()[:6]
+        return hashlib.md5(hash_str.encode(), usedforsecurity=False).hexdigest()[:6]
     def _get_state_file_path(self) -> Path:
         """Generate state file path for resume capability.
@@ -543,11 +550,17 @@ class BatchProcessor:
             return base_path
         else:  # rename
             seq = 2
-            while True:
+            max_seq = 9999  # Safety limit to prevent infinite loop
+            while seq <= max_seq:
                 new_path = reports_dir / f"markitai.{self.task_hash}.v{seq}.report.json"
                 if not new_path.exists():
                     return new_path
                 seq += 1
+            # Fallback: use timestamp if too many versions exist
+            import time
+            ts = int(time.time())
+            return reports_dir / f"markitai.{self.task_hash}.{ts}.report.json"
     def start_live_display(
         self,
@@ -807,6 +820,7 @@ class BatchProcessor:
         Optimized with interval-based throttling:
         - Checks interval BEFORE serialization to avoid unnecessary work
         - Uses minimal serialization when possible
+        - Uses thread lock to prevent concurrent disk writes
         Args:
             force: Force save even if interval hasn't passed
@@ -816,27 +830,35 @@ class BatchProcessor:
             return
         now = datetime.now().astimezone()
-        interval = getattr(self.config, "state_flush_interval_seconds", 0) or 0
+        # Default to 5 seconds if not specified in config to prevent $O(N^2)$ IO
+        interval = getattr(self.config, "state_flush_interval_seconds", 5) or 5
         # Check interval BEFORE any serialization work (optimization)
-        if not force and interval > 0:
+        if not force:
             last_saved = getattr(self, "_last_state_save", None)
             if last_saved and (now - last_saved).total_seconds() < interval:
                 return  # Skip: interval not passed, no work done
-        self.state.updated_at = now.isoformat()
+        # Ensure only one thread is writing at a time
+        if not self._save_lock.acquire(blocking=force):
+            return  # Skip if another thread is already saving, unless forced
+        try:
+            self.state.updated_at = now.isoformat()
-        # Build minimal state document (only what's needed for resume)
-        state_data = self.state.to_minimal_dict()
+            # Build minimal state document (only what's needed for resume)
+            state_data = self.state.to_minimal_dict()
-        # Ensure states directory exists
-        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+            # Ensure states directory exists
+            self.state_file.parent.mkdir(parents=True, exist_ok=True)
-        atomic_write_json(self.state_file, state_data, order_func=order_state)
-        self._last_state_save = now
+            atomic_write_json(self.state_file, state_data, order_func=order_state)
+            self._last_state_save = now
-        if log:
-            logger.info(f"State file saved: {self.state_file.resolve()}")
+            if log:
+                logger.info(f"State file saved: {self.state_file.resolve()}")
+        finally:
+            self._save_lock.release()
     def _compute_summary(self) -> dict[str, Any]:
         """Compute summary statistics for report."""
@@ -1037,6 +1059,15 @@ class BatchProcessor:
             self.save_state(force=True)
             return self.state
+        # Preheat OCR engine if OCR is enabled to eliminate cold start delay
+        if options and options.get("ocr_enabled"):
+            try:
+                from markitai.ocr import OCRProcessor
+                OCRProcessor.preheat()
+            except ImportError:
+                logger.debug("OCR preheat skipped: RapidOCR not installed")
         # Create semaphore for concurrency control
         semaphore = asyncio.Semaphore(self.config.concurrency)
@@ -1126,8 +1157,10 @@ class BatchProcessor:
             except Exception as e:
                 file_state.status = FileStatus.FAILED
-                file_state.error = str(e)
-                logger.error(f"Failed to process {file_path.name}: {e}")
+                file_state.error = format_error_message(e)
+                logger.error(
+                    f"Failed to process {file_path.name}: {format_error_message(e)}"
+                )
             finally:
                 end_time = asyncio.get_event_loop().time()

markitai-0.4.1/src/markitai/cli/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""CLI package for Markitai.
+This package provides the command-line interface for Markitai.
+Usage:
+    from markitai.cli import app
+"""
+from __future__ import annotations
+# Re-export CLI app
+from markitai.cli.main import app
+# Re-export validators from processors
+from markitai.cli.processors.validators import (
+    warn_case_sensitivity_mismatches as _warn_case_sensitivity_mismatches,
+)
+# Re-export utilities from refactored modules
+from markitai.utils.cli_helpers import (
+    compute_task_hash,
+    get_report_file_path,
+    is_url,
+    sanitize_filename,
+    url_to_filename,
+)
+from markitai.utils.output import resolve_output_path
+from markitai.utils.progress import ProgressReporter
+# Re-export from workflow helpers
+from markitai.workflow.helpers import write_images_json
+# Re-export types from workflow for backward compatibility
+from markitai.workflow.single import ImageAnalysisResult
+# Backward compatibility alias (deprecated, use sanitize_filename instead)
+_sanitize_filename = sanitize_filename
+__all__ = [
+    "app",
+    "ProgressReporter",
+    "is_url",
+    "url_to_filename",
+    "sanitize_filename",
+    "_sanitize_filename",  # Deprecated alias
+    "_warn_case_sensitivity_mismatches",
+    "compute_task_hash",
+    "get_report_file_path",
+    "resolve_output_path",
+    "write_images_json",
+    "ImageAnalysisResult",
+]

markitai-0.4.1/src/markitai/cli/commands/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""CLI commands package.
+This package contains CLI command groups for Markitai.
+Available command groups:
+- config: Configuration management commands
+- cache: Cache management commands
+- doctor: System health and dependency checking command
+- check_deps: Alias for doctor (backward compatibility)
+"""
+from __future__ import annotations
+from markitai.cli.commands.cache import cache
+from markitai.cli.commands.config import config
+from markitai.cli.commands.doctor import check_deps, doctor
+__all__ = ["cache", "config", "doctor", "check_deps"]

markitai 0.3.0__tar.gz → 0.4.1__tar.gz

markitai 0.3.0tar.gz → 0.4.1tar.gz