PyPI - pdf-transcriber - Versions diffs - 1.0.0__py3-none-any.whl - Mend

pdf-transcriber 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

pdf_transcriber/__init__.py +6 -0
pdf_transcriber/cli.py +291 -0
pdf_transcriber/config.py +109 -0
pdf_transcriber/core/__init__.py +21 -0
pdf_transcriber/core/linter/__init__.py +5 -0
pdf_transcriber/core/linter/engine.py +184 -0
pdf_transcriber/core/linter/models.py +72 -0
pdf_transcriber/core/linter/rules/__init__.py +55 -0
pdf_transcriber/core/linter/rules/artifacts.py +1030 -0
pdf_transcriber/core/linter/rules/markdown.py +191 -0
pdf_transcriber/core/linter/rules/math.py +633 -0
pdf_transcriber/core/metadata_parser.py +245 -0
pdf_transcriber/core/pdf_processor.py +173 -0
pdf_transcriber/core/state_manager.py +325 -0
pdf_transcriber/core/transcription.py +476 -0
pdf_transcriber/server.py +50 -0
pdf_transcriber/skills/__init__.py +1 -0
pdf_transcriber/skills/transcribe.md +48 -0
pdf_transcriber/tools/__init__.py +4 -0
pdf_transcriber/tools/lint.py +72 -0
pdf_transcriber/tools/transcribe.py +333 -0
pdf_transcriber-1.0.0.dist-info/METADATA +401 -0
pdf_transcriber-1.0.0.dist-info/RECORD +26 -0
pdf_transcriber-1.0.0.dist-info/WHEEL +4 -0
pdf_transcriber-1.0.0.dist-info/entry_points.txt +3 -0
pdf_transcriber-1.0.0.dist-info/licenses/LICENSE +21 -0

pdf_transcriber/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""PDF Transcriber MCP Server.
+Convert math-heavy PDFs to Markdown using Marker OCR with optional LLM enhancement.
+"""
+__version__ = "1.0.0"

pdf_transcriber/cli.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""CLI for pdf-transcriber.
+Provides direct terminal access to transcription functionality without MCP.
+"""
+import argparse
+import asyncio
+import sys
+from pathlib import Path
+from pdf_transcriber import __version__
+from pdf_transcriber.config import Config
+def main():
+    """Main entry point for the CLI."""
+    parser = argparse.ArgumentParser(
+        prog="pdf-transcriber-cli",
+        description="Convert math-heavy PDFs to Markdown using Marker OCR"
+    )
+    parser.add_argument(
+        "--version", action="version", version=f"%(prog)s {__version__}"
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # transcribe command
+    t = subparsers.add_parser("transcribe", help="Transcribe a PDF to Markdown")
+    t.add_argument("pdf_path", type=Path, help="Path to PDF file")
+    t.add_argument("-o", "--output", type=Path, help="Output directory")
+    t.add_argument(
+        "-q", "--quality",
+        choices=["fast", "balanced", "high-quality"],
+        default="balanced",
+        help="Quality preset (default: balanced)"
+    )
+    t.add_argument(
+        "--no-llm", action="store_true",
+        help="Disable LLM enhancement (faster, less accurate)"
+    )
+    t.add_argument(
+        "--no-lint", action="store_true",
+        help="Skip post-transcription linting"
+    )
+    t.add_argument(
+        "--no-resume", action="store_true",
+        help="Don't resume from previous progress"
+    )
+    # check command
+    subparsers.add_parser("check", help="Health check (config, paths, Ollama)")
+    # install-skill command
+    s = subparsers.add_parser("install-skill", help="Install Claude Code skill")
+    s.add_argument(
+        "--force", action="store_true",
+        help="Overwrite existing skill"
+    )
+    args = parser.parse_args()
+    if args.command == "transcribe":
+        asyncio.run(transcribe_command(args))
+    elif args.command == "check":
+        check_command()
+    elif args.command == "install-skill":
+        install_skill_command(args)
+async def transcribe_command(args):
+    """Execute the transcribe command."""
+    import os
+    # Apply CLI overrides to environment
+    if args.no_llm:
+        os.environ["PDF_TRANSCRIBER_USE_LLM"] = "false"
+    # Load config after env overrides
+    config = Config.load()
+    # Validate PDF exists
+    pdf_path = args.pdf_path.expanduser().resolve()
+    if not pdf_path.exists():
+        print(f"Error: PDF not found: {pdf_path}", file=sys.stderr)
+        sys.exit(1)
+    # Import transcription components
+    from pdf_transcriber.core.pdf_processor import PDFProcessor
+    from pdf_transcriber.core.transcription import get_transcription_engine
+    from pdf_transcriber.core.state_manager import StateManager
+    from pdf_transcriber.core.metadata_parser import (
+        create_initial_metadata,
+        generate_frontmatter
+    )
+    from pdf_transcriber.core.linter import engine as lint_engine
+    # Determine output location
+    paper_name = pdf_path.stem
+    out_dir = args.output.expanduser() if args.output else config.output_dir
+    paper_dir = out_dir / paper_name
+    paper_dir.mkdir(parents=True, exist_ok=True)
+    dpi = config.get_dpi(args.quality)
+    print(f"Transcribing: {pdf_path.name}")
+    print(f"  Quality: {args.quality} ({dpi} DPI)")
+    print(f"  Output: {paper_dir}")
+    print(f"  LLM: {'enabled' if config.use_llm else 'disabled'}")
+    print()
+    # Initialize state manager
+    state_mgr = StateManager(out_dir, paper_name)
+    # Check for existing job
+    resume = not args.no_resume
+    if resume and state_mgr.has_existing_job():
+        state = state_mgr.load_state()
+        if state:
+            print(f"Resuming: {len(state.completed_pages)}/{state.total_pages} pages done")
+    else:
+        try:
+            with PDFProcessor(str(pdf_path), dpi) as proc:
+                total_pages = proc.total_pages
+        except Exception as e:
+            print(f"Error: Failed to open PDF: {e}", file=sys.stderr)
+            sys.exit(1)
+        state = state_mgr.create_job(str(pdf_path), total_pages, "markdown", args.quality)
+        print(f"Processing {total_pages} pages...")
+    # Get transcription engine
+    engine = get_transcription_engine(
+        use_gpu=config.use_gpu,
+        batch_size=config.marker_batch_size,
+        langs=config.marker_langs,
+        use_llm=config.use_llm,
+        llm_service=config.llm_service,
+        ollama_base_url=config.ollama_base_url,
+        ollama_model=config.ollama_model
+    )
+    # Determine chunk size
+    if state.total_pages > config.auto_chunk_threshold:
+        chunk_size = config.chunk_size
+        print(f"  Chunking: {chunk_size} pages/chunk (auto-enabled)")
+    else:
+        chunk_size = 0
+    # Transcribe
+    try:
+        with PDFProcessor(str(pdf_path), dpi) as proc:
+            content = await engine.transcribe_streaming(
+                proc, "markdown", state_mgr,
+                chunk_size=chunk_size
+            )
+    except Exception as e:
+        summary = state_mgr.get_progress_summary()
+        print(f"\nError: Transcription failed: {e}", file=sys.stderr)
+        print(f"Progress saved: {summary['completed']}/{summary['total']} pages")
+        print("Run again with same PDF to resume")
+        sys.exit(1)
+    # Build metadata
+    paper_meta = create_initial_metadata(
+        title=paper_name,
+        pdf_source=pdf_path,
+        total_pages=state.total_pages,
+        output_format="markdown",
+        quality=args.quality,
+    )
+    summary = state_mgr.get_progress_summary()
+    paper_meta.transcribed_pages = summary["completed"]
+    # Write output
+    output_path = paper_dir / f"{paper_name}.md"
+    final_content = generate_frontmatter(paper_meta) + "\n" + content
+    output_path.write_text(final_content, encoding="utf-8")
+    # Cleanup on success
+    if summary["completed"] == summary["total"]:
+        state_mgr.cleanup()
+    print(f"\nTranscribed {summary['completed']}/{summary['total']} pages")
+    # Run linting
+    if not args.no_lint:
+        print("Linting...")
+        original_path = paper_dir / f"{paper_name}.original.md"
+        original_path.write_text(final_content, encoding="utf-8")
+        try:
+            lint_report = await lint_engine.lint_file(output_path, fix=True)
+            print(f"  {lint_report.total_issues} issues found, {len(lint_report.fixed)} auto-fixed")
+        except Exception as e:
+            print(f"  Warning: Linting failed: {e}")
+    print(f"\nOutput: {output_path}")
+def check_command():
+    """Execute the check command."""
+    print(f"PDF Transcriber v{__version__}")
+    print("=" * 40)
+    # Configuration
+    config = Config.load()
+    print("\nConfiguration:")
+    print(f"  Output directory: {config.output_dir}")
+    print(f"  Default quality: {config.default_quality} ({config.get_dpi()}dpi)")
+    print(f"  GPU: {config.use_gpu}")
+    print(f"  LLM enhanced: {config.use_llm}")
+    # Output directory
+    print("\nOutput directory:")
+    if config.output_dir.exists():
+        paper_count = sum(
+            1 for d in config.output_dir.iterdir()
+            if d.is_dir() and any(d.glob("*.md"))
+        )
+        print(f"  Status: exists ({paper_count} papers)")
+    else:
+        print("  Status: will be created on first transcription")
+    # Ollama (if LLM enabled)
+    if config.use_llm:
+        print("\nOllama connection:")
+        try:
+            import urllib.request
+            import json
+            req = urllib.request.Request(
+                f"{config.ollama_base_url}/api/tags",
+                method="GET"
+            )
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                data = json.loads(resp.read().decode())
+                models = [m["name"] for m in data.get("models", [])]
+                if config.ollama_model in models:
+                    print("  Status: connected")
+                    print(f"  Model: {config.ollama_model} (available)")
+                else:
+                    print("  Status: connected")
+                    print(f"  Model: {config.ollama_model} (NOT INSTALLED)")
+                    print(f"  Run: ollama pull {config.ollama_model}")
+        except Exception as e:
+            print("  Status: NOT CONNECTED")
+            print(f"  Error: {e}")
+            print(f"  URL: {config.ollama_base_url}")
+            print("  Run: ollama serve")
+    # Available tools
+    print("\nMCP tools:")
+    print("  - transcribe_pdf")
+    print("  - clear_transcription_cache")
+    print("  - lint_paper")
+    print("\n" + "=" * 40)
+    print("Ready to transcribe!")
+def install_skill_command(args):
+    """Install the Claude Code skill."""
+    import importlib.resources as resources
+    skill_dir = Path.home() / ".claude" / "skills"
+    skill_dir.mkdir(parents=True, exist_ok=True)
+    dest = skill_dir / "transcribe.md"
+    if dest.exists() and not args.force:
+        print(f"Skill already exists: {dest}")
+        print("Use --force to overwrite")
+        sys.exit(1)
+    # Copy from package resources
+    try:
+        skill_content = resources.files("pdf_transcriber.skills").joinpath("transcribe.md").read_text()
+        dest.write_text(skill_content)
+    except Exception as e:
+        print(f"Error: Failed to read skill from package: {e}", file=sys.stderr)
+        print("The skill file may not be included in this installation.")
+        sys.exit(1)
+    print(f"Installed skill: {dest}")
+    print("Restart Claude Code to load the skill")
+    print("Usage: /transcribe ~/path/to/paper.pdf")
+if __name__ == "__main__":
+    main()

pdf_transcriber/config.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Configuration management with environment variable overrides."""
+from dataclasses import dataclass, field
+from pathlib import Path
+import os
+@dataclass
+class Config:
+    """Configuration for PDF transcriber MCP server."""
+    # Output directory (relative to current working directory)
+    output_dir: Path = field(
+        default_factory=lambda: Path.cwd() / "transcriptions"
+    )
+    # Quality presets (DPI values)
+    quality_presets: dict = field(default_factory=lambda: {
+        "fast": 100,          # ~1275x1650px - Lightweight
+        "balanced": 150,      # ~1913x2475px - DEFAULT - Best quality/size ratio
+        "high-quality": 200   # ~2550x3300px - High quality (may approach API limits)
+    })
+    default_quality: str = "balanced"
+    # Processing (markdown only - LaTeX removed for distribution)
+    default_mode: str = "streaming"   # "streaming" or "batch"
+    max_concurrent_pages: int = 3     # For batch mode (future)
+    # Marker OCR settings
+    ocr_engine: str = "marker"
+    use_gpu: bool = True  # Auto-detected in load()
+    marker_batch_size: int = 1  # Pages per batch (not currently used)
+    marker_langs: list = field(default_factory=lambda: ["English"])
+    # LLM-enhanced OCR settings (Marker's built-in LLM mode)
+    # NOTE: Requires a VISION model (VLM) - text-only models won't work
+    use_llm: bool = True  # Enable Marker's LLM-enhanced OCR (default: on)
+    llm_service: str = "marker.services.ollama.OllamaService"  # LLM service class
+    ollama_base_url: str = "http://localhost:11434"  # Ollama server URL
+    # Model options (vision models only):
+    #   - qwen2.5vl:3b      (3.2 GB) - Recommended for 16GB RAM systems
+    #   - qwen2.5vl:7b      (5.5 GB) - Better quality, needs 24GB+ RAM
+    #   - qwen3-vl:4b       (3.5 GB) - Newest Qwen VL, excellent quality
+    ollama_model: str = "qwen2.5vl:3b"  # Default: Qwen2.5-VL 3B (memory-safe)
+    # Chunking settings
+    chunk_size: int = 25  # Pages per chunk for large PDFs
+    auto_chunk_threshold: int = 100  # Auto-enable chunking for PDFs larger than this
+    # State management
+    progress_dir_name: str = ".pdf-progress"
+    # Index
+    index_file: str = ".paper-index.json"
+    # Versioning
+    version: str = "1.0.0"
+    @classmethod
+    def load(cls) -> "Config":
+        """Load config with environment variable overrides."""
+        config = cls()
+        # Override output directory from env
+        if val := os.environ.get("PDF_TRANSCRIBER_OUTPUT_DIR"):
+            config.output_dir = Path(val).expanduser()
+        # Override quality preset from env
+        if val := os.environ.get("PDF_TRANSCRIBER_QUALITY"):
+            if val in config.quality_presets:
+                config.default_quality = val
+        # Auto-detect GPU
+        try:
+            import torch
+            config.use_gpu = torch.cuda.is_available()
+        except ImportError:
+            config.use_gpu = False
+        # Override GPU setting from env
+        if val := os.environ.get("PDF_TRANSCRIBER_USE_GPU"):
+            config.use_gpu = val.lower() in ("true", "1", "yes")
+        # Override chunking settings from env
+        if val := os.environ.get("PDF_TRANSCRIBER_CHUNK_SIZE"):
+            config.chunk_size = int(val)
+        if val := os.environ.get("PDF_TRANSCRIBER_AUTO_CHUNK_THRESHOLD"):
+            config.auto_chunk_threshold = int(val)
+        # Override LLM settings from env
+        if val := os.environ.get("PDF_TRANSCRIBER_USE_LLM"):
+            config.use_llm = val.lower() in ("true", "1", "yes")
+        if val := os.environ.get("PDF_TRANSCRIBER_LLM_SERVICE"):
+            config.llm_service = val
+        if val := os.environ.get("PDF_TRANSCRIBER_OLLAMA_URL"):
+            config.ollama_base_url = val
+        if val := os.environ.get("PDF_TRANSCRIBER_OLLAMA_MODEL"):
+            config.ollama_model = val
+        # Ensure output directory exists
+        config.output_dir.mkdir(parents=True, exist_ok=True)
+        return config
+    def get_dpi(self, quality: str | None = None) -> int:
+        """Get DPI value for a quality preset."""
+        quality = quality or self.default_quality
+        return self.quality_presets.get(quality, self.quality_presets["balanced"])

pdf_transcriber/core/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Core modules for PDF transcription."""
+from .transcription import (
+    TranscriptionEngine,
+    TranscriptionResult,
+    get_transcription_engine,
+    clear_engine_cache,
+)
+from .pdf_processor import PDFProcessor
+from .state_manager import StateManager, TranscriptionState
+from .metadata_parser import PaperMetadata
+__all__ = [
+    "TranscriptionEngine",
+    "TranscriptionResult",
+    "get_transcription_engine",
+    "clear_engine_cache",
+    "PDFProcessor",
+    "StateManager",
+    "TranscriptionState",
+    "PaperMetadata",
+]

pdf_transcriber/core/linter/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Markdown linter for transcribed papers."""
+from .engine import lint_file, lint_content
+from .models import LintIssue, LintReport, Severity, Fix
+__all__ = ["lint_file", "lint_content", "LintIssue", "LintReport", "Severity", "Fix"]

pdf_transcriber/core/linter/engine.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Lint engine - runs rules and applies fixes."""
+import logging
+import re
+from pathlib import Path
+from typing import Optional
+from .models import LintIssue, LintReport, Severity, Fix
+from .rules import RULES, DEFAULT_AUTO_FIX
+logger = logging.getLogger(__name__)
+async def lint_file(
+    path: Path,
+    fix: bool = False,
+    rules: Optional[list[str]] = None
+) -> LintReport:
+    """
+    Lint a markdown file.
+    Args:
+        path: Path to the .md file
+        fix: If True, apply auto-fixes and write back
+        rules: Specific rules to run (default: all)
+    Returns:
+        LintReport with all issues found
+    """
+    content = path.read_text(encoding='utf-8')
+    report = await lint_content(content, str(path), rules=rules)
+    if fix and report.auto_fixable > 0:
+        fixed_content, fixed_rules = apply_fixes(content, report.issues)
+        report.fixed = fixed_rules
+        if fixed_content != content:
+            path.write_text(fixed_content, encoding='utf-8')
+            logger.info(f"Wrote {len(fixed_rules)} fixes to {path}")
+    return report
+async def lint_content(
+    content: str,
+    source_path: str = "<string>",
+    rules: Optional[list[str]] = None
+) -> LintReport:
+    """
+    Lint markdown content.
+    Args:
+        content: The markdown content to lint
+        source_path: Path for reporting (doesn't need to exist)
+        rules: Specific rules to run (default: all)
+    Returns:
+        LintReport with all issues found
+    """
+    report = LintReport(paper_path=source_path)
+    # Determine which rules to run
+    rules_to_run = rules if rules else list(RULES.keys())
+    # Skip frontmatter when linting
+    content_without_frontmatter, frontmatter_lines = _extract_frontmatter(content)
+    for rule_name in rules_to_run:
+        if rule_name not in RULES:
+            logger.warning(f"Unknown rule: {rule_name}")
+            continue
+        rule_func = RULES[rule_name]
+        try:
+            for issue in rule_func(content_without_frontmatter):
+                # Adjust line numbers to account for frontmatter
+                issue.line += frontmatter_lines
+                report.add_issue(issue)
+        except Exception as e:
+            logger.error(f"Rule {rule_name} failed: {e}")
+    # Sort issues by line number
+    report.issues.sort(key=lambda i: i.line)
+    return report
+def apply_fixes(content: str, issues: list[LintIssue]) -> tuple[str, list[str]]:
+    """
+    Apply auto-fixes to content.
+    Only applies fixes for issues with Severity.AUTO_FIX.
+    Applies fixes in reverse order to preserve line numbers.
+    Args:
+        content: Original content
+        issues: List of issues from linting
+    Returns:
+        Tuple of (fixed_content, list_of_applied_rule_names)
+    """
+    # Filter to auto-fixable issues with fixes
+    fixable = [
+        i for i in issues
+        if i.severity == Severity.AUTO_FIX and i.fix is not None
+    ]
+    if not fixable:
+        return content, []
+    # Track which rules were applied
+    applied_rules: set[str] = set()
+    # For trailing whitespace, we need line-based fixing
+    # For other rules, we do string replacement
+    # Separate line-based vs content-based fixes
+    line_fixes: dict[int, Fix] = {}  # line_num -> fix (for trailing_whitespace)
+    content_fixes: list[tuple[str, str]] = []  # (old, new) pairs
+    for issue in fixable:
+        if issue.fix is None:
+            continue
+        if issue.rule == "trailing_whitespace":
+            line_fixes[issue.line] = issue.fix
+            applied_rules.add(issue.rule)
+        else:
+            content_fixes.append((issue.fix.old, issue.fix.new))
+            applied_rules.add(issue.rule)
+    # Apply line-based fixes first (trailing whitespace)
+    if line_fixes:
+        lines = content.split('\n')
+        for line_num, fix in line_fixes.items():
+            idx = line_num - 1
+            if 0 <= idx < len(lines) and lines[idx] == fix.old:
+                lines[idx] = fix.new
+        content = '\n'.join(lines)
+    # Apply content-based fixes
+    # Sort by length of old string (longest first) to avoid partial replacements
+    content_fixes.sort(key=lambda x: len(x[0]), reverse=True)
+    for old, new in content_fixes:
+        # Only replace first occurrence to be safe
+        content = content.replace(old, new, 1)
+    return content, sorted(applied_rules)
+def _extract_frontmatter(content: str) -> tuple[str, int]:
+    """
+    Extract YAML frontmatter from content.
+    Returns:
+        Tuple of (content_without_frontmatter, num_frontmatter_lines)
+    """
+    if not content.startswith('---'):
+        return content, 0
+    # Find the closing ---
+    match = re.match(r'^---\s*\n.*?\n---\s*\n', content, re.DOTALL)
+    if not match:
+        return content, 0
+    frontmatter = match.group()
+    frontmatter_lines = frontmatter.count('\n')
+    return content[len(frontmatter):], frontmatter_lines
+def get_available_rules() -> dict[str, str]:
+    """
+    Get list of available rules with descriptions.
+    Returns:
+        Dict mapping rule name to docstring
+    """
+    return {
+        name: (func.__doc__ or "No description").strip().split('\n')[0]
+        for name, func in RULES.items()
+    }

pdf_transcriber/core/linter/models.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Data models for the linter."""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+class Severity(Enum):
+    """Severity levels for lint issues."""
+    AUTO_FIX = "auto_fix"     # Safe to fix automatically
+    WARNING = "warning"        # Needs review
+    ERROR = "error"           # Must address
+@dataclass
+class Fix:
+    """A proposed fix for a lint issue."""
+    old: str
+    new: str
+@dataclass
+class LintIssue:
+    """A single lint issue found in the document."""
+    rule: str
+    severity: Severity
+    line: int
+    message: str
+    fix: Optional[Fix] = None
+    def to_dict(self) -> dict:
+        return {
+            "rule": self.rule,
+            "severity": self.severity.value,
+            "line": self.line,
+            "message": self.message,
+            "has_fix": self.fix is not None
+        }
+@dataclass
+class LintReport:
+    """Complete lint report for a document."""
+    paper_path: str
+    total_issues: int = 0
+    auto_fixable: int = 0
+    warnings: int = 0
+    errors: int = 0
+    issues: list[LintIssue] = field(default_factory=list)
+    fixed: list[str] = field(default_factory=list)
+    def add_issue(self, issue: LintIssue) -> None:
+        """Add an issue to the report and update counts."""
+        self.issues.append(issue)
+        self.total_issues += 1
+        if issue.severity == Severity.AUTO_FIX:
+            self.auto_fixable += 1
+        elif issue.severity == Severity.WARNING:
+            self.warnings += 1
+        elif issue.severity == Severity.ERROR:
+            self.errors += 1
+    def to_dict(self) -> dict:
+        return {
+            "paper_path": self.paper_path,
+            "total_issues": self.total_issues,
+            "auto_fixable": self.auto_fixable,
+            "warnings": self.warnings,
+            "errors": self.errors,
+            "issues": [i.to_dict() for i in self.issues],
+            "fixed": self.fixed
+        }