pdf-transcriber 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ """PDF Transcriber MCP Server.
2
+
3
+ Convert math-heavy PDFs to Markdown using Marker OCR with optional LLM enhancement.
4
+ """
5
+
6
+ __version__ = "1.0.0"
pdf_transcriber/cli.py ADDED
@@ -0,0 +1,291 @@
1
+ """CLI for pdf-transcriber.
2
+
3
+ Provides direct terminal access to transcription functionality without MCP.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from pdf_transcriber import __version__
11
+ from pdf_transcriber.config import Config
12
+
13
+
14
+ def main():
15
+ """Main entry point for the CLI."""
16
+ parser = argparse.ArgumentParser(
17
+ prog="pdf-transcriber-cli",
18
+ description="Convert math-heavy PDFs to Markdown using Marker OCR"
19
+ )
20
+ parser.add_argument(
21
+ "--version", action="version", version=f"%(prog)s {__version__}"
22
+ )
23
+
24
+ subparsers = parser.add_subparsers(dest="command", required=True)
25
+
26
+ # transcribe command
27
+ t = subparsers.add_parser("transcribe", help="Transcribe a PDF to Markdown")
28
+ t.add_argument("pdf_path", type=Path, help="Path to PDF file")
29
+ t.add_argument("-o", "--output", type=Path, help="Output directory")
30
+ t.add_argument(
31
+ "-q", "--quality",
32
+ choices=["fast", "balanced", "high-quality"],
33
+ default="balanced",
34
+ help="Quality preset (default: balanced)"
35
+ )
36
+ t.add_argument(
37
+ "--no-llm", action="store_true",
38
+ help="Disable LLM enhancement (faster, less accurate)"
39
+ )
40
+ t.add_argument(
41
+ "--no-lint", action="store_true",
42
+ help="Skip post-transcription linting"
43
+ )
44
+ t.add_argument(
45
+ "--no-resume", action="store_true",
46
+ help="Don't resume from previous progress"
47
+ )
48
+
49
+ # check command
50
+ subparsers.add_parser("check", help="Health check (config, paths, Ollama)")
51
+
52
+ # install-skill command
53
+ s = subparsers.add_parser("install-skill", help="Install Claude Code skill")
54
+ s.add_argument(
55
+ "--force", action="store_true",
56
+ help="Overwrite existing skill"
57
+ )
58
+
59
+ args = parser.parse_args()
60
+
61
+ if args.command == "transcribe":
62
+ asyncio.run(transcribe_command(args))
63
+ elif args.command == "check":
64
+ check_command()
65
+ elif args.command == "install-skill":
66
+ install_skill_command(args)
67
+
68
+
69
+ async def transcribe_command(args):
70
+ """Execute the transcribe command."""
71
+ import os
72
+
73
+ # Apply CLI overrides to environment
74
+ if args.no_llm:
75
+ os.environ["PDF_TRANSCRIBER_USE_LLM"] = "false"
76
+
77
+ # Load config after env overrides
78
+ config = Config.load()
79
+
80
+ # Validate PDF exists
81
+ pdf_path = args.pdf_path.expanduser().resolve()
82
+ if not pdf_path.exists():
83
+ print(f"Error: PDF not found: {pdf_path}", file=sys.stderr)
84
+ sys.exit(1)
85
+
86
+ # Import transcription components
87
+ from pdf_transcriber.core.pdf_processor import PDFProcessor
88
+ from pdf_transcriber.core.transcription import get_transcription_engine
89
+ from pdf_transcriber.core.state_manager import StateManager
90
+ from pdf_transcriber.core.metadata_parser import (
91
+ create_initial_metadata,
92
+ generate_frontmatter
93
+ )
94
+ from pdf_transcriber.core.linter import engine as lint_engine
95
+
96
+ # Determine output location
97
+ paper_name = pdf_path.stem
98
+ out_dir = args.output.expanduser() if args.output else config.output_dir
99
+ paper_dir = out_dir / paper_name
100
+ paper_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ dpi = config.get_dpi(args.quality)
103
+
104
+ print(f"Transcribing: {pdf_path.name}")
105
+ print(f" Quality: {args.quality} ({dpi} DPI)")
106
+ print(f" Output: {paper_dir}")
107
+ print(f" LLM: {'enabled' if config.use_llm else 'disabled'}")
108
+ print()
109
+
110
+ # Initialize state manager
111
+ state_mgr = StateManager(out_dir, paper_name)
112
+
113
+ # Check for existing job
114
+ resume = not args.no_resume
115
+ if resume and state_mgr.has_existing_job():
116
+ state = state_mgr.load_state()
117
+ if state:
118
+ print(f"Resuming: {len(state.completed_pages)}/{state.total_pages} pages done")
119
+ else:
120
+ try:
121
+ with PDFProcessor(str(pdf_path), dpi) as proc:
122
+ total_pages = proc.total_pages
123
+ except Exception as e:
124
+ print(f"Error: Failed to open PDF: {e}", file=sys.stderr)
125
+ sys.exit(1)
126
+
127
+ state = state_mgr.create_job(str(pdf_path), total_pages, "markdown", args.quality)
128
+ print(f"Processing {total_pages} pages...")
129
+
130
+ # Get transcription engine
131
+ engine = get_transcription_engine(
132
+ use_gpu=config.use_gpu,
133
+ batch_size=config.marker_batch_size,
134
+ langs=config.marker_langs,
135
+ use_llm=config.use_llm,
136
+ llm_service=config.llm_service,
137
+ ollama_base_url=config.ollama_base_url,
138
+ ollama_model=config.ollama_model
139
+ )
140
+
141
+ # Determine chunk size
142
+ if state.total_pages > config.auto_chunk_threshold:
143
+ chunk_size = config.chunk_size
144
+ print(f" Chunking: {chunk_size} pages/chunk (auto-enabled)")
145
+ else:
146
+ chunk_size = 0
147
+
148
+ # Transcribe
149
+ try:
150
+ with PDFProcessor(str(pdf_path), dpi) as proc:
151
+ content = await engine.transcribe_streaming(
152
+ proc, "markdown", state_mgr,
153
+ chunk_size=chunk_size
154
+ )
155
+ except Exception as e:
156
+ summary = state_mgr.get_progress_summary()
157
+ print(f"\nError: Transcription failed: {e}", file=sys.stderr)
158
+ print(f"Progress saved: {summary['completed']}/{summary['total']} pages")
159
+ print("Run again with same PDF to resume")
160
+ sys.exit(1)
161
+
162
+ # Build metadata
163
+ paper_meta = create_initial_metadata(
164
+ title=paper_name,
165
+ pdf_source=pdf_path,
166
+ total_pages=state.total_pages,
167
+ output_format="markdown",
168
+ quality=args.quality,
169
+ )
170
+
171
+ summary = state_mgr.get_progress_summary()
172
+ paper_meta.transcribed_pages = summary["completed"]
173
+
174
+ # Write output
175
+ output_path = paper_dir / f"{paper_name}.md"
176
+ final_content = generate_frontmatter(paper_meta) + "\n" + content
177
+ output_path.write_text(final_content, encoding="utf-8")
178
+
179
+ # Cleanup on success
180
+ if summary["completed"] == summary["total"]:
181
+ state_mgr.cleanup()
182
+
183
+ print(f"\nTranscribed {summary['completed']}/{summary['total']} pages")
184
+
185
+ # Run linting
186
+ if not args.no_lint:
187
+ print("Linting...")
188
+ original_path = paper_dir / f"{paper_name}.original.md"
189
+ original_path.write_text(final_content, encoding="utf-8")
190
+
191
+ try:
192
+ lint_report = await lint_engine.lint_file(output_path, fix=True)
193
+ print(f" {lint_report.total_issues} issues found, {len(lint_report.fixed)} auto-fixed")
194
+ except Exception as e:
195
+ print(f" Warning: Linting failed: {e}")
196
+
197
+ print(f"\nOutput: {output_path}")
198
+
199
+
200
+ def check_command():
201
+ """Execute the check command."""
202
+ print(f"PDF Transcriber v{__version__}")
203
+ print("=" * 40)
204
+
205
+ # Configuration
206
+ config = Config.load()
207
+ print("\nConfiguration:")
208
+ print(f" Output directory: {config.output_dir}")
209
+ print(f" Default quality: {config.default_quality} ({config.get_dpi()}dpi)")
210
+ print(f" GPU: {config.use_gpu}")
211
+ print(f" LLM enhanced: {config.use_llm}")
212
+
213
+ # Output directory
214
+ print("\nOutput directory:")
215
+ if config.output_dir.exists():
216
+ paper_count = sum(
217
+ 1 for d in config.output_dir.iterdir()
218
+ if d.is_dir() and any(d.glob("*.md"))
219
+ )
220
+ print(f" Status: exists ({paper_count} papers)")
221
+ else:
222
+ print(" Status: will be created on first transcription")
223
+
224
+ # Ollama (if LLM enabled)
225
+ if config.use_llm:
226
+ print("\nOllama connection:")
227
+ try:
228
+ import urllib.request
229
+ import json
230
+
231
+ req = urllib.request.Request(
232
+ f"{config.ollama_base_url}/api/tags",
233
+ method="GET"
234
+ )
235
+ with urllib.request.urlopen(req, timeout=5) as resp:
236
+ data = json.loads(resp.read().decode())
237
+ models = [m["name"] for m in data.get("models", [])]
238
+
239
+ if config.ollama_model in models:
240
+ print(" Status: connected")
241
+ print(f" Model: {config.ollama_model} (available)")
242
+ else:
243
+ print(" Status: connected")
244
+ print(f" Model: {config.ollama_model} (NOT INSTALLED)")
245
+ print(f" Run: ollama pull {config.ollama_model}")
246
+
247
+ except Exception as e:
248
+ print(" Status: NOT CONNECTED")
249
+ print(f" Error: {e}")
250
+ print(f" URL: {config.ollama_base_url}")
251
+ print(" Run: ollama serve")
252
+
253
+ # Available tools
254
+ print("\nMCP tools:")
255
+ print(" - transcribe_pdf")
256
+ print(" - clear_transcription_cache")
257
+ print(" - lint_paper")
258
+
259
+ print("\n" + "=" * 40)
260
+ print("Ready to transcribe!")
261
+
262
+
263
+ def install_skill_command(args):
264
+ """Install the Claude Code skill."""
265
+ import importlib.resources as resources
266
+
267
+ skill_dir = Path.home() / ".claude" / "skills"
268
+ skill_dir.mkdir(parents=True, exist_ok=True)
269
+ dest = skill_dir / "transcribe.md"
270
+
271
+ if dest.exists() and not args.force:
272
+ print(f"Skill already exists: {dest}")
273
+ print("Use --force to overwrite")
274
+ sys.exit(1)
275
+
276
+ # Copy from package resources
277
+ try:
278
+ skill_content = resources.files("pdf_transcriber.skills").joinpath("transcribe.md").read_text()
279
+ dest.write_text(skill_content)
280
+ except Exception as e:
281
+ print(f"Error: Failed to read skill from package: {e}", file=sys.stderr)
282
+ print("The skill file may not be included in this installation.")
283
+ sys.exit(1)
284
+
285
+ print(f"Installed skill: {dest}")
286
+ print("Restart Claude Code to load the skill")
287
+ print("Usage: /transcribe ~/path/to/paper.pdf")
288
+
289
+
290
+ if __name__ == "__main__":
291
+ main()
@@ -0,0 +1,109 @@
1
+ """Configuration management with environment variable overrides."""
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ import os
5
+
6
+
7
+ @dataclass
8
+ class Config:
9
+ """Configuration for PDF transcriber MCP server."""
10
+
11
+ # Output directory (relative to current working directory)
12
+ output_dir: Path = field(
13
+ default_factory=lambda: Path.cwd() / "transcriptions"
14
+ )
15
+
16
+
17
+ # Quality presets (DPI values)
18
+ quality_presets: dict = field(default_factory=lambda: {
19
+ "fast": 100, # ~1275x1650px - Lightweight
20
+ "balanced": 150, # ~1913x2475px - DEFAULT - Best quality/size ratio
21
+ "high-quality": 200 # ~2550x3300px - High quality (may approach API limits)
22
+ })
23
+ default_quality: str = "balanced"
24
+
25
+ # Processing (markdown only - LaTeX removed for distribution)
26
+ default_mode: str = "streaming" # "streaming" or "batch"
27
+ max_concurrent_pages: int = 3 # For batch mode (future)
28
+
29
+ # Marker OCR settings
30
+ ocr_engine: str = "marker"
31
+ use_gpu: bool = True # Auto-detected in load()
32
+ marker_batch_size: int = 1 # Pages per batch (not currently used)
33
+ marker_langs: list = field(default_factory=lambda: ["English"])
34
+
35
+ # LLM-enhanced OCR settings (Marker's built-in LLM mode)
36
+ # NOTE: Requires a VISION model (VLM) - text-only models won't work
37
+ use_llm: bool = True # Enable Marker's LLM-enhanced OCR (default: on)
38
+ llm_service: str = "marker.services.ollama.OllamaService" # LLM service class
39
+ ollama_base_url: str = "http://localhost:11434" # Ollama server URL
40
+ # Model options (vision models only):
41
+ # - qwen2.5vl:3b (3.2 GB) - Recommended for 16GB RAM systems
42
+ # - qwen2.5vl:7b (5.5 GB) - Better quality, needs 24GB+ RAM
43
+ # - qwen3-vl:4b (3.5 GB) - Newest Qwen VL, excellent quality
44
+ ollama_model: str = "qwen2.5vl:3b" # Default: Qwen2.5-VL 3B (memory-safe)
45
+
46
+ # Chunking settings
47
+ chunk_size: int = 25 # Pages per chunk for large PDFs
48
+ auto_chunk_threshold: int = 100 # Auto-enable chunking for PDFs larger than this
49
+
50
+ # State management
51
+ progress_dir_name: str = ".pdf-progress"
52
+
53
+ # Index
54
+ index_file: str = ".paper-index.json"
55
+
56
+ # Versioning
57
+ version: str = "1.0.0"
58
+
59
+ @classmethod
60
+ def load(cls) -> "Config":
61
+ """Load config with environment variable overrides."""
62
+ config = cls()
63
+
64
+ # Override output directory from env
65
+ if val := os.environ.get("PDF_TRANSCRIBER_OUTPUT_DIR"):
66
+ config.output_dir = Path(val).expanduser()
67
+
68
+
69
+ # Override quality preset from env
70
+ if val := os.environ.get("PDF_TRANSCRIBER_QUALITY"):
71
+ if val in config.quality_presets:
72
+ config.default_quality = val
73
+
74
+ # Auto-detect GPU
75
+ try:
76
+ import torch
77
+ config.use_gpu = torch.cuda.is_available()
78
+ except ImportError:
79
+ config.use_gpu = False
80
+
81
+ # Override GPU setting from env
82
+ if val := os.environ.get("PDF_TRANSCRIBER_USE_GPU"):
83
+ config.use_gpu = val.lower() in ("true", "1", "yes")
84
+
85
+ # Override chunking settings from env
86
+ if val := os.environ.get("PDF_TRANSCRIBER_CHUNK_SIZE"):
87
+ config.chunk_size = int(val)
88
+ if val := os.environ.get("PDF_TRANSCRIBER_AUTO_CHUNK_THRESHOLD"):
89
+ config.auto_chunk_threshold = int(val)
90
+
91
+ # Override LLM settings from env
92
+ if val := os.environ.get("PDF_TRANSCRIBER_USE_LLM"):
93
+ config.use_llm = val.lower() in ("true", "1", "yes")
94
+ if val := os.environ.get("PDF_TRANSCRIBER_LLM_SERVICE"):
95
+ config.llm_service = val
96
+ if val := os.environ.get("PDF_TRANSCRIBER_OLLAMA_URL"):
97
+ config.ollama_base_url = val
98
+ if val := os.environ.get("PDF_TRANSCRIBER_OLLAMA_MODEL"):
99
+ config.ollama_model = val
100
+
101
+ # Ensure output directory exists
102
+ config.output_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ return config
105
+
106
+ def get_dpi(self, quality: str | None = None) -> int:
107
+ """Get DPI value for a quality preset."""
108
+ quality = quality or self.default_quality
109
+ return self.quality_presets.get(quality, self.quality_presets["balanced"])
@@ -0,0 +1,21 @@
1
+ """Core modules for PDF transcription."""
2
+ from .transcription import (
3
+ TranscriptionEngine,
4
+ TranscriptionResult,
5
+ get_transcription_engine,
6
+ clear_engine_cache,
7
+ )
8
+ from .pdf_processor import PDFProcessor
9
+ from .state_manager import StateManager, TranscriptionState
10
+ from .metadata_parser import PaperMetadata
11
+
12
+ __all__ = [
13
+ "TranscriptionEngine",
14
+ "TranscriptionResult",
15
+ "get_transcription_engine",
16
+ "clear_engine_cache",
17
+ "PDFProcessor",
18
+ "StateManager",
19
+ "TranscriptionState",
20
+ "PaperMetadata",
21
+ ]
@@ -0,0 +1,5 @@
1
+ """Markdown linter for transcribed papers."""
2
+ from .engine import lint_file, lint_content
3
+ from .models import LintIssue, LintReport, Severity, Fix
4
+
5
+ __all__ = ["lint_file", "lint_content", "LintIssue", "LintReport", "Severity", "Fix"]
@@ -0,0 +1,184 @@
1
+ """Lint engine - runs rules and applies fixes."""
2
+ import logging
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .models import LintIssue, LintReport, Severity, Fix
8
+ from .rules import RULES, DEFAULT_AUTO_FIX
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ async def lint_file(
14
+ path: Path,
15
+ fix: bool = False,
16
+ rules: Optional[list[str]] = None
17
+ ) -> LintReport:
18
+ """
19
+ Lint a markdown file.
20
+
21
+ Args:
22
+ path: Path to the .md file
23
+ fix: If True, apply auto-fixes and write back
24
+ rules: Specific rules to run (default: all)
25
+
26
+ Returns:
27
+ LintReport with all issues found
28
+ """
29
+ content = path.read_text(encoding='utf-8')
30
+
31
+ report = await lint_content(content, str(path), rules=rules)
32
+
33
+ if fix and report.auto_fixable > 0:
34
+ fixed_content, fixed_rules = apply_fixes(content, report.issues)
35
+ report.fixed = fixed_rules
36
+
37
+ if fixed_content != content:
38
+ path.write_text(fixed_content, encoding='utf-8')
39
+ logger.info(f"Wrote {len(fixed_rules)} fixes to {path}")
40
+
41
+ return report
42
+
43
+
44
+ async def lint_content(
45
+ content: str,
46
+ source_path: str = "<string>",
47
+ rules: Optional[list[str]] = None
48
+ ) -> LintReport:
49
+ """
50
+ Lint markdown content.
51
+
52
+ Args:
53
+ content: The markdown content to lint
54
+ source_path: Path for reporting (doesn't need to exist)
55
+ rules: Specific rules to run (default: all)
56
+
57
+ Returns:
58
+ LintReport with all issues found
59
+ """
60
+ report = LintReport(paper_path=source_path)
61
+
62
+ # Determine which rules to run
63
+ rules_to_run = rules if rules else list(RULES.keys())
64
+
65
+ # Skip frontmatter when linting
66
+ content_without_frontmatter, frontmatter_lines = _extract_frontmatter(content)
67
+
68
+ for rule_name in rules_to_run:
69
+ if rule_name not in RULES:
70
+ logger.warning(f"Unknown rule: {rule_name}")
71
+ continue
72
+
73
+ rule_func = RULES[rule_name]
74
+
75
+ try:
76
+ for issue in rule_func(content_without_frontmatter):
77
+ # Adjust line numbers to account for frontmatter
78
+ issue.line += frontmatter_lines
79
+ report.add_issue(issue)
80
+ except Exception as e:
81
+ logger.error(f"Rule {rule_name} failed: {e}")
82
+
83
+ # Sort issues by line number
84
+ report.issues.sort(key=lambda i: i.line)
85
+
86
+ return report
87
+
88
+
89
+ def apply_fixes(content: str, issues: list[LintIssue]) -> tuple[str, list[str]]:
90
+ """
91
+ Apply auto-fixes to content.
92
+
93
+ Only applies fixes for issues with Severity.AUTO_FIX.
94
+ Applies fixes in reverse order to preserve line numbers.
95
+
96
+ Args:
97
+ content: Original content
98
+ issues: List of issues from linting
99
+
100
+ Returns:
101
+ Tuple of (fixed_content, list_of_applied_rule_names)
102
+ """
103
+ # Filter to auto-fixable issues with fixes
104
+ fixable = [
105
+ i for i in issues
106
+ if i.severity == Severity.AUTO_FIX and i.fix is not None
107
+ ]
108
+
109
+ if not fixable:
110
+ return content, []
111
+
112
+ # Track which rules were applied
113
+ applied_rules: set[str] = set()
114
+
115
+ # For trailing whitespace, we need line-based fixing
116
+ # For other rules, we do string replacement
117
+
118
+ # Separate line-based vs content-based fixes
119
+ line_fixes: dict[int, Fix] = {} # line_num -> fix (for trailing_whitespace)
120
+ content_fixes: list[tuple[str, str]] = [] # (old, new) pairs
121
+
122
+ for issue in fixable:
123
+ if issue.fix is None:
124
+ continue
125
+
126
+ if issue.rule == "trailing_whitespace":
127
+ line_fixes[issue.line] = issue.fix
128
+ applied_rules.add(issue.rule)
129
+ else:
130
+ content_fixes.append((issue.fix.old, issue.fix.new))
131
+ applied_rules.add(issue.rule)
132
+
133
+ # Apply line-based fixes first (trailing whitespace)
134
+ if line_fixes:
135
+ lines = content.split('\n')
136
+ for line_num, fix in line_fixes.items():
137
+ idx = line_num - 1
138
+ if 0 <= idx < len(lines) and lines[idx] == fix.old:
139
+ lines[idx] = fix.new
140
+ content = '\n'.join(lines)
141
+
142
+ # Apply content-based fixes
143
+ # Sort by length of old string (longest first) to avoid partial replacements
144
+ content_fixes.sort(key=lambda x: len(x[0]), reverse=True)
145
+
146
+ for old, new in content_fixes:
147
+ # Only replace first occurrence to be safe
148
+ content = content.replace(old, new, 1)
149
+
150
+ return content, sorted(applied_rules)
151
+
152
+
153
+ def _extract_frontmatter(content: str) -> tuple[str, int]:
154
+ """
155
+ Extract YAML frontmatter from content.
156
+
157
+ Returns:
158
+ Tuple of (content_without_frontmatter, num_frontmatter_lines)
159
+ """
160
+ if not content.startswith('---'):
161
+ return content, 0
162
+
163
+ # Find the closing ---
164
+ match = re.match(r'^---\s*\n.*?\n---\s*\n', content, re.DOTALL)
165
+ if not match:
166
+ return content, 0
167
+
168
+ frontmatter = match.group()
169
+ frontmatter_lines = frontmatter.count('\n')
170
+
171
+ return content[len(frontmatter):], frontmatter_lines
172
+
173
+
174
+ def get_available_rules() -> dict[str, str]:
175
+ """
176
+ Get list of available rules with descriptions.
177
+
178
+ Returns:
179
+ Dict mapping rule name to docstring
180
+ """
181
+ return {
182
+ name: (func.__doc__ or "No description").strip().split('\n')[0]
183
+ for name, func in RULES.items()
184
+ }
@@ -0,0 +1,72 @@
1
+ """Data models for the linter."""
2
+ from dataclasses import dataclass, field
3
+ from enum import Enum
4
+ from typing import Optional
5
+
6
+
7
+ class Severity(Enum):
8
+ """Severity levels for lint issues."""
9
+ AUTO_FIX = "auto_fix" # Safe to fix automatically
10
+ WARNING = "warning" # Needs review
11
+ ERROR = "error" # Must address
12
+
13
+
14
+ @dataclass
15
+ class Fix:
16
+ """A proposed fix for a lint issue."""
17
+ old: str
18
+ new: str
19
+
20
+
21
+ @dataclass
22
+ class LintIssue:
23
+ """A single lint issue found in the document."""
24
+ rule: str
25
+ severity: Severity
26
+ line: int
27
+ message: str
28
+ fix: Optional[Fix] = None
29
+
30
+ def to_dict(self) -> dict:
31
+ return {
32
+ "rule": self.rule,
33
+ "severity": self.severity.value,
34
+ "line": self.line,
35
+ "message": self.message,
36
+ "has_fix": self.fix is not None
37
+ }
38
+
39
+
40
+ @dataclass
41
+ class LintReport:
42
+ """Complete lint report for a document."""
43
+ paper_path: str
44
+ total_issues: int = 0
45
+ auto_fixable: int = 0
46
+ warnings: int = 0
47
+ errors: int = 0
48
+ issues: list[LintIssue] = field(default_factory=list)
49
+ fixed: list[str] = field(default_factory=list)
50
+
51
+ def add_issue(self, issue: LintIssue) -> None:
52
+ """Add an issue to the report and update counts."""
53
+ self.issues.append(issue)
54
+ self.total_issues += 1
55
+
56
+ if issue.severity == Severity.AUTO_FIX:
57
+ self.auto_fixable += 1
58
+ elif issue.severity == Severity.WARNING:
59
+ self.warnings += 1
60
+ elif issue.severity == Severity.ERROR:
61
+ self.errors += 1
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "paper_path": self.paper_path,
66
+ "total_issues": self.total_issues,
67
+ "auto_fixable": self.auto_fixable,
68
+ "warnings": self.warnings,
69
+ "errors": self.errors,
70
+ "issues": [i.to_dict() for i in self.issues],
71
+ "fixed": self.fixed
72
+ }