pdf-file-renamer 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_file_renamer-0.4.2.dist-info/METADATA +245 -0
- pdf_file_renamer-0.4.2.dist-info/RECORD +26 -0
- pdf_file_renamer-0.4.2.dist-info/WHEEL +5 -0
- pdf_file_renamer-0.4.2.dist-info/entry_points.txt +2 -0
- pdf_file_renamer-0.4.2.dist-info/licenses/LICENSE +21 -0
- pdf_file_renamer-0.4.2.dist-info/top_level.txt +1 -0
- pdf_renamer/__init__.py +3 -0
- pdf_renamer/application/__init__.py +7 -0
- pdf_renamer/application/filename_service.py +70 -0
- pdf_renamer/application/pdf_rename_workflow.py +144 -0
- pdf_renamer/application/rename_service.py +79 -0
- pdf_renamer/domain/__init__.py +25 -0
- pdf_renamer/domain/models.py +80 -0
- pdf_renamer/domain/ports.py +106 -0
- pdf_renamer/infrastructure/__init__.py +5 -0
- pdf_renamer/infrastructure/config.py +94 -0
- pdf_renamer/infrastructure/llm/__init__.py +5 -0
- pdf_renamer/infrastructure/llm/pydantic_ai_provider.py +234 -0
- pdf_renamer/infrastructure/pdf/__init__.py +7 -0
- pdf_renamer/infrastructure/pdf/composite.py +57 -0
- pdf_renamer/infrastructure/pdf/docling_extractor.py +116 -0
- pdf_renamer/infrastructure/pdf/pymupdf_extractor.py +165 -0
- pdf_renamer/main.py +6 -0
- pdf_renamer/presentation/__init__.py +6 -0
- pdf_renamer/presentation/cli.py +233 -0
- pdf_renamer/presentation/formatters.py +216 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
"""Domain models - core business entities."""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from enum import Enum
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from pydantic import BaseModel, Field
|
8
|
+
|
9
|
+
|
10
|
+
class ConfidenceLevel(str, Enum):
|
11
|
+
"""Confidence level for filename suggestions."""
|
12
|
+
|
13
|
+
HIGH = "high"
|
14
|
+
MEDIUM = "medium"
|
15
|
+
LOW = "low"
|
16
|
+
ERROR = "error"
|
17
|
+
|
18
|
+
|
19
|
+
class FilenameResult(BaseModel):
|
20
|
+
"""Result of filename generation."""
|
21
|
+
|
22
|
+
model_config = {"use_enum_values": True}
|
23
|
+
|
24
|
+
filename: str = Field(description="Suggested filename without extension")
|
25
|
+
confidence: ConfidenceLevel = Field(description="Confidence level of the suggestion")
|
26
|
+
reasoning: str = Field(description="Explanation of why this filename was chosen")
|
27
|
+
|
28
|
+
|
29
|
+
@dataclass(frozen=True)
|
30
|
+
class PDFMetadata:
|
31
|
+
"""Metadata extracted from PDF."""
|
32
|
+
|
33
|
+
title: str | None = None
|
34
|
+
author: str | None = None
|
35
|
+
subject: str | None = None
|
36
|
+
keywords: str | None = None
|
37
|
+
creator: str | None = None
|
38
|
+
producer: str | None = None
|
39
|
+
creation_date: str | None = None
|
40
|
+
modification_date: str | None = None
|
41
|
+
# Focused metadata extracted from document content
|
42
|
+
header_text: str | None = None
|
43
|
+
year_hints: list[str] | None = None
|
44
|
+
email_hints: list[str] | None = None
|
45
|
+
author_hints: list[str] | None = None
|
46
|
+
|
47
|
+
def to_dict(self) -> dict[str, str | list[str] | None]:
|
48
|
+
"""Convert to dictionary, excluding None values."""
|
49
|
+
return {k: v for k, v in self.__dict__.items() if v is not None}
|
50
|
+
|
51
|
+
|
52
|
+
@dataclass(frozen=True)
|
53
|
+
class PDFContent:
|
54
|
+
"""Extracted content from PDF."""
|
55
|
+
|
56
|
+
text: str
|
57
|
+
metadata: PDFMetadata
|
58
|
+
page_count: int
|
59
|
+
|
60
|
+
|
61
|
+
@dataclass
|
62
|
+
class FileRenameOperation:
|
63
|
+
"""Represents a file rename operation."""
|
64
|
+
|
65
|
+
original_path: Path
|
66
|
+
suggested_filename: str
|
67
|
+
confidence: ConfidenceLevel
|
68
|
+
reasoning: str
|
69
|
+
text_excerpt: str
|
70
|
+
metadata: PDFMetadata
|
71
|
+
|
72
|
+
@property
|
73
|
+
def new_filename(self) -> str:
|
74
|
+
"""Get the new filename with extension."""
|
75
|
+
return f"{self.suggested_filename}.pdf"
|
76
|
+
|
77
|
+
def create_new_path(self, output_dir: Path | None = None) -> Path:
|
78
|
+
"""Create the new path for the renamed file."""
|
79
|
+
target_dir = output_dir if output_dir else self.original_path.parent
|
80
|
+
return target_dir / self.new_filename
|
@@ -0,0 +1,106 @@
|
|
1
|
+
"""Domain ports - interfaces for external dependencies (Dependency Inversion Principle)."""
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from pdf_renamer.domain.models import FilenameResult, PDFContent
|
7
|
+
|
8
|
+
|
9
|
+
class PDFExtractor(ABC):
|
10
|
+
"""Interface for PDF text extraction."""
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
async def extract(self, pdf_path: Path) -> PDFContent:
|
14
|
+
"""
|
15
|
+
Extract text and metadata from a PDF file.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
pdf_path: Path to the PDF file
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
PDFContent with extracted text and metadata
|
22
|
+
|
23
|
+
Raises:
|
24
|
+
RuntimeError: If extraction fails
|
25
|
+
"""
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
class LLMProvider(ABC):
|
30
|
+
"""Interface for LLM providers."""
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
async def generate_filename(
|
34
|
+
self,
|
35
|
+
original_filename: str,
|
36
|
+
text_excerpt: str,
|
37
|
+
metadata_dict: dict[str, str | list[str] | None],
|
38
|
+
) -> FilenameResult:
|
39
|
+
"""
|
40
|
+
Generate a filename suggestion using an LLM.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
original_filename: Current filename
|
44
|
+
text_excerpt: Extracted text from PDF
|
45
|
+
metadata_dict: PDF metadata dictionary
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
FilenameResult with suggestion and confidence
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
RuntimeError: If generation fails
|
52
|
+
"""
|
53
|
+
pass
|
54
|
+
|
55
|
+
|
56
|
+
class FilenameGenerator(ABC):
|
57
|
+
"""Interface for filename generation service."""
|
58
|
+
|
59
|
+
@abstractmethod
|
60
|
+
async def generate(self, original_filename: str, content: PDFContent) -> FilenameResult:
|
61
|
+
"""
|
62
|
+
Generate a filename suggestion based on PDF content.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
original_filename: Current filename
|
66
|
+
content: Extracted PDF content
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
FilenameResult with suggestion
|
70
|
+
"""
|
71
|
+
pass
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def sanitize(self, filename: str) -> str:
|
75
|
+
"""
|
76
|
+
Sanitize a filename to be filesystem-safe.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
filename: Raw filename
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
Sanitized filename
|
83
|
+
"""
|
84
|
+
pass
|
85
|
+
|
86
|
+
|
87
|
+
class FileRenamer(ABC):
|
88
|
+
"""Interface for file renaming operations."""
|
89
|
+
|
90
|
+
@abstractmethod
|
91
|
+
async def rename(self, original_path: Path, new_path: Path, dry_run: bool = True) -> bool:
|
92
|
+
"""
|
93
|
+
Rename a file.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
original_path: Original file path
|
97
|
+
new_path: New file path
|
98
|
+
dry_run: If True, don't actually rename
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
True if successful
|
102
|
+
|
103
|
+
Raises:
|
104
|
+
RuntimeError: If rename fails
|
105
|
+
"""
|
106
|
+
pass
|
@@ -0,0 +1,94 @@
|
|
1
|
+
"""Configuration management using Pydantic Settings."""
|
2
|
+
|
3
|
+
from functools import lru_cache
|
4
|
+
from typing import Literal
|
5
|
+
|
6
|
+
from pydantic import Field
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
8
|
+
|
9
|
+
|
10
|
+
class Settings(BaseSettings):
|
11
|
+
"""Application settings loaded from environment variables."""
|
12
|
+
|
13
|
+
model_config = SettingsConfigDict(
|
14
|
+
env_file=".env",
|
15
|
+
env_file_encoding="utf-8",
|
16
|
+
case_sensitive=False,
|
17
|
+
extra="ignore",
|
18
|
+
)
|
19
|
+
|
20
|
+
# LLM Configuration
|
21
|
+
llm_provider: Literal["openai"] = Field(
|
22
|
+
default="openai",
|
23
|
+
description="LLM provider to use",
|
24
|
+
)
|
25
|
+
llm_model: str = Field(
|
26
|
+
default="llama3.2",
|
27
|
+
description="Model name to use",
|
28
|
+
)
|
29
|
+
llm_base_url: str = Field(
|
30
|
+
default="http://localhost:11434/v1",
|
31
|
+
description="Base URL for OpenAI-compatible API",
|
32
|
+
)
|
33
|
+
openai_api_key: str | None = Field(
|
34
|
+
default=None,
|
35
|
+
description="OpenAI API key (optional for local models)",
|
36
|
+
)
|
37
|
+
|
38
|
+
# PDF Extraction Configuration
|
39
|
+
pdf_max_pages: int = Field(
|
40
|
+
default=5,
|
41
|
+
ge=1,
|
42
|
+
le=50,
|
43
|
+
description="Maximum pages to extract from PDF",
|
44
|
+
)
|
45
|
+
pdf_max_chars: int = Field(
|
46
|
+
default=8000,
|
47
|
+
ge=1000,
|
48
|
+
le=50000,
|
49
|
+
description="Maximum characters to extract from PDF",
|
50
|
+
)
|
51
|
+
pdf_extractor: Literal["docling", "pymupdf"] = Field(
|
52
|
+
default="docling",
|
53
|
+
description="PDF extractor to use (docling for better structure, pymupdf for speed)",
|
54
|
+
)
|
55
|
+
|
56
|
+
# Processing Configuration
|
57
|
+
max_concurrent_api: int = Field(
|
58
|
+
default=3,
|
59
|
+
ge=1,
|
60
|
+
le=20,
|
61
|
+
description="Maximum concurrent API calls",
|
62
|
+
)
|
63
|
+
max_concurrent_pdf: int = Field(
|
64
|
+
default=10,
|
65
|
+
ge=1,
|
66
|
+
le=50,
|
67
|
+
description="Maximum concurrent PDF extractions",
|
68
|
+
)
|
69
|
+
|
70
|
+
# Retry Configuration
|
71
|
+
retry_max_attempts: int = Field(
|
72
|
+
default=3,
|
73
|
+
ge=1,
|
74
|
+
le=10,
|
75
|
+
description="Maximum retry attempts for API calls",
|
76
|
+
)
|
77
|
+
retry_min_wait: int = Field(
|
78
|
+
default=4,
|
79
|
+
ge=1,
|
80
|
+
le=60,
|
81
|
+
description="Minimum wait time for exponential backoff (seconds)",
|
82
|
+
)
|
83
|
+
retry_max_wait: int = Field(
|
84
|
+
default=30,
|
85
|
+
ge=1,
|
86
|
+
le=300,
|
87
|
+
description="Maximum wait time for exponential backoff (seconds)",
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
@lru_cache
|
92
|
+
def get_settings() -> Settings:
|
93
|
+
"""Get cached settings instance (singleton pattern)."""
|
94
|
+
return Settings()
|
@@ -0,0 +1,234 @@
|
|
1
|
+
"""LLM provider using Pydantic AI for structured output generation."""
|
2
|
+
|
3
|
+
from openai import (
|
4
|
+
APIConnectionError,
|
5
|
+
APIError,
|
6
|
+
APITimeoutError,
|
7
|
+
AsyncOpenAI,
|
8
|
+
RateLimitError,
|
9
|
+
)
|
10
|
+
from pydantic_ai import Agent
|
11
|
+
from pydantic_ai.models.openai import OpenAIModel
|
12
|
+
from pydantic_ai.providers.openai import OpenAIProvider
|
13
|
+
from tenacity import (
|
14
|
+
retry,
|
15
|
+
retry_if_exception_type,
|
16
|
+
stop_after_attempt,
|
17
|
+
wait_exponential,
|
18
|
+
)
|
19
|
+
|
20
|
+
from pdf_renamer.domain.models import ConfidenceLevel, FilenameResult
|
21
|
+
from pdf_renamer.domain.ports import LLMProvider
|
22
|
+
|
23
|
+
# System prompt for filename generation
|
24
|
+
FILENAME_GENERATION_PROMPT = """You are an expert at creating concise, descriptive filenames for academic papers and technical documents.
|
25
|
+
|
26
|
+
Your task is to analyze PDF content and suggest a clear, descriptive filename that accurately captures the document's identity.
|
27
|
+
|
28
|
+
CRITICAL: PDF metadata (title, author, subject) is often UNRELIABLE or MISSING. Always prioritize what you find in the actual document text over metadata fields.
|
29
|
+
|
30
|
+
Filename Format: Author-Topic-Year
|
31
|
+
Example: Smith-Neural-Networks-Deep-Learning-2020
|
32
|
+
|
33
|
+
EXTRACTION STRATEGY:
|
34
|
+
1. AUTHOR: Look for author names in these locations (in order of reliability):
|
35
|
+
- First page header/title area
|
36
|
+
- After the title (often in smaller font or with affiliations)
|
37
|
+
- Paper byline (e.g., "by John Smith" or "Authors: Smith et al.")
|
38
|
+
- Email addresses can help confirm author names
|
39
|
+
- If multiple authors, use ONLY the first author's last name
|
40
|
+
- IGNORE metadata author field if it conflicts with document text
|
41
|
+
|
42
|
+
2. TOPIC/TITLE: Look for the main title in:
|
43
|
+
- Large text at top of first page (usually biggest font)
|
44
|
+
- Abstract section which often restates the title
|
45
|
+
- Running headers on subsequent pages
|
46
|
+
- Condense long titles to key terms (3-6 words)
|
47
|
+
- Remove generic words like "A Study of", "An Analysis of", "Introduction to"
|
48
|
+
- Keep domain-specific terminology intact
|
49
|
+
|
50
|
+
3. YEAR: Look for publication year in:
|
51
|
+
- Copyright notice or footer on first page
|
52
|
+
- Date near title or author information
|
53
|
+
- Conference/journal citation info
|
54
|
+
- Page headers/footers
|
55
|
+
- ONLY include year if you find it clearly stated
|
56
|
+
- Do NOT guess or estimate years
|
57
|
+
|
58
|
+
EXAMPLES OF GOOD FILENAMES:
|
59
|
+
- Hinton-Deep-Learning-Review-2015
|
60
|
+
- Vapnik-Support-Vector-Networks-1995
|
61
|
+
- Goodfellow-Generative-Adversarial-Networks-2014
|
62
|
+
- Hochreiter-Long-Short-Term-Memory-1997
|
63
|
+
|
64
|
+
FORMATTING RULES:
|
65
|
+
- Use hyphens between ALL words (no spaces or underscores)
|
66
|
+
- Use title case for all words
|
67
|
+
- Remove special characters: colons, quotes, commas, parentheses
|
68
|
+
- Target 60-100 characters total (can be shorter or slightly longer if needed)
|
69
|
+
- If title is very long, focus on the most distinctive/searchable terms
|
70
|
+
|
71
|
+
CONFIDENCE LEVELS:
|
72
|
+
- HIGH: You found author (first page), clear title, and year in the document text
|
73
|
+
- MEDIUM: You found title and either author OR year, or title is very clear but other elements missing
|
74
|
+
- LOW: Document text is unclear, heavily formatted, or you can only extract partial information
|
75
|
+
|
76
|
+
IMPORTANT: When metadata contradicts document text, TRUST THE DOCUMENT TEXT. Explain your reasoning briefly."""
|
77
|
+
|
78
|
+
|
79
|
+
class PydanticAIProvider(LLMProvider):
|
80
|
+
"""LLM provider using Pydantic AI with structured outputs."""
|
81
|
+
|
82
|
+
def __init__(
|
83
|
+
self,
|
84
|
+
model_name: str,
|
85
|
+
api_key: str | None = None,
|
86
|
+
base_url: str | None = None,
|
87
|
+
retry_max_attempts: int = 3,
|
88
|
+
retry_min_wait: int = 4,
|
89
|
+
retry_max_wait: int = 30,
|
90
|
+
) -> None:
|
91
|
+
"""
|
92
|
+
Initialize the Pydantic AI provider.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
model_name: Model name to use
|
96
|
+
api_key: API key (optional for local models)
|
97
|
+
base_url: Base URL for OpenAI-compatible API
|
98
|
+
retry_max_attempts: Maximum retry attempts
|
99
|
+
retry_min_wait: Minimum wait time for retries (seconds)
|
100
|
+
retry_max_wait: Maximum wait time for retries (seconds)
|
101
|
+
"""
|
102
|
+
self.model_name = model_name
|
103
|
+
self.retry_max_attempts = retry_max_attempts
|
104
|
+
self.retry_min_wait = retry_min_wait
|
105
|
+
self.retry_max_wait = retry_max_wait
|
106
|
+
|
107
|
+
# Create model with appropriate configuration
|
108
|
+
if base_url:
|
109
|
+
client = AsyncOpenAI(base_url=base_url, api_key=api_key or "dummy-key")
|
110
|
+
provider = OpenAIProvider(openai_client=client)
|
111
|
+
model = OpenAIModel(model_name, provider=provider)
|
112
|
+
else:
|
113
|
+
if api_key:
|
114
|
+
client = AsyncOpenAI(api_key=api_key)
|
115
|
+
provider = OpenAIProvider(openai_client=client)
|
116
|
+
model = OpenAIModel(model_name, provider=provider)
|
117
|
+
else:
|
118
|
+
model = OpenAIModel(model_name)
|
119
|
+
|
120
|
+
# Create agent with structured output
|
121
|
+
self.agent: Agent[None, FilenameResult] = Agent(
|
122
|
+
model=model,
|
123
|
+
output_type=FilenameResult,
|
124
|
+
system_prompt=FILENAME_GENERATION_PROMPT,
|
125
|
+
)
|
126
|
+
|
127
|
+
@retry(
|
128
|
+
stop=stop_after_attempt(3),
|
129
|
+
wait=wait_exponential(multiplier=2, min=4, max=30),
|
130
|
+
retry=retry_if_exception_type(
|
131
|
+
(APIError, APIConnectionError, RateLimitError, APITimeoutError)
|
132
|
+
),
|
133
|
+
reraise=True,
|
134
|
+
)
|
135
|
+
async def generate_filename(
|
136
|
+
self,
|
137
|
+
original_filename: str,
|
138
|
+
text_excerpt: str,
|
139
|
+
metadata_dict: dict[str, str | list[str] | None],
|
140
|
+
) -> FilenameResult:
|
141
|
+
"""
|
142
|
+
Generate filename using LLM with retry logic.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
original_filename: Current filename
|
146
|
+
text_excerpt: Extracted text from PDF
|
147
|
+
metadata_dict: PDF metadata dictionary
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
FilenameResult with suggestion
|
151
|
+
|
152
|
+
Raises:
|
153
|
+
RuntimeError: If generation fails after retries
|
154
|
+
"""
|
155
|
+
try:
|
156
|
+
# Build context for LLM
|
157
|
+
context_parts = [f"Original filename: {original_filename}"]
|
158
|
+
|
159
|
+
# Add metadata hints if available
|
160
|
+
if title := metadata_dict.get("title"):
|
161
|
+
context_parts.append(f"PDF Title metadata (may be unreliable): {title}")
|
162
|
+
if author := metadata_dict.get("author"):
|
163
|
+
context_parts.append(f"PDF Author metadata (may be unreliable): {author}")
|
164
|
+
if subject := metadata_dict.get("subject"):
|
165
|
+
context_parts.append(f"PDF Subject metadata (may be unreliable): {subject}")
|
166
|
+
|
167
|
+
# Add focused metadata hints
|
168
|
+
year_hints = metadata_dict.get("year_hints")
|
169
|
+
if year_hints and isinstance(year_hints, list):
|
170
|
+
context_parts.append(f"Years found in document: {', '.join(year_hints)}")
|
171
|
+
|
172
|
+
email_hints = metadata_dict.get("email_hints")
|
173
|
+
if email_hints and isinstance(email_hints, list):
|
174
|
+
context_parts.append(
|
175
|
+
f"Email addresses found (often near authors): {', '.join(email_hints[:2])}"
|
176
|
+
)
|
177
|
+
|
178
|
+
author_hints = metadata_dict.get("author_hints")
|
179
|
+
if author_hints and isinstance(author_hints, list):
|
180
|
+
context_parts.append("Possible author sections:\n" + "\n".join(author_hints[:2]))
|
181
|
+
if header_text := metadata_dict.get("header_text"):
|
182
|
+
context_parts.append(f"First 500 chars (likely title/author area):\n{header_text}")
|
183
|
+
|
184
|
+
# Add full text excerpt
|
185
|
+
context_parts.append(f"\nFull content excerpt (first ~5 pages):\n{text_excerpt}")
|
186
|
+
|
187
|
+
context = "\n".join(context_parts)
|
188
|
+
|
189
|
+
# Generate filename
|
190
|
+
result = await self.agent.run(context)
|
191
|
+
suggestion = result.output
|
192
|
+
|
193
|
+
# If confidence is low, try a focused second pass
|
194
|
+
if suggestion.confidence == ConfidenceLevel.LOW:
|
195
|
+
suggestion = await self._retry_with_focus(original_filename, text_excerpt)
|
196
|
+
|
197
|
+
return suggestion
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
msg = f"Failed to generate filename: {e}"
|
201
|
+
raise RuntimeError(msg) from e
|
202
|
+
|
203
|
+
async def _retry_with_focus(self, original_filename: str, text_excerpt: str) -> FilenameResult:
|
204
|
+
"""
|
205
|
+
Retry filename generation with more focused prompting.
|
206
|
+
|
207
|
+
Args:
|
208
|
+
original_filename: Current filename
|
209
|
+
text_excerpt: Extracted text from PDF
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
FilenameResult from second pass
|
213
|
+
"""
|
214
|
+
# Focus on first portion of text
|
215
|
+
first_pages = text_excerpt[:4000]
|
216
|
+
|
217
|
+
focused_context = f"""SECOND PASS - The initial analysis had low confidence. Please analyze more carefully.
|
218
|
+
|
219
|
+
Original filename: {original_filename}
|
220
|
+
|
221
|
+
FOCUS ON: The first few pages contain the most important metadata (title, author, year).
|
222
|
+
Look VERY carefully at:
|
223
|
+
1. The largest text on page 1 (this is usually the title)
|
224
|
+
2. Text immediately after the title (usually authors and affiliations)
|
225
|
+
3. Any dates, copyright notices, or publication info on page 1
|
226
|
+
4. Headers and footers that might contain publication info
|
227
|
+
|
228
|
+
First pages content:
|
229
|
+
{first_pages}
|
230
|
+
|
231
|
+
Please extract whatever information you can find with certainty. If you cannot find author or year, that's OK - just provide the best title you can determine."""
|
232
|
+
|
233
|
+
result = await self.agent.run(focused_context)
|
234
|
+
return result.output
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""PDF extraction implementations."""
|
2
|
+
|
3
|
+
from pdf_renamer.infrastructure.pdf.composite import CompositePDFExtractor
|
4
|
+
from pdf_renamer.infrastructure.pdf.docling_extractor import DoclingPDFExtractor
|
5
|
+
from pdf_renamer.infrastructure.pdf.pymupdf_extractor import PyMuPDFExtractor
|
6
|
+
|
7
|
+
__all__ = ["CompositePDFExtractor", "DoclingPDFExtractor", "PyMuPDFExtractor"]
|
@@ -0,0 +1,57 @@
|
|
1
|
+
"""Composite PDF extractor that tries multiple strategies."""
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from pdf_renamer.domain.models import PDFContent
|
6
|
+
from pdf_renamer.domain.ports import PDFExtractor
|
7
|
+
|
8
|
+
|
9
|
+
class CompositePDFExtractor(PDFExtractor):
|
10
|
+
"""
|
11
|
+
Composite PDF extractor that tries multiple extractors in sequence.
|
12
|
+
|
13
|
+
This implements the Chain of Responsibility pattern with fallback strategy.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, extractors: list[PDFExtractor]) -> None:
|
17
|
+
"""
|
18
|
+
Initialize the composite extractor.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
extractors: List of extractors to try in order
|
22
|
+
"""
|
23
|
+
if not extractors:
|
24
|
+
msg = "At least one extractor must be provided"
|
25
|
+
raise ValueError(msg)
|
26
|
+
self.extractors = extractors
|
27
|
+
|
28
|
+
async def extract(self, pdf_path: Path) -> PDFContent:
|
29
|
+
"""
|
30
|
+
Try extractors in sequence until one succeeds.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
pdf_path: Path to PDF file
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
PDFContent from first successful extractor
|
37
|
+
|
38
|
+
Raises:
|
39
|
+
RuntimeError: If all extractors fail
|
40
|
+
"""
|
41
|
+
errors: list[str] = []
|
42
|
+
|
43
|
+
for extractor in self.extractors:
|
44
|
+
try:
|
45
|
+
content = await extractor.extract(pdf_path)
|
46
|
+
# Only accept if we got meaningful text
|
47
|
+
if len(content.text.strip()) > 100:
|
48
|
+
return content
|
49
|
+
errors.append(f"{extractor.__class__.__name__}: Insufficient text extracted")
|
50
|
+
except Exception as e:
|
51
|
+
errors.append(f"{extractor.__class__.__name__}: {e}")
|
52
|
+
continue
|
53
|
+
|
54
|
+
# All extractors failed
|
55
|
+
error_msg = "; ".join(errors)
|
56
|
+
msg = f"All PDF extractors failed for {pdf_path}: {error_msg}"
|
57
|
+
raise RuntimeError(msg)
|