ebk 0.1.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +35 -0
- ebk/cli.py +1724 -664
- ebk/config.py +260 -22
- ebk/decorators.py +132 -0
- ebk/extract_metadata.py +76 -7
- ebk/library_db.py +744 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +444 -0
- ebk/plugins/registry.py +500 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +1633 -0
- ebk-0.3.1.dist-info/METADATA +755 -0
- ebk-0.3.1.dist-info/RECORD +19 -0
- {ebk-0.1.0.dist-info → ebk-0.3.1.dist-info}/WHEEL +1 -1
- ebk-0.3.1.dist-info/entry_points.txt +6 -0
- ebk-0.3.1.dist-info/licenses/LICENSE +21 -0
- ebk-0.3.1.dist-info/top_level.txt +2 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/hugo.py +0 -55
- ebk/exports/zip.py +0 -25
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +0 -144
- ebk/imports/ebooks.py +0 -116
- ebk/llm.py +0 -58
- ebk/manager.py +0 -44
- ebk/merge.py +0 -308
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +0 -185
- ebk/streamlit/display.py +0 -168
- ebk/streamlit/filters.py +0 -151
- ebk/streamlit/utils.py +0 -58
- ebk/utils.py +0 -311
- ebk-0.1.0.dist-info/METADATA +0 -457
- ebk-0.1.0.dist-info/RECORD +0 -29
- ebk-0.1.0.dist-info/entry_points.txt +0 -2
- ebk-0.1.0.dist-info/top_level.txt +0 -1
ebk/config.py
CHANGED
|
@@ -1,35 +1,273 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for EBK.
|
|
3
|
+
|
|
4
|
+
Handles loading and saving user configuration from:
|
|
5
|
+
- XDG config directory: ~/.config/ebk/config.json
|
|
6
|
+
- Fallback: ~/.ebk/config.json
|
|
7
|
+
- Legacy: ~/.ebkrc (for backward compatibility)
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
import configparser
|
|
11
|
+
import json
|
|
2
12
|
import os
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, Any, Optional
|
|
15
|
+
from dataclasses import dataclass, asdict, field
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class LLMConfig:
|
|
20
|
+
"""LLM provider configuration."""
|
|
21
|
+
provider: str = "ollama"
|
|
22
|
+
model: str = "llama3.2"
|
|
23
|
+
host: str = "localhost"
|
|
24
|
+
port: int = 11434
|
|
25
|
+
api_key: Optional[str] = None
|
|
26
|
+
temperature: float = 0.7
|
|
27
|
+
max_tokens: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ServerConfig:
|
|
32
|
+
"""Web server configuration."""
|
|
33
|
+
host: str = "0.0.0.0"
|
|
34
|
+
port: int = 8000
|
|
35
|
+
auto_open_browser: bool = False
|
|
36
|
+
page_size: int = 50
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class CLIConfig:
|
|
41
|
+
"""CLI default options."""
|
|
42
|
+
verbose: bool = False
|
|
43
|
+
color: bool = True
|
|
44
|
+
page_size: int = 50
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class LibraryConfig:
|
|
49
|
+
"""Library-related settings."""
|
|
50
|
+
default_path: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class EBKConfig:
|
|
55
|
+
"""Main EBK configuration."""
|
|
56
|
+
llm: LLMConfig = field(default_factory=LLMConfig)
|
|
57
|
+
server: ServerConfig = field(default_factory=ServerConfig)
|
|
58
|
+
cli: CLIConfig = field(default_factory=CLIConfig)
|
|
59
|
+
library: LibraryConfig = field(default_factory=LibraryConfig)
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
62
|
+
"""Convert to dictionary."""
|
|
63
|
+
return {
|
|
64
|
+
"llm": asdict(self.llm),
|
|
65
|
+
"server": asdict(self.server),
|
|
66
|
+
"cli": asdict(self.cli),
|
|
67
|
+
"library": asdict(self.library),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'EBKConfig':
|
|
72
|
+
"""Create from dictionary."""
|
|
73
|
+
llm_data = data.get("llm", {})
|
|
74
|
+
server_data = data.get("server", {})
|
|
75
|
+
cli_data = data.get("cli", {})
|
|
76
|
+
library_data = data.get("library", {})
|
|
77
|
+
return cls(
|
|
78
|
+
llm=LLMConfig(**llm_data),
|
|
79
|
+
server=ServerConfig(**server_data),
|
|
80
|
+
cli=CLIConfig(**cli_data),
|
|
81
|
+
library=LibraryConfig(**library_data),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_config_path() -> Path:
|
|
86
|
+
"""
|
|
87
|
+
Get configuration file path.
|
|
88
|
+
|
|
89
|
+
Follows XDG Base Directory specification:
|
|
90
|
+
1. $XDG_CONFIG_HOME/ebk/config.json (usually ~/.config/ebk/config.json)
|
|
91
|
+
2. Fallback: ~/.ebk/config.json
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Path to config file
|
|
95
|
+
"""
|
|
96
|
+
# Try XDG config directory first
|
|
97
|
+
xdg_config_home = Path.home() / ".config"
|
|
98
|
+
if xdg_config_home.exists():
|
|
99
|
+
config_dir = xdg_config_home / "ebk"
|
|
100
|
+
else:
|
|
101
|
+
# Fallback to ~/.ebk
|
|
102
|
+
config_dir = Path.home() / ".ebk"
|
|
103
|
+
|
|
104
|
+
return config_dir / "config.json"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def load_config() -> EBKConfig:
|
|
108
|
+
"""
|
|
109
|
+
Load configuration from file.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
EBKConfig instance with loaded values or defaults
|
|
113
|
+
"""
|
|
114
|
+
config_path = get_config_path()
|
|
115
|
+
|
|
116
|
+
if not config_path.exists():
|
|
117
|
+
# Return default config
|
|
118
|
+
return EBKConfig()
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
with open(config_path, 'r') as f:
|
|
122
|
+
data = json.load(f)
|
|
123
|
+
return EBKConfig.from_dict(data)
|
|
124
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
125
|
+
print(f"Warning: Failed to load config from {config_path}: {e}")
|
|
126
|
+
print("Using default configuration")
|
|
127
|
+
return EBKConfig()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def save_config(config: EBKConfig) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Save configuration to file.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
config: Configuration to save
|
|
136
|
+
"""
|
|
137
|
+
config_path = get_config_path()
|
|
138
|
+
|
|
139
|
+
# Create directory if it doesn't exist
|
|
140
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
# Write config
|
|
143
|
+
with open(config_path, 'w') as f:
|
|
144
|
+
json.dump(config.to_dict(), f, indent=2)
|
|
145
|
+
|
|
146
|
+
print(f"Configuration saved to {config_path}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def ensure_config_exists() -> Path:
|
|
150
|
+
"""
|
|
151
|
+
Ensure configuration file exists, creating with defaults if not.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Path to config file
|
|
155
|
+
"""
|
|
156
|
+
config_path = get_config_path()
|
|
157
|
+
|
|
158
|
+
if not config_path.exists():
|
|
159
|
+
config = EBKConfig()
|
|
160
|
+
save_config(config)
|
|
161
|
+
print(f"Created default configuration at {config_path}")
|
|
162
|
+
|
|
163
|
+
return config_path
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def update_config(
|
|
167
|
+
# LLM settings
|
|
168
|
+
llm_provider: Optional[str] = None,
|
|
169
|
+
llm_model: Optional[str] = None,
|
|
170
|
+
llm_host: Optional[str] = None,
|
|
171
|
+
llm_port: Optional[int] = None,
|
|
172
|
+
llm_api_key: Optional[str] = None,
|
|
173
|
+
llm_temperature: Optional[float] = None,
|
|
174
|
+
llm_max_tokens: Optional[int] = None,
|
|
175
|
+
# Server settings
|
|
176
|
+
server_host: Optional[str] = None,
|
|
177
|
+
server_port: Optional[int] = None,
|
|
178
|
+
server_auto_open: Optional[bool] = None,
|
|
179
|
+
server_page_size: Optional[int] = None,
|
|
180
|
+
# CLI settings
|
|
181
|
+
cli_verbose: Optional[bool] = None,
|
|
182
|
+
cli_color: Optional[bool] = None,
|
|
183
|
+
cli_page_size: Optional[int] = None,
|
|
184
|
+
# Library settings
|
|
185
|
+
library_default_path: Optional[str] = None,
|
|
186
|
+
) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Update configuration.
|
|
3
189
|
|
|
190
|
+
Only updates provided values, leaving others unchanged.
|
|
191
|
+
"""
|
|
192
|
+
config = load_config()
|
|
193
|
+
|
|
194
|
+
# Update LLM config
|
|
195
|
+
if llm_provider is not None:
|
|
196
|
+
config.llm.provider = llm_provider
|
|
197
|
+
if llm_model is not None:
|
|
198
|
+
config.llm.model = llm_model
|
|
199
|
+
if llm_host is not None:
|
|
200
|
+
config.llm.host = llm_host
|
|
201
|
+
if llm_port is not None:
|
|
202
|
+
config.llm.port = llm_port
|
|
203
|
+
if llm_api_key is not None:
|
|
204
|
+
config.llm.api_key = llm_api_key
|
|
205
|
+
if llm_temperature is not None:
|
|
206
|
+
config.llm.temperature = llm_temperature
|
|
207
|
+
if llm_max_tokens is not None:
|
|
208
|
+
config.llm.max_tokens = llm_max_tokens
|
|
209
|
+
|
|
210
|
+
# Update server config
|
|
211
|
+
if server_host is not None:
|
|
212
|
+
config.server.host = server_host
|
|
213
|
+
if server_port is not None:
|
|
214
|
+
config.server.port = server_port
|
|
215
|
+
if server_auto_open is not None:
|
|
216
|
+
config.server.auto_open_browser = server_auto_open
|
|
217
|
+
if server_page_size is not None:
|
|
218
|
+
config.server.page_size = server_page_size
|
|
219
|
+
|
|
220
|
+
# Update CLI config
|
|
221
|
+
if cli_verbose is not None:
|
|
222
|
+
config.cli.verbose = cli_verbose
|
|
223
|
+
if cli_color is not None:
|
|
224
|
+
config.cli.color = cli_color
|
|
225
|
+
if cli_page_size is not None:
|
|
226
|
+
config.cli.page_size = cli_page_size
|
|
227
|
+
|
|
228
|
+
# Update library config
|
|
229
|
+
if library_default_path is not None:
|
|
230
|
+
config.library.default_path = library_default_path
|
|
231
|
+
|
|
232
|
+
save_config(config)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# Backward compatibility
|
|
236
|
+
def update_llm_config(
|
|
237
|
+
provider: Optional[str] = None,
|
|
238
|
+
model: Optional[str] = None,
|
|
239
|
+
host: Optional[str] = None,
|
|
240
|
+
port: Optional[int] = None,
|
|
241
|
+
api_key: Optional[str] = None,
|
|
242
|
+
temperature: Optional[float] = None,
|
|
243
|
+
max_tokens: Optional[int] = None
|
|
244
|
+
) -> None:
|
|
245
|
+
"""Update LLM configuration (legacy function)."""
|
|
246
|
+
update_config(
|
|
247
|
+
llm_provider=provider,
|
|
248
|
+
llm_model=model,
|
|
249
|
+
llm_host=host,
|
|
250
|
+
llm_port=port,
|
|
251
|
+
llm_api_key=api_key,
|
|
252
|
+
llm_temperature=temperature,
|
|
253
|
+
llm_max_tokens=max_tokens,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# Legacy support for ~/.ebkrc
|
|
4
258
|
def load_ebkrc_config():
|
|
5
259
|
"""
|
|
6
|
-
Loads configuration from ~/.
|
|
260
|
+
Loads configuration from ~/.ebkrc (legacy).
|
|
7
261
|
|
|
8
|
-
|
|
9
|
-
|
|
262
|
+
The configuration file can contain various sections for different features.
|
|
263
|
+
For example, [streamlit] section for dashboard configuration.
|
|
10
264
|
"""
|
|
11
265
|
config_path = os.path.expanduser("~/.ebkrc")
|
|
12
266
|
parser = configparser.ConfigParser()
|
|
13
267
|
|
|
14
268
|
if not os.path.exists(config_path):
|
|
15
|
-
|
|
269
|
+
# Config file is optional
|
|
270
|
+
return parser
|
|
16
271
|
|
|
17
272
|
parser.read(config_path)
|
|
18
|
-
|
|
19
|
-
if "llm" not in parser:
|
|
20
|
-
raise ValueError(
|
|
21
|
-
"Config file ~/.btkrc is missing the [llm] section. "
|
|
22
|
-
"Please add it with 'endpoint' and 'api_key' keys."
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
endpoint = parser["llm"].get("endpoint", "")
|
|
26
|
-
api_key = parser["llm"].get("api_key", "")
|
|
27
|
-
model = parser["llm"].get("model", "gpt-3.5-turbo")
|
|
28
|
-
|
|
29
|
-
if not endpoint or not api_key or not model:
|
|
30
|
-
raise ValueError(
|
|
31
|
-
"Please make sure your [llm] section in ~/.btkrc "
|
|
32
|
-
"includes 'endpoint', 'api_key', and 'model' keys."
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
return endpoint, api_key, model
|
|
273
|
+
return parser
|
ebk/decorators.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Decorators for ebk functionality."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Any
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def handle_library_errors(func: Callable) -> Callable:
|
|
15
|
+
"""
|
|
16
|
+
Decorator to handle common library operation errors.
|
|
17
|
+
|
|
18
|
+
Reduces code duplication by centralizing error handling for:
|
|
19
|
+
- FileNotFoundError: Library doesn't exist
|
|
20
|
+
- PermissionError: No access to files
|
|
21
|
+
- ValueError: Invalid data or arguments
|
|
22
|
+
- General exceptions: Unexpected errors
|
|
23
|
+
"""
|
|
24
|
+
@functools.wraps(func)
|
|
25
|
+
def wrapper(*args, **kwargs) -> Any:
|
|
26
|
+
try:
|
|
27
|
+
return func(*args, **kwargs)
|
|
28
|
+
except FileNotFoundError as e:
|
|
29
|
+
console.print(f"[bold red]Error:[/bold red] Library or file not found: {e}")
|
|
30
|
+
raise typer.Exit(code=1)
|
|
31
|
+
except PermissionError as e:
|
|
32
|
+
console.print(f"[bold red]Error:[/bold red] Permission denied: {e}")
|
|
33
|
+
console.print("[yellow]Tip: Check file permissions or run with appropriate privileges[/yellow]")
|
|
34
|
+
raise typer.Exit(code=1)
|
|
35
|
+
except ValueError as e:
|
|
36
|
+
console.print(f"[bold red]Error:[/bold red] Invalid input: {e}")
|
|
37
|
+
raise typer.Exit(code=1)
|
|
38
|
+
except KeyboardInterrupt:
|
|
39
|
+
console.print("\n[yellow]Operation cancelled by user[/yellow]")
|
|
40
|
+
raise typer.Exit(code=130)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"Unexpected error in {func.__name__}: {e}", exc_info=True)
|
|
43
|
+
console.print(f"[bold red]Unexpected error:[/bold red] {e}")
|
|
44
|
+
console.print("[dim]See log file for details[/dim]")
|
|
45
|
+
raise typer.Exit(code=1)
|
|
46
|
+
|
|
47
|
+
return wrapper
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def validate_path(path_type: str = "directory") -> Callable:
|
|
51
|
+
"""
|
|
52
|
+
Decorator to validate and sanitize file paths for security.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
path_type: Either "directory" or "file"
|
|
56
|
+
|
|
57
|
+
Prevents:
|
|
58
|
+
- Path traversal attacks
|
|
59
|
+
- Access to system directories
|
|
60
|
+
- Symbolic link attacks
|
|
61
|
+
"""
|
|
62
|
+
def decorator(func: Callable) -> Callable:
|
|
63
|
+
@functools.wraps(func)
|
|
64
|
+
def wrapper(*args, **kwargs) -> Any:
|
|
65
|
+
# Find path arguments (usually first positional arg)
|
|
66
|
+
if args:
|
|
67
|
+
path = Path(args[0]).resolve()
|
|
68
|
+
|
|
69
|
+
# Security checks
|
|
70
|
+
try:
|
|
71
|
+
# Ensure path is within current directory or explicitly allowed
|
|
72
|
+
cwd = Path.cwd()
|
|
73
|
+
home = Path.home()
|
|
74
|
+
|
|
75
|
+
# Check if path is trying to escape to system directories
|
|
76
|
+
if path.parts[0] in ('/', '\\') and not (
|
|
77
|
+
path.is_relative_to(cwd) or
|
|
78
|
+
path.is_relative_to(home)
|
|
79
|
+
):
|
|
80
|
+
raise ValueError(f"Access to system path not allowed: {path}")
|
|
81
|
+
|
|
82
|
+
# Check for suspicious patterns
|
|
83
|
+
suspicious_patterns = ['../', '...', '~/', '/etc/', '/usr/', '/bin/', '/sys/']
|
|
84
|
+
path_str = str(path)
|
|
85
|
+
for pattern in suspicious_patterns:
|
|
86
|
+
if pattern in path_str and not path.is_relative_to(home):
|
|
87
|
+
raise ValueError(f"Suspicious path pattern detected: {pattern}")
|
|
88
|
+
|
|
89
|
+
# Validate based on type
|
|
90
|
+
if path_type == "directory":
|
|
91
|
+
if path.exists() and not path.is_dir():
|
|
92
|
+
raise ValueError(f"Path exists but is not a directory: {path}")
|
|
93
|
+
elif path_type == "file":
|
|
94
|
+
if path.exists() and not path.is_file():
|
|
95
|
+
raise ValueError(f"Path exists but is not a file: {path}")
|
|
96
|
+
|
|
97
|
+
# Replace the path with the resolved, safe version
|
|
98
|
+
args = (str(path),) + args[1:]
|
|
99
|
+
|
|
100
|
+
except ValueError as e:
|
|
101
|
+
console.print(f"[bold red]Security Error:[/bold red] {e}")
|
|
102
|
+
raise typer.Exit(code=1)
|
|
103
|
+
|
|
104
|
+
return func(*args, **kwargs)
|
|
105
|
+
|
|
106
|
+
return wrapper
|
|
107
|
+
return decorator
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def require_confirmation(message: str = "Are you sure you want to continue?") -> Callable:
|
|
111
|
+
"""
|
|
112
|
+
Decorator to require user confirmation for destructive operations.
|
|
113
|
+
"""
|
|
114
|
+
def decorator(func: Callable) -> Callable:
|
|
115
|
+
@functools.wraps(func)
|
|
116
|
+
def wrapper(*args, **kwargs) -> Any:
|
|
117
|
+
# Check if --yes flag was passed (common pattern)
|
|
118
|
+
if kwargs.get('yes', False):
|
|
119
|
+
return func(*args, **kwargs)
|
|
120
|
+
|
|
121
|
+
# Ask for confirmation
|
|
122
|
+
console.print(f"[yellow]⚠️ {message}[/yellow]")
|
|
123
|
+
response = typer.confirm("Continue?")
|
|
124
|
+
|
|
125
|
+
if not response:
|
|
126
|
+
console.print("[red]Operation cancelled[/red]")
|
|
127
|
+
raise typer.Exit(code=0)
|
|
128
|
+
|
|
129
|
+
return func(*args, **kwargs)
|
|
130
|
+
|
|
131
|
+
return wrapper
|
|
132
|
+
return decorator
|
ebk/extract_metadata.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import xmltodict
|
|
3
3
|
from typing import Dict, Optional
|
|
4
4
|
from slugify import slugify
|
|
5
|
-
import
|
|
5
|
+
import pypdf
|
|
6
6
|
from ebooklib import epub
|
|
7
7
|
|
|
8
8
|
def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
@@ -32,12 +32,17 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
|
32
32
|
simplified = {
|
|
33
33
|
"title": metadata.get("dc:title", metadata.get("title")),
|
|
34
34
|
"creators": None,
|
|
35
|
+
"contributors": None,
|
|
35
36
|
"subjects": None,
|
|
36
37
|
"description": metadata.get("dc:description", metadata.get("description")),
|
|
37
38
|
"language": metadata.get("dc:language", metadata.get("language")),
|
|
38
39
|
"date": metadata.get("dc:date", metadata.get("date")),
|
|
39
40
|
"publisher": metadata.get("dc:publisher", metadata.get("publisher")),
|
|
40
|
-
"identifiers": None
|
|
41
|
+
"identifiers": None,
|
|
42
|
+
"rights": metadata.get("dc:rights", metadata.get("rights")),
|
|
43
|
+
"source": metadata.get("dc:source", metadata.get("source")),
|
|
44
|
+
"series": None,
|
|
45
|
+
"series_index": None
|
|
41
46
|
}
|
|
42
47
|
|
|
43
48
|
# -- Creators
|
|
@@ -75,12 +80,64 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
|
75
80
|
text = identifiers.get("#text", "").strip()
|
|
76
81
|
simplified["identifiers"][scheme] = text
|
|
77
82
|
|
|
83
|
+
# -- Contributors (editors, translators, etc)
|
|
84
|
+
contributors_raw = metadata.get("dc:contributor", metadata.get("contributor"))
|
|
85
|
+
if contributors_raw:
|
|
86
|
+
simplified["contributors"] = []
|
|
87
|
+
if isinstance(contributors_raw, list):
|
|
88
|
+
for contrib in contributors_raw:
|
|
89
|
+
if isinstance(contrib, dict):
|
|
90
|
+
name = contrib.get("#text", "").strip()
|
|
91
|
+
role = contrib.get("@opf:role", "contributor")
|
|
92
|
+
file_as = contrib.get("@opf:file-as", "")
|
|
93
|
+
if name:
|
|
94
|
+
simplified["contributors"].append({
|
|
95
|
+
"name": name,
|
|
96
|
+
"role": role,
|
|
97
|
+
"file_as": file_as
|
|
98
|
+
})
|
|
99
|
+
elif isinstance(contrib, str):
|
|
100
|
+
simplified["contributors"].append({
|
|
101
|
+
"name": contrib.strip(),
|
|
102
|
+
"role": "contributor",
|
|
103
|
+
"file_as": ""
|
|
104
|
+
})
|
|
105
|
+
elif isinstance(contributors_raw, dict):
|
|
106
|
+
name = contributors_raw.get("#text", "").strip()
|
|
107
|
+
role = contributors_raw.get("@opf:role", "contributor")
|
|
108
|
+
file_as = contributors_raw.get("@opf:file-as", "")
|
|
109
|
+
if name:
|
|
110
|
+
simplified["contributors"] = [{
|
|
111
|
+
"name": name,
|
|
112
|
+
"role": role,
|
|
113
|
+
"file_as": file_as
|
|
114
|
+
}]
|
|
115
|
+
|
|
116
|
+
# -- Calibre-specific metadata (series, etc)
|
|
117
|
+
# Look for meta tags with name attributes
|
|
118
|
+
meta_tags = metadata.get("meta", [])
|
|
119
|
+
if not isinstance(meta_tags, list):
|
|
120
|
+
meta_tags = [meta_tags] if meta_tags else []
|
|
121
|
+
|
|
122
|
+
for meta in meta_tags:
|
|
123
|
+
if isinstance(meta, dict):
|
|
124
|
+
meta_name = meta.get("@name", "")
|
|
125
|
+
meta_content = meta.get("@content", "")
|
|
126
|
+
|
|
127
|
+
if meta_name == "calibre:series" and meta_content:
|
|
128
|
+
simplified["series"] = meta_content
|
|
129
|
+
elif meta_name == "calibre:series_index" and meta_content:
|
|
130
|
+
try:
|
|
131
|
+
simplified["series_index"] = float(meta_content)
|
|
132
|
+
except (ValueError, TypeError):
|
|
133
|
+
pass
|
|
134
|
+
|
|
78
135
|
return simplified
|
|
79
136
|
|
|
80
137
|
|
|
81
138
|
def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
82
139
|
"""
|
|
83
|
-
Extract metadata from a PDF file using
|
|
140
|
+
Extract metadata from a PDF file using pypdf.
|
|
84
141
|
Returns a dictionary with the same keys as the OPF-based dict.
|
|
85
142
|
"""
|
|
86
143
|
|
|
@@ -94,20 +151,23 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
|
94
151
|
"publisher": None,
|
|
95
152
|
"identifiers": None,
|
|
96
153
|
"keywords": None,
|
|
154
|
+
"creator_application": None,
|
|
97
155
|
}
|
|
98
156
|
|
|
99
157
|
try:
|
|
100
158
|
with open(pdf_path, "rb") as f:
|
|
101
|
-
reader =
|
|
159
|
+
reader = pypdf.PdfReader(f)
|
|
102
160
|
info = reader.metadata or {}
|
|
103
161
|
|
|
104
|
-
# NOTE: Depending on
|
|
162
|
+
# NOTE: Depending on pypdf version, metadata keys can differ
|
|
105
163
|
# e.g. info.title vs info.get('/Title')
|
|
106
164
|
pdf_title = info.get("/Title", None) or info.get("title", None)
|
|
107
165
|
pdf_author = info.get("/Author", None) or info.get("author", None)
|
|
108
166
|
pdf_subject = info.get("/Subject", None) or info.get("subject", None)
|
|
109
167
|
pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
|
|
110
|
-
|
|
168
|
+
pdf_creator = info.get("/Creator", None) or info.get("creator", None) # Application used
|
|
169
|
+
pdf_producer = info.get("/Producer", None) or info.get("producer", None)
|
|
170
|
+
pdf_publisher = info.get("/Publisher", None) or info.get("publisher", None)
|
|
111
171
|
pdf_creation_date = info.get("/CreationDate", None)
|
|
112
172
|
|
|
113
173
|
if pdf_title:
|
|
@@ -130,10 +190,18 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
|
130
190
|
metadata["identifiers"] = {"pdf:identifier": pdf_path}
|
|
131
191
|
|
|
132
192
|
if pdf_keywords:
|
|
133
|
-
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",")]
|
|
193
|
+
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",") if kw.strip()]
|
|
194
|
+
|
|
195
|
+
# Creator is the application that created the PDF (e.g., LaTeX, Word)
|
|
196
|
+
if pdf_creator:
|
|
197
|
+
metadata["creator_application"] = pdf_creator.strip()
|
|
134
198
|
|
|
199
|
+
# Publisher: prefer explicit Publisher field, fallback to Producer
|
|
135
200
|
if pdf_publisher:
|
|
136
201
|
metadata["publisher"] = pdf_publisher.strip()
|
|
202
|
+
elif pdf_producer and not pdf_creator:
|
|
203
|
+
# Only use producer as publisher if there's no creator app
|
|
204
|
+
metadata["publisher"] = pdf_producer.strip()
|
|
137
205
|
|
|
138
206
|
metadata["file_paths"] = [pdf_path]
|
|
139
207
|
|
|
@@ -259,6 +327,7 @@ def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
|
|
|
259
327
|
if opf_file and os.path.isfile(opf_file):
|
|
260
328
|
opf_metadata = extract_metadata_from_opf(opf_file)
|
|
261
329
|
|
|
330
|
+
ebook_metadata = {}
|
|
262
331
|
_, ext = os.path.splitext(ebook_file.lower())
|
|
263
332
|
if ext == ".pdf":
|
|
264
333
|
ebook_metadata = extract_metadata_from_pdf(ebook_file)
|