ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (84) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +443 -0
  4. ebk/ai/llm_providers/__init__.py +21 -0
  5. ebk/ai/llm_providers/base.py +230 -0
  6. ebk/ai/llm_providers/ollama.py +362 -0
  7. ebk/ai/metadata_enrichment.py +396 -0
  8. ebk/ai/question_generator.py +328 -0
  9. ebk/ai/reading_companion.py +224 -0
  10. ebk/ai/semantic_search.py +434 -0
  11. ebk/ai/text_extractor.py +394 -0
  12. ebk/cli.py +2828 -680
  13. ebk/config.py +260 -22
  14. ebk/db/__init__.py +37 -0
  15. ebk/db/migrations.py +180 -0
  16. ebk/db/models.py +526 -0
  17. ebk/db/session.py +144 -0
  18. ebk/decorators.py +132 -0
  19. ebk/exports/base_exporter.py +218 -0
  20. ebk/exports/html_library.py +1390 -0
  21. ebk/exports/html_utils.py +117 -0
  22. ebk/exports/hugo.py +7 -3
  23. ebk/exports/jinja_export.py +287 -0
  24. ebk/exports/multi_facet_export.py +164 -0
  25. ebk/exports/symlink_dag.py +479 -0
  26. ebk/extract_metadata.py +76 -7
  27. ebk/library_db.py +899 -0
  28. ebk/plugins/__init__.py +42 -0
  29. ebk/plugins/base.py +502 -0
  30. ebk/plugins/hooks.py +444 -0
  31. ebk/plugins/registry.py +500 -0
  32. ebk/repl/__init__.py +9 -0
  33. ebk/repl/find.py +126 -0
  34. ebk/repl/grep.py +174 -0
  35. ebk/repl/shell.py +1677 -0
  36. ebk/repl/text_utils.py +320 -0
  37. ebk/search_parser.py +413 -0
  38. ebk/server.py +1633 -0
  39. ebk/services/__init__.py +11 -0
  40. ebk/services/import_service.py +442 -0
  41. ebk/services/tag_service.py +282 -0
  42. ebk/services/text_extraction.py +317 -0
  43. ebk/similarity/__init__.py +77 -0
  44. ebk/similarity/base.py +154 -0
  45. ebk/similarity/core.py +445 -0
  46. ebk/similarity/extractors.py +168 -0
  47. ebk/similarity/metrics.py +376 -0
  48. ebk/vfs/__init__.py +101 -0
  49. ebk/vfs/base.py +301 -0
  50. ebk/vfs/library_vfs.py +124 -0
  51. ebk/vfs/nodes/__init__.py +54 -0
  52. ebk/vfs/nodes/authors.py +196 -0
  53. ebk/vfs/nodes/books.py +480 -0
  54. ebk/vfs/nodes/files.py +155 -0
  55. ebk/vfs/nodes/metadata.py +385 -0
  56. ebk/vfs/nodes/root.py +100 -0
  57. ebk/vfs/nodes/similar.py +165 -0
  58. ebk/vfs/nodes/subjects.py +184 -0
  59. ebk/vfs/nodes/tags.py +371 -0
  60. ebk/vfs/resolver.py +228 -0
  61. ebk-0.3.2.dist-info/METADATA +755 -0
  62. ebk-0.3.2.dist-info/RECORD +69 -0
  63. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
  64. ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
  65. ebk/imports/__init__.py +0 -0
  66. ebk/imports/calibre.py +0 -144
  67. ebk/imports/ebooks.py +0 -116
  68. ebk/llm.py +0 -58
  69. ebk/manager.py +0 -44
  70. ebk/merge.py +0 -308
  71. ebk/streamlit/__init__.py +0 -0
  72. ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
  73. ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
  74. ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
  75. ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
  76. ebk/streamlit/app.py +0 -185
  77. ebk/streamlit/display.py +0 -168
  78. ebk/streamlit/filters.py +0 -151
  79. ebk/streamlit/utils.py +0 -58
  80. ebk/utils.py +0 -311
  81. ebk-0.1.0.dist-info/METADATA +0 -457
  82. ebk-0.1.0.dist-info/RECORD +0 -29
  83. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
  84. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,479 @@
1
+ """
2
+ Export library as a navigable directory structure using symlinks to represent tag hierarchies.
3
+
4
+ This module creates a filesystem view of the library where:
5
+ - Tags are represented as directories in a hierarchy
6
+ - Books appear in all relevant tag directories via symlinks
7
+ - The DAG structure of tags is preserved through the directory tree
8
+ """
9
+
10
+ import os
11
+ import json
12
+ import shutil
13
+ from pathlib import Path
14
+ from typing import Dict, List, Set, Optional, Tuple
15
+ import re
16
+ from collections import defaultdict
17
+
18
+
19
+ class SymlinkDAGExporter:
20
+ """Creates a navigable directory structure using symlinks to represent tag hierarchies."""
21
+
22
+ def __init__(self):
23
+ self.tag_separator = "/" # Separator for hierarchical tags
24
+ self.books_dir_name = "_books" # Directory to store actual book files
25
+
26
+ def export(self, lib_dir: str, output_dir: str,
27
+ tag_field: str = "subjects",
28
+ include_files: bool = False, # Changed default to False
29
+ create_index: bool = True,
30
+ flatten: bool = False,
31
+ min_books: int = 0):
32
+ """
33
+ Export library as symlink-based directory structure.
34
+
35
+ Args:
36
+ lib_dir: Path to the ebk library
37
+ output_dir: Output directory for the symlink structure
38
+ tag_field: Field to use for tags (default: "subjects")
39
+ include_files: Whether to copy actual ebook files (default: False)
40
+ create_index: Whether to create index.html files in directories
41
+ flatten: Whether to create direct symlinks to files instead of _books structure
42
+ min_books: Minimum books per tag folder; smaller folders go to _misc (default: 0)
43
+ """
44
+ lib_path = Path(lib_dir)
45
+ output_path = Path(output_dir)
46
+
47
+ # Load metadata
48
+ metadata_file = lib_path / "metadata.json"
49
+ with open(metadata_file, "r") as f:
50
+ entries = json.load(f)
51
+
52
+ # Create output directory
53
+ output_path.mkdir(parents=True, exist_ok=True)
54
+
55
+ # Create books directory for actual files (unless flattening)
56
+ if not flatten:
57
+ books_path = output_path / self.books_dir_name
58
+ books_path.mkdir(exist_ok=True)
59
+
60
+ # Process each entry
61
+ entry_paths = {} # Map entry ID to its path in _books
62
+ tag_entries = defaultdict(list) # Map tag to list of entries
63
+
64
+ for i, entry in enumerate(entries):
65
+ entry_id = entry.get("unique_id", f"entry_{i}")
66
+
67
+ if not flatten:
68
+ # Create entry directory in _books
69
+ entry_dir = books_path / self._sanitize_filename(entry_id)
70
+ entry_dir.mkdir(exist_ok=True)
71
+ entry_paths[entry_id] = entry_dir
72
+
73
+ # Save metadata
74
+ with open(entry_dir / "metadata.json", "w") as f:
75
+ json.dump(entry, f, indent=2)
76
+
77
+ # Handle files - either copy or symlink
78
+ if include_files:
79
+ self._copy_entry_files(entry, lib_path, entry_dir)
80
+ else:
81
+ # Create symlinks to original files
82
+ self._symlink_entry_files(entry, lib_path, entry_dir)
83
+ else:
84
+ # For flatten mode, store original file paths
85
+ entry_paths[entry_id] = entry.get("file_paths", [])
86
+
87
+ # Create a readable symlink name
88
+ title = entry.get("title", "Unknown Title")
89
+ creators = entry.get("creators", [])
90
+ if creators:
91
+ readable_name = f"{self._sanitize_filename(title)} - {self._sanitize_filename(creators[0])}"
92
+ else:
93
+ readable_name = self._sanitize_filename(title)
94
+
95
+ # Store readable name for later use
96
+ entry["_readable_name"] = readable_name
97
+ entry["_entry_id"] = entry_id
98
+
99
+ # Extract tags and build hierarchy
100
+ tags = entry.get(tag_field, [])
101
+ if isinstance(tags, str):
102
+ tags = [tags]
103
+
104
+ for tag in tags:
105
+ # Add to this tag and all parent tags
106
+ tag_parts = tag.split(self.tag_separator)
107
+ for i in range(len(tag_parts)):
108
+ parent_tag = self.tag_separator.join(tag_parts[:i+1])
109
+ tag_entries[parent_tag].append(entry)
110
+
111
+ # Consolidate small tag folders if min_books is set
112
+ if min_books > 0:
113
+ tag_entries = self._consolidate_small_tags(tag_entries, min_books)
114
+
115
+ # Create tag directory structure with symlinks
116
+ self._create_tag_structure(output_path, tag_entries, entry_paths, flatten, lib_path)
117
+
118
+ # Create root index if requested
119
+ if create_index:
120
+ self._create_index_files(output_path, tag_entries, entries)
121
+
122
+ # Create a README
123
+ self._create_readme(output_path, len(entries), len(tag_entries))
124
+
125
+ def _consolidate_small_tags(self, tag_entries: Dict[str, List[Dict]],
126
+ min_books: int) -> Dict[str, List[Dict]]:
127
+ """Consolidate tags with fewer than min_books into a _misc folder."""
128
+ consolidated = defaultdict(list)
129
+ misc_entries = []
130
+
131
+ for tag, entries in tag_entries.items():
132
+ # Get unique entries for this tag
133
+ seen_ids = set()
134
+ unique_entries = []
135
+ for entry in entries:
136
+ entry_id = entry.get("_entry_id", entry.get("unique_id"))
137
+ if entry_id not in seen_ids:
138
+ seen_ids.add(entry_id)
139
+ unique_entries.append(entry)
140
+
141
+ # Check if this tag has enough unique books
142
+ if len(unique_entries) < min_books:
143
+ # Check if it's a leaf tag (no children with enough books)
144
+ tag_prefix = tag + self.tag_separator
145
+ has_large_children = any(
146
+ other_tag.startswith(tag_prefix) and
147
+ len(set(e.get("_entry_id", e.get("unique_id")) for e in tag_entries[other_tag])) >= min_books
148
+ for other_tag in tag_entries.keys()
149
+ )
150
+
151
+ if not has_large_children:
152
+ # Add to misc folder with tag prefix
153
+ for entry in unique_entries:
154
+ misc_entry = entry.copy()
155
+ # Store original tag for display in misc folder
156
+ misc_entry["_original_tag"] = tag
157
+ misc_entries.append(misc_entry)
158
+ else:
159
+ # Keep it as is because it has large children
160
+ consolidated[tag] = entries
161
+ else:
162
+ # Keep tags with enough books
163
+ consolidated[tag] = entries
164
+
165
+ # Add misc entries if any
166
+ if misc_entries:
167
+ consolidated["_misc"] = misc_entries
168
+
169
+ return dict(consolidated)
170
+
171
+ def _sanitize_filename(self, name: str) -> str:
172
+ """Sanitize a string to be safe as a filename."""
173
+ # Replace problematic characters
174
+ name = re.sub(r'[<>:"/\\|?*]', '-', str(name))
175
+ # Remove leading/trailing spaces and dots
176
+ name = name.strip('. ')
177
+ # Limit length (being more conservative)
178
+ if len(name) > 150:
179
+ name = name[:147] + "..."
180
+ return name or "unnamed"
181
+
182
+ def _copy_entry_files(self, entry: Dict, lib_path: Path, entry_dir: Path):
183
+ """Copy ebook and cover files for an entry."""
184
+ # Copy ebook files
185
+ for file_path in entry.get("file_paths", []):
186
+ src_file = lib_path / file_path
187
+ if src_file.exists():
188
+ dest_file = entry_dir / src_file.name
189
+ shutil.copy2(src_file, dest_file)
190
+
191
+ # Copy cover file
192
+ cover_path = entry.get("cover_path")
193
+ if cover_path:
194
+ src_cover = lib_path / cover_path
195
+ if src_cover.exists():
196
+ dest_cover = entry_dir / src_cover.name
197
+ shutil.copy2(src_cover, dest_cover)
198
+
199
+ def _symlink_entry_files(self, entry: Dict, lib_path: Path, entry_dir: Path):
200
+ """Create symlinks to ebook and cover files for an entry."""
201
+ # Symlink ebook files
202
+ for file_path in entry.get("file_paths", []):
203
+ src_file = lib_path / file_path
204
+ if src_file.exists():
205
+ # Get absolute path of source file
206
+ abs_src = src_file.resolve()
207
+ dest_link = entry_dir / src_file.name
208
+
209
+ # Remove existing symlink if it exists
210
+ if dest_link.exists() or dest_link.is_symlink():
211
+ dest_link.unlink()
212
+
213
+ try:
214
+ # Create symlink using absolute path
215
+ dest_link.symlink_to(abs_src)
216
+ except OSError as e:
217
+ print(f"Warning: Could not create symlink for '{file_path}': {e}")
218
+
219
+ # Symlink cover file
220
+ cover_path = entry.get("cover_path")
221
+ if cover_path:
222
+ src_cover = lib_path / cover_path
223
+ if src_cover.exists():
224
+ # Get absolute path of source cover
225
+ abs_cover = src_cover.resolve()
226
+ dest_link = entry_dir / src_cover.name
227
+
228
+ if dest_link.exists() or dest_link.is_symlink():
229
+ dest_link.unlink()
230
+
231
+ try:
232
+ # Create symlink using absolute path
233
+ dest_link.symlink_to(abs_cover)
234
+ except OSError as e:
235
+ print(f"Warning: Could not create symlink for cover '{cover_path}': {e}")
236
+
237
+ def _create_tag_structure(self, output_path: Path,
238
+ tag_entries: Dict[str, List[Dict]],
239
+ entry_paths: Dict[str, Path],
240
+ flatten: bool = False,
241
+ lib_path: Path = None):
242
+ """Create the hierarchical tag directory structure with symlinks."""
243
+ # Sort tags to ensure parents are created before children
244
+ sorted_tags = sorted(tag_entries.keys())
245
+
246
+ for tag in sorted_tags:
247
+ # Create tag directory path
248
+ tag_parts = tag.split(self.tag_separator)
249
+ tag_dir = output_path
250
+ for part in tag_parts:
251
+ tag_dir = tag_dir / self._sanitize_filename(part)
252
+ tag_dir.mkdir(parents=True, exist_ok=True)
253
+
254
+ # Get unique entries for this tag (avoid duplicates)
255
+ seen_ids = set()
256
+ unique_entries = []
257
+ for entry in tag_entries[tag]:
258
+ entry_id = entry["_entry_id"]
259
+ if entry_id not in seen_ids:
260
+ seen_ids.add(entry_id)
261
+ unique_entries.append(entry)
262
+
263
+ # Create symlinks to entries
264
+ for entry in unique_entries:
265
+ entry_id = entry["_entry_id"]
266
+ readable_name = entry["_readable_name"]
267
+
268
+ # For _misc folder, include original tag in the name
269
+ if tag == "_misc" and "_original_tag" in entry:
270
+ original_tag = entry["_original_tag"]
271
+ # Shorten the tag to avoid filesystem limits
272
+ tag_parts = original_tag.split(self.tag_separator)
273
+ if len(tag_parts) > 2:
274
+ # Use only the last two parts of hierarchical tags
275
+ short_tag = self.tag_separator.join(tag_parts[-2:])
276
+ else:
277
+ short_tag = original_tag
278
+
279
+ # Further limit tag length
280
+ if len(short_tag) > 50:
281
+ short_tag = short_tag[:47] + "..."
282
+
283
+ tag_prefix = f"[{short_tag.replace(self.tag_separator, '-')}] "
284
+
285
+ # Ensure the total name isn't too long
286
+ max_name_length = 200 # Safe limit for most filesystems
287
+ if len(tag_prefix + readable_name) > max_name_length:
288
+ # Truncate the readable name to fit
289
+ available_length = max_name_length - len(tag_prefix) - 3
290
+ readable_name = readable_name[:available_length] + "..."
291
+
292
+ if not flatten:
293
+ # Path to actual entry in _books
294
+ target_path = Path(*[".."] * len(tag_parts)) / self.books_dir_name / self._sanitize_filename(entry_id)
295
+ # Create symlink
296
+ symlink_path = tag_dir / readable_name
297
+ else:
298
+ # For flatten mode, create direct symlinks to original files
299
+ file_paths = entry_paths.get(entry_id, [])
300
+ if file_paths:
301
+ # Use the first file path (usually the main ebook file)
302
+ original_file = file_paths[0]
303
+ # Get absolute path to the original file
304
+ abs_file_path = (lib_path / original_file).resolve()
305
+ # Use original filename as symlink name
306
+ symlink_path = tag_dir / Path(original_file).name
307
+ target_path = abs_file_path
308
+ else:
309
+ continue # Skip if no files
310
+
311
+ # Remove existing symlink if it exists
312
+ if symlink_path.exists() or symlink_path.is_symlink():
313
+ symlink_path.unlink()
314
+
315
+ # Create relative symlink
316
+ try:
317
+ symlink_path.symlink_to(target_path)
318
+ except OSError as e:
319
+ # On Windows, creating symlinks might require admin privileges
320
+ print(f"Warning: Could not create symlink for '{readable_name}': {e}")
321
+
322
+ def _create_index_files(self, output_path: Path,
323
+ tag_entries: Dict[str, List[Dict]],
324
+ all_entries: List[Dict]):
325
+ """Create index.html files in each directory for web browsing."""
326
+ # Create root index with tag counts
327
+ root_child_tags = {}
328
+ for tag, entries in tag_entries.items():
329
+ if self.tag_separator not in tag: # Top-level tags only
330
+ unique_count = len(set(e.get("_entry_id", e.get("unique_id"))
331
+ for e in entries))
332
+ root_child_tags[tag] = unique_count
333
+ self._write_index_file(output_path, "Library Root", all_entries, root_child_tags, output_path)
334
+
335
+ # Create index for each tag directory
336
+ for tag, entries in tag_entries.items():
337
+ tag_parts = tag.split(self.tag_separator)
338
+ tag_dir = output_path
339
+ for part in tag_parts:
340
+ tag_dir = tag_dir / self._sanitize_filename(part)
341
+
342
+ # Get child tags with counts
343
+ child_tags = {}
344
+ tag_prefix = tag + self.tag_separator
345
+ for other_tag, other_entries in tag_entries.items():
346
+ if other_tag.startswith(tag_prefix) and other_tag != tag:
347
+ # Check if it's a direct child
348
+ remaining = other_tag[len(tag_prefix):]
349
+ if self.tag_separator not in remaining:
350
+ # Count unique entries for this tag
351
+ unique_count = len(set(e.get("_entry_id", e.get("unique_id"))
352
+ for e in other_entries))
353
+ child_tags[other_tag] = unique_count
354
+
355
+ # Get unique entries
356
+ seen_ids = set()
357
+ unique_entries = []
358
+ for entry in entries:
359
+ entry_id = entry.get("_entry_id", entry.get("unique_id"))
360
+ if entry_id not in seen_ids:
361
+ seen_ids.add(entry_id)
362
+ unique_entries.append(entry)
363
+
364
+ self._write_index_file(tag_dir, tag, unique_entries, child_tags, output_path)
365
+
366
+ def _write_index_file(self, directory: Path, title: str,
367
+ entries: List[Dict], child_tags: Dict[str, int], output_path: Path):
368
+ """Write an index.html file for a directory using Jinja2 template."""
369
+ from jinja2 import Environment, FileSystemLoader
370
+ import json
371
+ import re
372
+
373
+ # Prepare entries for JSON (clean and escape)
374
+ clean_entries = []
375
+ for entry in entries:
376
+ clean_entry = {}
377
+ for key, value in entry.items():
378
+ if isinstance(value, str):
379
+ # Remove problematic HTML from descriptions
380
+ if key == "description":
381
+ # Strip HTML tags from description for JSON
382
+ value = re.sub(r'<[^>]+>', '', value)
383
+ # Limit description length
384
+ if len(value) > 500:
385
+ value = value[:500] + "..."
386
+ clean_entry[key] = value
387
+ elif isinstance(value, list):
388
+ clean_entry[key] = [str(v) for v in value]
389
+ else:
390
+ clean_entry[key] = str(value)
391
+ clean_entries.append(clean_entry)
392
+
393
+ # Convert to JSON for JavaScript
394
+ entries_json = json.dumps(clean_entries, ensure_ascii=True)
395
+
396
+ # Set up Jinja2 environment
397
+ template_dir = Path(__file__).parent / "templates"
398
+ env = Environment(loader=FileSystemLoader(str(template_dir)))
399
+ template = env.get_template("advanced_index.html")
400
+
401
+ # Calculate if we're in a subdirectory (for proper _books path)
402
+ is_subdir = directory != output_path
403
+
404
+ # Render template
405
+ html_content = template.render(
406
+ title=title,
407
+ entries=entries,
408
+ entries_json=entries_json,
409
+ child_tags=child_tags,
410
+ tag_separator=self.tag_separator,
411
+ is_subdir=is_subdir
412
+ )
413
+
414
+ # Write the file
415
+ index_path = directory / "index.html"
416
+ with open(index_path, "w", encoding="utf-8") as f:
417
+ f.write(html_content)
418
+
419
+ def _create_readme(self, output_path: Path, num_entries: int, num_tags: int):
420
+ """Create a README file explaining the structure."""
421
+ readme_content = f"""# EBK Library - Symlink Navigation Structure
422
+
423
+ This directory contains a navigable view of your ebook library organized by tags.
424
+
425
+ ## Statistics
426
+ - Total books: {num_entries}
427
+ - Total tags/categories: {num_tags}
428
+
429
+ ## Structure
430
+
431
+ - **_books/**: Contains the actual ebook files and metadata
432
+ - **Tag directories**: Each tag becomes a directory, with hierarchical tags creating nested directories
433
+ - **Symlinks**: Books appear in multiple tag directories via symbolic links
434
+
435
+ ## Navigation
436
+
437
+ You can navigate this structure using:
438
+ 1. Your file explorer (Finder, Windows Explorer, etc.)
439
+ 2. Command line tools (cd, ls, etc.)
440
+ 3. Web browser (open index.html files)
441
+
442
+ ## Hierarchical Tags
443
+
444
+ Tags like "Programming/Python/Web" create a nested structure:
445
+ ```
446
+ Programming/
447
+ Python/
448
+ Web/
449
+ (books tagged with Programming/Python/Web)
450
+ (books tagged with Programming/Python)
451
+ (books tagged with Programming)
452
+ ```
453
+
454
+ Books appear at each relevant level in the hierarchy.
455
+
456
+ ## Notes
457
+
458
+ - This is a read-only view. Modifying files here won't affect the original library.
459
+ - Symlinks point to files in the _books directory.
460
+ - On Windows, you may need administrator privileges to create symlinks.
461
+
462
+ Generated by EBK - https://github.com/queelius/ebk
463
+ """
464
+
465
+ with open(output_path / "README.md", "w") as f:
466
+ f.write(readme_content)
467
+
468
+
469
+ def export_symlink_dag(lib_dir: str, output_dir: str, **kwargs):
470
+ """
471
+ Convenience function to export library as symlink DAG.
472
+
473
+ Args:
474
+ lib_dir: Path to ebk library
475
+ output_dir: Output directory
476
+ **kwargs: Additional arguments passed to SymlinkDAGExporter.export()
477
+ """
478
+ exporter = SymlinkDAGExporter()
479
+ exporter.export(lib_dir, output_dir, **kwargs)
ebk/extract_metadata.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  import xmltodict
3
3
  from typing import Dict, Optional
4
4
  from slugify import slugify
5
- import PyPDF2
5
+ import pypdf
6
6
  from ebooklib import epub
7
7
 
8
8
  def extract_metadata_from_opf(opf_file: str) -> Dict:
@@ -32,12 +32,17 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
32
32
  simplified = {
33
33
  "title": metadata.get("dc:title", metadata.get("title")),
34
34
  "creators": None,
35
+ "contributors": None,
35
36
  "subjects": None,
36
37
  "description": metadata.get("dc:description", metadata.get("description")),
37
38
  "language": metadata.get("dc:language", metadata.get("language")),
38
39
  "date": metadata.get("dc:date", metadata.get("date")),
39
40
  "publisher": metadata.get("dc:publisher", metadata.get("publisher")),
40
- "identifiers": None
41
+ "identifiers": None,
42
+ "rights": metadata.get("dc:rights", metadata.get("rights")),
43
+ "source": metadata.get("dc:source", metadata.get("source")),
44
+ "series": None,
45
+ "series_index": None
41
46
  }
42
47
 
43
48
  # -- Creators
@@ -75,12 +80,64 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
75
80
  text = identifiers.get("#text", "").strip()
76
81
  simplified["identifiers"][scheme] = text
77
82
 
83
+ # -- Contributors (editors, translators, etc)
84
+ contributors_raw = metadata.get("dc:contributor", metadata.get("contributor"))
85
+ if contributors_raw:
86
+ simplified["contributors"] = []
87
+ if isinstance(contributors_raw, list):
88
+ for contrib in contributors_raw:
89
+ if isinstance(contrib, dict):
90
+ name = contrib.get("#text", "").strip()
91
+ role = contrib.get("@opf:role", "contributor")
92
+ file_as = contrib.get("@opf:file-as", "")
93
+ if name:
94
+ simplified["contributors"].append({
95
+ "name": name,
96
+ "role": role,
97
+ "file_as": file_as
98
+ })
99
+ elif isinstance(contrib, str):
100
+ simplified["contributors"].append({
101
+ "name": contrib.strip(),
102
+ "role": "contributor",
103
+ "file_as": ""
104
+ })
105
+ elif isinstance(contributors_raw, dict):
106
+ name = contributors_raw.get("#text", "").strip()
107
+ role = contributors_raw.get("@opf:role", "contributor")
108
+ file_as = contributors_raw.get("@opf:file-as", "")
109
+ if name:
110
+ simplified["contributors"] = [{
111
+ "name": name,
112
+ "role": role,
113
+ "file_as": file_as
114
+ }]
115
+
116
+ # -- Calibre-specific metadata (series, etc)
117
+ # Look for meta tags with name attributes
118
+ meta_tags = metadata.get("meta", [])
119
+ if not isinstance(meta_tags, list):
120
+ meta_tags = [meta_tags] if meta_tags else []
121
+
122
+ for meta in meta_tags:
123
+ if isinstance(meta, dict):
124
+ meta_name = meta.get("@name", "")
125
+ meta_content = meta.get("@content", "")
126
+
127
+ if meta_name == "calibre:series" and meta_content:
128
+ simplified["series"] = meta_content
129
+ elif meta_name == "calibre:series_index" and meta_content:
130
+ try:
131
+ simplified["series_index"] = float(meta_content)
132
+ except (ValueError, TypeError):
133
+ pass
134
+
78
135
  return simplified
79
136
 
80
137
 
81
138
  def extract_metadata_from_pdf(pdf_path: str) -> Dict:
82
139
  """
83
- Extract metadata from a PDF file using PyPDF2.
140
+ Extract metadata from a PDF file using pypdf.
84
141
  Returns a dictionary with the same keys as the OPF-based dict.
85
142
  """
86
143
 
@@ -94,20 +151,23 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
94
151
  "publisher": None,
95
152
  "identifiers": None,
96
153
  "keywords": None,
154
+ "creator_application": None,
97
155
  }
98
156
 
99
157
  try:
100
158
  with open(pdf_path, "rb") as f:
101
- reader = PyPDF2.PdfReader(f)
159
+ reader = pypdf.PdfReader(f)
102
160
  info = reader.metadata or {}
103
161
 
104
- # NOTE: Depending on PyPDF2 version, metadata keys can differ
162
+ # NOTE: Depending on pypdf version, metadata keys can differ
105
163
  # e.g. info.title vs info.get('/Title')
106
164
  pdf_title = info.get("/Title", None) or info.get("title", None)
107
165
  pdf_author = info.get("/Author", None) or info.get("author", None)
108
166
  pdf_subject = info.get("/Subject", None) or info.get("subject", None)
109
167
  pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
110
- pdf_publisher = info.get("/Producer", None) or info.get("producer", None) or info.get("/Publisher", None) or info.get("publisher", None)
168
+ pdf_creator = info.get("/Creator", None) or info.get("creator", None) # Application used
169
+ pdf_producer = info.get("/Producer", None) or info.get("producer", None)
170
+ pdf_publisher = info.get("/Publisher", None) or info.get("publisher", None)
111
171
  pdf_creation_date = info.get("/CreationDate", None)
112
172
 
113
173
  if pdf_title:
@@ -130,10 +190,18 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
130
190
  metadata["identifiers"] = {"pdf:identifier": pdf_path}
131
191
 
132
192
  if pdf_keywords:
133
- metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",")]
193
+ metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",") if kw.strip()]
194
+
195
+ # Creator is the application that created the PDF (e.g., LaTeX, Word)
196
+ if pdf_creator:
197
+ metadata["creator_application"] = pdf_creator.strip()
134
198
 
199
+ # Publisher: prefer explicit Publisher field, fallback to Producer
135
200
  if pdf_publisher:
136
201
  metadata["publisher"] = pdf_publisher.strip()
202
+ elif pdf_producer and not pdf_creator:
203
+ # Only use producer as publisher if there's no creator app
204
+ metadata["publisher"] = pdf_producer.strip()
137
205
 
138
206
  metadata["file_paths"] = [pdf_path]
139
207
 
@@ -259,6 +327,7 @@ def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
259
327
  if opf_file and os.path.isfile(opf_file):
260
328
  opf_metadata = extract_metadata_from_opf(opf_file)
261
329
 
330
+ ebook_metadata = {}
262
331
  _, ext = os.path.splitext(ebook_file.lower())
263
332
  if ext == ".pdf":
264
333
  ebook_metadata = extract_metadata_from_pdf(ebook_file)