ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,447 @@
1
+ """
2
+ Import service for adding books to the database.
3
+
4
+ Handles file copying, deduplication, metadata extraction, and text indexing.
5
+ """
6
+
7
+ import shutil
8
+ import hashlib
9
+ from pathlib import Path
10
+ from typing import Optional, List, Dict, Any
11
+ from datetime import datetime
12
+ import logging
13
+
14
+ from sqlalchemy.orm import Session
15
+ from PIL import Image
16
+
17
+ from ..db.models import Book, Author, Subject, Identifier, File, Cover, PersonalMetadata
18
+ from ..db.session import get_or_create
19
+ from .text_extraction import TextExtractionService
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def get_sort_name(name: str) -> str:
25
+ """Get sortable name (Last, First format).
26
+
27
+ Converts a name like "John Smith" to "Smith, John" for proper sorting.
28
+ """
29
+ if not name:
30
+ return name
31
+ parts = name.split()
32
+ if len(parts) >= 2:
33
+ return f"{parts[-1]}, {' '.join(parts[:-1])}"
34
+ return name
35
+
36
+
37
+ class ImportService:
38
+ """Service for importing books into the library."""
39
+
40
+ def __init__(self, library_root: Path, session: Session):
41
+ self.library_root = Path(library_root)
42
+ self.session = session
43
+ self.text_service = TextExtractionService(library_root)
44
+
45
+ # Create directory structure
46
+ (self.library_root / 'files').mkdir(parents=True, exist_ok=True)
47
+ (self.library_root / 'covers').mkdir(parents=True, exist_ok=True)
48
+ (self.library_root / 'covers' / 'thumbnails').mkdir(exist_ok=True)
49
+
50
+ def import_file(self, source_path: Path, metadata: Dict[str, Any],
51
+ extract_text: bool = True, extract_cover: bool = True) -> Optional[Book]:
52
+ """
53
+ Import a single ebook file into the library.
54
+
55
+ Args:
56
+ source_path: Path to source ebook file
57
+ metadata: Metadata dictionary
58
+ extract_text: Whether to extract full text
59
+ extract_cover: Whether to extract cover image
60
+
61
+ Returns:
62
+ Book instance or None if import failed
63
+ """
64
+ source_path = Path(source_path)
65
+
66
+ if not source_path.exists():
67
+ logger.error(f"Source file not found: {source_path}")
68
+ return None
69
+
70
+ try:
71
+ # Compute file hash
72
+ file_hash = self._compute_file_hash(source_path)
73
+
74
+ # Check for duplicate by hash
75
+ existing_file = self.session.query(File).filter_by(file_hash=file_hash).first()
76
+ if existing_file:
77
+ logger.info(f"Duplicate file detected (hash match): {source_path.name}")
78
+ return existing_file.book
79
+
80
+ # Generate unique ID for book
81
+ unique_id = self._generate_unique_id(metadata)
82
+
83
+ # Check if book already exists by unique_id
84
+ existing_book = self.session.query(Book).filter_by(unique_id=unique_id).first()
85
+
86
+ if existing_book:
87
+ # Add this file format to existing book
88
+ logger.info(f"Adding format to existing book: {metadata.get('title')}")
89
+ book = existing_book
90
+ else:
91
+ # Create new book
92
+ book = self._create_book(metadata, unique_id)
93
+
94
+ # Copy file to library
95
+ dest_path = self._get_file_path(file_hash, source_path.suffix)
96
+ shutil.copy2(source_path, dest_path)
97
+
98
+ # Get file metadata from filesystem
99
+ file_stat = source_path.stat()
100
+ import mimetypes
101
+ from datetime import datetime
102
+ mime_type = mimetypes.guess_type(str(source_path))[0]
103
+ created_date = datetime.fromtimestamp(file_stat.st_ctime)
104
+ modified_date = datetime.fromtimestamp(file_stat.st_mtime)
105
+
106
+ # Extract creator application from metadata if PDF
107
+ creator_app = metadata.get('creator_application')
108
+
109
+ # Create file record with enhanced metadata
110
+ file = File(
111
+ book_id=book.id,
112
+ path=str(dest_path.relative_to(self.library_root)),
113
+ format=source_path.suffix[1:].lower(), # Remove leading dot
114
+ size_bytes=file_stat.st_size,
115
+ file_hash=file_hash,
116
+ mime_type=mime_type,
117
+ created_date=created_date,
118
+ modified_date=modified_date,
119
+ creator_application=creator_app
120
+ )
121
+ self.session.add(file)
122
+ self.session.flush() # Get file.id
123
+
124
+ # Extract cover if needed
125
+ if extract_cover:
126
+ self._extract_cover(source_path, book, file)
127
+
128
+ # Extract text if needed
129
+ if extract_text:
130
+ self.text_service.extract_and_chunk_all(file, self.session)
131
+
132
+ self.session.commit()
133
+ logger.info(f"Successfully imported: {metadata.get('title')}")
134
+ return book
135
+
136
+ except Exception as e:
137
+ self.session.rollback()
138
+ logger.error(f"Error importing {source_path}: {e}")
139
+ return None
140
+
141
+ def _create_book(self, metadata: Dict[str, Any], unique_id: str) -> Book:
142
+ """Create book record with metadata."""
143
+
144
+ # Create book with enhanced metadata
145
+ book = Book(
146
+ unique_id=unique_id,
147
+ title=metadata.get('title', 'Unknown Title'),
148
+ subtitle=metadata.get('subtitle'),
149
+ sort_title=self._get_sort_title(metadata.get('title', '')),
150
+ language=metadata.get('language', 'en'),
151
+ publisher=metadata.get('publisher'),
152
+ publication_date=metadata.get('date'),
153
+ description=metadata.get('description'),
154
+ page_count=metadata.get('page_count'),
155
+ # New fields
156
+ series=metadata.get('series'),
157
+ series_index=metadata.get('series_index'),
158
+ edition=metadata.get('edition'),
159
+ rights=metadata.get('rights'),
160
+ source=metadata.get('source'),
161
+ keywords=metadata.get('keywords')
162
+ )
163
+ self.session.add(book)
164
+ self.session.flush() # Get book.id
165
+
166
+ # Add authors
167
+ creators = metadata.get('creators') or []
168
+ for author_name in creators:
169
+ if author_name: # Skip None/empty values
170
+ author, _ = get_or_create(
171
+ self.session,
172
+ Author,
173
+ name=author_name,
174
+ sort_name=get_sort_name(author_name)
175
+ )
176
+ book.authors.append(author)
177
+
178
+ # Add subjects/tags
179
+ subjects = metadata.get('subjects') or []
180
+ for subject_name in subjects:
181
+ if subject_name: # Skip None/empty values
182
+ subject, _ = get_or_create(
183
+ self.session,
184
+ Subject,
185
+ name=subject_name,
186
+ type='topic'
187
+ )
188
+ book.subjects.append(subject)
189
+
190
+ # Add contributors (editors, translators, etc.)
191
+ contributors = metadata.get('contributors') or []
192
+ for contrib in contributors:
193
+ if isinstance(contrib, dict):
194
+ name = contrib.get('name')
195
+ role = contrib.get('role', 'contributor')
196
+ file_as = contrib.get('file_as', '')
197
+ if name:
198
+ from ..db.models import Contributor
199
+ contributor = Contributor(
200
+ book_id=book.id,
201
+ name=name,
202
+ role=role,
203
+ file_as=file_as
204
+ )
205
+ self.session.add(contributor)
206
+
207
+ # Add identifiers
208
+ for scheme, value in metadata.get('identifiers', {}).items():
209
+ identifier = Identifier(
210
+ book_id=book.id,
211
+ scheme=scheme,
212
+ value=value
213
+ )
214
+ self.session.add(identifier)
215
+
216
+ # Create personal metadata
217
+ personal = PersonalMetadata(
218
+ book_id=book.id,
219
+ reading_status='unread',
220
+ owned=True
221
+ )
222
+ self.session.add(personal)
223
+
224
+ return book
225
+
226
+ def _extract_cover(self, source_path: Path, book: Book, file: File):
227
+ """Extract and save cover image."""
228
+ cover_path = None
229
+
230
+ try:
231
+ if source_path.suffix.lower() == '.pdf':
232
+ cover_path = self._extract_pdf_cover(source_path, file.file_hash)
233
+ elif source_path.suffix.lower() == '.epub':
234
+ cover_path = self._extract_epub_cover(source_path, file.file_hash)
235
+
236
+ if cover_path and cover_path.exists():
237
+ # Create thumbnail
238
+ thumb_path = self._create_thumbnail(cover_path, file.file_hash)
239
+
240
+ # Save cover record
241
+ img = Image.open(cover_path)
242
+ cover = Cover(
243
+ book_id=book.id,
244
+ path=str(cover_path.relative_to(self.library_root)),
245
+ width=img.width,
246
+ height=img.height,
247
+ is_primary=True,
248
+ source='extracted'
249
+ )
250
+ self.session.add(cover)
251
+ logger.info(f"Extracted cover for {book.title}")
252
+
253
+ except Exception as e:
254
+ logger.warning(f"Cover extraction failed: {e}")
255
+
256
+ def _extract_pdf_cover(self, pdf_path: Path, file_hash: str) -> Optional[Path]:
257
+ """Extract first page of PDF as cover image."""
258
+ try:
259
+ import fitz
260
+ doc = fitz.open(str(pdf_path))
261
+ if len(doc) > 0:
262
+ page = doc[0]
263
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x scale for quality
264
+
265
+ cover_path = self._get_cover_path(file_hash, 'png')
266
+ pix.save(str(cover_path))
267
+ doc.close()
268
+ return cover_path
269
+ except Exception as e:
270
+ logger.error(f"PDF cover extraction error: {e}")
271
+
272
+ return None
273
+
274
+ def _extract_epub_cover(self, epub_path: Path, file_hash: str) -> Optional[Path]:
275
+ """Extract cover image from EPUB."""
276
+ try:
277
+ from ebooklib import epub
278
+ book = epub.read_epub(str(epub_path))
279
+
280
+ # Try to get cover - handle different ebooklib versions
281
+ cover_item = None
282
+
283
+ # Method 1: Try ITEM_COVER constant (older ebooklib)
284
+ try:
285
+ for item in book.get_items():
286
+ if hasattr(epub, 'ITEM_COVER') and item.get_type() == epub.ITEM_COVER:
287
+ cover_item = item
288
+ break
289
+ except AttributeError:
290
+ pass
291
+
292
+ # Method 2: Look for image named 'cover' or check item type == 1 (image)
293
+ if not cover_item:
294
+ for item in book.get_items():
295
+ # Type 1 is ITEM_IMAGE in ebooklib
296
+ if item.get_type() == 1: # ITEM_IMAGE
297
+ if 'cover' in item.get_name().lower():
298
+ cover_item = item
299
+ break
300
+
301
+ # Method 3: Try ITEM_IMAGE constant fallback
302
+ if not cover_item:
303
+ try:
304
+ for item in book.get_items():
305
+ if hasattr(epub, 'ITEM_IMAGE') and item.get_type() == epub.ITEM_IMAGE:
306
+ if 'cover' in item.get_name().lower():
307
+ cover_item = item
308
+ break
309
+ except AttributeError:
310
+ pass
311
+
312
+ if cover_item:
313
+ # Determine image format
314
+ ext = Path(cover_item.get_name()).suffix or '.jpg'
315
+ cover_path = self._get_cover_path(file_hash, ext[1:])
316
+
317
+ cover_path.write_bytes(cover_item.get_content())
318
+ return cover_path
319
+
320
+ except Exception as e:
321
+ logger.error(f"EPUB cover extraction error: {e}")
322
+
323
+ return None
324
+
325
+ def _create_thumbnail(self, cover_path: Path, file_hash: str) -> Path:
326
+ """Create thumbnail from cover image."""
327
+ thumb_path = self.library_root / 'covers' / 'thumbnails' / f"{file_hash}_thumb.jpg"
328
+
329
+ try:
330
+ img = Image.open(cover_path)
331
+ img.thumbnail((200, 300))
332
+ img.save(thumb_path, 'JPEG', quality=85)
333
+ return thumb_path
334
+ except Exception as e:
335
+ logger.error(f"Thumbnail creation error: {e}")
336
+ return cover_path
337
+
338
+ def _get_file_path(self, file_hash: str, extension: str) -> Path:
339
+ """Get storage path for file based on hash prefix."""
340
+ prefix = file_hash[:2]
341
+ dir_path = self.library_root / 'files' / prefix
342
+ dir_path.mkdir(parents=True, exist_ok=True)
343
+ return dir_path / f"{file_hash}{extension}"
344
+
345
+ def _get_cover_path(self, file_hash: str, extension: str) -> Path:
346
+ """Get storage path for cover based on hash prefix."""
347
+ prefix = file_hash[:2]
348
+ dir_path = self.library_root / 'covers' / prefix
349
+ dir_path.mkdir(parents=True, exist_ok=True)
350
+ return dir_path / f"{file_hash}.{extension}"
351
+
352
+ @staticmethod
353
+ def _compute_file_hash(file_path: Path) -> str:
354
+ """Compute SHA256 hash of file."""
355
+ sha256 = hashlib.sha256()
356
+ with open(file_path, 'rb') as f:
357
+ for block in iter(lambda: f.read(8192), b''):
358
+ sha256.update(block)
359
+ return sha256.hexdigest()
360
+
361
+ @staticmethod
362
+ def _generate_unique_id(metadata: Dict[str, Any]) -> str:
363
+ """Generate unique ID for book based on metadata."""
364
+ # Use ISBN if available
365
+ identifiers = metadata.get('identifiers', {})
366
+ if 'isbn' in identifiers:
367
+ return f"isbn_{identifiers['isbn']}"
368
+
369
+ # Otherwise use hash of title + authors
370
+ title = metadata.get('title', 'unknown')
371
+ authors = ','.join(metadata.get('creators', ['unknown']))
372
+ content = f"{title}:{authors}".lower()
373
+ return hashlib.md5(content.encode()).hexdigest()[:16]
374
+
375
+ @staticmethod
376
+ def _get_sort_title(title: str) -> str:
377
+ """Get sortable title (remove leading articles)."""
378
+ title = title.strip()
379
+ for article in ['The ', 'A ', 'An ']:
380
+ if title.startswith(article):
381
+ return title[len(article):]
382
+ return title
383
+
384
+ def import_calibre_book(self, calibre_metadata_path: Path) -> Optional[Book]:
385
+ """
386
+ Import book from Calibre metadata.opf file.
387
+
388
+ Args:
389
+ calibre_metadata_path: Path to metadata.opf file
390
+
391
+ Returns:
392
+ Book instance or None
393
+ """
394
+ from ..extract_metadata import extract_metadata_from_opf
395
+
396
+ metadata = extract_metadata_from_opf(str(calibre_metadata_path))
397
+
398
+ # Find ebook files in same directory
399
+ book_dir = calibre_metadata_path.parent
400
+ ebook_files = list(book_dir.glob('*.pdf')) + \
401
+ list(book_dir.glob('*.epub')) + \
402
+ list(book_dir.glob('*.mobi'))
403
+
404
+ if not ebook_files:
405
+ logger.warning(f"No ebook files found in {book_dir}")
406
+ return None
407
+
408
+ # Import first file (others will be added as formats)
409
+ book = self.import_file(ebook_files[0], metadata)
410
+
411
+ # Import additional formats
412
+ for ebook_file in ebook_files[1:]:
413
+ self.import_file(ebook_file, metadata, extract_text=False, extract_cover=False)
414
+
415
+ return book
416
+
417
+ def batch_import(self, file_paths: List[Path], metadata_list: List[Dict[str, Any]],
418
+ show_progress: bool = False) -> List[Book]:
419
+ """
420
+ Import multiple files with progress tracking.
421
+
422
+ Args:
423
+ file_paths: List of file paths to import
424
+ metadata_list: List of metadata dicts (one per file)
425
+ show_progress: Whether to show progress bar
426
+
427
+ Returns:
428
+ List of imported Book instances
429
+ """
430
+ books = []
431
+
432
+ if show_progress:
433
+ from rich.progress import Progress
434
+ with Progress() as progress:
435
+ task = progress.add_task("[green]Importing...", total=len(file_paths))
436
+ for file_path, metadata in zip(file_paths, metadata_list):
437
+ book = self.import_file(file_path, metadata)
438
+ if book:
439
+ books.append(book)
440
+ progress.advance(task)
441
+ else:
442
+ for file_path, metadata in zip(file_paths, metadata_list):
443
+ book = self.import_file(file_path, metadata)
444
+ if book:
445
+ books.append(book)
446
+
447
+ return books