ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,442 @@
1
+ """
2
+ Import service for adding books to the database.
3
+
4
+ Handles file copying, deduplication, metadata extraction, and text indexing.
5
+ """
6
+
7
+ import shutil
8
+ import hashlib
9
+ from pathlib import Path
10
+ from typing import Optional, List, Dict, Any
11
+ from datetime import datetime
12
+ import logging
13
+
14
+ from sqlalchemy.orm import Session
15
+ from PIL import Image
16
+
17
+ from ..db.models import Book, Author, Subject, Identifier, File, Cover, PersonalMetadata
18
+ from ..db.session import get_or_create
19
+ from .text_extraction import TextExtractionService
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ImportService:
25
+ """Service for importing books into the library."""
26
+
27
+ def __init__(self, library_root: Path, session: Session):
28
+ self.library_root = Path(library_root)
29
+ self.session = session
30
+ self.text_service = TextExtractionService(library_root)
31
+
32
+ # Create directory structure
33
+ (self.library_root / 'files').mkdir(parents=True, exist_ok=True)
34
+ (self.library_root / 'covers').mkdir(parents=True, exist_ok=True)
35
+ (self.library_root / 'covers' / 'thumbnails').mkdir(exist_ok=True)
36
+
37
+ def import_file(self, source_path: Path, metadata: Dict[str, Any],
38
+ extract_text: bool = True, extract_cover: bool = True) -> Optional[Book]:
39
+ """
40
+ Import a single ebook file into the library.
41
+
42
+ Args:
43
+ source_path: Path to source ebook file
44
+ metadata: Metadata dictionary
45
+ extract_text: Whether to extract full text
46
+ extract_cover: Whether to extract cover image
47
+
48
+ Returns:
49
+ Book instance or None if import failed
50
+ """
51
+ source_path = Path(source_path)
52
+
53
+ if not source_path.exists():
54
+ logger.error(f"Source file not found: {source_path}")
55
+ return None
56
+
57
+ try:
58
+ # Compute file hash
59
+ file_hash = self._compute_file_hash(source_path)
60
+
61
+ # Check for duplicate by hash
62
+ existing_file = self.session.query(File).filter_by(file_hash=file_hash).first()
63
+ if existing_file:
64
+ logger.info(f"Duplicate file detected (hash match): {source_path.name}")
65
+ return existing_file.book
66
+
67
+ # Generate unique ID for book
68
+ unique_id = self._generate_unique_id(metadata)
69
+
70
+ # Check if book already exists by unique_id
71
+ existing_book = self.session.query(Book).filter_by(unique_id=unique_id).first()
72
+
73
+ if existing_book:
74
+ # Add this file format to existing book
75
+ logger.info(f"Adding format to existing book: {metadata.get('title')}")
76
+ book = existing_book
77
+ else:
78
+ # Create new book
79
+ book = self._create_book(metadata, unique_id)
80
+
81
+ # Copy file to library
82
+ dest_path = self._get_file_path(file_hash, source_path.suffix)
83
+ shutil.copy2(source_path, dest_path)
84
+
85
+ # Get file metadata from filesystem
86
+ file_stat = source_path.stat()
87
+ import mimetypes
88
+ from datetime import datetime
89
+ mime_type = mimetypes.guess_type(str(source_path))[0]
90
+ created_date = datetime.fromtimestamp(file_stat.st_ctime)
91
+ modified_date = datetime.fromtimestamp(file_stat.st_mtime)
92
+
93
+ # Extract creator application from metadata if PDF
94
+ creator_app = metadata.get('creator_application')
95
+
96
+ # Create file record with enhanced metadata
97
+ file = File(
98
+ book_id=book.id,
99
+ path=str(dest_path.relative_to(self.library_root)),
100
+ format=source_path.suffix[1:].lower(), # Remove leading dot
101
+ size_bytes=file_stat.st_size,
102
+ file_hash=file_hash,
103
+ mime_type=mime_type,
104
+ created_date=created_date,
105
+ modified_date=modified_date,
106
+ creator_application=creator_app
107
+ )
108
+ self.session.add(file)
109
+ self.session.flush() # Get file.id
110
+
111
+ # Extract cover if needed
112
+ if extract_cover:
113
+ self._extract_cover(source_path, book, file)
114
+
115
+ # Extract text if needed
116
+ if extract_text:
117
+ self.text_service.extract_and_chunk_all(file, self.session)
118
+
119
+ self.session.commit()
120
+ logger.info(f"Successfully imported: {metadata.get('title')}")
121
+ return book
122
+
123
+ except Exception as e:
124
+ self.session.rollback()
125
+ logger.error(f"Error importing {source_path}: {e}")
126
+ return None
127
+
128
+ def _create_book(self, metadata: Dict[str, Any], unique_id: str) -> Book:
129
+ """Create book record with metadata."""
130
+
131
+ # Create book with enhanced metadata
132
+ book = Book(
133
+ unique_id=unique_id,
134
+ title=metadata.get('title', 'Unknown Title'),
135
+ subtitle=metadata.get('subtitle'),
136
+ sort_title=self._get_sort_title(metadata.get('title', '')),
137
+ language=metadata.get('language', 'en'),
138
+ publisher=metadata.get('publisher'),
139
+ publication_date=metadata.get('date'),
140
+ description=metadata.get('description'),
141
+ page_count=metadata.get('page_count'),
142
+ # New fields
143
+ series=metadata.get('series'),
144
+ series_index=metadata.get('series_index'),
145
+ edition=metadata.get('edition'),
146
+ rights=metadata.get('rights'),
147
+ source=metadata.get('source'),
148
+ keywords=metadata.get('keywords')
149
+ )
150
+ self.session.add(book)
151
+ self.session.flush() # Get book.id
152
+
153
+ # Add authors
154
+ creators = metadata.get('creators') or []
155
+ for author_name in creators:
156
+ if author_name: # Skip None/empty values
157
+ author, _ = get_or_create(
158
+ self.session,
159
+ Author,
160
+ name=author_name,
161
+ sort_name=self._get_sort_name(author_name)
162
+ )
163
+ book.authors.append(author)
164
+
165
+ # Add subjects/tags
166
+ subjects = metadata.get('subjects') or []
167
+ for subject_name in subjects:
168
+ if subject_name: # Skip None/empty values
169
+ subject, _ = get_or_create(
170
+ self.session,
171
+ Subject,
172
+ name=subject_name,
173
+ type='topic'
174
+ )
175
+ book.subjects.append(subject)
176
+
177
+ # Add contributors (editors, translators, etc.)
178
+ contributors = metadata.get('contributors') or []
179
+ for contrib in contributors:
180
+ if isinstance(contrib, dict):
181
+ name = contrib.get('name')
182
+ role = contrib.get('role', 'contributor')
183
+ file_as = contrib.get('file_as', '')
184
+ if name:
185
+ from ..db.models import Contributor
186
+ contributor = Contributor(
187
+ book_id=book.id,
188
+ name=name,
189
+ role=role,
190
+ file_as=file_as
191
+ )
192
+ self.session.add(contributor)
193
+
194
+ # Add identifiers
195
+ for scheme, value in metadata.get('identifiers', {}).items():
196
+ identifier = Identifier(
197
+ book_id=book.id,
198
+ scheme=scheme,
199
+ value=value
200
+ )
201
+ self.session.add(identifier)
202
+
203
+ # Create personal metadata
204
+ personal = PersonalMetadata(
205
+ book_id=book.id,
206
+ reading_status='unread',
207
+ owned=True
208
+ )
209
+ self.session.add(personal)
210
+
211
+ return book
212
+
213
+ def _extract_cover(self, source_path: Path, book: Book, file: File):
214
+ """Extract and save cover image."""
215
+ cover_path = None
216
+
217
+ try:
218
+ if source_path.suffix.lower() == '.pdf':
219
+ cover_path = self._extract_pdf_cover(source_path, file.file_hash)
220
+ elif source_path.suffix.lower() == '.epub':
221
+ cover_path = self._extract_epub_cover(source_path, file.file_hash)
222
+
223
+ if cover_path and cover_path.exists():
224
+ # Create thumbnail
225
+ thumb_path = self._create_thumbnail(cover_path, file.file_hash)
226
+
227
+ # Save cover record
228
+ img = Image.open(cover_path)
229
+ cover = Cover(
230
+ book_id=book.id,
231
+ path=str(cover_path.relative_to(self.library_root)),
232
+ width=img.width,
233
+ height=img.height,
234
+ is_primary=True,
235
+ source='extracted'
236
+ )
237
+ self.session.add(cover)
238
+ logger.info(f"Extracted cover for {book.title}")
239
+
240
+ except Exception as e:
241
+ logger.warning(f"Cover extraction failed: {e}")
242
+
243
+ def _extract_pdf_cover(self, pdf_path: Path, file_hash: str) -> Optional[Path]:
244
+ """Extract first page of PDF as cover image."""
245
+ try:
246
+ import fitz
247
+ doc = fitz.open(str(pdf_path))
248
+ if len(doc) > 0:
249
+ page = doc[0]
250
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x scale for quality
251
+
252
+ cover_path = self._get_cover_path(file_hash, 'png')
253
+ pix.save(str(cover_path))
254
+ doc.close()
255
+ return cover_path
256
+ except Exception as e:
257
+ logger.error(f"PDF cover extraction error: {e}")
258
+
259
+ return None
260
+
261
+ def _extract_epub_cover(self, epub_path: Path, file_hash: str) -> Optional[Path]:
262
+ """Extract cover image from EPUB."""
263
+ try:
264
+ from ebooklib import epub
265
+ book = epub.read_epub(str(epub_path))
266
+
267
+ # Try to get cover - handle different ebooklib versions
268
+ cover_item = None
269
+
270
+ # Method 1: Try ITEM_COVER constant (older ebooklib)
271
+ try:
272
+ for item in book.get_items():
273
+ if hasattr(epub, 'ITEM_COVER') and item.get_type() == epub.ITEM_COVER:
274
+ cover_item = item
275
+ break
276
+ except AttributeError:
277
+ pass
278
+
279
+ # Method 2: Look for image named 'cover' or check item type == 1 (image)
280
+ if not cover_item:
281
+ for item in book.get_items():
282
+ # Type 1 is ITEM_IMAGE in ebooklib
283
+ if item.get_type() == 1: # ITEM_IMAGE
284
+ if 'cover' in item.get_name().lower():
285
+ cover_item = item
286
+ break
287
+
288
+ # Method 3: Try ITEM_IMAGE constant fallback
289
+ if not cover_item:
290
+ try:
291
+ for item in book.get_items():
292
+ if hasattr(epub, 'ITEM_IMAGE') and item.get_type() == epub.ITEM_IMAGE:
293
+ if 'cover' in item.get_name().lower():
294
+ cover_item = item
295
+ break
296
+ except AttributeError:
297
+ pass
298
+
299
+ if cover_item:
300
+ # Determine image format
301
+ ext = Path(cover_item.get_name()).suffix or '.jpg'
302
+ cover_path = self._get_cover_path(file_hash, ext[1:])
303
+
304
+ cover_path.write_bytes(cover_item.get_content())
305
+ return cover_path
306
+
307
+ except Exception as e:
308
+ logger.error(f"EPUB cover extraction error: {e}")
309
+
310
+ return None
311
+
312
+ def _create_thumbnail(self, cover_path: Path, file_hash: str) -> Path:
313
+ """Create thumbnail from cover image."""
314
+ thumb_path = self.library_root / 'covers' / 'thumbnails' / f"{file_hash}_thumb.jpg"
315
+
316
+ try:
317
+ img = Image.open(cover_path)
318
+ img.thumbnail((200, 300))
319
+ img.save(thumb_path, 'JPEG', quality=85)
320
+ return thumb_path
321
+ except Exception as e:
322
+ logger.error(f"Thumbnail creation error: {e}")
323
+ return cover_path
324
+
325
+ def _get_file_path(self, file_hash: str, extension: str) -> Path:
326
+ """Get storage path for file based on hash prefix."""
327
+ prefix = file_hash[:2]
328
+ dir_path = self.library_root / 'files' / prefix
329
+ dir_path.mkdir(parents=True, exist_ok=True)
330
+ return dir_path / f"{file_hash}{extension}"
331
+
332
+ def _get_cover_path(self, file_hash: str, extension: str) -> Path:
333
+ """Get storage path for cover based on hash prefix."""
334
+ prefix = file_hash[:2]
335
+ dir_path = self.library_root / 'covers' / prefix
336
+ dir_path.mkdir(parents=True, exist_ok=True)
337
+ return dir_path / f"{file_hash}.{extension}"
338
+
339
+ @staticmethod
340
+ def _compute_file_hash(file_path: Path) -> str:
341
+ """Compute SHA256 hash of file."""
342
+ sha256 = hashlib.sha256()
343
+ with open(file_path, 'rb') as f:
344
+ for block in iter(lambda: f.read(8192), b''):
345
+ sha256.update(block)
346
+ return sha256.hexdigest()
347
+
348
+ @staticmethod
349
+ def _generate_unique_id(metadata: Dict[str, Any]) -> str:
350
+ """Generate unique ID for book based on metadata."""
351
+ # Use ISBN if available
352
+ identifiers = metadata.get('identifiers', {})
353
+ if 'isbn' in identifiers:
354
+ return f"isbn_{identifiers['isbn']}"
355
+
356
+ # Otherwise use hash of title + authors
357
+ title = metadata.get('title', 'unknown')
358
+ authors = ','.join(metadata.get('creators', ['unknown']))
359
+ content = f"{title}:{authors}".lower()
360
+ return hashlib.md5(content.encode()).hexdigest()[:16]
361
+
362
+ @staticmethod
363
+ def _get_sort_title(title: str) -> str:
364
+ """Get sortable title (remove leading articles)."""
365
+ title = title.strip()
366
+ for article in ['The ', 'A ', 'An ']:
367
+ if title.startswith(article):
368
+ return title[len(article):]
369
+ return title
370
+
371
+ @staticmethod
372
+ def _get_sort_name(name: str) -> str:
373
+ """Get sortable name (Last, First format)."""
374
+ parts = name.split()
375
+ if len(parts) >= 2:
376
+ return f"{parts[-1]}, {' '.join(parts[:-1])}"
377
+ return name
378
+
379
+ def import_calibre_book(self, calibre_metadata_path: Path) -> Optional[Book]:
380
+ """
381
+ Import book from Calibre metadata.opf file.
382
+
383
+ Args:
384
+ calibre_metadata_path: Path to metadata.opf file
385
+
386
+ Returns:
387
+ Book instance or None
388
+ """
389
+ from ..extract_metadata import extract_metadata_from_opf
390
+
391
+ metadata = extract_metadata_from_opf(str(calibre_metadata_path))
392
+
393
+ # Find ebook files in same directory
394
+ book_dir = calibre_metadata_path.parent
395
+ ebook_files = list(book_dir.glob('*.pdf')) + \
396
+ list(book_dir.glob('*.epub')) + \
397
+ list(book_dir.glob('*.mobi'))
398
+
399
+ if not ebook_files:
400
+ logger.warning(f"No ebook files found in {book_dir}")
401
+ return None
402
+
403
+ # Import first file (others will be added as formats)
404
+ book = self.import_file(ebook_files[0], metadata)
405
+
406
+ # Import additional formats
407
+ for ebook_file in ebook_files[1:]:
408
+ self.import_file(ebook_file, metadata, extract_text=False, extract_cover=False)
409
+
410
+ return book
411
+
412
+ def batch_import(self, file_paths: List[Path], metadata_list: List[Dict[str, Any]],
413
+ show_progress: bool = False) -> List[Book]:
414
+ """
415
+ Import multiple files with progress tracking.
416
+
417
+ Args:
418
+ file_paths: List of file paths to import
419
+ metadata_list: List of metadata dicts (one per file)
420
+ show_progress: Whether to show progress bar
421
+
422
+ Returns:
423
+ List of imported Book instances
424
+ """
425
+ books = []
426
+
427
+ if show_progress:
428
+ from rich.progress import Progress
429
+ with Progress() as progress:
430
+ task = progress.add_task("[green]Importing...", total=len(file_paths))
431
+ for file_path, metadata in zip(file_paths, metadata_list):
432
+ book = self.import_file(file_path, metadata)
433
+ if book:
434
+ books.append(book)
435
+ progress.advance(task)
436
+ else:
437
+ for file_path, metadata in zip(file_paths, metadata_list):
438
+ book = self.import_file(file_path, metadata)
439
+ if book:
440
+ books.append(book)
441
+
442
+ return books