ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,28 @@
1
+ """
2
+ Services for ebk business logic.
3
+
4
+ Provides a unified service layer for all ebk operations.
5
+ """
6
+
7
+ from .text_extraction import TextExtractionService
8
+ from .import_service import ImportService
9
+ from .export_service import ExportService
10
+ from .queue_service import ReadingQueueService
11
+ from .personal_metadata_service import PersonalMetadataService
12
+ from .annotation_service import AnnotationService
13
+ from .view_service import ViewService
14
+
15
+ __all__ = [
16
+ # Core services
17
+ 'TextExtractionService',
18
+ 'ImportService',
19
+ 'ExportService',
20
+
21
+ # Personal/user services
22
+ 'ReadingQueueService',
23
+ 'PersonalMetadataService',
24
+ 'AnnotationService',
25
+
26
+ # Library organization
27
+ 'ViewService',
28
+ ]
@@ -0,0 +1,351 @@
1
+ """
2
+ Annotation extraction service for ebook files.
3
+
4
+ Extracts highlights, notes, and bookmarks from PDF and EPUB files.
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any, Optional
10
+ from dataclasses import dataclass
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class ExtractedAnnotation:
17
+ """Represents an extracted annotation."""
18
+ annotation_type: str # 'highlight', 'note', 'bookmark', 'underline', 'strikeout'
19
+ content: str # The highlighted/noted text or note content
20
+ page_number: Optional[int] = None
21
+ color: Optional[str] = None
22
+ position: Optional[Dict[str, Any]] = None # Position info
23
+ note: Optional[str] = None # Additional note attached to highlight
24
+
25
+
26
+ class AnnotationExtractionService:
27
+ """Service for extracting annotations from ebook files."""
28
+
29
+ def __init__(self, library_root: Path):
30
+ self.library_root = Path(library_root)
31
+
32
+ def extract_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
33
+ """
34
+ Extract all annotations from an ebook file.
35
+
36
+ Args:
37
+ file_path: Path to the ebook file
38
+
39
+ Returns:
40
+ List of extracted annotations
41
+ """
42
+ if not file_path.exists():
43
+ logger.error(f"File not found: {file_path}")
44
+ return []
45
+
46
+ suffix = file_path.suffix.lower()
47
+
48
+ if suffix == '.pdf':
49
+ return self._extract_pdf_annotations(file_path)
50
+ elif suffix == '.epub':
51
+ return self._extract_epub_annotations(file_path)
52
+ else:
53
+ logger.warning(f"Unsupported format for annotation extraction: {suffix}")
54
+ return []
55
+
56
+ def _extract_pdf_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
57
+ """Extract annotations from a PDF file using PyMuPDF."""
58
+ try:
59
+ import fitz # PyMuPDF
60
+ except ImportError:
61
+ logger.error("PyMuPDF (fitz) not installed. Install with: pip install pymupdf")
62
+ return []
63
+
64
+ annotations = []
65
+
66
+ try:
67
+ doc = fitz.open(file_path)
68
+
69
+ for page_num, page in enumerate(doc, start=1):
70
+ # Get all annotations on this page
71
+ for annot in page.annots() or []:
72
+ annot_type = annot.type[1] # e.g., 'Highlight', 'Text', 'StrikeOut'
73
+
74
+ # Map PDF annotation types to our types
75
+ type_mapping = {
76
+ 'Highlight': 'highlight',
77
+ 'Underline': 'underline',
78
+ 'StrikeOut': 'strikeout',
79
+ 'Squiggly': 'underline',
80
+ 'Text': 'note', # Sticky note
81
+ 'FreeText': 'note',
82
+ 'Ink': 'drawing',
83
+ }
84
+
85
+ our_type = type_mapping.get(annot_type, 'other')
86
+ if our_type == 'other':
87
+ continue # Skip unsupported types
88
+
89
+ # Get the highlighted/annotated text
90
+ content = ""
91
+ note_content = None
92
+
93
+ # For text markup annotations (highlight, underline, etc.)
94
+ if annot_type in ['Highlight', 'Underline', 'StrikeOut', 'Squiggly']:
95
+ # Get the text under the annotation
96
+ try:
97
+ quads = annot.vertices
98
+ if quads:
99
+ # Extract text from the annotation area
100
+ rect = annot.rect
101
+ content = page.get_text("text", clip=rect).strip()
102
+ except Exception:
103
+ pass
104
+
105
+ # Check for popup note attached to highlight
106
+ info = annot.info
107
+ if info.get('content'):
108
+ note_content = info['content']
109
+
110
+ # For text notes (sticky notes)
111
+ elif annot_type in ['Text', 'FreeText']:
112
+ info = annot.info
113
+ content = info.get('content', '') or annot.get_text() or ''
114
+
115
+ if not content and not note_content:
116
+ continue
117
+
118
+ # Get color
119
+ color = None
120
+ colors = annot.colors
121
+ if colors and colors.get('stroke'):
122
+ # Convert RGB to hex
123
+ rgb = colors['stroke']
124
+ if len(rgb) >= 3:
125
+ color = '#{:02x}{:02x}{:02x}'.format(
126
+ int(rgb[0] * 255),
127
+ int(rgb[1] * 255),
128
+ int(rgb[2] * 255)
129
+ )
130
+
131
+ # Get position
132
+ rect = annot.rect
133
+ position = {
134
+ 'x': rect.x0,
135
+ 'y': rect.y0,
136
+ 'width': rect.width,
137
+ 'height': rect.height
138
+ }
139
+
140
+ annotations.append(ExtractedAnnotation(
141
+ annotation_type=our_type,
142
+ content=content or note_content or "",
143
+ page_number=page_num,
144
+ color=color,
145
+ position=position,
146
+ note=note_content if content else None
147
+ ))
148
+
149
+ doc.close()
150
+ logger.info(f"Extracted {len(annotations)} annotations from PDF: {file_path.name}")
151
+
152
+ except Exception as e:
153
+ logger.error(f"Error extracting PDF annotations: {e}")
154
+
155
+ return annotations
156
+
157
+ def _extract_epub_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
158
+ """
159
+ Extract annotations from an EPUB file.
160
+
161
+ Note: EPUB files don't have a standard annotation format.
162
+ This looks for common annotation storage patterns used by some readers.
163
+ """
164
+ try:
165
+ from ebooklib import epub
166
+ except ImportError:
167
+ logger.error("ebooklib not installed. Install with: pip install ebooklib")
168
+ return []
169
+
170
+ annotations = []
171
+
172
+ try:
173
+ book = epub.read_epub(file_path)
174
+
175
+ # Look for annotation files that some readers create
176
+ # Common patterns: META-INF/annotations.xml, OPS/annotations.xml
177
+ for item in book.get_items():
178
+ name = item.get_name().lower()
179
+
180
+ # Check for annotation files
181
+ if 'annotation' in name and name.endswith('.xml'):
182
+ content = item.get_content().decode('utf-8', errors='ignore')
183
+ annotations.extend(self._parse_epub_annotations_xml(content))
184
+
185
+ # Check for Open Annotation format
186
+ elif name.endswith('.json') and 'annotation' in name:
187
+ import json
188
+ try:
189
+ content = item.get_content().decode('utf-8', errors='ignore')
190
+ data = json.loads(content)
191
+ annotations.extend(self._parse_open_annotation_json(data))
192
+ except json.JSONDecodeError:
193
+ pass
194
+
195
+ logger.info(f"Extracted {len(annotations)} annotations from EPUB: {file_path.name}")
196
+
197
+ except Exception as e:
198
+ logger.error(f"Error extracting EPUB annotations: {e}")
199
+
200
+ return annotations
201
+
202
+ def _parse_epub_annotations_xml(self, xml_content: str) -> List[ExtractedAnnotation]:
203
+ """Parse common EPUB annotation XML formats."""
204
+ annotations = []
205
+
206
+ try:
207
+ from bs4 import BeautifulSoup
208
+ soup = BeautifulSoup(xml_content, 'xml')
209
+
210
+ # Try various common annotation formats
211
+ # Adobe Digital Editions format
212
+ for annot in soup.find_all(['annotation', 'highlight', 'note']):
213
+ content = annot.get_text(strip=True)
214
+ if content:
215
+ annot_type = annot.name
216
+ if annot_type == 'annotation':
217
+ annot_type = 'note'
218
+
219
+ annotations.append(ExtractedAnnotation(
220
+ annotation_type=annot_type,
221
+ content=content,
222
+ page_number=None,
223
+ color=annot.get('color')
224
+ ))
225
+
226
+ except Exception as e:
227
+ logger.debug(f"Error parsing EPUB annotations XML: {e}")
228
+
229
+ return annotations
230
+
231
+ def _parse_open_annotation_json(self, data: Any) -> List[ExtractedAnnotation]:
232
+ """Parse Open Annotation (W3C Web Annotation) format."""
233
+ annotations = []
234
+
235
+ try:
236
+ items = data if isinstance(data, list) else [data]
237
+
238
+ for item in items:
239
+ if not isinstance(item, dict):
240
+ continue
241
+
242
+ # W3C Web Annotation format
243
+ body = item.get('body', {})
244
+ target = item.get('target', {})
245
+
246
+ content = ""
247
+ if isinstance(body, str):
248
+ content = body
249
+ elif isinstance(body, dict):
250
+ content = body.get('value', '') or body.get('text', '')
251
+
252
+ if not content:
253
+ continue
254
+
255
+ # Determine type from motivation
256
+ motivation = item.get('motivation', 'highlighting')
257
+ type_mapping = {
258
+ 'highlighting': 'highlight',
259
+ 'commenting': 'note',
260
+ 'bookmarking': 'bookmark',
261
+ 'describing': 'note',
262
+ }
263
+ annot_type = type_mapping.get(motivation, 'note')
264
+
265
+ annotations.append(ExtractedAnnotation(
266
+ annotation_type=annot_type,
267
+ content=content,
268
+ page_number=None
269
+ ))
270
+
271
+ except Exception as e:
272
+ logger.debug(f"Error parsing Open Annotation JSON: {e}")
273
+
274
+ return annotations
275
+
276
+
277
+ def extract_annotations_from_book(
278
+ book,
279
+ library_path: Path,
280
+ file_format: Optional[str] = None
281
+ ) -> List[ExtractedAnnotation]:
282
+ """
283
+ Extract annotations from a book's files (without saving to database).
284
+
285
+ Args:
286
+ book: Book ORM instance with files relationship loaded
287
+ library_path: Path to the library root directory
288
+ file_format: Optional specific format to extract from (e.g., 'pdf')
289
+
290
+ Returns:
291
+ List of extracted annotations
292
+ """
293
+ service = AnnotationExtractionService(library_path)
294
+ all_annotations = []
295
+
296
+ for file in book.files:
297
+ # Skip if format filter specified and doesn't match
298
+ if file_format and file.format.lower() != file_format.lower():
299
+ continue
300
+
301
+ file_path = library_path / file.path
302
+ annotations = service.extract_annotations(file_path)
303
+ all_annotations.extend(annotations)
304
+
305
+ return all_annotations
306
+
307
+
308
+ def extract_and_save_annotations(
309
+ library,
310
+ book_id: int,
311
+ file_format: Optional[str] = None
312
+ ) -> int:
313
+ """
314
+ Extract annotations from a book's files and save to database.
315
+
316
+ Args:
317
+ library: Library instance
318
+ book_id: Book ID to extract annotations for
319
+ file_format: Optional specific format to extract from (e.g., 'pdf')
320
+
321
+ Returns:
322
+ Number of annotations extracted and saved
323
+ """
324
+ book = library.get_book(book_id)
325
+ if not book:
326
+ logger.error(f"Book {book_id} not found")
327
+ return 0
328
+
329
+ annotations = extract_annotations_from_book(book, library.library_path, file_format)
330
+ total_saved = 0
331
+
332
+ for annot in annotations:
333
+ # Skip duplicates (same content, same page, same type)
334
+ existing = [a for a in book.annotations
335
+ if a.content == annot.content
336
+ and a.page_number == annot.page_number
337
+ and a.annotation_type == annot.annotation_type]
338
+ if existing:
339
+ continue
340
+
341
+ library.add_annotation(
342
+ book_id=book_id,
343
+ content=annot.content,
344
+ annotation_type=annot.annotation_type,
345
+ page_number=annot.page_number,
346
+ position=annot.position,
347
+ color=annot.color
348
+ )
349
+ total_saved += 1
350
+
351
+ return total_saved