ebk 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +450 -0
- ebk/ai/llm_providers/__init__.py +26 -0
- ebk/ai/llm_providers/anthropic.py +209 -0
- ebk/ai/llm_providers/base.py +295 -0
- ebk/ai/llm_providers/gemini.py +285 -0
- ebk/ai/llm_providers/ollama.py +294 -0
- ebk/ai/metadata_enrichment.py +394 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +433 -0
- ebk/ai/text_extractor.py +393 -0
- ebk/calibre_import.py +66 -0
- ebk/cli.py +6433 -0
- ebk/config.py +230 -0
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +507 -0
- ebk/db/models.py +725 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +1 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/echo_export.py +279 -0
- ebk/exports/html_library.py +1743 -0
- ebk/exports/html_utils.py +87 -0
- ebk/exports/hugo.py +59 -0
- ebk/exports/jinja_export.py +286 -0
- ebk/exports/multi_facet_export.py +159 -0
- ebk/exports/opds_export.py +232 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/exports/zip.py +25 -0
- ebk/extract_metadata.py +341 -0
- ebk/ident.py +89 -0
- ebk/library_db.py +1440 -0
- ebk/opds.py +748 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +442 -0
- ebk/plugins/registry.py +499 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +173 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +3608 -0
- ebk/services/__init__.py +28 -0
- ebk/services/annotation_extraction.py +351 -0
- ebk/services/annotation_service.py +380 -0
- ebk/services/export_service.py +577 -0
- ebk/services/import_service.py +447 -0
- ebk/services/personal_metadata_service.py +347 -0
- ebk/services/queue_service.py +253 -0
- ebk/services/tag_service.py +281 -0
- ebk/services/text_extraction.py +317 -0
- ebk/services/view_service.py +12 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +471 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/skills/SKILL.md +182 -0
- ebk/skills/__init__.py +1 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +298 -0
- ebk/vfs/library_vfs.py +122 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk/vfs_router.py +275 -0
- ebk/views/__init__.py +32 -0
- ebk/views/dsl.py +668 -0
- ebk/views/service.py +619 -0
- ebk-0.4.4.dist-info/METADATA +755 -0
- ebk-0.4.4.dist-info/RECORD +87 -0
- ebk-0.4.4.dist-info/WHEEL +5 -0
- ebk-0.4.4.dist-info/entry_points.txt +2 -0
- ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
- ebk-0.4.4.dist-info/top_level.txt +1 -0
ebk/services/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Services for ebk business logic.
|
|
3
|
+
|
|
4
|
+
Provides a unified service layer for all ebk operations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .text_extraction import TextExtractionService
|
|
8
|
+
from .import_service import ImportService
|
|
9
|
+
from .export_service import ExportService
|
|
10
|
+
from .queue_service import ReadingQueueService
|
|
11
|
+
from .personal_metadata_service import PersonalMetadataService
|
|
12
|
+
from .annotation_service import AnnotationService
|
|
13
|
+
from .view_service import ViewService
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
# Core services
|
|
17
|
+
'TextExtractionService',
|
|
18
|
+
'ImportService',
|
|
19
|
+
'ExportService',
|
|
20
|
+
|
|
21
|
+
# Personal/user services
|
|
22
|
+
'ReadingQueueService',
|
|
23
|
+
'PersonalMetadataService',
|
|
24
|
+
'AnnotationService',
|
|
25
|
+
|
|
26
|
+
# Library organization
|
|
27
|
+
'ViewService',
|
|
28
|
+
]
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Annotation extraction service for ebook files.
|
|
3
|
+
|
|
4
|
+
Extracts highlights, notes, and bookmarks from PDF and EPUB files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Optional
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ExtractedAnnotation:
|
|
17
|
+
"""Represents an extracted annotation."""
|
|
18
|
+
annotation_type: str # 'highlight', 'note', 'bookmark', 'underline', 'strikeout'
|
|
19
|
+
content: str # The highlighted/noted text or note content
|
|
20
|
+
page_number: Optional[int] = None
|
|
21
|
+
color: Optional[str] = None
|
|
22
|
+
position: Optional[Dict[str, Any]] = None # Position info
|
|
23
|
+
note: Optional[str] = None # Additional note attached to highlight
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AnnotationExtractionService:
|
|
27
|
+
"""Service for extracting annotations from ebook files."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, library_root: Path):
|
|
30
|
+
self.library_root = Path(library_root)
|
|
31
|
+
|
|
32
|
+
def extract_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
|
|
33
|
+
"""
|
|
34
|
+
Extract all annotations from an ebook file.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
file_path: Path to the ebook file
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of extracted annotations
|
|
41
|
+
"""
|
|
42
|
+
if not file_path.exists():
|
|
43
|
+
logger.error(f"File not found: {file_path}")
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
suffix = file_path.suffix.lower()
|
|
47
|
+
|
|
48
|
+
if suffix == '.pdf':
|
|
49
|
+
return self._extract_pdf_annotations(file_path)
|
|
50
|
+
elif suffix == '.epub':
|
|
51
|
+
return self._extract_epub_annotations(file_path)
|
|
52
|
+
else:
|
|
53
|
+
logger.warning(f"Unsupported format for annotation extraction: {suffix}")
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
def _extract_pdf_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
|
|
57
|
+
"""Extract annotations from a PDF file using PyMuPDF."""
|
|
58
|
+
try:
|
|
59
|
+
import fitz # PyMuPDF
|
|
60
|
+
except ImportError:
|
|
61
|
+
logger.error("PyMuPDF (fitz) not installed. Install with: pip install pymupdf")
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
annotations = []
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
doc = fitz.open(file_path)
|
|
68
|
+
|
|
69
|
+
for page_num, page in enumerate(doc, start=1):
|
|
70
|
+
# Get all annotations on this page
|
|
71
|
+
for annot in page.annots() or []:
|
|
72
|
+
annot_type = annot.type[1] # e.g., 'Highlight', 'Text', 'StrikeOut'
|
|
73
|
+
|
|
74
|
+
# Map PDF annotation types to our types
|
|
75
|
+
type_mapping = {
|
|
76
|
+
'Highlight': 'highlight',
|
|
77
|
+
'Underline': 'underline',
|
|
78
|
+
'StrikeOut': 'strikeout',
|
|
79
|
+
'Squiggly': 'underline',
|
|
80
|
+
'Text': 'note', # Sticky note
|
|
81
|
+
'FreeText': 'note',
|
|
82
|
+
'Ink': 'drawing',
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
our_type = type_mapping.get(annot_type, 'other')
|
|
86
|
+
if our_type == 'other':
|
|
87
|
+
continue # Skip unsupported types
|
|
88
|
+
|
|
89
|
+
# Get the highlighted/annotated text
|
|
90
|
+
content = ""
|
|
91
|
+
note_content = None
|
|
92
|
+
|
|
93
|
+
# For text markup annotations (highlight, underline, etc.)
|
|
94
|
+
if annot_type in ['Highlight', 'Underline', 'StrikeOut', 'Squiggly']:
|
|
95
|
+
# Get the text under the annotation
|
|
96
|
+
try:
|
|
97
|
+
quads = annot.vertices
|
|
98
|
+
if quads:
|
|
99
|
+
# Extract text from the annotation area
|
|
100
|
+
rect = annot.rect
|
|
101
|
+
content = page.get_text("text", clip=rect).strip()
|
|
102
|
+
except Exception:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
# Check for popup note attached to highlight
|
|
106
|
+
info = annot.info
|
|
107
|
+
if info.get('content'):
|
|
108
|
+
note_content = info['content']
|
|
109
|
+
|
|
110
|
+
# For text notes (sticky notes)
|
|
111
|
+
elif annot_type in ['Text', 'FreeText']:
|
|
112
|
+
info = annot.info
|
|
113
|
+
content = info.get('content', '') or annot.get_text() or ''
|
|
114
|
+
|
|
115
|
+
if not content and not note_content:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Get color
|
|
119
|
+
color = None
|
|
120
|
+
colors = annot.colors
|
|
121
|
+
if colors and colors.get('stroke'):
|
|
122
|
+
# Convert RGB to hex
|
|
123
|
+
rgb = colors['stroke']
|
|
124
|
+
if len(rgb) >= 3:
|
|
125
|
+
color = '#{:02x}{:02x}{:02x}'.format(
|
|
126
|
+
int(rgb[0] * 255),
|
|
127
|
+
int(rgb[1] * 255),
|
|
128
|
+
int(rgb[2] * 255)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Get position
|
|
132
|
+
rect = annot.rect
|
|
133
|
+
position = {
|
|
134
|
+
'x': rect.x0,
|
|
135
|
+
'y': rect.y0,
|
|
136
|
+
'width': rect.width,
|
|
137
|
+
'height': rect.height
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
annotations.append(ExtractedAnnotation(
|
|
141
|
+
annotation_type=our_type,
|
|
142
|
+
content=content or note_content or "",
|
|
143
|
+
page_number=page_num,
|
|
144
|
+
color=color,
|
|
145
|
+
position=position,
|
|
146
|
+
note=note_content if content else None
|
|
147
|
+
))
|
|
148
|
+
|
|
149
|
+
doc.close()
|
|
150
|
+
logger.info(f"Extracted {len(annotations)} annotations from PDF: {file_path.name}")
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Error extracting PDF annotations: {e}")
|
|
154
|
+
|
|
155
|
+
return annotations
|
|
156
|
+
|
|
157
|
+
def _extract_epub_annotations(self, file_path: Path) -> List[ExtractedAnnotation]:
|
|
158
|
+
"""
|
|
159
|
+
Extract annotations from an EPUB file.
|
|
160
|
+
|
|
161
|
+
Note: EPUB files don't have a standard annotation format.
|
|
162
|
+
This looks for common annotation storage patterns used by some readers.
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
from ebooklib import epub
|
|
166
|
+
except ImportError:
|
|
167
|
+
logger.error("ebooklib not installed. Install with: pip install ebooklib")
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
annotations = []
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
book = epub.read_epub(file_path)
|
|
174
|
+
|
|
175
|
+
# Look for annotation files that some readers create
|
|
176
|
+
# Common patterns: META-INF/annotations.xml, OPS/annotations.xml
|
|
177
|
+
for item in book.get_items():
|
|
178
|
+
name = item.get_name().lower()
|
|
179
|
+
|
|
180
|
+
# Check for annotation files
|
|
181
|
+
if 'annotation' in name and name.endswith('.xml'):
|
|
182
|
+
content = item.get_content().decode('utf-8', errors='ignore')
|
|
183
|
+
annotations.extend(self._parse_epub_annotations_xml(content))
|
|
184
|
+
|
|
185
|
+
# Check for Open Annotation format
|
|
186
|
+
elif name.endswith('.json') and 'annotation' in name:
|
|
187
|
+
import json
|
|
188
|
+
try:
|
|
189
|
+
content = item.get_content().decode('utf-8', errors='ignore')
|
|
190
|
+
data = json.loads(content)
|
|
191
|
+
annotations.extend(self._parse_open_annotation_json(data))
|
|
192
|
+
except json.JSONDecodeError:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
logger.info(f"Extracted {len(annotations)} annotations from EPUB: {file_path.name}")
|
|
196
|
+
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Error extracting EPUB annotations: {e}")
|
|
199
|
+
|
|
200
|
+
return annotations
|
|
201
|
+
|
|
202
|
+
def _parse_epub_annotations_xml(self, xml_content: str) -> List[ExtractedAnnotation]:
|
|
203
|
+
"""Parse common EPUB annotation XML formats."""
|
|
204
|
+
annotations = []
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
from bs4 import BeautifulSoup
|
|
208
|
+
soup = BeautifulSoup(xml_content, 'xml')
|
|
209
|
+
|
|
210
|
+
# Try various common annotation formats
|
|
211
|
+
# Adobe Digital Editions format
|
|
212
|
+
for annot in soup.find_all(['annotation', 'highlight', 'note']):
|
|
213
|
+
content = annot.get_text(strip=True)
|
|
214
|
+
if content:
|
|
215
|
+
annot_type = annot.name
|
|
216
|
+
if annot_type == 'annotation':
|
|
217
|
+
annot_type = 'note'
|
|
218
|
+
|
|
219
|
+
annotations.append(ExtractedAnnotation(
|
|
220
|
+
annotation_type=annot_type,
|
|
221
|
+
content=content,
|
|
222
|
+
page_number=None,
|
|
223
|
+
color=annot.get('color')
|
|
224
|
+
))
|
|
225
|
+
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.debug(f"Error parsing EPUB annotations XML: {e}")
|
|
228
|
+
|
|
229
|
+
return annotations
|
|
230
|
+
|
|
231
|
+
def _parse_open_annotation_json(self, data: Any) -> List[ExtractedAnnotation]:
|
|
232
|
+
"""Parse Open Annotation (W3C Web Annotation) format."""
|
|
233
|
+
annotations = []
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
items = data if isinstance(data, list) else [data]
|
|
237
|
+
|
|
238
|
+
for item in items:
|
|
239
|
+
if not isinstance(item, dict):
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
# W3C Web Annotation format
|
|
243
|
+
body = item.get('body', {})
|
|
244
|
+
target = item.get('target', {})
|
|
245
|
+
|
|
246
|
+
content = ""
|
|
247
|
+
if isinstance(body, str):
|
|
248
|
+
content = body
|
|
249
|
+
elif isinstance(body, dict):
|
|
250
|
+
content = body.get('value', '') or body.get('text', '')
|
|
251
|
+
|
|
252
|
+
if not content:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
# Determine type from motivation
|
|
256
|
+
motivation = item.get('motivation', 'highlighting')
|
|
257
|
+
type_mapping = {
|
|
258
|
+
'highlighting': 'highlight',
|
|
259
|
+
'commenting': 'note',
|
|
260
|
+
'bookmarking': 'bookmark',
|
|
261
|
+
'describing': 'note',
|
|
262
|
+
}
|
|
263
|
+
annot_type = type_mapping.get(motivation, 'note')
|
|
264
|
+
|
|
265
|
+
annotations.append(ExtractedAnnotation(
|
|
266
|
+
annotation_type=annot_type,
|
|
267
|
+
content=content,
|
|
268
|
+
page_number=None
|
|
269
|
+
))
|
|
270
|
+
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.debug(f"Error parsing Open Annotation JSON: {e}")
|
|
273
|
+
|
|
274
|
+
return annotations
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def extract_annotations_from_book(
|
|
278
|
+
book,
|
|
279
|
+
library_path: Path,
|
|
280
|
+
file_format: Optional[str] = None
|
|
281
|
+
) -> List[ExtractedAnnotation]:
|
|
282
|
+
"""
|
|
283
|
+
Extract annotations from a book's files (without saving to database).
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
book: Book ORM instance with files relationship loaded
|
|
287
|
+
library_path: Path to the library root directory
|
|
288
|
+
file_format: Optional specific format to extract from (e.g., 'pdf')
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of extracted annotations
|
|
292
|
+
"""
|
|
293
|
+
service = AnnotationExtractionService(library_path)
|
|
294
|
+
all_annotations = []
|
|
295
|
+
|
|
296
|
+
for file in book.files:
|
|
297
|
+
# Skip if format filter specified and doesn't match
|
|
298
|
+
if file_format and file.format.lower() != file_format.lower():
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
file_path = library_path / file.path
|
|
302
|
+
annotations = service.extract_annotations(file_path)
|
|
303
|
+
all_annotations.extend(annotations)
|
|
304
|
+
|
|
305
|
+
return all_annotations
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def extract_and_save_annotations(
|
|
309
|
+
library,
|
|
310
|
+
book_id: int,
|
|
311
|
+
file_format: Optional[str] = None
|
|
312
|
+
) -> int:
|
|
313
|
+
"""
|
|
314
|
+
Extract annotations from a book's files and save to database.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
library: Library instance
|
|
318
|
+
book_id: Book ID to extract annotations for
|
|
319
|
+
file_format: Optional specific format to extract from (e.g., 'pdf')
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Number of annotations extracted and saved
|
|
323
|
+
"""
|
|
324
|
+
book = library.get_book(book_id)
|
|
325
|
+
if not book:
|
|
326
|
+
logger.error(f"Book {book_id} not found")
|
|
327
|
+
return 0
|
|
328
|
+
|
|
329
|
+
annotations = extract_annotations_from_book(book, library.library_path, file_format)
|
|
330
|
+
total_saved = 0
|
|
331
|
+
|
|
332
|
+
for annot in annotations:
|
|
333
|
+
# Skip duplicates (same content, same page, same type)
|
|
334
|
+
existing = [a for a in book.annotations
|
|
335
|
+
if a.content == annot.content
|
|
336
|
+
and a.page_number == annot.page_number
|
|
337
|
+
and a.annotation_type == annot.annotation_type]
|
|
338
|
+
if existing:
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
library.add_annotation(
|
|
342
|
+
book_id=book_id,
|
|
343
|
+
content=annot.content,
|
|
344
|
+
annotation_type=annot.annotation_type,
|
|
345
|
+
page_number=annot.page_number,
|
|
346
|
+
position=annot.position,
|
|
347
|
+
color=annot.color
|
|
348
|
+
)
|
|
349
|
+
total_saved += 1
|
|
350
|
+
|
|
351
|
+
return total_saved
|