ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,393 @@
1
+ """
2
+ Extract text and structured content from various ebook formats.
3
+ """
4
+
5
+ import re
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any, Optional
8
+ import fitz # PyMuPDF
9
+ import ebooklib
10
+ from ebooklib import epub
11
+ from bs4 import BeautifulSoup
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ChapterExtractor:
18
+ """Extract chapters and structured content from books."""
19
+
20
+ def extract(self, file_path: Path) -> List[Dict[str, Any]]:
21
+ """Extract chapters from a book file."""
22
+ file_path = Path(file_path)
23
+ suffix = file_path.suffix.lower()
24
+
25
+ if suffix == '.pdf':
26
+ return self._extract_pdf_chapters(file_path)
27
+ elif suffix == '.epub':
28
+ return self._extract_epub_chapters(file_path)
29
+ else:
30
+ logger.warning(f"Unsupported format: {suffix}")
31
+ return []
32
+
33
+ def _extract_pdf_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
34
+ """Extract chapters from a PDF file."""
35
+ chapters = []
36
+
37
+ try:
38
+ pdf = fitz.open(str(file_path))
39
+ toc = pdf.get_toc() # Table of contents
40
+
41
+ if toc:
42
+ # Use TOC if available
43
+ for i, (level, title, page_num) in enumerate(toc):
44
+ if level == 1: # Main chapters
45
+ # Get chapter content
46
+ start_page = page_num - 1
47
+ end_page = toc[i + 1][2] - 1 if i + 1 < len(toc) else len(pdf)
48
+
49
+ text = ""
50
+ for page_idx in range(start_page, min(end_page, len(pdf))):
51
+ page = pdf[page_idx]
52
+ text += page.get_text()
53
+
54
+ chapters.append({
55
+ 'title': title,
56
+ 'level': level,
57
+ 'page_start': start_page + 1,
58
+ 'page_end': end_page,
59
+ 'content': text.strip()
60
+ })
61
+ else:
62
+ # Fallback: Try to detect chapters by patterns
63
+ chapters = self._detect_pdf_chapters_by_pattern(pdf)
64
+
65
+ pdf.close()
66
+
67
+ except Exception as e:
68
+ logger.error(f"Error extracting PDF chapters: {e}")
69
+
70
+ return chapters
71
+
72
+ def _detect_pdf_chapters_by_pattern(self, pdf) -> List[Dict[str, Any]]:
73
+ """Detect chapters in PDF by common patterns."""
74
+ chapters = []
75
+ chapter_pattern = re.compile(
76
+ r'^(Chapter|CHAPTER|Ch\.|CH\.?)\s+(\d+|[IVX]+)[\s:\-]*(.*)$',
77
+ re.MULTILINE
78
+ )
79
+
80
+ current_chapter = None
81
+ chapter_text = []
82
+
83
+ for page_num in range(len(pdf)):
84
+ page = pdf[page_num]
85
+ text = page.get_text()
86
+
87
+ # Look for chapter headings
88
+ matches = chapter_pattern.finditer(text)
89
+ for match in matches:
90
+ if current_chapter:
91
+ # Save previous chapter
92
+ current_chapter['content'] = '\n'.join(chapter_text).strip()
93
+ current_chapter['page_end'] = page_num
94
+ chapters.append(current_chapter)
95
+ chapter_text = []
96
+
97
+ # Start new chapter
98
+ current_chapter = {
99
+ 'title': match.group(3).strip() or f"Chapter {match.group(2)}",
100
+ 'level': 1,
101
+ 'page_start': page_num + 1,
102
+ 'page_end': None,
103
+ 'content': ''
104
+ }
105
+
106
+ if current_chapter:
107
+ chapter_text.append(text)
108
+
109
+ # Save last chapter
110
+ if current_chapter:
111
+ current_chapter['content'] = '\n'.join(chapter_text).strip()
112
+ current_chapter['page_end'] = len(pdf)
113
+ chapters.append(current_chapter)
114
+
115
+ return chapters
116
+
117
+ def _extract_epub_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
118
+ """Extract chapters from an EPUB file."""
119
+ chapters = []
120
+
121
+ try:
122
+ book = epub.read_epub(str(file_path))
123
+
124
+ # Get table of contents
125
+ toc = book.toc
126
+
127
+ # Extract text from each chapter
128
+ for item in book.get_items():
129
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
130
+ soup = BeautifulSoup(item.content, 'html.parser')
131
+
132
+ # Try to find chapter title
133
+ title = None
134
+ for heading in ['h1', 'h2', 'h3']:
135
+ heading_elem = soup.find(heading)
136
+ if heading_elem:
137
+ title = heading_elem.get_text().strip()
138
+ break
139
+
140
+ if not title:
141
+ title = item.get_name()
142
+
143
+ # Extract text content
144
+ text = soup.get_text(separator='\n').strip()
145
+
146
+ if text:
147
+ chapters.append({
148
+ 'title': title,
149
+ 'level': 1,
150
+ 'page_start': None, # EPUB doesn't have pages
151
+ 'page_end': None,
152
+ 'content': text
153
+ })
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error extracting EPUB chapters: {e}")
157
+
158
+ return chapters
159
+
160
+
161
+ class TextExtractor:
162
+ """Extract and process text from ebooks for knowledge extraction."""
163
+
164
+ def __init__(self):
165
+ self.chapter_extractor = ChapterExtractor()
166
+
167
+ def extract_full_text(self, file_path: Path) -> str:
168
+ """Extract complete text from a book."""
169
+ file_path = Path(file_path)
170
+ suffix = file_path.suffix.lower()
171
+
172
+ if suffix == '.pdf':
173
+ return self._extract_pdf_text(file_path)
174
+ elif suffix == '.epub':
175
+ return self._extract_epub_text(file_path)
176
+ elif suffix in ['.txt', '.md']:
177
+ return file_path.read_text(encoding='utf-8')
178
+ else:
179
+ logger.warning(f"Unsupported format: {suffix}")
180
+ return ""
181
+
182
+ def extract_key_passages(self, file_path: Path,
183
+ keywords: List[str] = None,
184
+ context_size: int = 200) -> List[Dict[str, Any]]:
185
+ """
186
+ Extract passages containing specific keywords or important concepts.
187
+ """
188
+ chapters = self.chapter_extractor.extract(file_path)
189
+ passages = []
190
+
191
+ for chapter in chapters:
192
+ content = chapter.get('content', '')
193
+ if not content:
194
+ continue
195
+
196
+ # Split into sentences
197
+ sentences = self._split_into_sentences(content)
198
+
199
+ for i, sentence in enumerate(sentences):
200
+ # Check if sentence contains keywords or is important
201
+ if self._is_important_passage(sentence, keywords):
202
+ # Get context
203
+ start = max(0, i - 2)
204
+ end = min(len(sentences), i + 3)
205
+ context = ' '.join(sentences[start:end])
206
+
207
+ passages.append({
208
+ 'chapter': chapter['title'],
209
+ 'page': chapter.get('page_start'),
210
+ 'sentence': sentence,
211
+ 'context': context,
212
+ 'importance_score': self._calculate_importance(sentence, keywords)
213
+ })
214
+
215
+ # Sort by importance
216
+ passages.sort(key=lambda x: x['importance_score'], reverse=True)
217
+ return passages
218
+
219
+ def extract_quotes(self, file_path: Path) -> List[Dict[str, str]]:
220
+ """Extract quoted text from a book."""
221
+ text = self.extract_full_text(file_path)
222
+ quotes = []
223
+
224
+ # Pattern for quotes
225
+ quote_patterns = [
226
+ r'"([^"]+)"', # Double quotes
227
+ r"'([^']+)'", # Single quotes
228
+ r'"([^"]+)"', # Smart quotes
229
+ r'«([^»]+)»' # French quotes
230
+ ]
231
+
232
+ for pattern in quote_patterns:
233
+ matches = re.findall(pattern, text)
234
+ for match in matches:
235
+ if len(match) > 30 and len(match) < 500: # Reasonable quote length
236
+ quotes.append({
237
+ 'text': match,
238
+ 'length': len(match)
239
+ })
240
+
241
+ # Remove duplicates
242
+ seen = set()
243
+ unique_quotes = []
244
+ for quote in quotes:
245
+ if quote['text'] not in seen:
246
+ seen.add(quote['text'])
247
+ unique_quotes.append(quote)
248
+
249
+ return unique_quotes
250
+
251
+ def extract_definitions(self, file_path: Path) -> List[Dict[str, str]]:
252
+ """Extract definitions and explanations from text."""
253
+ text = self.extract_full_text(file_path)
254
+ definitions = []
255
+
256
+ # Patterns that indicate definitions
257
+ definition_patterns = [
258
+ r'(\w+) is defined as ([^.]+)',
259
+ r'(\w+) means ([^.]+)',
260
+ r'(\w+): ([^.]+)',
261
+ r'define (\w+) as ([^.]+)',
262
+ r'(\w+) refers to ([^.]+)',
263
+ ]
264
+
265
+ for pattern in definition_patterns:
266
+ matches = re.findall(pattern, text, re.IGNORECASE)
267
+ for term, definition in matches:
268
+ if len(definition) > 20: # Meaningful definition
269
+ definitions.append({
270
+ 'term': term.strip(),
271
+ 'definition': definition.strip()
272
+ })
273
+
274
+ return definitions
275
+
276
+ def extract_summaries(self, file_path: Path,
277
+ summary_length: int = 500) -> List[Dict[str, str]]:
278
+ """
279
+ Extract or generate chapter summaries.
280
+ Looks for explicit summaries or creates them from beginning/end of chapters.
281
+ """
282
+ chapters = self.chapter_extractor.extract(file_path)
283
+ summaries = []
284
+
285
+ for chapter in chapters:
286
+ content = chapter.get('content', '')
287
+ if not content:
288
+ continue
289
+
290
+ # Look for explicit summary sections
291
+ summary_text = self._find_summary_section(content)
292
+
293
+ if not summary_text:
294
+ # Create summary from first and last paragraphs
295
+ paragraphs = content.split('\n\n')
296
+ if len(paragraphs) > 3:
297
+ summary_text = paragraphs[0][:summary_length // 2] + "..." + \
298
+ paragraphs[-1][:summary_length // 2]
299
+ else:
300
+ summary_text = content[:summary_length]
301
+
302
+ summaries.append({
303
+ 'chapter': chapter['title'],
304
+ 'summary': summary_text.strip(),
305
+ 'page': chapter.get('page_start')
306
+ })
307
+
308
+ return summaries
309
+
310
+ def _extract_pdf_text(self, file_path: Path) -> str:
311
+ """Extract text from PDF."""
312
+ text = ""
313
+ try:
314
+ pdf = fitz.open(str(file_path))
315
+ for page in pdf:
316
+ text += page.get_text()
317
+ pdf.close()
318
+ except Exception as e:
319
+ logger.error(f"Error extracting PDF text: {e}")
320
+ return text
321
+
322
+ def _extract_epub_text(self, file_path: Path) -> str:
323
+ """Extract text from EPUB."""
324
+ text = ""
325
+ try:
326
+ book = epub.read_epub(str(file_path))
327
+ for item in book.get_items():
328
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
329
+ soup = BeautifulSoup(item.content, 'html.parser')
330
+ text += soup.get_text(separator='\n')
331
+ except Exception as e:
332
+ logger.error(f"Error extracting EPUB text: {e}")
333
+ return text
334
+
335
+ def _split_into_sentences(self, text: str) -> List[str]:
336
+ """Split text into sentences."""
337
+ # Simple sentence splitter - can be improved with NLTK
338
+ sentences = re.split(r'[.!?]\s+', text)
339
+ return [s.strip() for s in sentences if s.strip()]
340
+
341
+ def _is_important_passage(self, sentence: str, keywords: List[str] = None) -> bool:
342
+ """Determine if a passage is important."""
343
+ if not keywords:
344
+ # Default important indicators
345
+ keywords = ['therefore', 'thus', 'consequently', 'important',
346
+ 'key', 'critical', 'essential', 'fundamental']
347
+
348
+ sentence_lower = sentence.lower()
349
+ return any(kw.lower() in sentence_lower for kw in keywords)
350
+
351
+ def _calculate_importance(self, sentence: str, keywords: List[str] = None) -> float:
352
+ """Calculate importance score for a sentence."""
353
+ score = 0.0
354
+
355
+ # Length factor
356
+ if 50 < len(sentence) < 300:
357
+ score += 0.2
358
+
359
+ # Keyword presence
360
+ if keywords:
361
+ sentence_lower = sentence.lower()
362
+ for kw in keywords:
363
+ if kw.lower() in sentence_lower:
364
+ score += 0.3
365
+
366
+ # Importance indicators
367
+ importance_indicators = ['important', 'key', 'critical', 'essential',
368
+ 'fundamental', 'significant', 'crucial']
369
+ for indicator in importance_indicators:
370
+ if indicator in sentence.lower():
371
+ score += 0.2
372
+
373
+ # Conclusion indicators
374
+ if any(word in sentence.lower() for word in ['therefore', 'thus', 'consequently', 'in conclusion']):
375
+ score += 0.3
376
+
377
+ return min(score, 1.0)
378
+
379
+ def _find_summary_section(self, text: str) -> Optional[str]:
380
+ """Find explicit summary section in text."""
381
+ summary_patterns = [
382
+ r'Summary[:\s]+([^\\n]+(?:\\n[^\\n]+)*)',
383
+ r'In summary[,:\s]+([^.]+\.)',
384
+ r'To summarize[,:\s]+([^.]+\.)',
385
+ r'Key points[:\s]+([^\\n]+(?:\\n[^\\n]+)*)'
386
+ ]
387
+
388
+ for pattern in summary_patterns:
389
+ match = re.search(pattern, text, re.IGNORECASE)
390
+ if match:
391
+ return match.group(1)
392
+
393
+ return None
ebk/calibre_import.py ADDED
@@ -0,0 +1,66 @@
1
+ """
2
+ Calibre library import functionality.
3
+
4
+ Provides functions to import books from a Calibre library into an ebk library.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Optional, Dict, Any
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def import_calibre_library(
15
+ calibre_path: Path,
16
+ library,
17
+ limit: Optional[int] = None
18
+ ) -> Dict[str, Any]:
19
+ """
20
+ Import books from a Calibre library.
21
+
22
+ Args:
23
+ calibre_path: Path to the Calibre library folder
24
+ library: An open ebk Library instance
25
+ limit: Maximum number of books to import
26
+
27
+ Returns:
28
+ Dictionary with import results:
29
+ - total: Number of books found
30
+ - imported: Number of books successfully imported
31
+ - failed: Number of books that failed to import
32
+ - errors: List of error messages
33
+ """
34
+ results = {
35
+ "total": 0,
36
+ "imported": 0,
37
+ "failed": 0,
38
+ "errors": []
39
+ }
40
+
41
+ # Find all metadata.opf files
42
+ opf_files = list(calibre_path.rglob("metadata.opf"))
43
+
44
+ if limit:
45
+ opf_files = opf_files[:limit]
46
+
47
+ results["total"] = len(opf_files)
48
+
49
+ if len(opf_files) == 0:
50
+ results["errors"].append("No books found. Make sure this is a Calibre library directory.")
51
+ return results
52
+
53
+ for opf_path in opf_files:
54
+ try:
55
+ book = library.add_calibre_book(opf_path)
56
+ if book:
57
+ results["imported"] += 1
58
+ else:
59
+ results["failed"] += 1
60
+ results["errors"].append(f"Failed to import: {opf_path.parent.name}")
61
+ except Exception as e:
62
+ results["failed"] += 1
63
+ results["errors"].append(f"{opf_path.parent.name}: {str(e)}")
64
+ logger.debug(f"Failed to import {opf_path.parent.name}: {e}")
65
+
66
+ return results