ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (84) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +443 -0
  4. ebk/ai/llm_providers/__init__.py +21 -0
  5. ebk/ai/llm_providers/base.py +230 -0
  6. ebk/ai/llm_providers/ollama.py +362 -0
  7. ebk/ai/metadata_enrichment.py +396 -0
  8. ebk/ai/question_generator.py +328 -0
  9. ebk/ai/reading_companion.py +224 -0
  10. ebk/ai/semantic_search.py +434 -0
  11. ebk/ai/text_extractor.py +394 -0
  12. ebk/cli.py +2828 -680
  13. ebk/config.py +260 -22
  14. ebk/db/__init__.py +37 -0
  15. ebk/db/migrations.py +180 -0
  16. ebk/db/models.py +526 -0
  17. ebk/db/session.py +144 -0
  18. ebk/decorators.py +132 -0
  19. ebk/exports/base_exporter.py +218 -0
  20. ebk/exports/html_library.py +1390 -0
  21. ebk/exports/html_utils.py +117 -0
  22. ebk/exports/hugo.py +7 -3
  23. ebk/exports/jinja_export.py +287 -0
  24. ebk/exports/multi_facet_export.py +164 -0
  25. ebk/exports/symlink_dag.py +479 -0
  26. ebk/extract_metadata.py +76 -7
  27. ebk/library_db.py +899 -0
  28. ebk/plugins/__init__.py +42 -0
  29. ebk/plugins/base.py +502 -0
  30. ebk/plugins/hooks.py +444 -0
  31. ebk/plugins/registry.py +500 -0
  32. ebk/repl/__init__.py +9 -0
  33. ebk/repl/find.py +126 -0
  34. ebk/repl/grep.py +174 -0
  35. ebk/repl/shell.py +1677 -0
  36. ebk/repl/text_utils.py +320 -0
  37. ebk/search_parser.py +413 -0
  38. ebk/server.py +1633 -0
  39. ebk/services/__init__.py +11 -0
  40. ebk/services/import_service.py +442 -0
  41. ebk/services/tag_service.py +282 -0
  42. ebk/services/text_extraction.py +317 -0
  43. ebk/similarity/__init__.py +77 -0
  44. ebk/similarity/base.py +154 -0
  45. ebk/similarity/core.py +445 -0
  46. ebk/similarity/extractors.py +168 -0
  47. ebk/similarity/metrics.py +376 -0
  48. ebk/vfs/__init__.py +101 -0
  49. ebk/vfs/base.py +301 -0
  50. ebk/vfs/library_vfs.py +124 -0
  51. ebk/vfs/nodes/__init__.py +54 -0
  52. ebk/vfs/nodes/authors.py +196 -0
  53. ebk/vfs/nodes/books.py +480 -0
  54. ebk/vfs/nodes/files.py +155 -0
  55. ebk/vfs/nodes/metadata.py +385 -0
  56. ebk/vfs/nodes/root.py +100 -0
  57. ebk/vfs/nodes/similar.py +165 -0
  58. ebk/vfs/nodes/subjects.py +184 -0
  59. ebk/vfs/nodes/tags.py +371 -0
  60. ebk/vfs/resolver.py +228 -0
  61. ebk-0.3.2.dist-info/METADATA +755 -0
  62. ebk-0.3.2.dist-info/RECORD +69 -0
  63. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
  64. ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
  65. ebk/imports/__init__.py +0 -0
  66. ebk/imports/calibre.py +0 -144
  67. ebk/imports/ebooks.py +0 -116
  68. ebk/llm.py +0 -58
  69. ebk/manager.py +0 -44
  70. ebk/merge.py +0 -308
  71. ebk/streamlit/__init__.py +0 -0
  72. ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
  73. ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
  74. ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
  75. ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
  76. ebk/streamlit/app.py +0 -185
  77. ebk/streamlit/display.py +0 -168
  78. ebk/streamlit/filters.py +0 -151
  79. ebk/streamlit/utils.py +0 -58
  80. ebk/utils.py +0 -311
  81. ebk-0.1.0.dist-info/METADATA +0 -457
  82. ebk-0.1.0.dist-info/RECORD +0 -29
  83. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
  84. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,394 @@
1
+ """
2
+ Extract text and structured content from various ebook formats.
3
+ """
4
+
5
+ import re
6
+ import json
7
+ from pathlib import Path
8
+ from typing import List, Dict, Any, Optional, Tuple
9
+ import fitz # PyMuPDF
10
+ import ebooklib
11
+ from ebooklib import epub
12
+ from bs4 import BeautifulSoup
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ChapterExtractor:
19
+ """Extract chapters and structured content from books."""
20
+
21
+ def extract(self, file_path: Path) -> List[Dict[str, Any]]:
22
+ """Extract chapters from a book file."""
23
+ file_path = Path(file_path)
24
+ suffix = file_path.suffix.lower()
25
+
26
+ if suffix == '.pdf':
27
+ return self._extract_pdf_chapters(file_path)
28
+ elif suffix == '.epub':
29
+ return self._extract_epub_chapters(file_path)
30
+ else:
31
+ logger.warning(f"Unsupported format: {suffix}")
32
+ return []
33
+
34
+ def _extract_pdf_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
35
+ """Extract chapters from a PDF file."""
36
+ chapters = []
37
+
38
+ try:
39
+ pdf = fitz.open(str(file_path))
40
+ toc = pdf.get_toc() # Table of contents
41
+
42
+ if toc:
43
+ # Use TOC if available
44
+ for i, (level, title, page_num) in enumerate(toc):
45
+ if level == 1: # Main chapters
46
+ # Get chapter content
47
+ start_page = page_num - 1
48
+ end_page = toc[i + 1][2] - 1 if i + 1 < len(toc) else len(pdf)
49
+
50
+ text = ""
51
+ for page_idx in range(start_page, min(end_page, len(pdf))):
52
+ page = pdf[page_idx]
53
+ text += page.get_text()
54
+
55
+ chapters.append({
56
+ 'title': title,
57
+ 'level': level,
58
+ 'page_start': start_page + 1,
59
+ 'page_end': end_page,
60
+ 'content': text.strip()
61
+ })
62
+ else:
63
+ # Fallback: Try to detect chapters by patterns
64
+ chapters = self._detect_pdf_chapters_by_pattern(pdf)
65
+
66
+ pdf.close()
67
+
68
+ except Exception as e:
69
+ logger.error(f"Error extracting PDF chapters: {e}")
70
+
71
+ return chapters
72
+
73
+ def _detect_pdf_chapters_by_pattern(self, pdf) -> List[Dict[str, Any]]:
74
+ """Detect chapters in PDF by common patterns."""
75
+ chapters = []
76
+ chapter_pattern = re.compile(
77
+ r'^(Chapter|CHAPTER|Ch\.|CH\.?)\s+(\d+|[IVX]+)[\s:\-]*(.*)$',
78
+ re.MULTILINE
79
+ )
80
+
81
+ current_chapter = None
82
+ chapter_text = []
83
+
84
+ for page_num in range(len(pdf)):
85
+ page = pdf[page_num]
86
+ text = page.get_text()
87
+
88
+ # Look for chapter headings
89
+ matches = chapter_pattern.finditer(text)
90
+ for match in matches:
91
+ if current_chapter:
92
+ # Save previous chapter
93
+ current_chapter['content'] = '\n'.join(chapter_text).strip()
94
+ current_chapter['page_end'] = page_num
95
+ chapters.append(current_chapter)
96
+ chapter_text = []
97
+
98
+ # Start new chapter
99
+ current_chapter = {
100
+ 'title': match.group(3).strip() or f"Chapter {match.group(2)}",
101
+ 'level': 1,
102
+ 'page_start': page_num + 1,
103
+ 'page_end': None,
104
+ 'content': ''
105
+ }
106
+
107
+ if current_chapter:
108
+ chapter_text.append(text)
109
+
110
+ # Save last chapter
111
+ if current_chapter:
112
+ current_chapter['content'] = '\n'.join(chapter_text).strip()
113
+ current_chapter['page_end'] = len(pdf)
114
+ chapters.append(current_chapter)
115
+
116
+ return chapters
117
+
118
+ def _extract_epub_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
119
+ """Extract chapters from an EPUB file."""
120
+ chapters = []
121
+
122
+ try:
123
+ book = epub.read_epub(str(file_path))
124
+
125
+ # Get table of contents
126
+ toc = book.toc
127
+
128
+ # Extract text from each chapter
129
+ for item in book.get_items():
130
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
131
+ soup = BeautifulSoup(item.content, 'html.parser')
132
+
133
+ # Try to find chapter title
134
+ title = None
135
+ for heading in ['h1', 'h2', 'h3']:
136
+ heading_elem = soup.find(heading)
137
+ if heading_elem:
138
+ title = heading_elem.get_text().strip()
139
+ break
140
+
141
+ if not title:
142
+ title = item.get_name()
143
+
144
+ # Extract text content
145
+ text = soup.get_text(separator='\n').strip()
146
+
147
+ if text:
148
+ chapters.append({
149
+ 'title': title,
150
+ 'level': 1,
151
+ 'page_start': None, # EPUB doesn't have pages
152
+ 'page_end': None,
153
+ 'content': text
154
+ })
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error extracting EPUB chapters: {e}")
158
+
159
+ return chapters
160
+
161
+
162
+ class TextExtractor:
163
+ """Extract and process text from ebooks for knowledge extraction."""
164
+
165
+ def __init__(self):
166
+ self.chapter_extractor = ChapterExtractor()
167
+
168
+ def extract_full_text(self, file_path: Path) -> str:
169
+ """Extract complete text from a book."""
170
+ file_path = Path(file_path)
171
+ suffix = file_path.suffix.lower()
172
+
173
+ if suffix == '.pdf':
174
+ return self._extract_pdf_text(file_path)
175
+ elif suffix == '.epub':
176
+ return self._extract_epub_text(file_path)
177
+ elif suffix in ['.txt', '.md']:
178
+ return file_path.read_text(encoding='utf-8')
179
+ else:
180
+ logger.warning(f"Unsupported format: {suffix}")
181
+ return ""
182
+
183
+ def extract_key_passages(self, file_path: Path,
184
+ keywords: List[str] = None,
185
+ context_size: int = 200) -> List[Dict[str, Any]]:
186
+ """
187
+ Extract passages containing specific keywords or important concepts.
188
+ """
189
+ chapters = self.chapter_extractor.extract(file_path)
190
+ passages = []
191
+
192
+ for chapter in chapters:
193
+ content = chapter.get('content', '')
194
+ if not content:
195
+ continue
196
+
197
+ # Split into sentences
198
+ sentences = self._split_into_sentences(content)
199
+
200
+ for i, sentence in enumerate(sentences):
201
+ # Check if sentence contains keywords or is important
202
+ if self._is_important_passage(sentence, keywords):
203
+ # Get context
204
+ start = max(0, i - 2)
205
+ end = min(len(sentences), i + 3)
206
+ context = ' '.join(sentences[start:end])
207
+
208
+ passages.append({
209
+ 'chapter': chapter['title'],
210
+ 'page': chapter.get('page_start'),
211
+ 'sentence': sentence,
212
+ 'context': context,
213
+ 'importance_score': self._calculate_importance(sentence, keywords)
214
+ })
215
+
216
+ # Sort by importance
217
+ passages.sort(key=lambda x: x['importance_score'], reverse=True)
218
+ return passages
219
+
220
+ def extract_quotes(self, file_path: Path) -> List[Dict[str, str]]:
221
+ """Extract quoted text from a book."""
222
+ text = self.extract_full_text(file_path)
223
+ quotes = []
224
+
225
+ # Pattern for quotes
226
+ quote_patterns = [
227
+ r'"([^"]+)"', # Double quotes
228
+ r"'([^']+)'", # Single quotes
229
+ r'"([^"]+)"', # Smart quotes
230
+ r'«([^»]+)»' # French quotes
231
+ ]
232
+
233
+ for pattern in quote_patterns:
234
+ matches = re.findall(pattern, text)
235
+ for match in matches:
236
+ if len(match) > 30 and len(match) < 500: # Reasonable quote length
237
+ quotes.append({
238
+ 'text': match,
239
+ 'length': len(match)
240
+ })
241
+
242
+ # Remove duplicates
243
+ seen = set()
244
+ unique_quotes = []
245
+ for quote in quotes:
246
+ if quote['text'] not in seen:
247
+ seen.add(quote['text'])
248
+ unique_quotes.append(quote)
249
+
250
+ return unique_quotes
251
+
252
+ def extract_definitions(self, file_path: Path) -> List[Dict[str, str]]:
253
+ """Extract definitions and explanations from text."""
254
+ text = self.extract_full_text(file_path)
255
+ definitions = []
256
+
257
+ # Patterns that indicate definitions
258
+ definition_patterns = [
259
+ r'(\w+) is defined as ([^.]+)',
260
+ r'(\w+) means ([^.]+)',
261
+ r'(\w+): ([^.]+)',
262
+ r'define (\w+) as ([^.]+)',
263
+ r'(\w+) refers to ([^.]+)',
264
+ ]
265
+
266
+ for pattern in definition_patterns:
267
+ matches = re.findall(pattern, text, re.IGNORECASE)
268
+ for term, definition in matches:
269
+ if len(definition) > 20: # Meaningful definition
270
+ definitions.append({
271
+ 'term': term.strip(),
272
+ 'definition': definition.strip()
273
+ })
274
+
275
+ return definitions
276
+
277
+ def extract_summaries(self, file_path: Path,
278
+ summary_length: int = 500) -> List[Dict[str, str]]:
279
+ """
280
+ Extract or generate chapter summaries.
281
+ Looks for explicit summaries or creates them from beginning/end of chapters.
282
+ """
283
+ chapters = self.chapter_extractor.extract(file_path)
284
+ summaries = []
285
+
286
+ for chapter in chapters:
287
+ content = chapter.get('content', '')
288
+ if not content:
289
+ continue
290
+
291
+ # Look for explicit summary sections
292
+ summary_text = self._find_summary_section(content)
293
+
294
+ if not summary_text:
295
+ # Create summary from first and last paragraphs
296
+ paragraphs = content.split('\n\n')
297
+ if len(paragraphs) > 3:
298
+ summary_text = paragraphs[0][:summary_length // 2] + "..." + \
299
+ paragraphs[-1][:summary_length // 2]
300
+ else:
301
+ summary_text = content[:summary_length]
302
+
303
+ summaries.append({
304
+ 'chapter': chapter['title'],
305
+ 'summary': summary_text.strip(),
306
+ 'page': chapter.get('page_start')
307
+ })
308
+
309
+ return summaries
310
+
311
+ def _extract_pdf_text(self, file_path: Path) -> str:
312
+ """Extract text from PDF."""
313
+ text = ""
314
+ try:
315
+ pdf = fitz.open(str(file_path))
316
+ for page in pdf:
317
+ text += page.get_text()
318
+ pdf.close()
319
+ except Exception as e:
320
+ logger.error(f"Error extracting PDF text: {e}")
321
+ return text
322
+
323
+ def _extract_epub_text(self, file_path: Path) -> str:
324
+ """Extract text from EPUB."""
325
+ text = ""
326
+ try:
327
+ book = epub.read_epub(str(file_path))
328
+ for item in book.get_items():
329
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
330
+ soup = BeautifulSoup(item.content, 'html.parser')
331
+ text += soup.get_text(separator='\n')
332
+ except Exception as e:
333
+ logger.error(f"Error extracting EPUB text: {e}")
334
+ return text
335
+
336
+ def _split_into_sentences(self, text: str) -> List[str]:
337
+ """Split text into sentences."""
338
+ # Simple sentence splitter - can be improved with NLTK
339
+ sentences = re.split(r'[.!?]\s+', text)
340
+ return [s.strip() for s in sentences if s.strip()]
341
+
342
+ def _is_important_passage(self, sentence: str, keywords: List[str] = None) -> bool:
343
+ """Determine if a passage is important."""
344
+ if not keywords:
345
+ # Default important indicators
346
+ keywords = ['therefore', 'thus', 'consequently', 'important',
347
+ 'key', 'critical', 'essential', 'fundamental']
348
+
349
+ sentence_lower = sentence.lower()
350
+ return any(kw.lower() in sentence_lower for kw in keywords)
351
+
352
+ def _calculate_importance(self, sentence: str, keywords: List[str] = None) -> float:
353
+ """Calculate importance score for a sentence."""
354
+ score = 0.0
355
+
356
+ # Length factor
357
+ if 50 < len(sentence) < 300:
358
+ score += 0.2
359
+
360
+ # Keyword presence
361
+ if keywords:
362
+ sentence_lower = sentence.lower()
363
+ for kw in keywords:
364
+ if kw.lower() in sentence_lower:
365
+ score += 0.3
366
+
367
+ # Importance indicators
368
+ importance_indicators = ['important', 'key', 'critical', 'essential',
369
+ 'fundamental', 'significant', 'crucial']
370
+ for indicator in importance_indicators:
371
+ if indicator in sentence.lower():
372
+ score += 0.2
373
+
374
+ # Conclusion indicators
375
+ if any(word in sentence.lower() for word in ['therefore', 'thus', 'consequently', 'in conclusion']):
376
+ score += 0.3
377
+
378
+ return min(score, 1.0)
379
+
380
+ def _find_summary_section(self, text: str) -> Optional[str]:
381
+ """Find explicit summary section in text."""
382
+ summary_patterns = [
383
+ r'Summary[:\s]+([^\\n]+(?:\\n[^\\n]+)*)',
384
+ r'In summary[,:\s]+([^.]+\.)',
385
+ r'To summarize[,:\s]+([^.]+\.)',
386
+ r'Key points[:\s]+([^\\n]+(?:\\n[^\\n]+)*)'
387
+ ]
388
+
389
+ for pattern in summary_patterns:
390
+ match = re.search(pattern, text, re.IGNORECASE)
391
+ if match:
392
+ return match.group(1)
393
+
394
+ return None