ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +2828 -680
- ebk/config.py +260 -22
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +132 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +7 -3
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/extract_metadata.py +76 -7
- ebk/library_db.py +899 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +444 -0
- ebk/plugins/registry.py +500 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +1633 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk-0.3.2.dist-info/METADATA +755 -0
- ebk-0.3.2.dist-info/RECORD +69 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
- ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +0 -144
- ebk/imports/ebooks.py +0 -116
- ebk/llm.py +0 -58
- ebk/manager.py +0 -44
- ebk/merge.py +0 -308
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +0 -185
- ebk/streamlit/display.py +0 -168
- ebk/streamlit/filters.py +0 -151
- ebk/streamlit/utils.py +0 -58
- ebk/utils.py +0 -311
- ebk-0.1.0.dist-info/METADATA +0 -457
- ebk-0.1.0.dist-info/RECORD +0 -29
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
ebk/ai/text_extractor.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract text and structured content from various ebook formats.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
9
|
+
import fitz # PyMuPDF
|
|
10
|
+
import ebooklib
|
|
11
|
+
from ebooklib import epub
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChapterExtractor:
|
|
19
|
+
"""Extract chapters and structured content from books."""
|
|
20
|
+
|
|
21
|
+
def extract(self, file_path: Path) -> List[Dict[str, Any]]:
|
|
22
|
+
"""Extract chapters from a book file."""
|
|
23
|
+
file_path = Path(file_path)
|
|
24
|
+
suffix = file_path.suffix.lower()
|
|
25
|
+
|
|
26
|
+
if suffix == '.pdf':
|
|
27
|
+
return self._extract_pdf_chapters(file_path)
|
|
28
|
+
elif suffix == '.epub':
|
|
29
|
+
return self._extract_epub_chapters(file_path)
|
|
30
|
+
else:
|
|
31
|
+
logger.warning(f"Unsupported format: {suffix}")
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
def _extract_pdf_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
|
|
35
|
+
"""Extract chapters from a PDF file."""
|
|
36
|
+
chapters = []
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
pdf = fitz.open(str(file_path))
|
|
40
|
+
toc = pdf.get_toc() # Table of contents
|
|
41
|
+
|
|
42
|
+
if toc:
|
|
43
|
+
# Use TOC if available
|
|
44
|
+
for i, (level, title, page_num) in enumerate(toc):
|
|
45
|
+
if level == 1: # Main chapters
|
|
46
|
+
# Get chapter content
|
|
47
|
+
start_page = page_num - 1
|
|
48
|
+
end_page = toc[i + 1][2] - 1 if i + 1 < len(toc) else len(pdf)
|
|
49
|
+
|
|
50
|
+
text = ""
|
|
51
|
+
for page_idx in range(start_page, min(end_page, len(pdf))):
|
|
52
|
+
page = pdf[page_idx]
|
|
53
|
+
text += page.get_text()
|
|
54
|
+
|
|
55
|
+
chapters.append({
|
|
56
|
+
'title': title,
|
|
57
|
+
'level': level,
|
|
58
|
+
'page_start': start_page + 1,
|
|
59
|
+
'page_end': end_page,
|
|
60
|
+
'content': text.strip()
|
|
61
|
+
})
|
|
62
|
+
else:
|
|
63
|
+
# Fallback: Try to detect chapters by patterns
|
|
64
|
+
chapters = self._detect_pdf_chapters_by_pattern(pdf)
|
|
65
|
+
|
|
66
|
+
pdf.close()
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.error(f"Error extracting PDF chapters: {e}")
|
|
70
|
+
|
|
71
|
+
return chapters
|
|
72
|
+
|
|
73
|
+
def _detect_pdf_chapters_by_pattern(self, pdf) -> List[Dict[str, Any]]:
|
|
74
|
+
"""Detect chapters in PDF by common patterns."""
|
|
75
|
+
chapters = []
|
|
76
|
+
chapter_pattern = re.compile(
|
|
77
|
+
r'^(Chapter|CHAPTER|Ch\.|CH\.?)\s+(\d+|[IVX]+)[\s:\-]*(.*)$',
|
|
78
|
+
re.MULTILINE
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
current_chapter = None
|
|
82
|
+
chapter_text = []
|
|
83
|
+
|
|
84
|
+
for page_num in range(len(pdf)):
|
|
85
|
+
page = pdf[page_num]
|
|
86
|
+
text = page.get_text()
|
|
87
|
+
|
|
88
|
+
# Look for chapter headings
|
|
89
|
+
matches = chapter_pattern.finditer(text)
|
|
90
|
+
for match in matches:
|
|
91
|
+
if current_chapter:
|
|
92
|
+
# Save previous chapter
|
|
93
|
+
current_chapter['content'] = '\n'.join(chapter_text).strip()
|
|
94
|
+
current_chapter['page_end'] = page_num
|
|
95
|
+
chapters.append(current_chapter)
|
|
96
|
+
chapter_text = []
|
|
97
|
+
|
|
98
|
+
# Start new chapter
|
|
99
|
+
current_chapter = {
|
|
100
|
+
'title': match.group(3).strip() or f"Chapter {match.group(2)}",
|
|
101
|
+
'level': 1,
|
|
102
|
+
'page_start': page_num + 1,
|
|
103
|
+
'page_end': None,
|
|
104
|
+
'content': ''
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if current_chapter:
|
|
108
|
+
chapter_text.append(text)
|
|
109
|
+
|
|
110
|
+
# Save last chapter
|
|
111
|
+
if current_chapter:
|
|
112
|
+
current_chapter['content'] = '\n'.join(chapter_text).strip()
|
|
113
|
+
current_chapter['page_end'] = len(pdf)
|
|
114
|
+
chapters.append(current_chapter)
|
|
115
|
+
|
|
116
|
+
return chapters
|
|
117
|
+
|
|
118
|
+
def _extract_epub_chapters(self, file_path: Path) -> List[Dict[str, Any]]:
|
|
119
|
+
"""Extract chapters from an EPUB file."""
|
|
120
|
+
chapters = []
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
book = epub.read_epub(str(file_path))
|
|
124
|
+
|
|
125
|
+
# Get table of contents
|
|
126
|
+
toc = book.toc
|
|
127
|
+
|
|
128
|
+
# Extract text from each chapter
|
|
129
|
+
for item in book.get_items():
|
|
130
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
131
|
+
soup = BeautifulSoup(item.content, 'html.parser')
|
|
132
|
+
|
|
133
|
+
# Try to find chapter title
|
|
134
|
+
title = None
|
|
135
|
+
for heading in ['h1', 'h2', 'h3']:
|
|
136
|
+
heading_elem = soup.find(heading)
|
|
137
|
+
if heading_elem:
|
|
138
|
+
title = heading_elem.get_text().strip()
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
if not title:
|
|
142
|
+
title = item.get_name()
|
|
143
|
+
|
|
144
|
+
# Extract text content
|
|
145
|
+
text = soup.get_text(separator='\n').strip()
|
|
146
|
+
|
|
147
|
+
if text:
|
|
148
|
+
chapters.append({
|
|
149
|
+
'title': title,
|
|
150
|
+
'level': 1,
|
|
151
|
+
'page_start': None, # EPUB doesn't have pages
|
|
152
|
+
'page_end': None,
|
|
153
|
+
'content': text
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Error extracting EPUB chapters: {e}")
|
|
158
|
+
|
|
159
|
+
return chapters
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class TextExtractor:
|
|
163
|
+
"""Extract and process text from ebooks for knowledge extraction."""
|
|
164
|
+
|
|
165
|
+
def __init__(self):
|
|
166
|
+
self.chapter_extractor = ChapterExtractor()
|
|
167
|
+
|
|
168
|
+
def extract_full_text(self, file_path: Path) -> str:
|
|
169
|
+
"""Extract complete text from a book."""
|
|
170
|
+
file_path = Path(file_path)
|
|
171
|
+
suffix = file_path.suffix.lower()
|
|
172
|
+
|
|
173
|
+
if suffix == '.pdf':
|
|
174
|
+
return self._extract_pdf_text(file_path)
|
|
175
|
+
elif suffix == '.epub':
|
|
176
|
+
return self._extract_epub_text(file_path)
|
|
177
|
+
elif suffix in ['.txt', '.md']:
|
|
178
|
+
return file_path.read_text(encoding='utf-8')
|
|
179
|
+
else:
|
|
180
|
+
logger.warning(f"Unsupported format: {suffix}")
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
def extract_key_passages(self, file_path: Path,
|
|
184
|
+
keywords: List[str] = None,
|
|
185
|
+
context_size: int = 200) -> List[Dict[str, Any]]:
|
|
186
|
+
"""
|
|
187
|
+
Extract passages containing specific keywords or important concepts.
|
|
188
|
+
"""
|
|
189
|
+
chapters = self.chapter_extractor.extract(file_path)
|
|
190
|
+
passages = []
|
|
191
|
+
|
|
192
|
+
for chapter in chapters:
|
|
193
|
+
content = chapter.get('content', '')
|
|
194
|
+
if not content:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
# Split into sentences
|
|
198
|
+
sentences = self._split_into_sentences(content)
|
|
199
|
+
|
|
200
|
+
for i, sentence in enumerate(sentences):
|
|
201
|
+
# Check if sentence contains keywords or is important
|
|
202
|
+
if self._is_important_passage(sentence, keywords):
|
|
203
|
+
# Get context
|
|
204
|
+
start = max(0, i - 2)
|
|
205
|
+
end = min(len(sentences), i + 3)
|
|
206
|
+
context = ' '.join(sentences[start:end])
|
|
207
|
+
|
|
208
|
+
passages.append({
|
|
209
|
+
'chapter': chapter['title'],
|
|
210
|
+
'page': chapter.get('page_start'),
|
|
211
|
+
'sentence': sentence,
|
|
212
|
+
'context': context,
|
|
213
|
+
'importance_score': self._calculate_importance(sentence, keywords)
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
# Sort by importance
|
|
217
|
+
passages.sort(key=lambda x: x['importance_score'], reverse=True)
|
|
218
|
+
return passages
|
|
219
|
+
|
|
220
|
+
def extract_quotes(self, file_path: Path) -> List[Dict[str, str]]:
|
|
221
|
+
"""Extract quoted text from a book."""
|
|
222
|
+
text = self.extract_full_text(file_path)
|
|
223
|
+
quotes = []
|
|
224
|
+
|
|
225
|
+
# Pattern for quotes
|
|
226
|
+
quote_patterns = [
|
|
227
|
+
r'"([^"]+)"', # Double quotes
|
|
228
|
+
r"'([^']+)'", # Single quotes
|
|
229
|
+
r'"([^"]+)"', # Smart quotes
|
|
230
|
+
r'«([^»]+)»' # French quotes
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
for pattern in quote_patterns:
|
|
234
|
+
matches = re.findall(pattern, text)
|
|
235
|
+
for match in matches:
|
|
236
|
+
if len(match) > 30 and len(match) < 500: # Reasonable quote length
|
|
237
|
+
quotes.append({
|
|
238
|
+
'text': match,
|
|
239
|
+
'length': len(match)
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Remove duplicates
|
|
243
|
+
seen = set()
|
|
244
|
+
unique_quotes = []
|
|
245
|
+
for quote in quotes:
|
|
246
|
+
if quote['text'] not in seen:
|
|
247
|
+
seen.add(quote['text'])
|
|
248
|
+
unique_quotes.append(quote)
|
|
249
|
+
|
|
250
|
+
return unique_quotes
|
|
251
|
+
|
|
252
|
+
def extract_definitions(self, file_path: Path) -> List[Dict[str, str]]:
|
|
253
|
+
"""Extract definitions and explanations from text."""
|
|
254
|
+
text = self.extract_full_text(file_path)
|
|
255
|
+
definitions = []
|
|
256
|
+
|
|
257
|
+
# Patterns that indicate definitions
|
|
258
|
+
definition_patterns = [
|
|
259
|
+
r'(\w+) is defined as ([^.]+)',
|
|
260
|
+
r'(\w+) means ([^.]+)',
|
|
261
|
+
r'(\w+): ([^.]+)',
|
|
262
|
+
r'define (\w+) as ([^.]+)',
|
|
263
|
+
r'(\w+) refers to ([^.]+)',
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
for pattern in definition_patterns:
|
|
267
|
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
268
|
+
for term, definition in matches:
|
|
269
|
+
if len(definition) > 20: # Meaningful definition
|
|
270
|
+
definitions.append({
|
|
271
|
+
'term': term.strip(),
|
|
272
|
+
'definition': definition.strip()
|
|
273
|
+
})
|
|
274
|
+
|
|
275
|
+
return definitions
|
|
276
|
+
|
|
277
|
+
def extract_summaries(self, file_path: Path,
|
|
278
|
+
summary_length: int = 500) -> List[Dict[str, str]]:
|
|
279
|
+
"""
|
|
280
|
+
Extract or generate chapter summaries.
|
|
281
|
+
Looks for explicit summaries or creates them from beginning/end of chapters.
|
|
282
|
+
"""
|
|
283
|
+
chapters = self.chapter_extractor.extract(file_path)
|
|
284
|
+
summaries = []
|
|
285
|
+
|
|
286
|
+
for chapter in chapters:
|
|
287
|
+
content = chapter.get('content', '')
|
|
288
|
+
if not content:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
# Look for explicit summary sections
|
|
292
|
+
summary_text = self._find_summary_section(content)
|
|
293
|
+
|
|
294
|
+
if not summary_text:
|
|
295
|
+
# Create summary from first and last paragraphs
|
|
296
|
+
paragraphs = content.split('\n\n')
|
|
297
|
+
if len(paragraphs) > 3:
|
|
298
|
+
summary_text = paragraphs[0][:summary_length // 2] + "..." + \
|
|
299
|
+
paragraphs[-1][:summary_length // 2]
|
|
300
|
+
else:
|
|
301
|
+
summary_text = content[:summary_length]
|
|
302
|
+
|
|
303
|
+
summaries.append({
|
|
304
|
+
'chapter': chapter['title'],
|
|
305
|
+
'summary': summary_text.strip(),
|
|
306
|
+
'page': chapter.get('page_start')
|
|
307
|
+
})
|
|
308
|
+
|
|
309
|
+
return summaries
|
|
310
|
+
|
|
311
|
+
def _extract_pdf_text(self, file_path: Path) -> str:
|
|
312
|
+
"""Extract text from PDF."""
|
|
313
|
+
text = ""
|
|
314
|
+
try:
|
|
315
|
+
pdf = fitz.open(str(file_path))
|
|
316
|
+
for page in pdf:
|
|
317
|
+
text += page.get_text()
|
|
318
|
+
pdf.close()
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.error(f"Error extracting PDF text: {e}")
|
|
321
|
+
return text
|
|
322
|
+
|
|
323
|
+
def _extract_epub_text(self, file_path: Path) -> str:
|
|
324
|
+
"""Extract text from EPUB."""
|
|
325
|
+
text = ""
|
|
326
|
+
try:
|
|
327
|
+
book = epub.read_epub(str(file_path))
|
|
328
|
+
for item in book.get_items():
|
|
329
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
330
|
+
soup = BeautifulSoup(item.content, 'html.parser')
|
|
331
|
+
text += soup.get_text(separator='\n')
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error(f"Error extracting EPUB text: {e}")
|
|
334
|
+
return text
|
|
335
|
+
|
|
336
|
+
def _split_into_sentences(self, text: str) -> List[str]:
|
|
337
|
+
"""Split text into sentences."""
|
|
338
|
+
# Simple sentence splitter - can be improved with NLTK
|
|
339
|
+
sentences = re.split(r'[.!?]\s+', text)
|
|
340
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
341
|
+
|
|
342
|
+
def _is_important_passage(self, sentence: str, keywords: List[str] = None) -> bool:
|
|
343
|
+
"""Determine if a passage is important."""
|
|
344
|
+
if not keywords:
|
|
345
|
+
# Default important indicators
|
|
346
|
+
keywords = ['therefore', 'thus', 'consequently', 'important',
|
|
347
|
+
'key', 'critical', 'essential', 'fundamental']
|
|
348
|
+
|
|
349
|
+
sentence_lower = sentence.lower()
|
|
350
|
+
return any(kw.lower() in sentence_lower for kw in keywords)
|
|
351
|
+
|
|
352
|
+
def _calculate_importance(self, sentence: str, keywords: List[str] = None) -> float:
|
|
353
|
+
"""Calculate importance score for a sentence."""
|
|
354
|
+
score = 0.0
|
|
355
|
+
|
|
356
|
+
# Length factor
|
|
357
|
+
if 50 < len(sentence) < 300:
|
|
358
|
+
score += 0.2
|
|
359
|
+
|
|
360
|
+
# Keyword presence
|
|
361
|
+
if keywords:
|
|
362
|
+
sentence_lower = sentence.lower()
|
|
363
|
+
for kw in keywords:
|
|
364
|
+
if kw.lower() in sentence_lower:
|
|
365
|
+
score += 0.3
|
|
366
|
+
|
|
367
|
+
# Importance indicators
|
|
368
|
+
importance_indicators = ['important', 'key', 'critical', 'essential',
|
|
369
|
+
'fundamental', 'significant', 'crucial']
|
|
370
|
+
for indicator in importance_indicators:
|
|
371
|
+
if indicator in sentence.lower():
|
|
372
|
+
score += 0.2
|
|
373
|
+
|
|
374
|
+
# Conclusion indicators
|
|
375
|
+
if any(word in sentence.lower() for word in ['therefore', 'thus', 'consequently', 'in conclusion']):
|
|
376
|
+
score += 0.3
|
|
377
|
+
|
|
378
|
+
return min(score, 1.0)
|
|
379
|
+
|
|
380
|
+
def _find_summary_section(self, text: str) -> Optional[str]:
|
|
381
|
+
"""Find explicit summary section in text."""
|
|
382
|
+
summary_patterns = [
|
|
383
|
+
r'Summary[:\s]+([^\\n]+(?:\\n[^\\n]+)*)',
|
|
384
|
+
r'In summary[,:\s]+([^.]+\.)',
|
|
385
|
+
r'To summarize[,:\s]+([^.]+\.)',
|
|
386
|
+
r'Key points[:\s]+([^\\n]+(?:\\n[^\\n]+)*)'
|
|
387
|
+
]
|
|
388
|
+
|
|
389
|
+
for pattern in summary_patterns:
|
|
390
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
391
|
+
if match:
|
|
392
|
+
return match.group(1)
|
|
393
|
+
|
|
394
|
+
return None
|