ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (84) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +443 -0
  4. ebk/ai/llm_providers/__init__.py +21 -0
  5. ebk/ai/llm_providers/base.py +230 -0
  6. ebk/ai/llm_providers/ollama.py +362 -0
  7. ebk/ai/metadata_enrichment.py +396 -0
  8. ebk/ai/question_generator.py +328 -0
  9. ebk/ai/reading_companion.py +224 -0
  10. ebk/ai/semantic_search.py +434 -0
  11. ebk/ai/text_extractor.py +394 -0
  12. ebk/cli.py +2828 -680
  13. ebk/config.py +260 -22
  14. ebk/db/__init__.py +37 -0
  15. ebk/db/migrations.py +180 -0
  16. ebk/db/models.py +526 -0
  17. ebk/db/session.py +144 -0
  18. ebk/decorators.py +132 -0
  19. ebk/exports/base_exporter.py +218 -0
  20. ebk/exports/html_library.py +1390 -0
  21. ebk/exports/html_utils.py +117 -0
  22. ebk/exports/hugo.py +7 -3
  23. ebk/exports/jinja_export.py +287 -0
  24. ebk/exports/multi_facet_export.py +164 -0
  25. ebk/exports/symlink_dag.py +479 -0
  26. ebk/extract_metadata.py +76 -7
  27. ebk/library_db.py +899 -0
  28. ebk/plugins/__init__.py +42 -0
  29. ebk/plugins/base.py +502 -0
  30. ebk/plugins/hooks.py +444 -0
  31. ebk/plugins/registry.py +500 -0
  32. ebk/repl/__init__.py +9 -0
  33. ebk/repl/find.py +126 -0
  34. ebk/repl/grep.py +174 -0
  35. ebk/repl/shell.py +1677 -0
  36. ebk/repl/text_utils.py +320 -0
  37. ebk/search_parser.py +413 -0
  38. ebk/server.py +1633 -0
  39. ebk/services/__init__.py +11 -0
  40. ebk/services/import_service.py +442 -0
  41. ebk/services/tag_service.py +282 -0
  42. ebk/services/text_extraction.py +317 -0
  43. ebk/similarity/__init__.py +77 -0
  44. ebk/similarity/base.py +154 -0
  45. ebk/similarity/core.py +445 -0
  46. ebk/similarity/extractors.py +168 -0
  47. ebk/similarity/metrics.py +376 -0
  48. ebk/vfs/__init__.py +101 -0
  49. ebk/vfs/base.py +301 -0
  50. ebk/vfs/library_vfs.py +124 -0
  51. ebk/vfs/nodes/__init__.py +54 -0
  52. ebk/vfs/nodes/authors.py +196 -0
  53. ebk/vfs/nodes/books.py +480 -0
  54. ebk/vfs/nodes/files.py +155 -0
  55. ebk/vfs/nodes/metadata.py +385 -0
  56. ebk/vfs/nodes/root.py +100 -0
  57. ebk/vfs/nodes/similar.py +165 -0
  58. ebk/vfs/nodes/subjects.py +184 -0
  59. ebk/vfs/nodes/tags.py +371 -0
  60. ebk/vfs/resolver.py +228 -0
  61. ebk-0.3.2.dist-info/METADATA +755 -0
  62. ebk-0.3.2.dist-info/RECORD +69 -0
  63. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
  64. ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
  65. ebk/imports/__init__.py +0 -0
  66. ebk/imports/calibre.py +0 -144
  67. ebk/imports/ebooks.py +0 -116
  68. ebk/llm.py +0 -58
  69. ebk/manager.py +0 -44
  70. ebk/merge.py +0 -308
  71. ebk/streamlit/__init__.py +0 -0
  72. ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
  73. ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
  74. ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
  75. ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
  76. ebk/streamlit/app.py +0 -185
  77. ebk/streamlit/display.py +0 -168
  78. ebk/streamlit/filters.py +0 -151
  79. ebk/streamlit/utils.py +0 -58
  80. ebk/utils.py +0 -311
  81. ebk-0.1.0.dist-info/METADATA +0 -457
  82. ebk-0.1.0.dist-info/RECORD +0 -29
  83. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
  84. {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,434 @@
1
+ """
2
+ Semantic search using vector embeddings for intelligent content discovery.
3
+ """
4
+
5
+ import json
6
+ import pickle
7
+ import numpy as np
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ from dataclasses import dataclass
11
+ import hashlib
12
+ import logging
13
+ from collections import defaultdict
14
+
15
+ # Use sentence-transformers for embeddings if available
16
+ try:
17
+ from sentence_transformers import SentenceTransformer
18
+ HAS_SENTENCE_TRANSFORMERS = True
19
+ except ImportError:
20
+ HAS_SENTENCE_TRANSFORMERS = False
21
+ logging.warning("sentence-transformers not installed. Using fallback embedding method.")
22
+
23
+ # Fallback: simple TF-IDF based embeddings
24
+ from sklearn.feature_extraction.text import TfidfVectorizer
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class EmbeddedDocument:
32
+ """A document with its embedding vector."""
33
+ id: str
34
+ text: str
35
+ embedding: np.ndarray
36
+ metadata: Dict[str, Any]
37
+
38
+ def similarity(self, other_embedding: np.ndarray) -> float:
39
+ """Calculate cosine similarity with another embedding."""
40
+ return float(cosine_similarity(
41
+ self.embedding.reshape(1, -1),
42
+ other_embedding.reshape(1, -1)
43
+ )[0, 0])
44
+
45
+
46
+ class EmbeddingStore:
47
+ """
48
+ Store and retrieve document embeddings for semantic search.
49
+ Provides a simple vector database for similarity search.
50
+ """
51
+
52
+ def __init__(self, library_path: Path, model_name: str = "all-MiniLM-L6-v2"):
53
+ self.library_path = Path(library_path)
54
+ self.store_path = self.library_path / '.embeddings'
55
+ self.store_path.mkdir(exist_ok=True)
56
+
57
+ self.model_name = model_name
58
+ self.embeddings: Dict[str, EmbeddedDocument] = {}
59
+ self.index_metadata: Dict[str, Any] = {}
60
+
61
+ # Initialize embedding model
62
+ if HAS_SENTENCE_TRANSFORMERS:
63
+ try:
64
+ self.model = SentenceTransformer(model_name)
65
+ self.embedding_dim = self.model.get_sentence_embedding_dimension()
66
+ self.use_transformer = True
67
+ except Exception as e:
68
+ logger.warning(f"Failed to load SentenceTransformer: {e}. Using TF-IDF fallback.")
69
+ self._init_tfidf()
70
+ else:
71
+ self._init_tfidf()
72
+
73
+ self.load_embeddings()
74
+
75
+ def _init_tfidf(self):
76
+ """Initialize TF-IDF vectorizer as fallback."""
77
+ self.vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
78
+ self.use_transformer = False
79
+ self.embedding_dim = 768
80
+ self.fitted_texts = []
81
+
82
+ def add_document(self, text: str, metadata: Dict[str, Any] = None) -> str:
83
+ """Add a document and compute its embedding."""
84
+ # Generate ID
85
+ doc_id = self._generate_id(text)
86
+
87
+ # Compute embedding
88
+ embedding = self._compute_embedding(text)
89
+
90
+ # Store document
91
+ self.embeddings[doc_id] = EmbeddedDocument(
92
+ id=doc_id,
93
+ text=text,
94
+ embedding=embedding,
95
+ metadata=metadata or {}
96
+ )
97
+
98
+ return doc_id
99
+
100
+ def add_batch(self, texts: List[str], metadata_list: List[Dict[str, Any]] = None) -> List[str]:
101
+ """Add multiple documents efficiently."""
102
+ if metadata_list is None:
103
+ metadata_list = [{}] * len(texts)
104
+
105
+ # Compute embeddings in batch
106
+ embeddings = self._compute_embeddings_batch(texts)
107
+
108
+ doc_ids = []
109
+ for text, embedding, metadata in zip(texts, embeddings, metadata_list):
110
+ doc_id = self._generate_id(text)
111
+ self.embeddings[doc_id] = EmbeddedDocument(
112
+ id=doc_id,
113
+ text=text,
114
+ embedding=embedding,
115
+ metadata=metadata
116
+ )
117
+ doc_ids.append(doc_id)
118
+
119
+ return doc_ids
120
+
121
+ def search(self, query: str, top_k: int = 10,
122
+ min_similarity: float = 0.0,
123
+ filter_metadata: Dict[str, Any] = None) -> List[Tuple[EmbeddedDocument, float]]:
124
+ """
125
+ Search for similar documents using semantic similarity.
126
+ """
127
+ # Compute query embedding
128
+ query_embedding = self._compute_embedding(query)
129
+
130
+ # Calculate similarities
131
+ results = []
132
+ for doc_id, doc in self.embeddings.items():
133
+ # Apply metadata filter
134
+ if filter_metadata:
135
+ if not self._matches_filter(doc.metadata, filter_metadata):
136
+ continue
137
+
138
+ # Calculate similarity
139
+ similarity = doc.similarity(query_embedding)
140
+
141
+ if similarity >= min_similarity:
142
+ results.append((doc, similarity))
143
+
144
+ # Sort by similarity
145
+ results.sort(key=lambda x: x[1], reverse=True)
146
+
147
+ return results[:top_k]
148
+
149
+ def find_similar(self, doc_id: str, top_k: int = 10,
150
+ min_similarity: float = 0.0) -> List[Tuple[EmbeddedDocument, float]]:
151
+ """Find documents similar to a given document."""
152
+ if doc_id not in self.embeddings:
153
+ return []
154
+
155
+ source_doc = self.embeddings[doc_id]
156
+ results = []
157
+
158
+ for other_id, other_doc in self.embeddings.items():
159
+ if other_id == doc_id:
160
+ continue
161
+
162
+ similarity = source_doc.similarity(other_doc.embedding)
163
+
164
+ if similarity >= min_similarity:
165
+ results.append((other_doc, similarity))
166
+
167
+ results.sort(key=lambda x: x[1], reverse=True)
168
+ return results[:top_k]
169
+
170
+ def get_document(self, doc_id: str) -> Optional[EmbeddedDocument]:
171
+ """Retrieve a document by ID."""
172
+ return self.embeddings.get(doc_id)
173
+
174
+ def remove_document(self, doc_id: str) -> bool:
175
+ """Remove a document from the store."""
176
+ if doc_id in self.embeddings:
177
+ del self.embeddings[doc_id]
178
+ return True
179
+ return False
180
+
181
+ def save_embeddings(self):
182
+ """Save embeddings to disk."""
183
+ # Save embeddings
184
+ embeddings_file = self.store_path / 'embeddings.pkl'
185
+ with open(embeddings_file, 'wb') as f:
186
+ pickle.dump(self.embeddings, f)
187
+
188
+ # Save metadata
189
+ metadata_file = self.store_path / 'metadata.json'
190
+ with open(metadata_file, 'w') as f:
191
+ json.dump({
192
+ 'model_name': self.model_name,
193
+ 'use_transformer': self.use_transformer,
194
+ 'embedding_dim': self.embedding_dim,
195
+ 'num_documents': len(self.embeddings)
196
+ }, f, indent=2)
197
+
198
+ # Save TF-IDF vectorizer if used
199
+ if not self.use_transformer and hasattr(self, 'vectorizer'):
200
+ vectorizer_file = self.store_path / 'vectorizer.pkl'
201
+ with open(vectorizer_file, 'wb') as f:
202
+ pickle.dump(self.vectorizer, f)
203
+
204
+ def load_embeddings(self):
205
+ """Load embeddings from disk."""
206
+ embeddings_file = self.store_path / 'embeddings.pkl'
207
+ metadata_file = self.store_path / 'metadata.json'
208
+
209
+ if embeddings_file.exists():
210
+ with open(embeddings_file, 'rb') as f:
211
+ self.embeddings = pickle.load(f)
212
+
213
+ if metadata_file.exists():
214
+ with open(metadata_file, 'r') as f:
215
+ self.index_metadata = json.load(f)
216
+
217
+ # Load TF-IDF vectorizer if needed
218
+ if not self.use_transformer:
219
+ vectorizer_file = self.store_path / 'vectorizer.pkl'
220
+ if vectorizer_file.exists():
221
+ with open(vectorizer_file, 'rb') as f:
222
+ self.vectorizer = pickle.load(f)
223
+
224
+ def _compute_embedding(self, text: str) -> np.ndarray:
225
+ """Compute embedding for a single text."""
226
+ if self.use_transformer:
227
+ return self.model.encode(text, convert_to_numpy=True)
228
+ else:
229
+ # TF-IDF fallback
230
+ if not hasattr(self, 'vectorizer') or not self.fitted_texts:
231
+ # First text - fit the vectorizer
232
+ self.fitted_texts = [text]
233
+ embeddings = self.vectorizer.fit_transform([text])
234
+ else:
235
+ # Transform using existing vocabulary
236
+ try:
237
+ embeddings = self.vectorizer.transform([text])
238
+ except:
239
+ # Refit with all texts if vocabulary changed
240
+ self.fitted_texts.append(text)
241
+ embeddings = self.vectorizer.fit_transform(self.fitted_texts)
242
+
243
+ return embeddings.toarray()[0]
244
+
245
+ def _compute_embeddings_batch(self, texts: List[str]) -> List[np.ndarray]:
246
+ """Compute embeddings for multiple texts efficiently."""
247
+ if self.use_transformer:
248
+ return self.model.encode(texts, convert_to_numpy=True)
249
+ else:
250
+ # TF-IDF fallback
251
+ if not hasattr(self, 'vectorizer') or not self.fitted_texts:
252
+ self.fitted_texts = texts
253
+ embeddings = self.vectorizer.fit_transform(texts)
254
+ else:
255
+ try:
256
+ embeddings = self.vectorizer.transform(texts)
257
+ except:
258
+ self.fitted_texts.extend(texts)
259
+ embeddings = self.vectorizer.fit_transform(self.fitted_texts)
260
+
261
+ return [embeddings[i].toarray()[0] for i in range(len(texts))]
262
+
263
+ def _generate_id(self, text: str) -> str:
264
+ """Generate unique ID for a document."""
265
+ return hashlib.md5(text.encode()).hexdigest()[:16]
266
+
267
+ def _matches_filter(self, metadata: Dict[str, Any], filter_dict: Dict[str, Any]) -> bool:
268
+ """Check if metadata matches filter criteria."""
269
+ for key, value in filter_dict.items():
270
+ if key not in metadata:
271
+ return False
272
+ if isinstance(value, list):
273
+ if metadata[key] not in value:
274
+ return False
275
+ else:
276
+ if metadata[key] != value:
277
+ return False
278
+ return True
279
+
280
+
281
+ class SemanticSearch:
282
+ """
283
+ High-level semantic search interface for ebook libraries.
284
+ """
285
+
286
+ def __init__(self, library_path: Path):
287
+ self.library_path = Path(library_path)
288
+ self.embedding_store = EmbeddingStore(library_path)
289
+ self.book_chunks: Dict[str, List[str]] = {} # book_id -> chunk_ids
290
+
291
+ def index_book(self, book_id: str, text: str, chunk_size: int = 500):
292
+ """
293
+ Index a book by splitting into chunks and computing embeddings.
294
+ """
295
+ # Split text into chunks
296
+ chunks = self._split_into_chunks(text, chunk_size)
297
+
298
+ # Add chunks to embedding store
299
+ chunk_ids = []
300
+ for i, chunk in enumerate(chunks):
301
+ metadata = {
302
+ 'book_id': book_id,
303
+ 'chunk_index': i,
304
+ 'chunk_total': len(chunks)
305
+ }
306
+ chunk_id = self.embedding_store.add_document(chunk, metadata)
307
+ chunk_ids.append(chunk_id)
308
+
309
+ self.book_chunks[book_id] = chunk_ids
310
+ self.embedding_store.save_embeddings()
311
+
312
+ def search_library(self, query: str, top_k: int = 10,
313
+ book_ids: List[str] = None) -> List[Dict[str, Any]]:
314
+ """
315
+ Search across the entire library or specific books.
316
+ """
317
+ # Prepare filter
318
+ filter_metadata = None
319
+ if book_ids:
320
+ filter_metadata = {'book_id': book_ids}
321
+
322
+ # Perform search
323
+ results = self.embedding_store.search(
324
+ query, top_k=top_k, filter_metadata=filter_metadata
325
+ )
326
+
327
+ # Format results
328
+ formatted_results = []
329
+ for doc, similarity in results:
330
+ formatted_results.append({
331
+ 'book_id': doc.metadata.get('book_id'),
332
+ 'text': doc.text,
333
+ 'similarity': similarity,
334
+ 'chunk_index': doc.metadata.get('chunk_index'),
335
+ 'metadata': doc.metadata
336
+ })
337
+
338
+ return formatted_results
339
+
340
+ def find_cross_references(self, book_id: str, passage: str,
341
+ other_books: List[str] = None) -> List[Dict[str, Any]]:
342
+ """
343
+ Find similar passages in other books (cross-references).
344
+ """
345
+ # Search in other books
346
+ filter_metadata = None
347
+ if other_books:
348
+ filter_metadata = {'book_id': other_books}
349
+ else:
350
+ # Search all books except the source
351
+ filter_metadata = {}
352
+
353
+ results = self.embedding_store.search(
354
+ passage, top_k=10, filter_metadata=filter_metadata
355
+ )
356
+
357
+ # Filter out results from the same book
358
+ cross_refs = []
359
+ for doc, similarity in results:
360
+ if doc.metadata.get('book_id') != book_id:
361
+ cross_refs.append({
362
+ 'book_id': doc.metadata.get('book_id'),
363
+ 'text': doc.text,
364
+ 'similarity': similarity,
365
+ 'metadata': doc.metadata
366
+ })
367
+
368
+ return cross_refs
369
+
370
+ def get_book_summary_vectors(self, book_ids: List[str]) -> Dict[str, np.ndarray]:
371
+ """
372
+ Get summary embedding vectors for books (average of all chunks).
373
+ """
374
+ book_vectors = {}
375
+
376
+ for book_id in book_ids:
377
+ if book_id not in self.book_chunks:
378
+ continue
379
+
380
+ # Get all chunk embeddings
381
+ embeddings = []
382
+ for chunk_id in self.book_chunks[book_id]:
383
+ doc = self.embedding_store.get_document(chunk_id)
384
+ if doc:
385
+ embeddings.append(doc.embedding)
386
+
387
+ if embeddings:
388
+ # Average embeddings
389
+ book_vectors[book_id] = np.mean(embeddings, axis=0)
390
+
391
+ return book_vectors
392
+
393
+ def find_similar_books(self, book_id: str, top_k: int = 5) -> List[Tuple[str, float]]:
394
+ """
395
+ Find books similar to a given book based on content similarity.
396
+ """
397
+ # Get summary vector for source book
398
+ source_vectors = self.get_book_summary_vectors([book_id])
399
+ if book_id not in source_vectors:
400
+ return []
401
+
402
+ source_vector = source_vectors[book_id]
403
+
404
+ # Get vectors for all other books
405
+ all_book_ids = list(self.book_chunks.keys())
406
+ all_book_ids.remove(book_id) if book_id in all_book_ids else None
407
+
408
+ other_vectors = self.get_book_summary_vectors(all_book_ids)
409
+
410
+ # Calculate similarities
411
+ similarities = []
412
+ for other_id, other_vector in other_vectors.items():
413
+ similarity = float(cosine_similarity(
414
+ source_vector.reshape(1, -1),
415
+ other_vector.reshape(1, -1)
416
+ )[0, 0])
417
+ similarities.append((other_id, similarity))
418
+
419
+ # Sort and return top-k
420
+ similarities.sort(key=lambda x: x[1], reverse=True)
421
+ return similarities[:top_k]
422
+
423
+ def _split_into_chunks(self, text: str, chunk_size: int) -> List[str]:
424
+ """Split text into overlapping chunks."""
425
+ words = text.split()
426
+ chunks = []
427
+ overlap = chunk_size // 4 # 25% overlap
428
+
429
+ for i in range(0, len(words), chunk_size - overlap):
430
+ chunk = ' '.join(words[i:i + chunk_size])
431
+ if chunk:
432
+ chunks.append(chunk)
433
+
434
+ return chunks