ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,433 @@
1
+ """
2
+ Semantic search using vector embeddings for intelligent content discovery.
3
+ """
4
+
5
+ import json
6
+ import pickle
7
+ import numpy as np
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ from dataclasses import dataclass
11
+ import hashlib
12
+ import logging
13
+
14
+ # Use sentence-transformers for embeddings if available
15
+ try:
16
+ from sentence_transformers import SentenceTransformer
17
+ HAS_SENTENCE_TRANSFORMERS = True
18
+ except ImportError:
19
+ HAS_SENTENCE_TRANSFORMERS = False
20
+ logging.warning("sentence-transformers not installed. Using fallback embedding method.")
21
+
22
+ # Fallback: simple TF-IDF based embeddings
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from sklearn.metrics.pairwise import cosine_similarity
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class EmbeddedDocument:
31
+ """A document with its embedding vector."""
32
+ id: str
33
+ text: str
34
+ embedding: np.ndarray
35
+ metadata: Dict[str, Any]
36
+
37
+ def similarity(self, other_embedding: np.ndarray) -> float:
38
+ """Calculate cosine similarity with another embedding."""
39
+ return float(cosine_similarity(
40
+ self.embedding.reshape(1, -1),
41
+ other_embedding.reshape(1, -1)
42
+ )[0, 0])
43
+
44
+
45
+ class EmbeddingStore:
46
+ """
47
+ Store and retrieve document embeddings for semantic search.
48
+ Provides a simple vector database for similarity search.
49
+ """
50
+
51
+ def __init__(self, library_path: Path, model_name: str = "all-MiniLM-L6-v2"):
52
+ self.library_path = Path(library_path)
53
+ self.store_path = self.library_path / '.embeddings'
54
+ self.store_path.mkdir(exist_ok=True)
55
+
56
+ self.model_name = model_name
57
+ self.embeddings: Dict[str, EmbeddedDocument] = {}
58
+ self.index_metadata: Dict[str, Any] = {}
59
+
60
+ # Initialize embedding model
61
+ if HAS_SENTENCE_TRANSFORMERS:
62
+ try:
63
+ self.model = SentenceTransformer(model_name)
64
+ self.embedding_dim = self.model.get_sentence_embedding_dimension()
65
+ self.use_transformer = True
66
+ except Exception as e:
67
+ logger.warning(f"Failed to load SentenceTransformer: {e}. Using TF-IDF fallback.")
68
+ self._init_tfidf()
69
+ else:
70
+ self._init_tfidf()
71
+
72
+ self.load_embeddings()
73
+
74
+ def _init_tfidf(self):
75
+ """Initialize TF-IDF vectorizer as fallback."""
76
+ self.vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
77
+ self.use_transformer = False
78
+ self.embedding_dim = 768
79
+ self.fitted_texts = []
80
+
81
+ def add_document(self, text: str, metadata: Dict[str, Any] = None) -> str:
82
+ """Add a document and compute its embedding."""
83
+ # Generate ID
84
+ doc_id = self._generate_id(text)
85
+
86
+ # Compute embedding
87
+ embedding = self._compute_embedding(text)
88
+
89
+ # Store document
90
+ self.embeddings[doc_id] = EmbeddedDocument(
91
+ id=doc_id,
92
+ text=text,
93
+ embedding=embedding,
94
+ metadata=metadata or {}
95
+ )
96
+
97
+ return doc_id
98
+
99
+ def add_batch(self, texts: List[str], metadata_list: List[Dict[str, Any]] = None) -> List[str]:
100
+ """Add multiple documents efficiently."""
101
+ if metadata_list is None:
102
+ metadata_list = [{}] * len(texts)
103
+
104
+ # Compute embeddings in batch
105
+ embeddings = self._compute_embeddings_batch(texts)
106
+
107
+ doc_ids = []
108
+ for text, embedding, metadata in zip(texts, embeddings, metadata_list):
109
+ doc_id = self._generate_id(text)
110
+ self.embeddings[doc_id] = EmbeddedDocument(
111
+ id=doc_id,
112
+ text=text,
113
+ embedding=embedding,
114
+ metadata=metadata
115
+ )
116
+ doc_ids.append(doc_id)
117
+
118
+ return doc_ids
119
+
120
+ def search(self, query: str, top_k: int = 10,
121
+ min_similarity: float = 0.0,
122
+ filter_metadata: Dict[str, Any] = None) -> List[Tuple[EmbeddedDocument, float]]:
123
+ """
124
+ Search for similar documents using semantic similarity.
125
+ """
126
+ # Compute query embedding
127
+ query_embedding = self._compute_embedding(query)
128
+
129
+ # Calculate similarities
130
+ results = []
131
+ for doc_id, doc in self.embeddings.items():
132
+ # Apply metadata filter
133
+ if filter_metadata:
134
+ if not self._matches_filter(doc.metadata, filter_metadata):
135
+ continue
136
+
137
+ # Calculate similarity
138
+ similarity = doc.similarity(query_embedding)
139
+
140
+ if similarity >= min_similarity:
141
+ results.append((doc, similarity))
142
+
143
+ # Sort by similarity
144
+ results.sort(key=lambda x: x[1], reverse=True)
145
+
146
+ return results[:top_k]
147
+
148
+ def find_similar(self, doc_id: str, top_k: int = 10,
149
+ min_similarity: float = 0.0) -> List[Tuple[EmbeddedDocument, float]]:
150
+ """Find documents similar to a given document."""
151
+ if doc_id not in self.embeddings:
152
+ return []
153
+
154
+ source_doc = self.embeddings[doc_id]
155
+ results = []
156
+
157
+ for other_id, other_doc in self.embeddings.items():
158
+ if other_id == doc_id:
159
+ continue
160
+
161
+ similarity = source_doc.similarity(other_doc.embedding)
162
+
163
+ if similarity >= min_similarity:
164
+ results.append((other_doc, similarity))
165
+
166
+ results.sort(key=lambda x: x[1], reverse=True)
167
+ return results[:top_k]
168
+
169
+ def get_document(self, doc_id: str) -> Optional[EmbeddedDocument]:
170
+ """Retrieve a document by ID."""
171
+ return self.embeddings.get(doc_id)
172
+
173
+ def remove_document(self, doc_id: str) -> bool:
174
+ """Remove a document from the store."""
175
+ if doc_id in self.embeddings:
176
+ del self.embeddings[doc_id]
177
+ return True
178
+ return False
179
+
180
+ def save_embeddings(self):
181
+ """Save embeddings to disk."""
182
+ # Save embeddings
183
+ embeddings_file = self.store_path / 'embeddings.pkl'
184
+ with open(embeddings_file, 'wb') as f:
185
+ pickle.dump(self.embeddings, f)
186
+
187
+ # Save metadata
188
+ metadata_file = self.store_path / 'metadata.json'
189
+ with open(metadata_file, 'w') as f:
190
+ json.dump({
191
+ 'model_name': self.model_name,
192
+ 'use_transformer': self.use_transformer,
193
+ 'embedding_dim': self.embedding_dim,
194
+ 'num_documents': len(self.embeddings)
195
+ }, f, indent=2)
196
+
197
+ # Save TF-IDF vectorizer if used
198
+ if not self.use_transformer and hasattr(self, 'vectorizer'):
199
+ vectorizer_file = self.store_path / 'vectorizer.pkl'
200
+ with open(vectorizer_file, 'wb') as f:
201
+ pickle.dump(self.vectorizer, f)
202
+
203
+ def load_embeddings(self):
204
+ """Load embeddings from disk."""
205
+ embeddings_file = self.store_path / 'embeddings.pkl'
206
+ metadata_file = self.store_path / 'metadata.json'
207
+
208
+ if embeddings_file.exists():
209
+ with open(embeddings_file, 'rb') as f:
210
+ self.embeddings = pickle.load(f)
211
+
212
+ if metadata_file.exists():
213
+ with open(metadata_file, 'r') as f:
214
+ self.index_metadata = json.load(f)
215
+
216
+ # Load TF-IDF vectorizer if needed
217
+ if not self.use_transformer:
218
+ vectorizer_file = self.store_path / 'vectorizer.pkl'
219
+ if vectorizer_file.exists():
220
+ with open(vectorizer_file, 'rb') as f:
221
+ self.vectorizer = pickle.load(f)
222
+
223
+ def _compute_embedding(self, text: str) -> np.ndarray:
224
+ """Compute embedding for a single text."""
225
+ if self.use_transformer:
226
+ return self.model.encode(text, convert_to_numpy=True)
227
+ else:
228
+ # TF-IDF fallback
229
+ if not hasattr(self, 'vectorizer') or not self.fitted_texts:
230
+ # First text - fit the vectorizer
231
+ self.fitted_texts = [text]
232
+ embeddings = self.vectorizer.fit_transform([text])
233
+ else:
234
+ # Transform using existing vocabulary
235
+ try:
236
+ embeddings = self.vectorizer.transform([text])
237
+ except Exception:
238
+ # Refit with all texts if vocabulary changed
239
+ self.fitted_texts.append(text)
240
+ embeddings = self.vectorizer.fit_transform(self.fitted_texts)
241
+
242
+ return embeddings.toarray()[0]
243
+
244
+ def _compute_embeddings_batch(self, texts: List[str]) -> List[np.ndarray]:
245
+ """Compute embeddings for multiple texts efficiently."""
246
+ if self.use_transformer:
247
+ return self.model.encode(texts, convert_to_numpy=True)
248
+ else:
249
+ # TF-IDF fallback
250
+ if not hasattr(self, 'vectorizer') or not self.fitted_texts:
251
+ self.fitted_texts = texts
252
+ embeddings = self.vectorizer.fit_transform(texts)
253
+ else:
254
+ try:
255
+ embeddings = self.vectorizer.transform(texts)
256
+ except Exception:
257
+ self.fitted_texts.extend(texts)
258
+ embeddings = self.vectorizer.fit_transform(self.fitted_texts)
259
+
260
+ return [embeddings[i].toarray()[0] for i in range(len(texts))]
261
+
262
+ def _generate_id(self, text: str) -> str:
263
+ """Generate unique ID for a document."""
264
+ return hashlib.md5(text.encode()).hexdigest()[:16]
265
+
266
+ def _matches_filter(self, metadata: Dict[str, Any], filter_dict: Dict[str, Any]) -> bool:
267
+ """Check if metadata matches filter criteria."""
268
+ for key, value in filter_dict.items():
269
+ if key not in metadata:
270
+ return False
271
+ if isinstance(value, list):
272
+ if metadata[key] not in value:
273
+ return False
274
+ else:
275
+ if metadata[key] != value:
276
+ return False
277
+ return True
278
+
279
+
280
+ class SemanticSearch:
281
+ """
282
+ High-level semantic search interface for ebook libraries.
283
+ """
284
+
285
+ def __init__(self, library_path: Path):
286
+ self.library_path = Path(library_path)
287
+ self.embedding_store = EmbeddingStore(library_path)
288
+ self.book_chunks: Dict[str, List[str]] = {} # book_id -> chunk_ids
289
+
290
+ def index_book(self, book_id: str, text: str, chunk_size: int = 500):
291
+ """
292
+ Index a book by splitting into chunks and computing embeddings.
293
+ """
294
+ # Split text into chunks
295
+ chunks = self._split_into_chunks(text, chunk_size)
296
+
297
+ # Add chunks to embedding store
298
+ chunk_ids = []
299
+ for i, chunk in enumerate(chunks):
300
+ metadata = {
301
+ 'book_id': book_id,
302
+ 'chunk_index': i,
303
+ 'chunk_total': len(chunks)
304
+ }
305
+ chunk_id = self.embedding_store.add_document(chunk, metadata)
306
+ chunk_ids.append(chunk_id)
307
+
308
+ self.book_chunks[book_id] = chunk_ids
309
+ self.embedding_store.save_embeddings()
310
+
311
+ def search_library(self, query: str, top_k: int = 10,
312
+ book_ids: List[str] = None) -> List[Dict[str, Any]]:
313
+ """
314
+ Search across the entire library or specific books.
315
+ """
316
+ # Prepare filter
317
+ filter_metadata = None
318
+ if book_ids:
319
+ filter_metadata = {'book_id': book_ids}
320
+
321
+ # Perform search
322
+ results = self.embedding_store.search(
323
+ query, top_k=top_k, filter_metadata=filter_metadata
324
+ )
325
+
326
+ # Format results
327
+ formatted_results = []
328
+ for doc, similarity in results:
329
+ formatted_results.append({
330
+ 'book_id': doc.metadata.get('book_id'),
331
+ 'text': doc.text,
332
+ 'similarity': similarity,
333
+ 'chunk_index': doc.metadata.get('chunk_index'),
334
+ 'metadata': doc.metadata
335
+ })
336
+
337
+ return formatted_results
338
+
339
+ def find_cross_references(self, book_id: str, passage: str,
340
+ other_books: List[str] = None) -> List[Dict[str, Any]]:
341
+ """
342
+ Find similar passages in other books (cross-references).
343
+ """
344
+ # Search in other books
345
+ filter_metadata = None
346
+ if other_books:
347
+ filter_metadata = {'book_id': other_books}
348
+ else:
349
+ # Search all books except the source
350
+ filter_metadata = {}
351
+
352
+ results = self.embedding_store.search(
353
+ passage, top_k=10, filter_metadata=filter_metadata
354
+ )
355
+
356
+ # Filter out results from the same book
357
+ cross_refs = []
358
+ for doc, similarity in results:
359
+ if doc.metadata.get('book_id') != book_id:
360
+ cross_refs.append({
361
+ 'book_id': doc.metadata.get('book_id'),
362
+ 'text': doc.text,
363
+ 'similarity': similarity,
364
+ 'metadata': doc.metadata
365
+ })
366
+
367
+ return cross_refs
368
+
369
+ def get_book_summary_vectors(self, book_ids: List[str]) -> Dict[str, np.ndarray]:
370
+ """
371
+ Get summary embedding vectors for books (average of all chunks).
372
+ """
373
+ book_vectors = {}
374
+
375
+ for book_id in book_ids:
376
+ if book_id not in self.book_chunks:
377
+ continue
378
+
379
+ # Get all chunk embeddings
380
+ embeddings = []
381
+ for chunk_id in self.book_chunks[book_id]:
382
+ doc = self.embedding_store.get_document(chunk_id)
383
+ if doc:
384
+ embeddings.append(doc.embedding)
385
+
386
+ if embeddings:
387
+ # Average embeddings
388
+ book_vectors[book_id] = np.mean(embeddings, axis=0)
389
+
390
+ return book_vectors
391
+
392
+ def find_similar_books(self, book_id: str, top_k: int = 5) -> List[Tuple[str, float]]:
393
+ """
394
+ Find books similar to a given book based on content similarity.
395
+ """
396
+ # Get summary vector for source book
397
+ source_vectors = self.get_book_summary_vectors([book_id])
398
+ if book_id not in source_vectors:
399
+ return []
400
+
401
+ source_vector = source_vectors[book_id]
402
+
403
+ # Get vectors for all other books
404
+ all_book_ids = list(self.book_chunks.keys())
405
+ all_book_ids.remove(book_id) if book_id in all_book_ids else None
406
+
407
+ other_vectors = self.get_book_summary_vectors(all_book_ids)
408
+
409
+ # Calculate similarities
410
+ similarities = []
411
+ for other_id, other_vector in other_vectors.items():
412
+ similarity = float(cosine_similarity(
413
+ source_vector.reshape(1, -1),
414
+ other_vector.reshape(1, -1)
415
+ )[0, 0])
416
+ similarities.append((other_id, similarity))
417
+
418
+ # Sort and return top-k
419
+ similarities.sort(key=lambda x: x[1], reverse=True)
420
+ return similarities[:top_k]
421
+
422
+ def _split_into_chunks(self, text: str, chunk_size: int) -> List[str]:
423
+ """Split text into overlapping chunks."""
424
+ words = text.split()
425
+ chunks = []
426
+ overlap = chunk_size // 4 # 25% overlap
427
+
428
+ for i in range(0, len(words), chunk_size - overlap):
429
+ chunk = ' '.join(words[i:i + chunk_size])
430
+ if chunk:
431
+ chunks.append(chunk)
432
+
433
+ return chunks