ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +2828 -680
- ebk/config.py +260 -22
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +132 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +7 -3
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/extract_metadata.py +76 -7
- ebk/library_db.py +899 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +444 -0
- ebk/plugins/registry.py +500 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +1633 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk-0.3.2.dist-info/METADATA +755 -0
- ebk-0.3.2.dist-info/RECORD +69 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
- ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +0 -144
- ebk/imports/ebooks.py +0 -116
- ebk/llm.py +0 -58
- ebk/manager.py +0 -44
- ebk/merge.py +0 -308
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +0 -185
- ebk/streamlit/display.py +0 -168
- ebk/streamlit/filters.py +0 -151
- ebk/streamlit/utils.py +0 -58
- ebk/utils.py +0 -311
- ebk-0.1.0.dist-info/METADATA +0 -457
- ebk-0.1.0.dist-info/RECORD +0 -29
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic search using vector embeddings for intelligent content discovery.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import pickle
|
|
7
|
+
import numpy as np
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
# Use sentence-transformers for embeddings if available
|
|
16
|
+
try:
|
|
17
|
+
from sentence_transformers import SentenceTransformer
|
|
18
|
+
HAS_SENTENCE_TRANSFORMERS = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
HAS_SENTENCE_TRANSFORMERS = False
|
|
21
|
+
logging.warning("sentence-transformers not installed. Using fallback embedding method.")
|
|
22
|
+
|
|
23
|
+
# Fallback: simple TF-IDF based embeddings
|
|
24
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
25
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class EmbeddedDocument:
|
|
32
|
+
"""A document with its embedding vector."""
|
|
33
|
+
id: str
|
|
34
|
+
text: str
|
|
35
|
+
embedding: np.ndarray
|
|
36
|
+
metadata: Dict[str, Any]
|
|
37
|
+
|
|
38
|
+
def similarity(self, other_embedding: np.ndarray) -> float:
|
|
39
|
+
"""Calculate cosine similarity with another embedding."""
|
|
40
|
+
return float(cosine_similarity(
|
|
41
|
+
self.embedding.reshape(1, -1),
|
|
42
|
+
other_embedding.reshape(1, -1)
|
|
43
|
+
)[0, 0])
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EmbeddingStore:
|
|
47
|
+
"""
|
|
48
|
+
Store and retrieve document embeddings for semantic search.
|
|
49
|
+
Provides a simple vector database for similarity search.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, library_path: Path, model_name: str = "all-MiniLM-L6-v2"):
|
|
53
|
+
self.library_path = Path(library_path)
|
|
54
|
+
self.store_path = self.library_path / '.embeddings'
|
|
55
|
+
self.store_path.mkdir(exist_ok=True)
|
|
56
|
+
|
|
57
|
+
self.model_name = model_name
|
|
58
|
+
self.embeddings: Dict[str, EmbeddedDocument] = {}
|
|
59
|
+
self.index_metadata: Dict[str, Any] = {}
|
|
60
|
+
|
|
61
|
+
# Initialize embedding model
|
|
62
|
+
if HAS_SENTENCE_TRANSFORMERS:
|
|
63
|
+
try:
|
|
64
|
+
self.model = SentenceTransformer(model_name)
|
|
65
|
+
self.embedding_dim = self.model.get_sentence_embedding_dimension()
|
|
66
|
+
self.use_transformer = True
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.warning(f"Failed to load SentenceTransformer: {e}. Using TF-IDF fallback.")
|
|
69
|
+
self._init_tfidf()
|
|
70
|
+
else:
|
|
71
|
+
self._init_tfidf()
|
|
72
|
+
|
|
73
|
+
self.load_embeddings()
|
|
74
|
+
|
|
75
|
+
def _init_tfidf(self):
|
|
76
|
+
"""Initialize TF-IDF vectorizer as fallback."""
|
|
77
|
+
self.vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
|
|
78
|
+
self.use_transformer = False
|
|
79
|
+
self.embedding_dim = 768
|
|
80
|
+
self.fitted_texts = []
|
|
81
|
+
|
|
82
|
+
def add_document(self, text: str, metadata: Dict[str, Any] = None) -> str:
|
|
83
|
+
"""Add a document and compute its embedding."""
|
|
84
|
+
# Generate ID
|
|
85
|
+
doc_id = self._generate_id(text)
|
|
86
|
+
|
|
87
|
+
# Compute embedding
|
|
88
|
+
embedding = self._compute_embedding(text)
|
|
89
|
+
|
|
90
|
+
# Store document
|
|
91
|
+
self.embeddings[doc_id] = EmbeddedDocument(
|
|
92
|
+
id=doc_id,
|
|
93
|
+
text=text,
|
|
94
|
+
embedding=embedding,
|
|
95
|
+
metadata=metadata or {}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return doc_id
|
|
99
|
+
|
|
100
|
+
def add_batch(self, texts: List[str], metadata_list: List[Dict[str, Any]] = None) -> List[str]:
|
|
101
|
+
"""Add multiple documents efficiently."""
|
|
102
|
+
if metadata_list is None:
|
|
103
|
+
metadata_list = [{}] * len(texts)
|
|
104
|
+
|
|
105
|
+
# Compute embeddings in batch
|
|
106
|
+
embeddings = self._compute_embeddings_batch(texts)
|
|
107
|
+
|
|
108
|
+
doc_ids = []
|
|
109
|
+
for text, embedding, metadata in zip(texts, embeddings, metadata_list):
|
|
110
|
+
doc_id = self._generate_id(text)
|
|
111
|
+
self.embeddings[doc_id] = EmbeddedDocument(
|
|
112
|
+
id=doc_id,
|
|
113
|
+
text=text,
|
|
114
|
+
embedding=embedding,
|
|
115
|
+
metadata=metadata
|
|
116
|
+
)
|
|
117
|
+
doc_ids.append(doc_id)
|
|
118
|
+
|
|
119
|
+
return doc_ids
|
|
120
|
+
|
|
121
|
+
def search(self, query: str, top_k: int = 10,
|
|
122
|
+
min_similarity: float = 0.0,
|
|
123
|
+
filter_metadata: Dict[str, Any] = None) -> List[Tuple[EmbeddedDocument, float]]:
|
|
124
|
+
"""
|
|
125
|
+
Search for similar documents using semantic similarity.
|
|
126
|
+
"""
|
|
127
|
+
# Compute query embedding
|
|
128
|
+
query_embedding = self._compute_embedding(query)
|
|
129
|
+
|
|
130
|
+
# Calculate similarities
|
|
131
|
+
results = []
|
|
132
|
+
for doc_id, doc in self.embeddings.items():
|
|
133
|
+
# Apply metadata filter
|
|
134
|
+
if filter_metadata:
|
|
135
|
+
if not self._matches_filter(doc.metadata, filter_metadata):
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Calculate similarity
|
|
139
|
+
similarity = doc.similarity(query_embedding)
|
|
140
|
+
|
|
141
|
+
if similarity >= min_similarity:
|
|
142
|
+
results.append((doc, similarity))
|
|
143
|
+
|
|
144
|
+
# Sort by similarity
|
|
145
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
146
|
+
|
|
147
|
+
return results[:top_k]
|
|
148
|
+
|
|
149
|
+
def find_similar(self, doc_id: str, top_k: int = 10,
|
|
150
|
+
min_similarity: float = 0.0) -> List[Tuple[EmbeddedDocument, float]]:
|
|
151
|
+
"""Find documents similar to a given document."""
|
|
152
|
+
if doc_id not in self.embeddings:
|
|
153
|
+
return []
|
|
154
|
+
|
|
155
|
+
source_doc = self.embeddings[doc_id]
|
|
156
|
+
results = []
|
|
157
|
+
|
|
158
|
+
for other_id, other_doc in self.embeddings.items():
|
|
159
|
+
if other_id == doc_id:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
similarity = source_doc.similarity(other_doc.embedding)
|
|
163
|
+
|
|
164
|
+
if similarity >= min_similarity:
|
|
165
|
+
results.append((other_doc, similarity))
|
|
166
|
+
|
|
167
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
168
|
+
return results[:top_k]
|
|
169
|
+
|
|
170
|
+
def get_document(self, doc_id: str) -> Optional[EmbeddedDocument]:
|
|
171
|
+
"""Retrieve a document by ID."""
|
|
172
|
+
return self.embeddings.get(doc_id)
|
|
173
|
+
|
|
174
|
+
def remove_document(self, doc_id: str) -> bool:
|
|
175
|
+
"""Remove a document from the store."""
|
|
176
|
+
if doc_id in self.embeddings:
|
|
177
|
+
del self.embeddings[doc_id]
|
|
178
|
+
return True
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
def save_embeddings(self):
|
|
182
|
+
"""Save embeddings to disk."""
|
|
183
|
+
# Save embeddings
|
|
184
|
+
embeddings_file = self.store_path / 'embeddings.pkl'
|
|
185
|
+
with open(embeddings_file, 'wb') as f:
|
|
186
|
+
pickle.dump(self.embeddings, f)
|
|
187
|
+
|
|
188
|
+
# Save metadata
|
|
189
|
+
metadata_file = self.store_path / 'metadata.json'
|
|
190
|
+
with open(metadata_file, 'w') as f:
|
|
191
|
+
json.dump({
|
|
192
|
+
'model_name': self.model_name,
|
|
193
|
+
'use_transformer': self.use_transformer,
|
|
194
|
+
'embedding_dim': self.embedding_dim,
|
|
195
|
+
'num_documents': len(self.embeddings)
|
|
196
|
+
}, f, indent=2)
|
|
197
|
+
|
|
198
|
+
# Save TF-IDF vectorizer if used
|
|
199
|
+
if not self.use_transformer and hasattr(self, 'vectorizer'):
|
|
200
|
+
vectorizer_file = self.store_path / 'vectorizer.pkl'
|
|
201
|
+
with open(vectorizer_file, 'wb') as f:
|
|
202
|
+
pickle.dump(self.vectorizer, f)
|
|
203
|
+
|
|
204
|
+
def load_embeddings(self):
|
|
205
|
+
"""Load embeddings from disk."""
|
|
206
|
+
embeddings_file = self.store_path / 'embeddings.pkl'
|
|
207
|
+
metadata_file = self.store_path / 'metadata.json'
|
|
208
|
+
|
|
209
|
+
if embeddings_file.exists():
|
|
210
|
+
with open(embeddings_file, 'rb') as f:
|
|
211
|
+
self.embeddings = pickle.load(f)
|
|
212
|
+
|
|
213
|
+
if metadata_file.exists():
|
|
214
|
+
with open(metadata_file, 'r') as f:
|
|
215
|
+
self.index_metadata = json.load(f)
|
|
216
|
+
|
|
217
|
+
# Load TF-IDF vectorizer if needed
|
|
218
|
+
if not self.use_transformer:
|
|
219
|
+
vectorizer_file = self.store_path / 'vectorizer.pkl'
|
|
220
|
+
if vectorizer_file.exists():
|
|
221
|
+
with open(vectorizer_file, 'rb') as f:
|
|
222
|
+
self.vectorizer = pickle.load(f)
|
|
223
|
+
|
|
224
|
+
def _compute_embedding(self, text: str) -> np.ndarray:
|
|
225
|
+
"""Compute embedding for a single text."""
|
|
226
|
+
if self.use_transformer:
|
|
227
|
+
return self.model.encode(text, convert_to_numpy=True)
|
|
228
|
+
else:
|
|
229
|
+
# TF-IDF fallback
|
|
230
|
+
if not hasattr(self, 'vectorizer') or not self.fitted_texts:
|
|
231
|
+
# First text - fit the vectorizer
|
|
232
|
+
self.fitted_texts = [text]
|
|
233
|
+
embeddings = self.vectorizer.fit_transform([text])
|
|
234
|
+
else:
|
|
235
|
+
# Transform using existing vocabulary
|
|
236
|
+
try:
|
|
237
|
+
embeddings = self.vectorizer.transform([text])
|
|
238
|
+
except:
|
|
239
|
+
# Refit with all texts if vocabulary changed
|
|
240
|
+
self.fitted_texts.append(text)
|
|
241
|
+
embeddings = self.vectorizer.fit_transform(self.fitted_texts)
|
|
242
|
+
|
|
243
|
+
return embeddings.toarray()[0]
|
|
244
|
+
|
|
245
|
+
def _compute_embeddings_batch(self, texts: List[str]) -> List[np.ndarray]:
|
|
246
|
+
"""Compute embeddings for multiple texts efficiently."""
|
|
247
|
+
if self.use_transformer:
|
|
248
|
+
return self.model.encode(texts, convert_to_numpy=True)
|
|
249
|
+
else:
|
|
250
|
+
# TF-IDF fallback
|
|
251
|
+
if not hasattr(self, 'vectorizer') or not self.fitted_texts:
|
|
252
|
+
self.fitted_texts = texts
|
|
253
|
+
embeddings = self.vectorizer.fit_transform(texts)
|
|
254
|
+
else:
|
|
255
|
+
try:
|
|
256
|
+
embeddings = self.vectorizer.transform(texts)
|
|
257
|
+
except:
|
|
258
|
+
self.fitted_texts.extend(texts)
|
|
259
|
+
embeddings = self.vectorizer.fit_transform(self.fitted_texts)
|
|
260
|
+
|
|
261
|
+
return [embeddings[i].toarray()[0] for i in range(len(texts))]
|
|
262
|
+
|
|
263
|
+
def _generate_id(self, text: str) -> str:
|
|
264
|
+
"""Generate unique ID for a document."""
|
|
265
|
+
return hashlib.md5(text.encode()).hexdigest()[:16]
|
|
266
|
+
|
|
267
|
+
def _matches_filter(self, metadata: Dict[str, Any], filter_dict: Dict[str, Any]) -> bool:
|
|
268
|
+
"""Check if metadata matches filter criteria."""
|
|
269
|
+
for key, value in filter_dict.items():
|
|
270
|
+
if key not in metadata:
|
|
271
|
+
return False
|
|
272
|
+
if isinstance(value, list):
|
|
273
|
+
if metadata[key] not in value:
|
|
274
|
+
return False
|
|
275
|
+
else:
|
|
276
|
+
if metadata[key] != value:
|
|
277
|
+
return False
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class SemanticSearch:
|
|
282
|
+
"""
|
|
283
|
+
High-level semantic search interface for ebook libraries.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
def __init__(self, library_path: Path):
|
|
287
|
+
self.library_path = Path(library_path)
|
|
288
|
+
self.embedding_store = EmbeddingStore(library_path)
|
|
289
|
+
self.book_chunks: Dict[str, List[str]] = {} # book_id -> chunk_ids
|
|
290
|
+
|
|
291
|
+
def index_book(self, book_id: str, text: str, chunk_size: int = 500):
|
|
292
|
+
"""
|
|
293
|
+
Index a book by splitting into chunks and computing embeddings.
|
|
294
|
+
"""
|
|
295
|
+
# Split text into chunks
|
|
296
|
+
chunks = self._split_into_chunks(text, chunk_size)
|
|
297
|
+
|
|
298
|
+
# Add chunks to embedding store
|
|
299
|
+
chunk_ids = []
|
|
300
|
+
for i, chunk in enumerate(chunks):
|
|
301
|
+
metadata = {
|
|
302
|
+
'book_id': book_id,
|
|
303
|
+
'chunk_index': i,
|
|
304
|
+
'chunk_total': len(chunks)
|
|
305
|
+
}
|
|
306
|
+
chunk_id = self.embedding_store.add_document(chunk, metadata)
|
|
307
|
+
chunk_ids.append(chunk_id)
|
|
308
|
+
|
|
309
|
+
self.book_chunks[book_id] = chunk_ids
|
|
310
|
+
self.embedding_store.save_embeddings()
|
|
311
|
+
|
|
312
|
+
def search_library(self, query: str, top_k: int = 10,
|
|
313
|
+
book_ids: List[str] = None) -> List[Dict[str, Any]]:
|
|
314
|
+
"""
|
|
315
|
+
Search across the entire library or specific books.
|
|
316
|
+
"""
|
|
317
|
+
# Prepare filter
|
|
318
|
+
filter_metadata = None
|
|
319
|
+
if book_ids:
|
|
320
|
+
filter_metadata = {'book_id': book_ids}
|
|
321
|
+
|
|
322
|
+
# Perform search
|
|
323
|
+
results = self.embedding_store.search(
|
|
324
|
+
query, top_k=top_k, filter_metadata=filter_metadata
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Format results
|
|
328
|
+
formatted_results = []
|
|
329
|
+
for doc, similarity in results:
|
|
330
|
+
formatted_results.append({
|
|
331
|
+
'book_id': doc.metadata.get('book_id'),
|
|
332
|
+
'text': doc.text,
|
|
333
|
+
'similarity': similarity,
|
|
334
|
+
'chunk_index': doc.metadata.get('chunk_index'),
|
|
335
|
+
'metadata': doc.metadata
|
|
336
|
+
})
|
|
337
|
+
|
|
338
|
+
return formatted_results
|
|
339
|
+
|
|
340
|
+
def find_cross_references(self, book_id: str, passage: str,
|
|
341
|
+
other_books: List[str] = None) -> List[Dict[str, Any]]:
|
|
342
|
+
"""
|
|
343
|
+
Find similar passages in other books (cross-references).
|
|
344
|
+
"""
|
|
345
|
+
# Search in other books
|
|
346
|
+
filter_metadata = None
|
|
347
|
+
if other_books:
|
|
348
|
+
filter_metadata = {'book_id': other_books}
|
|
349
|
+
else:
|
|
350
|
+
# Search all books except the source
|
|
351
|
+
filter_metadata = {}
|
|
352
|
+
|
|
353
|
+
results = self.embedding_store.search(
|
|
354
|
+
passage, top_k=10, filter_metadata=filter_metadata
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Filter out results from the same book
|
|
358
|
+
cross_refs = []
|
|
359
|
+
for doc, similarity in results:
|
|
360
|
+
if doc.metadata.get('book_id') != book_id:
|
|
361
|
+
cross_refs.append({
|
|
362
|
+
'book_id': doc.metadata.get('book_id'),
|
|
363
|
+
'text': doc.text,
|
|
364
|
+
'similarity': similarity,
|
|
365
|
+
'metadata': doc.metadata
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
return cross_refs
|
|
369
|
+
|
|
370
|
+
def get_book_summary_vectors(self, book_ids: List[str]) -> Dict[str, np.ndarray]:
|
|
371
|
+
"""
|
|
372
|
+
Get summary embedding vectors for books (average of all chunks).
|
|
373
|
+
"""
|
|
374
|
+
book_vectors = {}
|
|
375
|
+
|
|
376
|
+
for book_id in book_ids:
|
|
377
|
+
if book_id not in self.book_chunks:
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
# Get all chunk embeddings
|
|
381
|
+
embeddings = []
|
|
382
|
+
for chunk_id in self.book_chunks[book_id]:
|
|
383
|
+
doc = self.embedding_store.get_document(chunk_id)
|
|
384
|
+
if doc:
|
|
385
|
+
embeddings.append(doc.embedding)
|
|
386
|
+
|
|
387
|
+
if embeddings:
|
|
388
|
+
# Average embeddings
|
|
389
|
+
book_vectors[book_id] = np.mean(embeddings, axis=0)
|
|
390
|
+
|
|
391
|
+
return book_vectors
|
|
392
|
+
|
|
393
|
+
def find_similar_books(self, book_id: str, top_k: int = 5) -> List[Tuple[str, float]]:
|
|
394
|
+
"""
|
|
395
|
+
Find books similar to a given book based on content similarity.
|
|
396
|
+
"""
|
|
397
|
+
# Get summary vector for source book
|
|
398
|
+
source_vectors = self.get_book_summary_vectors([book_id])
|
|
399
|
+
if book_id not in source_vectors:
|
|
400
|
+
return []
|
|
401
|
+
|
|
402
|
+
source_vector = source_vectors[book_id]
|
|
403
|
+
|
|
404
|
+
# Get vectors for all other books
|
|
405
|
+
all_book_ids = list(self.book_chunks.keys())
|
|
406
|
+
all_book_ids.remove(book_id) if book_id in all_book_ids else None
|
|
407
|
+
|
|
408
|
+
other_vectors = self.get_book_summary_vectors(all_book_ids)
|
|
409
|
+
|
|
410
|
+
# Calculate similarities
|
|
411
|
+
similarities = []
|
|
412
|
+
for other_id, other_vector in other_vectors.items():
|
|
413
|
+
similarity = float(cosine_similarity(
|
|
414
|
+
source_vector.reshape(1, -1),
|
|
415
|
+
other_vector.reshape(1, -1)
|
|
416
|
+
)[0, 0])
|
|
417
|
+
similarities.append((other_id, similarity))
|
|
418
|
+
|
|
419
|
+
# Sort and return top-k
|
|
420
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
421
|
+
return similarities[:top_k]
|
|
422
|
+
|
|
423
|
+
def _split_into_chunks(self, text: str, chunk_size: int) -> List[str]:
|
|
424
|
+
"""Split text into overlapping chunks."""
|
|
425
|
+
words = text.split()
|
|
426
|
+
chunks = []
|
|
427
|
+
overlap = chunk_size // 4 # 25% overlap
|
|
428
|
+
|
|
429
|
+
for i in range(0, len(words), chunk_size - overlap):
|
|
430
|
+
chunk = ' '.join(words[i:i + chunk_size])
|
|
431
|
+
if chunk:
|
|
432
|
+
chunks.append(chunk)
|
|
433
|
+
|
|
434
|
+
return chunks
|