ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
ebk/similarity/base.py ADDED
@@ -0,0 +1,154 @@
1
+ """Base classes for the similarity system.
2
+
3
+ This module defines the core abstractions:
4
+ - Extractor: Extracts values from books
5
+ - Metric: Computes similarity between values
6
+ - Feature: Combines an extractor and a metric
7
+ """
8
+
9
+ from abc import ABC, abstractmethod
10
+ from pathlib import Path
11
+ from typing import Any, Dict, Generic, TypeVar
12
+
13
+ from ebk.db.models import Book
14
+
15
+ T = TypeVar("T")
16
+
17
+
18
+ class Extractor(ABC, Generic[T]):
19
+ """Extracts a value from a book for similarity comparison.
20
+
21
+ Examples:
22
+ - ContentExtractor: Extracts full text
23
+ - AuthorsExtractor: Extracts set of author names
24
+ - SubjectsExtractor: Extracts set of subjects
25
+ - PublicationYearExtractor: Extracts publication year
26
+ """
27
+
28
+ @abstractmethod
29
+ def extract(self, book: Book) -> T:
30
+ """Extract a value from the book.
31
+
32
+ Args:
33
+ book: Book to extract value from
34
+
35
+ Returns:
36
+ Extracted value (type depends on extractor)
37
+ """
38
+ pass
39
+
40
+
41
+ class Metric(ABC, Generic[T]):
42
+ """Computes similarity between two values.
43
+
44
+ All similarity scores must be normalized to [0, 1] where:
45
+ - 0 = completely dissimilar
46
+ - 1 = identical
47
+
48
+ Examples:
49
+ - TfidfMetric: Computes cosine similarity of TF-IDF vectors
50
+ - JaccardMetric: Computes set overlap
51
+ - ExactMatchMetric: Returns 1 if equal, 0 otherwise
52
+ - TemporalDecayMetric: Gaussian decay based on time difference
53
+ """
54
+
55
+ @abstractmethod
56
+ def similarity(self, value1: T, value2: T) -> float:
57
+ """Compute similarity between two values.
58
+
59
+ Args:
60
+ value1: First value
61
+ value2: Second value
62
+
63
+ Returns:
64
+ Similarity score in [0, 1]
65
+ """
66
+ pass
67
+
68
+ def fit(self, data: Dict[int, T]) -> None:
69
+ """Fit metric on a corpus (optional).
70
+
71
+ Override this for metrics that need pre-computation, such as:
72
+ - TF-IDF: Fit vectorizer and cache vectors
73
+ - Embeddings: Compute and cache embeddings
74
+
75
+ Default implementation is no-op for metrics that don't need fitting
76
+ (e.g., Jaccard, exact match, temporal decay).
77
+
78
+ Args:
79
+ data: Dictionary mapping book IDs to extracted values
80
+ """
81
+ pass # No-op by default
82
+
83
+ def save(self, path: Path) -> None:
84
+ """Save fitted state to disk (optional).
85
+
86
+ Override this for metrics that cache expensive computations.
87
+ Default implementation is no-op.
88
+
89
+ Args:
90
+ path: Path to save fitted state
91
+ """
92
+ pass # No-op by default
93
+
94
+ def load(self, path: Path) -> None:
95
+ """Load fitted state from disk (optional).
96
+
97
+ Override this for metrics that cache expensive computations.
98
+ Default implementation is no-op.
99
+
100
+ Args:
101
+ path: Path to load fitted state from
102
+ """
103
+ pass # No-op by default
104
+
105
+
106
+ class Feature:
107
+ """Combines an extractor and a metric with a weight.
108
+
109
+ A Feature represents one aspect of book similarity, such as:
110
+ - Content similarity (text + TF-IDF)
111
+ - Author overlap (authors + Jaccard)
112
+ - Temporal proximity (pub year + Gaussian decay)
113
+
114
+ Attributes:
115
+ extractor: Extractor for getting values from books
116
+ metric: Metric for computing similarity between values
117
+ weight: Weight for this feature (default 1.0)
118
+ name: Optional name for this feature
119
+ """
120
+
121
+ def __init__(
122
+ self,
123
+ extractor: Extractor,
124
+ metric: Metric,
125
+ weight: float = 1.0,
126
+ name: str = None,
127
+ ):
128
+ """Initialize a feature.
129
+
130
+ Args:
131
+ extractor: Extractor for getting values from books
132
+ metric: Metric for computing similarity between values
133
+ weight: Weight for this feature (default 1.0)
134
+ name: Optional name for this feature
135
+ """
136
+ self.extractor = extractor
137
+ self.metric = metric
138
+ self.weight = weight
139
+ self.name = name or f"{extractor.__class__.__name__}+{metric.__class__.__name__}"
140
+
141
+ def similarity(self, book1: Book, book2: Book) -> float:
142
+ """Compute weighted similarity between two books.
143
+
144
+ Args:
145
+ book1: First book
146
+ book2: Second book
147
+
148
+ Returns:
149
+ Weighted similarity score
150
+ """
151
+ value1 = self.extractor.extract(book1)
152
+ value2 = self.extractor.extract(book2)
153
+ sim = self.metric.similarity(value1, value2)
154
+ return sim * self.weight
ebk/similarity/core.py ADDED
@@ -0,0 +1,445 @@
1
+ """Core BookSimilarity class with fluent API."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ import numpy as np
7
+
8
+ from ebk.db.models import Book
9
+ from ebk.similarity.base import Feature, Metric
10
+ from ebk.similarity.extractors import (
11
+ AuthorsExtractor,
12
+ ContentExtractor,
13
+ DescriptionExtractor,
14
+ LanguageExtractor,
15
+ PageCountExtractor,
16
+ PublicationYearExtractor,
17
+ PublisherExtractor,
18
+ SubjectsExtractor,
19
+ )
20
+ from ebk.similarity.metrics import (
21
+ CosineMetric,
22
+ ExactMatchMetric,
23
+ JaccardMetric,
24
+ NumericProximityMetric,
25
+ TemporalDecayMetric,
26
+ TfidfMetric,
27
+ )
28
+
29
+
30
+ class BookSimilarity:
31
+ """Compute similarity between books using multiple features.
32
+
33
+ This class uses a fluent API for configuration:
34
+
35
+ Example:
36
+ >>> sim = (BookSimilarity()
37
+ ... .content(weight=4.0)
38
+ ... .authors(weight=2.0)
39
+ ... .subjects(weight=1.0)
40
+ ... .temporal(weight=0.5))
41
+ >>> sim.fit(books)
42
+ >>> score = sim.similarity(book1, book2)
43
+
44
+ Each method adds a feature (extractor + metric + weight) to the similarity
45
+ computation. The final similarity is the weighted average of all features.
46
+
47
+ Three-tier API:
48
+ - Tier 1: Presets (.balanced(), .content_only())
49
+ - Tier 2: Semantic methods (.content(), .authors()) with defaults
50
+ - Tier 3: Escape hatch (.custom()) for power users
51
+ """
52
+
53
+ def __init__(self):
54
+ """Initialize empty similarity configuration."""
55
+ self.features: List[Feature] = []
56
+ self._fitted = False
57
+
58
+ # ===== Tier 1: Presets =====
59
+
60
+ def balanced(self) -> "BookSimilarity":
61
+ """Balanced preset with reasonable defaults.
62
+
63
+ Weights:
64
+ - Content (TF-IDF): 4.0
65
+ - Authors (Jaccard): 2.0
66
+ - Subjects (Jaccard): 1.0
67
+ - Temporal (Gaussian): 0.5
68
+
69
+ Returns:
70
+ Self for chaining
71
+ """
72
+ return (
73
+ self.content(weight=4.0)
74
+ .authors(weight=2.0)
75
+ .subjects(weight=1.0)
76
+ .temporal(weight=0.5)
77
+ )
78
+
79
+ def content_only(self, metric: Optional[Metric] = None) -> "BookSimilarity":
80
+ """Content-only preset (pure semantic similarity).
81
+
82
+ Uses TF-IDF by default, but can override metric.
83
+
84
+ Args:
85
+ metric: Optional custom metric (default TfidfMetric)
86
+
87
+ Returns:
88
+ Self for chaining
89
+ """
90
+ return self.content(weight=1.0, metric=metric)
91
+
92
+ def metadata_only(self) -> "BookSimilarity":
93
+ """Metadata-only preset (no content similarity).
94
+
95
+ Weights:
96
+ - Authors (Jaccard): 3.0
97
+ - Subjects (Jaccard): 2.0
98
+ - Temporal (Gaussian): 1.0
99
+ - Language (Exact): 1.0
100
+ - Publisher (Exact): 0.5
101
+
102
+ Returns:
103
+ Self for chaining
104
+ """
105
+ return (
106
+ self.authors(weight=3.0)
107
+ .subjects(weight=2.0)
108
+ .temporal(weight=1.0)
109
+ .language(weight=1.0)
110
+ .publisher(weight=0.5)
111
+ )
112
+
113
+ # ===== Tier 2: Semantic Methods =====
114
+
115
+ def content(
116
+ self, weight: float = 1.0, metric: Optional[Metric] = None
117
+ ) -> "BookSimilarity":
118
+ """Add content similarity (full text).
119
+
120
+ Default metric: TfidfMetric (cosine similarity of TF-IDF vectors)
121
+
122
+ Args:
123
+ weight: Weight for this feature (default 1.0)
124
+ metric: Optional custom metric (default TfidfMetric)
125
+
126
+ Returns:
127
+ Self for chaining
128
+ """
129
+ metric = metric or TfidfMetric()
130
+ extractor = ContentExtractor()
131
+ self.features.append(Feature(extractor, metric, weight, "content"))
132
+ return self
133
+
134
+ def description(
135
+ self, weight: float = 1.0, metric: Optional[Metric] = None
136
+ ) -> "BookSimilarity":
137
+ """Add description similarity (book summary/blurb).
138
+
139
+ Default metric: TfidfMetric (delegates to content provider)
140
+
141
+ Args:
142
+ weight: Weight for this feature (default 1.0)
143
+ metric: Optional custom metric (default TfidfMetric)
144
+
145
+ Returns:
146
+ Self for chaining
147
+ """
148
+ metric = metric or TfidfMetric()
149
+ extractor = DescriptionExtractor()
150
+ self.features.append(Feature(extractor, metric, weight, "description"))
151
+ return self
152
+
153
+ def authors(
154
+ self, weight: float = 1.0, metric: Optional[Metric] = None
155
+ ) -> "BookSimilarity":
156
+ """Add author overlap similarity.
157
+
158
+ Default metric: JaccardMetric (set overlap)
159
+
160
+ Args:
161
+ weight: Weight for this feature (default 1.0)
162
+ metric: Optional custom metric (default JaccardMetric)
163
+
164
+ Returns:
165
+ Self for chaining
166
+ """
167
+ metric = metric or JaccardMetric()
168
+ extractor = AuthorsExtractor()
169
+ self.features.append(Feature(extractor, metric, weight, "authors"))
170
+ return self
171
+
172
+ def subjects(
173
+ self, weight: float = 1.0, metric: Optional[Metric] = None
174
+ ) -> "BookSimilarity":
175
+ """Add subject/tag overlap similarity.
176
+
177
+ Default metric: JaccardMetric (set overlap)
178
+
179
+ Args:
180
+ weight: Weight for this feature (default 1.0)
181
+ metric: Optional custom metric (default JaccardMetric)
182
+
183
+ Returns:
184
+ Self for chaining
185
+ """
186
+ metric = metric or JaccardMetric()
187
+ extractor = SubjectsExtractor()
188
+ self.features.append(Feature(extractor, metric, weight, "subjects"))
189
+ return self
190
+
191
+ def temporal(
192
+ self, weight: float = 1.0, metric: Optional[Metric] = None, sigma: float = 10.0
193
+ ) -> "BookSimilarity":
194
+ """Add temporal proximity similarity (publication date).
195
+
196
+ Default metric: TemporalDecayMetric (Gaussian decay)
197
+
198
+ Args:
199
+ weight: Weight for this feature (default 1.0)
200
+ metric: Optional custom metric (default TemporalDecayMetric)
201
+ sigma: Standard deviation in years for Gaussian decay (default 10.0)
202
+
203
+ Returns:
204
+ Self for chaining
205
+ """
206
+ metric = metric or TemporalDecayMetric(sigma=sigma)
207
+ extractor = PublicationYearExtractor()
208
+ self.features.append(Feature(extractor, metric, weight, "temporal"))
209
+ return self
210
+
211
+ def language(
212
+ self, weight: float = 1.0, metric: Optional[Metric] = None
213
+ ) -> "BookSimilarity":
214
+ """Add language match similarity.
215
+
216
+ Default metric: ExactMatchMetric (1 if same language, 0 otherwise)
217
+
218
+ Args:
219
+ weight: Weight for this feature (default 1.0)
220
+ metric: Optional custom metric (default ExactMatchMetric)
221
+
222
+ Returns:
223
+ Self for chaining
224
+ """
225
+ metric = metric or ExactMatchMetric()
226
+ extractor = LanguageExtractor()
227
+ self.features.append(Feature(extractor, metric, weight, "language"))
228
+ return self
229
+
230
+ def publisher(
231
+ self, weight: float = 1.0, metric: Optional[Metric] = None
232
+ ) -> "BookSimilarity":
233
+ """Add publisher match similarity.
234
+
235
+ Default metric: ExactMatchMetric (1 if same publisher, 0 otherwise)
236
+
237
+ Args:
238
+ weight: Weight for this feature (default 1.0)
239
+ metric: Optional custom metric (default ExactMatchMetric)
240
+
241
+ Returns:
242
+ Self for chaining
243
+ """
244
+ metric = metric or ExactMatchMetric()
245
+ extractor = PublisherExtractor()
246
+ self.features.append(Feature(extractor, metric, weight, "publisher"))
247
+ return self
248
+
249
+ def page_count(
250
+ self,
251
+ weight: float = 1.0,
252
+ metric: Optional[Metric] = None,
253
+ max_diff: float = 1000.0,
254
+ ) -> "BookSimilarity":
255
+ """Add page count proximity similarity.
256
+
257
+ Default metric: NumericProximityMetric
258
+
259
+ Args:
260
+ weight: Weight for this feature (default 1.0)
261
+ metric: Optional custom metric (default NumericProximityMetric)
262
+ max_diff: Maximum expected difference in pages (default 1000)
263
+
264
+ Returns:
265
+ Self for chaining
266
+ """
267
+ metric = metric or NumericProximityMetric(max_diff=max_diff)
268
+ extractor = PageCountExtractor()
269
+ self.features.append(Feature(extractor, metric, weight, "page_count"))
270
+ return self
271
+
272
+ # ===== Tier 3: Escape Hatch =====
273
+
274
+ def custom(
275
+ self, feature: Feature, name: Optional[str] = None
276
+ ) -> "BookSimilarity":
277
+ """Add a custom feature for power users.
278
+
279
+ Args:
280
+ feature: Custom Feature (extractor + metric + weight)
281
+ name: Optional name for this feature
282
+
283
+ Returns:
284
+ Self for chaining
285
+ """
286
+ if name:
287
+ feature.name = name
288
+ self.features.append(feature)
289
+ return self
290
+
291
+ # ===== Core Functionality =====
292
+
293
+ def fit(self, books: List[Book]) -> "BookSimilarity":
294
+ """Fit all metrics on the corpus.
295
+
296
+ This pre-computes expensive features (e.g., TF-IDF vectors) for
297
+ dramatic performance improvements.
298
+
299
+ Args:
300
+ books: List of books to fit on
301
+
302
+ Returns:
303
+ Self for chaining
304
+ """
305
+ if not books:
306
+ return self
307
+
308
+ # For each feature, extract values and fit metric
309
+ for feature in self.features:
310
+ # Extract values for all books
311
+ data = {}
312
+ for book in books:
313
+ try:
314
+ value = feature.extractor.extract(book)
315
+ data[book.id] = value
316
+ except Exception:
317
+ # Skip books that fail extraction
318
+ continue
319
+
320
+ # Fit metric (no-op for most metrics)
321
+ feature.metric.fit(data)
322
+
323
+ self._fitted = True
324
+ return self
325
+
326
+ def similarity(self, book1: Book, book2: Book) -> float:
327
+ """Compute similarity between two books.
328
+
329
+ Returns weighted average of all feature similarities.
330
+
331
+ Args:
332
+ book1: First book
333
+ book2: Second book
334
+
335
+ Returns:
336
+ Similarity score in [0, 1]
337
+ """
338
+ if not self.features:
339
+ raise ValueError("No features configured. Use .content(), .authors(), etc.")
340
+
341
+ total_weighted_sim = 0.0
342
+ total_weight = 0.0
343
+
344
+ for feature in self.features:
345
+ try:
346
+ weighted_sim = feature.similarity(book1, book2)
347
+ total_weighted_sim += weighted_sim
348
+ total_weight += feature.weight
349
+ except Exception:
350
+ # Skip features that fail
351
+ continue
352
+
353
+ if total_weight == 0:
354
+ return 0.0
355
+
356
+ return total_weighted_sim / total_weight
357
+
358
+ def similarity_matrix(self, books: List[Book]) -> np.ndarray:
359
+ """Compute pairwise similarity matrix for all books.
360
+
361
+ Returns NxN matrix where matrix[i][j] = similarity(books[i], books[j])
362
+
363
+ This is much faster than computing similarities one by one.
364
+
365
+ Args:
366
+ books: List of books
367
+
368
+ Returns:
369
+ NxN numpy array of similarities
370
+ """
371
+ n = len(books)
372
+ matrix = np.zeros((n, n))
373
+
374
+ # Diagonal is always 1.0 (book is identical to itself)
375
+ np.fill_diagonal(matrix, 1.0)
376
+
377
+ # Compute upper triangle (matrix is symmetric)
378
+ for i in range(n):
379
+ for j in range(i + 1, n):
380
+ sim = self.similarity(books[i], books[j])
381
+ matrix[i][j] = sim
382
+ matrix[j][i] = sim # Symmetric
383
+
384
+ return matrix
385
+
386
+ def find_similar(
387
+ self, book: Book, candidates: List[Book], top_k: int = 10
388
+ ) -> List[Tuple[Book, float]]:
389
+ """Find top-k most similar books from candidates.
390
+
391
+ Args:
392
+ book: Query book
393
+ candidates: Candidate books to compare against
394
+ top_k: Number of results to return (default 10)
395
+
396
+ Returns:
397
+ List of (book, similarity) tuples, sorted by similarity descending
398
+ """
399
+ # Compute similarities
400
+ similarities = []
401
+ for candidate in candidates:
402
+ if candidate.id == book.id:
403
+ continue # Skip self
404
+
405
+ sim = self.similarity(book, candidate)
406
+ similarities.append((candidate, sim))
407
+
408
+ # Sort by similarity descending
409
+ similarities.sort(key=lambda x: x[1], reverse=True)
410
+
411
+ # Return top-k
412
+ return similarities[:top_k]
413
+
414
+ def save(self, path: Path) -> None:
415
+ """Save fitted state to disk.
416
+
417
+ Args:
418
+ path: Directory to save to (will create multiple files)
419
+ """
420
+ if not self._fitted:
421
+ raise RuntimeError("Must call fit() before save()")
422
+
423
+ path = Path(path)
424
+ path.mkdir(parents=True, exist_ok=True)
425
+
426
+ # Save each feature's metric
427
+ for i, feature in enumerate(self.features):
428
+ metric_path = path / f"metric_{i}_{feature.name}.pkl"
429
+ feature.metric.save(metric_path)
430
+
431
+ def load(self, path: Path) -> None:
432
+ """Load fitted state from disk.
433
+
434
+ Args:
435
+ path: Directory to load from
436
+ """
437
+ path = Path(path)
438
+
439
+ # Load each feature's metric
440
+ for i, feature in enumerate(self.features):
441
+ metric_path = path / f"metric_{i}_{feature.name}.pkl"
442
+ if metric_path.exists():
443
+ feature.metric.load(metric_path)
444
+
445
+ self._fitted = True