ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +1097 -9
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +59 -0
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/exports/zip.py +25 -0
- ebk/library_db.py +155 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
- ebk-0.3.2.dist-info/RECORD +69 -0
- ebk-0.3.2.dist-info/entry_points.txt +2 -0
- ebk-0.3.2.dist-info/top_level.txt +1 -0
- ebk-0.3.1.dist-info/RECORD +0 -19
- ebk-0.3.1.dist-info/entry_points.txt +0 -6
- ebk-0.3.1.dist-info/top_level.txt +0 -2
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
ebk/similarity/base.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Base classes for the similarity system.
|
|
2
|
+
|
|
3
|
+
This module defines the core abstractions:
|
|
4
|
+
- Extractor: Extracts values from books
|
|
5
|
+
- Metric: Computes similarity between values
|
|
6
|
+
- Feature: Combines an extractor and a metric
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, Generic, TypeVar
|
|
12
|
+
|
|
13
|
+
from ebk.db.models import Book
|
|
14
|
+
|
|
15
|
+
T = TypeVar("T")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Extractor(ABC, Generic[T]):
|
|
19
|
+
"""Extracts a value from a book for similarity comparison.
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
- ContentExtractor: Extracts full text
|
|
23
|
+
- AuthorsExtractor: Extracts set of author names
|
|
24
|
+
- SubjectsExtractor: Extracts set of subjects
|
|
25
|
+
- PublicationYearExtractor: Extracts publication year
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def extract(self, book: Book) -> T:
|
|
30
|
+
"""Extract a value from the book.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
book: Book to extract value from
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Extracted value (type depends on extractor)
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Metric(ABC, Generic[T]):
|
|
42
|
+
"""Computes similarity between two values.
|
|
43
|
+
|
|
44
|
+
All similarity scores must be normalized to [0, 1] where:
|
|
45
|
+
- 0 = completely dissimilar
|
|
46
|
+
- 1 = identical
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
- TfidfMetric: Computes cosine similarity of TF-IDF vectors
|
|
50
|
+
- JaccardMetric: Computes set overlap
|
|
51
|
+
- ExactMatchMetric: Returns 1 if equal, 0 otherwise
|
|
52
|
+
- TemporalDecayMetric: Gaussian decay based on time difference
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def similarity(self, value1: T, value2: T) -> float:
|
|
57
|
+
"""Compute similarity between two values.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
value1: First value
|
|
61
|
+
value2: Second value
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Similarity score in [0, 1]
|
|
65
|
+
"""
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
def fit(self, data: Dict[int, T]) -> None:
|
|
69
|
+
"""Fit metric on a corpus (optional).
|
|
70
|
+
|
|
71
|
+
Override this for metrics that need pre-computation, such as:
|
|
72
|
+
- TF-IDF: Fit vectorizer and cache vectors
|
|
73
|
+
- Embeddings: Compute and cache embeddings
|
|
74
|
+
|
|
75
|
+
Default implementation is no-op for metrics that don't need fitting
|
|
76
|
+
(e.g., Jaccard, exact match, temporal decay).
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
data: Dictionary mapping book IDs to extracted values
|
|
80
|
+
"""
|
|
81
|
+
pass # No-op by default
|
|
82
|
+
|
|
83
|
+
def save(self, path: Path) -> None:
|
|
84
|
+
"""Save fitted state to disk (optional).
|
|
85
|
+
|
|
86
|
+
Override this for metrics that cache expensive computations.
|
|
87
|
+
Default implementation is no-op.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
path: Path to save fitted state
|
|
91
|
+
"""
|
|
92
|
+
pass # No-op by default
|
|
93
|
+
|
|
94
|
+
def load(self, path: Path) -> None:
|
|
95
|
+
"""Load fitted state from disk (optional).
|
|
96
|
+
|
|
97
|
+
Override this for metrics that cache expensive computations.
|
|
98
|
+
Default implementation is no-op.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
path: Path to load fitted state from
|
|
102
|
+
"""
|
|
103
|
+
pass # No-op by default
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class Feature:
|
|
107
|
+
"""Combines an extractor and a metric with a weight.
|
|
108
|
+
|
|
109
|
+
A Feature represents one aspect of book similarity, such as:
|
|
110
|
+
- Content similarity (text + TF-IDF)
|
|
111
|
+
- Author overlap (authors + Jaccard)
|
|
112
|
+
- Temporal proximity (pub year + Gaussian decay)
|
|
113
|
+
|
|
114
|
+
Attributes:
|
|
115
|
+
extractor: Extractor for getting values from books
|
|
116
|
+
metric: Metric for computing similarity between values
|
|
117
|
+
weight: Weight for this feature (default 1.0)
|
|
118
|
+
name: Optional name for this feature
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
extractor: Extractor,
|
|
124
|
+
metric: Metric,
|
|
125
|
+
weight: float = 1.0,
|
|
126
|
+
name: str = None,
|
|
127
|
+
):
|
|
128
|
+
"""Initialize a feature.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
extractor: Extractor for getting values from books
|
|
132
|
+
metric: Metric for computing similarity between values
|
|
133
|
+
weight: Weight for this feature (default 1.0)
|
|
134
|
+
name: Optional name for this feature
|
|
135
|
+
"""
|
|
136
|
+
self.extractor = extractor
|
|
137
|
+
self.metric = metric
|
|
138
|
+
self.weight = weight
|
|
139
|
+
self.name = name or f"{extractor.__class__.__name__}+{metric.__class__.__name__}"
|
|
140
|
+
|
|
141
|
+
def similarity(self, book1: Book, book2: Book) -> float:
|
|
142
|
+
"""Compute weighted similarity between two books.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
book1: First book
|
|
146
|
+
book2: Second book
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Weighted similarity score
|
|
150
|
+
"""
|
|
151
|
+
value1 = self.extractor.extract(book1)
|
|
152
|
+
value2 = self.extractor.extract(book2)
|
|
153
|
+
sim = self.metric.similarity(value1, value2)
|
|
154
|
+
return sim * self.weight
|
ebk/similarity/core.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""Core BookSimilarity class with fluent API."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from ebk.db.models import Book
|
|
9
|
+
from ebk.similarity.base import Feature, Metric
|
|
10
|
+
from ebk.similarity.extractors import (
|
|
11
|
+
AuthorsExtractor,
|
|
12
|
+
ContentExtractor,
|
|
13
|
+
DescriptionExtractor,
|
|
14
|
+
LanguageExtractor,
|
|
15
|
+
PageCountExtractor,
|
|
16
|
+
PublicationYearExtractor,
|
|
17
|
+
PublisherExtractor,
|
|
18
|
+
SubjectsExtractor,
|
|
19
|
+
)
|
|
20
|
+
from ebk.similarity.metrics import (
|
|
21
|
+
CosineMetric,
|
|
22
|
+
ExactMatchMetric,
|
|
23
|
+
JaccardMetric,
|
|
24
|
+
NumericProximityMetric,
|
|
25
|
+
TemporalDecayMetric,
|
|
26
|
+
TfidfMetric,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BookSimilarity:
|
|
31
|
+
"""Compute similarity between books using multiple features.
|
|
32
|
+
|
|
33
|
+
This class uses a fluent API for configuration:
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> sim = (BookSimilarity()
|
|
37
|
+
... .content(weight=4.0)
|
|
38
|
+
... .authors(weight=2.0)
|
|
39
|
+
... .subjects(weight=1.0)
|
|
40
|
+
... .temporal(weight=0.5))
|
|
41
|
+
>>> sim.fit(books)
|
|
42
|
+
>>> score = sim.similarity(book1, book2)
|
|
43
|
+
|
|
44
|
+
Each method adds a feature (extractor + metric + weight) to the similarity
|
|
45
|
+
computation. The final similarity is the weighted average of all features.
|
|
46
|
+
|
|
47
|
+
Three-tier API:
|
|
48
|
+
- Tier 1: Presets (.balanced(), .content_only())
|
|
49
|
+
- Tier 2: Semantic methods (.content(), .authors()) with defaults
|
|
50
|
+
- Tier 3: Escape hatch (.custom()) for power users
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
"""Initialize empty similarity configuration."""
|
|
55
|
+
self.features: List[Feature] = []
|
|
56
|
+
self._fitted = False
|
|
57
|
+
|
|
58
|
+
# ===== Tier 1: Presets =====
|
|
59
|
+
|
|
60
|
+
def balanced(self) -> "BookSimilarity":
|
|
61
|
+
"""Balanced preset with reasonable defaults.
|
|
62
|
+
|
|
63
|
+
Weights:
|
|
64
|
+
- Content (TF-IDF): 4.0
|
|
65
|
+
- Authors (Jaccard): 2.0
|
|
66
|
+
- Subjects (Jaccard): 1.0
|
|
67
|
+
- Temporal (Gaussian): 0.5
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Self for chaining
|
|
71
|
+
"""
|
|
72
|
+
return (
|
|
73
|
+
self.content(weight=4.0)
|
|
74
|
+
.authors(weight=2.0)
|
|
75
|
+
.subjects(weight=1.0)
|
|
76
|
+
.temporal(weight=0.5)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def content_only(self, metric: Optional[Metric] = None) -> "BookSimilarity":
|
|
80
|
+
"""Content-only preset (pure semantic similarity).
|
|
81
|
+
|
|
82
|
+
Uses TF-IDF by default, but can override metric.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
metric: Optional custom metric (default TfidfMetric)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Self for chaining
|
|
89
|
+
"""
|
|
90
|
+
return self.content(weight=1.0, metric=metric)
|
|
91
|
+
|
|
92
|
+
def metadata_only(self) -> "BookSimilarity":
|
|
93
|
+
"""Metadata-only preset (no content similarity).
|
|
94
|
+
|
|
95
|
+
Weights:
|
|
96
|
+
- Authors (Jaccard): 3.0
|
|
97
|
+
- Subjects (Jaccard): 2.0
|
|
98
|
+
- Temporal (Gaussian): 1.0
|
|
99
|
+
- Language (Exact): 1.0
|
|
100
|
+
- Publisher (Exact): 0.5
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Self for chaining
|
|
104
|
+
"""
|
|
105
|
+
return (
|
|
106
|
+
self.authors(weight=3.0)
|
|
107
|
+
.subjects(weight=2.0)
|
|
108
|
+
.temporal(weight=1.0)
|
|
109
|
+
.language(weight=1.0)
|
|
110
|
+
.publisher(weight=0.5)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# ===== Tier 2: Semantic Methods =====
|
|
114
|
+
|
|
115
|
+
def content(
|
|
116
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
117
|
+
) -> "BookSimilarity":
|
|
118
|
+
"""Add content similarity (full text).
|
|
119
|
+
|
|
120
|
+
Default metric: TfidfMetric (cosine similarity of TF-IDF vectors)
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
weight: Weight for this feature (default 1.0)
|
|
124
|
+
metric: Optional custom metric (default TfidfMetric)
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Self for chaining
|
|
128
|
+
"""
|
|
129
|
+
metric = metric or TfidfMetric()
|
|
130
|
+
extractor = ContentExtractor()
|
|
131
|
+
self.features.append(Feature(extractor, metric, weight, "content"))
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def description(
|
|
135
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
136
|
+
) -> "BookSimilarity":
|
|
137
|
+
"""Add description similarity (book summary/blurb).
|
|
138
|
+
|
|
139
|
+
Default metric: TfidfMetric (delegates to content provider)
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
weight: Weight for this feature (default 1.0)
|
|
143
|
+
metric: Optional custom metric (default TfidfMetric)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Self for chaining
|
|
147
|
+
"""
|
|
148
|
+
metric = metric or TfidfMetric()
|
|
149
|
+
extractor = DescriptionExtractor()
|
|
150
|
+
self.features.append(Feature(extractor, metric, weight, "description"))
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def authors(
|
|
154
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
155
|
+
) -> "BookSimilarity":
|
|
156
|
+
"""Add author overlap similarity.
|
|
157
|
+
|
|
158
|
+
Default metric: JaccardMetric (set overlap)
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
weight: Weight for this feature (default 1.0)
|
|
162
|
+
metric: Optional custom metric (default JaccardMetric)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Self for chaining
|
|
166
|
+
"""
|
|
167
|
+
metric = metric or JaccardMetric()
|
|
168
|
+
extractor = AuthorsExtractor()
|
|
169
|
+
self.features.append(Feature(extractor, metric, weight, "authors"))
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def subjects(
|
|
173
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
174
|
+
) -> "BookSimilarity":
|
|
175
|
+
"""Add subject/tag overlap similarity.
|
|
176
|
+
|
|
177
|
+
Default metric: JaccardMetric (set overlap)
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
weight: Weight for this feature (default 1.0)
|
|
181
|
+
metric: Optional custom metric (default JaccardMetric)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Self for chaining
|
|
185
|
+
"""
|
|
186
|
+
metric = metric or JaccardMetric()
|
|
187
|
+
extractor = SubjectsExtractor()
|
|
188
|
+
self.features.append(Feature(extractor, metric, weight, "subjects"))
|
|
189
|
+
return self
|
|
190
|
+
|
|
191
|
+
def temporal(
|
|
192
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None, sigma: float = 10.0
|
|
193
|
+
) -> "BookSimilarity":
|
|
194
|
+
"""Add temporal proximity similarity (publication date).
|
|
195
|
+
|
|
196
|
+
Default metric: TemporalDecayMetric (Gaussian decay)
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
weight: Weight for this feature (default 1.0)
|
|
200
|
+
metric: Optional custom metric (default TemporalDecayMetric)
|
|
201
|
+
sigma: Standard deviation in years for Gaussian decay (default 10.0)
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Self for chaining
|
|
205
|
+
"""
|
|
206
|
+
metric = metric or TemporalDecayMetric(sigma=sigma)
|
|
207
|
+
extractor = PublicationYearExtractor()
|
|
208
|
+
self.features.append(Feature(extractor, metric, weight, "temporal"))
|
|
209
|
+
return self
|
|
210
|
+
|
|
211
|
+
def language(
|
|
212
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
213
|
+
) -> "BookSimilarity":
|
|
214
|
+
"""Add language match similarity.
|
|
215
|
+
|
|
216
|
+
Default metric: ExactMatchMetric (1 if same language, 0 otherwise)
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
weight: Weight for this feature (default 1.0)
|
|
220
|
+
metric: Optional custom metric (default ExactMatchMetric)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Self for chaining
|
|
224
|
+
"""
|
|
225
|
+
metric = metric or ExactMatchMetric()
|
|
226
|
+
extractor = LanguageExtractor()
|
|
227
|
+
self.features.append(Feature(extractor, metric, weight, "language"))
|
|
228
|
+
return self
|
|
229
|
+
|
|
230
|
+
def publisher(
|
|
231
|
+
self, weight: float = 1.0, metric: Optional[Metric] = None
|
|
232
|
+
) -> "BookSimilarity":
|
|
233
|
+
"""Add publisher match similarity.
|
|
234
|
+
|
|
235
|
+
Default metric: ExactMatchMetric (1 if same publisher, 0 otherwise)
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
weight: Weight for this feature (default 1.0)
|
|
239
|
+
metric: Optional custom metric (default ExactMatchMetric)
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Self for chaining
|
|
243
|
+
"""
|
|
244
|
+
metric = metric or ExactMatchMetric()
|
|
245
|
+
extractor = PublisherExtractor()
|
|
246
|
+
self.features.append(Feature(extractor, metric, weight, "publisher"))
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
def page_count(
|
|
250
|
+
self,
|
|
251
|
+
weight: float = 1.0,
|
|
252
|
+
metric: Optional[Metric] = None,
|
|
253
|
+
max_diff: float = 1000.0,
|
|
254
|
+
) -> "BookSimilarity":
|
|
255
|
+
"""Add page count proximity similarity.
|
|
256
|
+
|
|
257
|
+
Default metric: NumericProximityMetric
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
weight: Weight for this feature (default 1.0)
|
|
261
|
+
metric: Optional custom metric (default NumericProximityMetric)
|
|
262
|
+
max_diff: Maximum expected difference in pages (default 1000)
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Self for chaining
|
|
266
|
+
"""
|
|
267
|
+
metric = metric or NumericProximityMetric(max_diff=max_diff)
|
|
268
|
+
extractor = PageCountExtractor()
|
|
269
|
+
self.features.append(Feature(extractor, metric, weight, "page_count"))
|
|
270
|
+
return self
|
|
271
|
+
|
|
272
|
+
# ===== Tier 3: Escape Hatch =====
|
|
273
|
+
|
|
274
|
+
def custom(
|
|
275
|
+
self, feature: Feature, name: Optional[str] = None
|
|
276
|
+
) -> "BookSimilarity":
|
|
277
|
+
"""Add a custom feature for power users.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
feature: Custom Feature (extractor + metric + weight)
|
|
281
|
+
name: Optional name for this feature
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Self for chaining
|
|
285
|
+
"""
|
|
286
|
+
if name:
|
|
287
|
+
feature.name = name
|
|
288
|
+
self.features.append(feature)
|
|
289
|
+
return self
|
|
290
|
+
|
|
291
|
+
# ===== Core Functionality =====
|
|
292
|
+
|
|
293
|
+
def fit(self, books: List[Book]) -> "BookSimilarity":
|
|
294
|
+
"""Fit all metrics on the corpus.
|
|
295
|
+
|
|
296
|
+
This pre-computes expensive features (e.g., TF-IDF vectors) for
|
|
297
|
+
dramatic performance improvements.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
books: List of books to fit on
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Self for chaining
|
|
304
|
+
"""
|
|
305
|
+
if not books:
|
|
306
|
+
return self
|
|
307
|
+
|
|
308
|
+
# For each feature, extract values and fit metric
|
|
309
|
+
for feature in self.features:
|
|
310
|
+
# Extract values for all books
|
|
311
|
+
data = {}
|
|
312
|
+
for book in books:
|
|
313
|
+
try:
|
|
314
|
+
value = feature.extractor.extract(book)
|
|
315
|
+
data[book.id] = value
|
|
316
|
+
except Exception:
|
|
317
|
+
# Skip books that fail extraction
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Fit metric (no-op for most metrics)
|
|
321
|
+
feature.metric.fit(data)
|
|
322
|
+
|
|
323
|
+
self._fitted = True
|
|
324
|
+
return self
|
|
325
|
+
|
|
326
|
+
def similarity(self, book1: Book, book2: Book) -> float:
|
|
327
|
+
"""Compute similarity between two books.
|
|
328
|
+
|
|
329
|
+
Returns weighted average of all feature similarities.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
book1: First book
|
|
333
|
+
book2: Second book
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Similarity score in [0, 1]
|
|
337
|
+
"""
|
|
338
|
+
if not self.features:
|
|
339
|
+
raise ValueError("No features configured. Use .content(), .authors(), etc.")
|
|
340
|
+
|
|
341
|
+
total_weighted_sim = 0.0
|
|
342
|
+
total_weight = 0.0
|
|
343
|
+
|
|
344
|
+
for feature in self.features:
|
|
345
|
+
try:
|
|
346
|
+
weighted_sim = feature.similarity(book1, book2)
|
|
347
|
+
total_weighted_sim += weighted_sim
|
|
348
|
+
total_weight += feature.weight
|
|
349
|
+
except Exception:
|
|
350
|
+
# Skip features that fail
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
if total_weight == 0:
|
|
354
|
+
return 0.0
|
|
355
|
+
|
|
356
|
+
return total_weighted_sim / total_weight
|
|
357
|
+
|
|
358
|
+
def similarity_matrix(self, books: List[Book]) -> np.ndarray:
|
|
359
|
+
"""Compute pairwise similarity matrix for all books.
|
|
360
|
+
|
|
361
|
+
Returns NxN matrix where matrix[i][j] = similarity(books[i], books[j])
|
|
362
|
+
|
|
363
|
+
This is much faster than computing similarities one by one.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
books: List of books
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
NxN numpy array of similarities
|
|
370
|
+
"""
|
|
371
|
+
n = len(books)
|
|
372
|
+
matrix = np.zeros((n, n))
|
|
373
|
+
|
|
374
|
+
# Diagonal is always 1.0 (book is identical to itself)
|
|
375
|
+
np.fill_diagonal(matrix, 1.0)
|
|
376
|
+
|
|
377
|
+
# Compute upper triangle (matrix is symmetric)
|
|
378
|
+
for i in range(n):
|
|
379
|
+
for j in range(i + 1, n):
|
|
380
|
+
sim = self.similarity(books[i], books[j])
|
|
381
|
+
matrix[i][j] = sim
|
|
382
|
+
matrix[j][i] = sim # Symmetric
|
|
383
|
+
|
|
384
|
+
return matrix
|
|
385
|
+
|
|
386
|
+
def find_similar(
|
|
387
|
+
self, book: Book, candidates: List[Book], top_k: int = 10
|
|
388
|
+
) -> List[Tuple[Book, float]]:
|
|
389
|
+
"""Find top-k most similar books from candidates.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
book: Query book
|
|
393
|
+
candidates: Candidate books to compare against
|
|
394
|
+
top_k: Number of results to return (default 10)
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
List of (book, similarity) tuples, sorted by similarity descending
|
|
398
|
+
"""
|
|
399
|
+
# Compute similarities
|
|
400
|
+
similarities = []
|
|
401
|
+
for candidate in candidates:
|
|
402
|
+
if candidate.id == book.id:
|
|
403
|
+
continue # Skip self
|
|
404
|
+
|
|
405
|
+
sim = self.similarity(book, candidate)
|
|
406
|
+
similarities.append((candidate, sim))
|
|
407
|
+
|
|
408
|
+
# Sort by similarity descending
|
|
409
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
410
|
+
|
|
411
|
+
# Return top-k
|
|
412
|
+
return similarities[:top_k]
|
|
413
|
+
|
|
414
|
+
def save(self, path: Path) -> None:
|
|
415
|
+
"""Save fitted state to disk.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
path: Directory to save to (will create multiple files)
|
|
419
|
+
"""
|
|
420
|
+
if not self._fitted:
|
|
421
|
+
raise RuntimeError("Must call fit() before save()")
|
|
422
|
+
|
|
423
|
+
path = Path(path)
|
|
424
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
425
|
+
|
|
426
|
+
# Save each feature's metric
|
|
427
|
+
for i, feature in enumerate(self.features):
|
|
428
|
+
metric_path = path / f"metric_{i}_{feature.name}.pkl"
|
|
429
|
+
feature.metric.save(metric_path)
|
|
430
|
+
|
|
431
|
+
def load(self, path: Path) -> None:
|
|
432
|
+
"""Load fitted state from disk.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
path: Directory to load from
|
|
436
|
+
"""
|
|
437
|
+
path = Path(path)
|
|
438
|
+
|
|
439
|
+
# Load each feature's metric
|
|
440
|
+
for i, feature in enumerate(self.features):
|
|
441
|
+
metric_path = path / f"metric_{i}_{feature.name}.pkl"
|
|
442
|
+
if metric_path.exists():
|
|
443
|
+
feature.metric.load(metric_path)
|
|
444
|
+
|
|
445
|
+
self._fitted = True
|