ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +1097 -9
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +59 -0
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/exports/zip.py +25 -0
- ebk/library_db.py +155 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
- ebk-0.3.2.dist-info/RECORD +69 -0
- ebk-0.3.2.dist-info/entry_points.txt +2 -0
- ebk-0.3.2.dist-info/top_level.txt +1 -0
- ebk-0.3.1.dist-info/RECORD +0 -19
- ebk-0.3.1.dist-info/entry_points.txt +0 -6
- ebk-0.3.1.dist-info/top_level.txt +0 -2
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Concrete extractor implementations."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Set
|
|
4
|
+
|
|
5
|
+
from ebk.db.models import Book
|
|
6
|
+
from ebk.similarity.base import Extractor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ContentExtractor(Extractor[str]):
|
|
10
|
+
"""Extracts full text content from a book.
|
|
11
|
+
|
|
12
|
+
Uses extracted_text from primary file if available, otherwise combines
|
|
13
|
+
title and description.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def extract(self, book: Book) -> str:
|
|
17
|
+
"""Extract text content from book.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
book: Book to extract from
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Full text content as string
|
|
24
|
+
"""
|
|
25
|
+
# Try to get extracted text from primary file
|
|
26
|
+
if book.files:
|
|
27
|
+
for file in book.files:
|
|
28
|
+
if file.extracted_text and file.extracted_text.full_text:
|
|
29
|
+
return file.extracted_text.full_text
|
|
30
|
+
|
|
31
|
+
# Fallback to title + description
|
|
32
|
+
parts = []
|
|
33
|
+
if book.title:
|
|
34
|
+
parts.append(book.title)
|
|
35
|
+
if book.description:
|
|
36
|
+
parts.append(book.description)
|
|
37
|
+
|
|
38
|
+
return " ".join(parts)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class AuthorsExtractor(Extractor[Set[str]]):
|
|
42
|
+
"""Extracts set of author names from a book."""
|
|
43
|
+
|
|
44
|
+
def extract(self, book: Book) -> Set[str]:
|
|
45
|
+
"""Extract author names from book.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
book: Book to extract from
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Set of author names (normalized to lowercase)
|
|
52
|
+
"""
|
|
53
|
+
if not book.authors:
|
|
54
|
+
return set()
|
|
55
|
+
|
|
56
|
+
return {author.name.lower() for author in book.authors}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SubjectsExtractor(Extractor[Set[str]]):
|
|
60
|
+
"""Extracts set of subjects/tags from a book."""
|
|
61
|
+
|
|
62
|
+
def extract(self, book: Book) -> Set[str]:
|
|
63
|
+
"""Extract subjects from book.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
book: Book to extract from
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Set of subject names (normalized to lowercase)
|
|
70
|
+
"""
|
|
71
|
+
if not book.subjects:
|
|
72
|
+
return set()
|
|
73
|
+
|
|
74
|
+
return {subject.name.lower() for subject in book.subjects}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class PublicationYearExtractor(Extractor[Optional[int]]):
|
|
78
|
+
"""Extracts publication year from a book."""
|
|
79
|
+
|
|
80
|
+
def extract(self, book: Book) -> Optional[int]:
|
|
81
|
+
"""Extract publication year from book.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
book: Book to extract from
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Publication year as int, or None if not available
|
|
88
|
+
"""
|
|
89
|
+
if not book.publication_date:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
# Handle various date formats
|
|
93
|
+
date_str = str(book.publication_date)
|
|
94
|
+
|
|
95
|
+
# Try to extract year
|
|
96
|
+
if len(date_str) >= 4:
|
|
97
|
+
try:
|
|
98
|
+
return int(date_str[:4])
|
|
99
|
+
except ValueError:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class LanguageExtractor(Extractor[Optional[str]]):
|
|
106
|
+
"""Extracts language code from a book."""
|
|
107
|
+
|
|
108
|
+
def extract(self, book: Book) -> Optional[str]:
|
|
109
|
+
"""Extract language from book.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
book: Book to extract from
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Language code (normalized to lowercase), or None
|
|
116
|
+
"""
|
|
117
|
+
if not book.language:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
return book.language.lower()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class PublisherExtractor(Extractor[Optional[str]]):
|
|
124
|
+
"""Extracts publisher name from a book."""
|
|
125
|
+
|
|
126
|
+
def extract(self, book: Book) -> Optional[str]:
|
|
127
|
+
"""Extract publisher from book.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
book: Book to extract from
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Publisher name (normalized to lowercase), or None
|
|
134
|
+
"""
|
|
135
|
+
if not book.publisher:
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
return book.publisher.lower()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class PageCountExtractor(Extractor[Optional[int]]):
|
|
142
|
+
"""Extracts page count from a book."""
|
|
143
|
+
|
|
144
|
+
def extract(self, book: Book) -> Optional[int]:
|
|
145
|
+
"""Extract page count from book.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
book: Book to extract from
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Page count, or None if not available
|
|
152
|
+
"""
|
|
153
|
+
return book.page_count
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class DescriptionExtractor(Extractor[str]):
|
|
157
|
+
"""Extracts description/summary from a book."""
|
|
158
|
+
|
|
159
|
+
def extract(self, book: Book) -> str:
|
|
160
|
+
"""Extract description from book.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
book: Book to extract from
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Description text (empty string if not available)
|
|
167
|
+
"""
|
|
168
|
+
return book.description or ""
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""Concrete metric implementations."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import pickle
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Optional, Set
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
10
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
11
|
+
|
|
12
|
+
from ebk.similarity.base import Metric
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class JaccardMetric(Metric[Set[str]]):
|
|
16
|
+
"""Jaccard similarity for sets.
|
|
17
|
+
|
|
18
|
+
Computes |A ∩ B| / |A ∪ B|
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
1.0 if sets are identical
|
|
22
|
+
0.0 if sets have no overlap
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def similarity(self, value1: Set[str], value2: Set[str]) -> float:
|
|
26
|
+
"""Compute Jaccard similarity between two sets.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
value1: First set
|
|
30
|
+
value2: Second set
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Jaccard similarity in [0, 1]
|
|
34
|
+
"""
|
|
35
|
+
if not value1 and not value2:
|
|
36
|
+
return 1.0 # Both empty = identical
|
|
37
|
+
|
|
38
|
+
if not value1 or not value2:
|
|
39
|
+
return 0.0 # One empty = no overlap
|
|
40
|
+
|
|
41
|
+
intersection = len(value1 & value2)
|
|
42
|
+
union = len(value1 | value2)
|
|
43
|
+
|
|
44
|
+
if union == 0:
|
|
45
|
+
return 0.0
|
|
46
|
+
|
|
47
|
+
return intersection / union
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ExactMatchMetric(Metric):
|
|
51
|
+
"""Exact match metric for any comparable values.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
1.0 if values are equal
|
|
55
|
+
0.0 if values are different
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def similarity(self, value1, value2) -> float:
|
|
59
|
+
"""Compute exact match similarity.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
value1: First value
|
|
63
|
+
value2: Second value
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
1.0 if equal, 0.0 otherwise
|
|
67
|
+
"""
|
|
68
|
+
if value1 is None or value2 is None:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
return 1.0 if value1 == value2 else 0.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class TemporalDecayMetric(Metric[Optional[int]]):
|
|
75
|
+
"""Gaussian decay based on time difference.
|
|
76
|
+
|
|
77
|
+
Similarity decays as Gaussian: exp(-((y1 - y2) / sigma)^2)
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
sigma: Standard deviation for Gaussian (controls decay rate)
|
|
81
|
+
Default 10 years means ~60% similarity for 10-year gap
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self, sigma: float = 10.0):
|
|
85
|
+
"""Initialize temporal decay metric.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
sigma: Standard deviation in years (default 10.0)
|
|
89
|
+
"""
|
|
90
|
+
self.sigma = sigma
|
|
91
|
+
|
|
92
|
+
def similarity(self, value1: Optional[int], value2: Optional[int]) -> float:
|
|
93
|
+
"""Compute temporal similarity with Gaussian decay.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
value1: First year
|
|
97
|
+
value2: Second year
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Similarity in [0, 1] based on Gaussian decay
|
|
101
|
+
"""
|
|
102
|
+
if value1 is None or value2 is None:
|
|
103
|
+
return 0.0
|
|
104
|
+
|
|
105
|
+
diff = abs(value1 - value2)
|
|
106
|
+
return math.exp(-((diff / self.sigma) ** 2))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class NumericProximityMetric(Metric[Optional[int]]):
|
|
110
|
+
"""Similarity based on numeric proximity with normalization.
|
|
111
|
+
|
|
112
|
+
Computes: 1 - |v1 - v2| / max_diff
|
|
113
|
+
|
|
114
|
+
Useful for page counts, ratings, etc.
|
|
115
|
+
|
|
116
|
+
Attributes:
|
|
117
|
+
max_diff: Maximum expected difference for normalization
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, max_diff: float):
|
|
121
|
+
"""Initialize numeric proximity metric.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
max_diff: Maximum expected difference (e.g., 1000 pages)
|
|
125
|
+
"""
|
|
126
|
+
self.max_diff = max_diff
|
|
127
|
+
|
|
128
|
+
def similarity(self, value1: Optional[int], value2: Optional[int]) -> float:
|
|
129
|
+
"""Compute numeric proximity similarity.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
value1: First value
|
|
133
|
+
value2: Second value
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Similarity in [0, 1] based on proximity
|
|
137
|
+
"""
|
|
138
|
+
if value1 is None or value2 is None:
|
|
139
|
+
return 0.0
|
|
140
|
+
|
|
141
|
+
diff = abs(value1 - value2)
|
|
142
|
+
normalized = min(diff / self.max_diff, 1.0)
|
|
143
|
+
return 1.0 - normalized
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class TfidfMetric(Metric[str]):
|
|
147
|
+
"""TF-IDF cosine similarity for text.
|
|
148
|
+
|
|
149
|
+
This metric needs fitting to build vocabulary and cache vectors.
|
|
150
|
+
|
|
151
|
+
Attributes:
|
|
152
|
+
max_features: Maximum number of features for TF-IDF (default 5000)
|
|
153
|
+
min_df: Minimum document frequency (default 2)
|
|
154
|
+
max_df: Maximum document frequency (default 0.95)
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def __init__(
|
|
158
|
+
self,
|
|
159
|
+
max_features: int = 5000,
|
|
160
|
+
min_df: int = 2,
|
|
161
|
+
max_df: float = 0.95,
|
|
162
|
+
):
|
|
163
|
+
"""Initialize TF-IDF metric.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
max_features: Maximum number of features (default 5000)
|
|
167
|
+
min_df: Minimum document frequency (default 2)
|
|
168
|
+
max_df: Maximum document frequency (default 0.95)
|
|
169
|
+
"""
|
|
170
|
+
self.max_features = max_features
|
|
171
|
+
self.min_df = min_df
|
|
172
|
+
self.max_df = max_df
|
|
173
|
+
|
|
174
|
+
self.vectorizer = TfidfVectorizer(
|
|
175
|
+
max_features=max_features,
|
|
176
|
+
min_df=min_df,
|
|
177
|
+
max_df=max_df,
|
|
178
|
+
stop_words="english",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._vectors: Dict[int, np.ndarray] = {}
|
|
182
|
+
self._fitted = False
|
|
183
|
+
|
|
184
|
+
def fit(self, data: Dict[int, str]) -> None:
|
|
185
|
+
"""Fit vectorizer and cache all vectors.
|
|
186
|
+
|
|
187
|
+
This dramatically speeds up similarity computation by pre-computing
|
|
188
|
+
TF-IDF vectors for all books.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
data: Dictionary mapping book IDs to text content
|
|
192
|
+
"""
|
|
193
|
+
if not data:
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
# Fit vectorizer on all texts
|
|
197
|
+
book_ids = list(data.keys())
|
|
198
|
+
texts = [data[book_id] for book_id in book_ids]
|
|
199
|
+
|
|
200
|
+
# Fit and transform
|
|
201
|
+
vectors = self.vectorizer.fit_transform(texts)
|
|
202
|
+
|
|
203
|
+
# Cache sparse vectors by book_id
|
|
204
|
+
for book_id, vector in zip(book_ids, vectors):
|
|
205
|
+
self._vectors[book_id] = vector
|
|
206
|
+
|
|
207
|
+
self._fitted = True
|
|
208
|
+
|
|
209
|
+
def similarity(self, value1: str, value2: str) -> float:
|
|
210
|
+
"""Compute TF-IDF cosine similarity.
|
|
211
|
+
|
|
212
|
+
If not fitted, transforms texts on-the-fly (slow).
|
|
213
|
+
If fitted, uses cached vectors (fast).
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
value1: First text
|
|
217
|
+
value2: Second text
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Cosine similarity in [0, 1]
|
|
221
|
+
"""
|
|
222
|
+
if not value1 or not value2:
|
|
223
|
+
return 0.0
|
|
224
|
+
|
|
225
|
+
# Transform texts to vectors
|
|
226
|
+
if not self._fitted:
|
|
227
|
+
# Not fitted - transform on the fly (slow path)
|
|
228
|
+
try:
|
|
229
|
+
vectors = self.vectorizer.fit_transform([value1, value2])
|
|
230
|
+
v1, v2 = vectors[0], vectors[1]
|
|
231
|
+
except ValueError:
|
|
232
|
+
# Empty vocabulary
|
|
233
|
+
return 0.0
|
|
234
|
+
else:
|
|
235
|
+
# Fitted - transform using learned vocabulary
|
|
236
|
+
try:
|
|
237
|
+
v1 = self.vectorizer.transform([value1])
|
|
238
|
+
v2 = self.vectorizer.transform([value2])
|
|
239
|
+
except ValueError:
|
|
240
|
+
return 0.0
|
|
241
|
+
|
|
242
|
+
# Compute cosine similarity
|
|
243
|
+
sim = cosine_similarity(v1, v2)[0, 0]
|
|
244
|
+
|
|
245
|
+
# Ensure [0, 1] range (cosine can be negative for sparse vectors)
|
|
246
|
+
return max(0.0, min(1.0, sim))
|
|
247
|
+
|
|
248
|
+
def similarity_from_cache(self, book1_id: int, book2_id: int) -> float:
|
|
249
|
+
"""Fast similarity using pre-computed vectors.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
book1_id: ID of first book
|
|
253
|
+
book2_id: ID of second book
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Cosine similarity in [0, 1]
|
|
257
|
+
|
|
258
|
+
Raises:
|
|
259
|
+
KeyError: If book IDs not in cache (need to call fit() first)
|
|
260
|
+
"""
|
|
261
|
+
if not self._fitted:
|
|
262
|
+
raise RuntimeError("Must call fit() before similarity_from_cache()")
|
|
263
|
+
|
|
264
|
+
v1 = self._vectors[book1_id]
|
|
265
|
+
v2 = self._vectors[book2_id]
|
|
266
|
+
|
|
267
|
+
sim = cosine_similarity(v1, v2)[0, 0]
|
|
268
|
+
return max(0.0, min(1.0, sim))
|
|
269
|
+
|
|
270
|
+
def save(self, path: Path) -> None:
|
|
271
|
+
"""Save fitted state to disk.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
path: Path to save to (will create .pkl file)
|
|
275
|
+
"""
|
|
276
|
+
if not self._fitted:
|
|
277
|
+
raise RuntimeError("Must call fit() before save()")
|
|
278
|
+
|
|
279
|
+
state = {
|
|
280
|
+
"vectorizer": self.vectorizer,
|
|
281
|
+
"vectors": self._vectors,
|
|
282
|
+
"max_features": self.max_features,
|
|
283
|
+
"min_df": self.min_df,
|
|
284
|
+
"max_df": self.max_df,
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
with open(path, "wb") as f:
|
|
288
|
+
pickle.dump(state, f)
|
|
289
|
+
|
|
290
|
+
def load(self, path: Path) -> None:
|
|
291
|
+
"""Load fitted state from disk.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
path: Path to load from
|
|
295
|
+
"""
|
|
296
|
+
with open(path, "rb") as f:
|
|
297
|
+
state = pickle.load(f)
|
|
298
|
+
|
|
299
|
+
self.vectorizer = state["vectorizer"]
|
|
300
|
+
self._vectors = state["vectors"]
|
|
301
|
+
self.max_features = state["max_features"]
|
|
302
|
+
self.min_df = state["min_df"]
|
|
303
|
+
self.max_df = state["max_df"]
|
|
304
|
+
self._fitted = True
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class CosineMetric(Metric[str]):
|
|
308
|
+
"""Simple cosine similarity without TF-IDF weighting.
|
|
309
|
+
|
|
310
|
+
Uses CountVectorizer instead of TF-IDF. Faster but less accurate
|
|
311
|
+
than TfidfMetric.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
def __init__(self, max_features: int = 5000):
|
|
315
|
+
"""Initialize cosine metric.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
max_features: Maximum number of features (default 5000)
|
|
319
|
+
"""
|
|
320
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
321
|
+
|
|
322
|
+
self.max_features = max_features
|
|
323
|
+
self.vectorizer = CountVectorizer(
|
|
324
|
+
max_features=max_features,
|
|
325
|
+
stop_words="english",
|
|
326
|
+
)
|
|
327
|
+
self._vectors: Dict[int, np.ndarray] = {}
|
|
328
|
+
self._fitted = False
|
|
329
|
+
|
|
330
|
+
def fit(self, data: Dict[int, str]) -> None:
|
|
331
|
+
"""Fit vectorizer and cache all vectors.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
data: Dictionary mapping book IDs to text content
|
|
335
|
+
"""
|
|
336
|
+
if not data:
|
|
337
|
+
return
|
|
338
|
+
|
|
339
|
+
book_ids = list(data.keys())
|
|
340
|
+
texts = [data[book_id] for book_id in book_ids]
|
|
341
|
+
|
|
342
|
+
vectors = self.vectorizer.fit_transform(texts)
|
|
343
|
+
|
|
344
|
+
for book_id, vector in zip(book_ids, vectors):
|
|
345
|
+
self._vectors[book_id] = vector
|
|
346
|
+
|
|
347
|
+
self._fitted = True
|
|
348
|
+
|
|
349
|
+
def similarity(self, value1: str, value2: str) -> float:
|
|
350
|
+
"""Compute cosine similarity.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
value1: First text
|
|
354
|
+
value2: Second text
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Cosine similarity in [0, 1]
|
|
358
|
+
"""
|
|
359
|
+
if not value1 or not value2:
|
|
360
|
+
return 0.0
|
|
361
|
+
|
|
362
|
+
if not self._fitted:
|
|
363
|
+
try:
|
|
364
|
+
vectors = self.vectorizer.fit_transform([value1, value2])
|
|
365
|
+
v1, v2 = vectors[0], vectors[1]
|
|
366
|
+
except ValueError:
|
|
367
|
+
return 0.0
|
|
368
|
+
else:
|
|
369
|
+
try:
|
|
370
|
+
v1 = self.vectorizer.transform([value1])
|
|
371
|
+
v2 = self.vectorizer.transform([value2])
|
|
372
|
+
except ValueError:
|
|
373
|
+
return 0.0
|
|
374
|
+
|
|
375
|
+
sim = cosine_similarity(v1, v2)[0, 0]
|
|
376
|
+
return max(0.0, min(1.0, sim))
|
ebk/vfs/__init__.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Virtual File System for navigating the library database.
|
|
2
|
+
|
|
3
|
+
The VFS provides a filesystem-like interface for browsing and interacting
|
|
4
|
+
with the ebook library. It maps database entities to a hierarchical structure
|
|
5
|
+
that can be navigated with familiar shell commands.
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
/ # Root (RootNode)
|
|
11
|
+
├── books/ # All books (BooksDirectoryNode)
|
|
12
|
+
│ ├── 1/ # Book 1 (BookNode)
|
|
13
|
+
│ │ ├── title # Metadata file (TitleFileNode)
|
|
14
|
+
│ │ ├── authors # Metadata file (AuthorsFileNode)
|
|
15
|
+
│ │ ├── description # Metadata file
|
|
16
|
+
│ │ ├── text # Extracted text (TextFileNode)
|
|
17
|
+
│ │ ├── files/ # Physical files (FilesDirectoryNode)
|
|
18
|
+
│ │ │ ├── book.pdf
|
|
19
|
+
│ │ │ └── book.epub
|
|
20
|
+
│ │ ├── similar/ # Similar books (SimilarDirectoryNode)
|
|
21
|
+
│ │ ├── annotations/ # User annotations
|
|
22
|
+
│ │ └── covers/ # Cover images
|
|
23
|
+
│ └── 2/
|
|
24
|
+
├── authors/ # Browse by author (AuthorsDirectoryNode)
|
|
25
|
+
│ └── knuth-donald/ # Books by this author
|
|
26
|
+
├── subjects/ # Browse by subject
|
|
27
|
+
└── series/ # Browse by series
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Node Types:
|
|
31
|
+
|
|
32
|
+
- Node: Base class for all VFS entries
|
|
33
|
+
- DirectoryNode: Can contain children (cd into them)
|
|
34
|
+
- FileNode: Leaf nodes with content (cat them)
|
|
35
|
+
- VirtualNode: Dynamically computed (e.g., /books/, /similar/)
|
|
36
|
+
- SymlinkNode: Links to other nodes
|
|
37
|
+
|
|
38
|
+
Path Resolution:
|
|
39
|
+
|
|
40
|
+
The PathResolver handles navigation:
|
|
41
|
+
- Absolute paths: /books/42/title
|
|
42
|
+
- Relative paths: ../other, ./files
|
|
43
|
+
- Special: ., .., ~ (home = /)
|
|
44
|
+
- Symlink following
|
|
45
|
+
- Tab completion support
|
|
46
|
+
|
|
47
|
+
Usage Example:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from ebk.library_db import Library
|
|
51
|
+
from ebk.vfs import LibraryVFS
|
|
52
|
+
|
|
53
|
+
# Create VFS for a library
|
|
54
|
+
lib = Library.open("/path/to/library")
|
|
55
|
+
vfs = LibraryVFS(lib)
|
|
56
|
+
|
|
57
|
+
# Navigate
|
|
58
|
+
root = vfs.root
|
|
59
|
+
books_dir = vfs.resolver.resolve("/books", root)
|
|
60
|
+
book_node = vfs.resolver.resolve("/books/42", root)
|
|
61
|
+
|
|
62
|
+
# List children
|
|
63
|
+
children = books_dir.list_children() # All books
|
|
64
|
+
for child in children:
|
|
65
|
+
print(child.name, child.get_info())
|
|
66
|
+
|
|
67
|
+
# Read file content
|
|
68
|
+
title_node = vfs.resolver.resolve("/books/42/title", root)
|
|
69
|
+
if isinstance(title_node, FileNode):
|
|
70
|
+
content = title_node.read_content()
|
|
71
|
+
print(content)
|
|
72
|
+
```
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
from ebk.vfs.base import (
|
|
76
|
+
Node,
|
|
77
|
+
DirectoryNode,
|
|
78
|
+
FileNode,
|
|
79
|
+
VirtualNode,
|
|
80
|
+
SymlinkNode,
|
|
81
|
+
NodeType,
|
|
82
|
+
)
|
|
83
|
+
from ebk.vfs.resolver import PathResolver, PathError, NotADirectoryError, NotFoundError
|
|
84
|
+
from ebk.vfs.library_vfs import LibraryVFS
|
|
85
|
+
|
|
86
|
+
__all__ = [
|
|
87
|
+
# Main entry point
|
|
88
|
+
"LibraryVFS",
|
|
89
|
+
# Core classes
|
|
90
|
+
"Node",
|
|
91
|
+
"DirectoryNode",
|
|
92
|
+
"FileNode",
|
|
93
|
+
"VirtualNode",
|
|
94
|
+
"SymlinkNode",
|
|
95
|
+
"NodeType",
|
|
96
|
+
# Path resolution
|
|
97
|
+
"PathResolver",
|
|
98
|
+
"PathError",
|
|
99
|
+
"NotADirectoryError",
|
|
100
|
+
"NotFoundError",
|
|
101
|
+
]
|