ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,168 @@
1
+ """Concrete extractor implementations."""
2
+
3
+ from typing import Optional, Set
4
+
5
+ from ebk.db.models import Book
6
+ from ebk.similarity.base import Extractor
7
+
8
+
9
+ class ContentExtractor(Extractor[str]):
10
+ """Extracts full text content from a book.
11
+
12
+ Uses extracted_text from primary file if available, otherwise combines
13
+ title and description.
14
+ """
15
+
16
+ def extract(self, book: Book) -> str:
17
+ """Extract text content from book.
18
+
19
+ Args:
20
+ book: Book to extract from
21
+
22
+ Returns:
23
+ Full text content as string
24
+ """
25
+ # Try to get extracted text from primary file
26
+ if book.files:
27
+ for file in book.files:
28
+ if file.extracted_text and file.extracted_text.full_text:
29
+ return file.extracted_text.full_text
30
+
31
+ # Fallback to title + description
32
+ parts = []
33
+ if book.title:
34
+ parts.append(book.title)
35
+ if book.description:
36
+ parts.append(book.description)
37
+
38
+ return " ".join(parts)
39
+
40
+
41
+ class AuthorsExtractor(Extractor[Set[str]]):
42
+ """Extracts set of author names from a book."""
43
+
44
+ def extract(self, book: Book) -> Set[str]:
45
+ """Extract author names from book.
46
+
47
+ Args:
48
+ book: Book to extract from
49
+
50
+ Returns:
51
+ Set of author names (normalized to lowercase)
52
+ """
53
+ if not book.authors:
54
+ return set()
55
+
56
+ return {author.name.lower() for author in book.authors}
57
+
58
+
59
+ class SubjectsExtractor(Extractor[Set[str]]):
60
+ """Extracts set of subjects/tags from a book."""
61
+
62
+ def extract(self, book: Book) -> Set[str]:
63
+ """Extract subjects from book.
64
+
65
+ Args:
66
+ book: Book to extract from
67
+
68
+ Returns:
69
+ Set of subject names (normalized to lowercase)
70
+ """
71
+ if not book.subjects:
72
+ return set()
73
+
74
+ return {subject.name.lower() for subject in book.subjects}
75
+
76
+
77
+ class PublicationYearExtractor(Extractor[Optional[int]]):
78
+ """Extracts publication year from a book."""
79
+
80
+ def extract(self, book: Book) -> Optional[int]:
81
+ """Extract publication year from book.
82
+
83
+ Args:
84
+ book: Book to extract from
85
+
86
+ Returns:
87
+ Publication year as int, or None if not available
88
+ """
89
+ if not book.publication_date:
90
+ return None
91
+
92
+ # Handle various date formats
93
+ date_str = str(book.publication_date)
94
+
95
+ # Try to extract year
96
+ if len(date_str) >= 4:
97
+ try:
98
+ return int(date_str[:4])
99
+ except ValueError:
100
+ return None
101
+
102
+ return None
103
+
104
+
105
+ class LanguageExtractor(Extractor[Optional[str]]):
106
+ """Extracts language code from a book."""
107
+
108
+ def extract(self, book: Book) -> Optional[str]:
109
+ """Extract language from book.
110
+
111
+ Args:
112
+ book: Book to extract from
113
+
114
+ Returns:
115
+ Language code (normalized to lowercase), or None
116
+ """
117
+ if not book.language:
118
+ return None
119
+
120
+ return book.language.lower()
121
+
122
+
123
+ class PublisherExtractor(Extractor[Optional[str]]):
124
+ """Extracts publisher name from a book."""
125
+
126
+ def extract(self, book: Book) -> Optional[str]:
127
+ """Extract publisher from book.
128
+
129
+ Args:
130
+ book: Book to extract from
131
+
132
+ Returns:
133
+ Publisher name (normalized to lowercase), or None
134
+ """
135
+ if not book.publisher:
136
+ return None
137
+
138
+ return book.publisher.lower()
139
+
140
+
141
+ class PageCountExtractor(Extractor[Optional[int]]):
142
+ """Extracts page count from a book."""
143
+
144
+ def extract(self, book: Book) -> Optional[int]:
145
+ """Extract page count from book.
146
+
147
+ Args:
148
+ book: Book to extract from
149
+
150
+ Returns:
151
+ Page count, or None if not available
152
+ """
153
+ return book.page_count
154
+
155
+
156
+ class DescriptionExtractor(Extractor[str]):
157
+ """Extracts description/summary from a book."""
158
+
159
+ def extract(self, book: Book) -> str:
160
+ """Extract description from book.
161
+
162
+ Args:
163
+ book: Book to extract from
164
+
165
+ Returns:
166
+ Description text (empty string if not available)
167
+ """
168
+ return book.description or ""
@@ -0,0 +1,376 @@
1
+ """Concrete metric implementations."""
2
+
3
+ import math
4
+ import pickle
5
+ from pathlib import Path
6
+ from typing import Dict, Optional, Set
7
+
8
+ import numpy as np
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+ from ebk.similarity.base import Metric
13
+
14
+
15
+ class JaccardMetric(Metric[Set[str]]):
16
+ """Jaccard similarity for sets.
17
+
18
+ Computes |A ∩ B| / |A ∪ B|
19
+
20
+ Returns:
21
+ 1.0 if sets are identical
22
+ 0.0 if sets have no overlap
23
+ """
24
+
25
+ def similarity(self, value1: Set[str], value2: Set[str]) -> float:
26
+ """Compute Jaccard similarity between two sets.
27
+
28
+ Args:
29
+ value1: First set
30
+ value2: Second set
31
+
32
+ Returns:
33
+ Jaccard similarity in [0, 1]
34
+ """
35
+ if not value1 and not value2:
36
+ return 1.0 # Both empty = identical
37
+
38
+ if not value1 or not value2:
39
+ return 0.0 # One empty = no overlap
40
+
41
+ intersection = len(value1 & value2)
42
+ union = len(value1 | value2)
43
+
44
+ if union == 0:
45
+ return 0.0
46
+
47
+ return intersection / union
48
+
49
+
50
+ class ExactMatchMetric(Metric):
51
+ """Exact match metric for any comparable values.
52
+
53
+ Returns:
54
+ 1.0 if values are equal
55
+ 0.0 if values are different
56
+ """
57
+
58
+ def similarity(self, value1, value2) -> float:
59
+ """Compute exact match similarity.
60
+
61
+ Args:
62
+ value1: First value
63
+ value2: Second value
64
+
65
+ Returns:
66
+ 1.0 if equal, 0.0 otherwise
67
+ """
68
+ if value1 is None or value2 is None:
69
+ return 0.0
70
+
71
+ return 1.0 if value1 == value2 else 0.0
72
+
73
+
74
+ class TemporalDecayMetric(Metric[Optional[int]]):
75
+ """Gaussian decay based on time difference.
76
+
77
+ Similarity decays as Gaussian: exp(-((y1 - y2) / sigma)^2)
78
+
79
+ Attributes:
80
+ sigma: Standard deviation for Gaussian (controls decay rate)
81
+ Default 10 years means ~60% similarity for 10-year gap
82
+ """
83
+
84
+ def __init__(self, sigma: float = 10.0):
85
+ """Initialize temporal decay metric.
86
+
87
+ Args:
88
+ sigma: Standard deviation in years (default 10.0)
89
+ """
90
+ self.sigma = sigma
91
+
92
+ def similarity(self, value1: Optional[int], value2: Optional[int]) -> float:
93
+ """Compute temporal similarity with Gaussian decay.
94
+
95
+ Args:
96
+ value1: First year
97
+ value2: Second year
98
+
99
+ Returns:
100
+ Similarity in [0, 1] based on Gaussian decay
101
+ """
102
+ if value1 is None or value2 is None:
103
+ return 0.0
104
+
105
+ diff = abs(value1 - value2)
106
+ return math.exp(-((diff / self.sigma) ** 2))
107
+
108
+
109
+ class NumericProximityMetric(Metric[Optional[int]]):
110
+ """Similarity based on numeric proximity with normalization.
111
+
112
+ Computes: 1 - |v1 - v2| / max_diff
113
+
114
+ Useful for page counts, ratings, etc.
115
+
116
+ Attributes:
117
+ max_diff: Maximum expected difference for normalization
118
+ """
119
+
120
+ def __init__(self, max_diff: float):
121
+ """Initialize numeric proximity metric.
122
+
123
+ Args:
124
+ max_diff: Maximum expected difference (e.g., 1000 pages)
125
+ """
126
+ self.max_diff = max_diff
127
+
128
+ def similarity(self, value1: Optional[int], value2: Optional[int]) -> float:
129
+ """Compute numeric proximity similarity.
130
+
131
+ Args:
132
+ value1: First value
133
+ value2: Second value
134
+
135
+ Returns:
136
+ Similarity in [0, 1] based on proximity
137
+ """
138
+ if value1 is None or value2 is None:
139
+ return 0.0
140
+
141
+ diff = abs(value1 - value2)
142
+ normalized = min(diff / self.max_diff, 1.0)
143
+ return 1.0 - normalized
144
+
145
+
146
+ class TfidfMetric(Metric[str]):
147
+ """TF-IDF cosine similarity for text.
148
+
149
+ This metric needs fitting to build vocabulary and cache vectors.
150
+
151
+ Attributes:
152
+ max_features: Maximum number of features for TF-IDF (default 5000)
153
+ min_df: Minimum document frequency (default 2)
154
+ max_df: Maximum document frequency (default 0.95)
155
+ """
156
+
157
+ def __init__(
158
+ self,
159
+ max_features: int = 5000,
160
+ min_df: int = 2,
161
+ max_df: float = 0.95,
162
+ ):
163
+ """Initialize TF-IDF metric.
164
+
165
+ Args:
166
+ max_features: Maximum number of features (default 5000)
167
+ min_df: Minimum document frequency (default 2)
168
+ max_df: Maximum document frequency (default 0.95)
169
+ """
170
+ self.max_features = max_features
171
+ self.min_df = min_df
172
+ self.max_df = max_df
173
+
174
+ self.vectorizer = TfidfVectorizer(
175
+ max_features=max_features,
176
+ min_df=min_df,
177
+ max_df=max_df,
178
+ stop_words="english",
179
+ )
180
+
181
+ self._vectors: Dict[int, np.ndarray] = {}
182
+ self._fitted = False
183
+
184
+ def fit(self, data: Dict[int, str]) -> None:
185
+ """Fit vectorizer and cache all vectors.
186
+
187
+ This dramatically speeds up similarity computation by pre-computing
188
+ TF-IDF vectors for all books.
189
+
190
+ Args:
191
+ data: Dictionary mapping book IDs to text content
192
+ """
193
+ if not data:
194
+ return
195
+
196
+ # Fit vectorizer on all texts
197
+ book_ids = list(data.keys())
198
+ texts = [data[book_id] for book_id in book_ids]
199
+
200
+ # Fit and transform
201
+ vectors = self.vectorizer.fit_transform(texts)
202
+
203
+ # Cache sparse vectors by book_id
204
+ for book_id, vector in zip(book_ids, vectors):
205
+ self._vectors[book_id] = vector
206
+
207
+ self._fitted = True
208
+
209
+ def similarity(self, value1: str, value2: str) -> float:
210
+ """Compute TF-IDF cosine similarity.
211
+
212
+ If not fitted, transforms texts on-the-fly (slow).
213
+ If fitted, uses cached vectors (fast).
214
+
215
+ Args:
216
+ value1: First text
217
+ value2: Second text
218
+
219
+ Returns:
220
+ Cosine similarity in [0, 1]
221
+ """
222
+ if not value1 or not value2:
223
+ return 0.0
224
+
225
+ # Transform texts to vectors
226
+ if not self._fitted:
227
+ # Not fitted - transform on the fly (slow path)
228
+ try:
229
+ vectors = self.vectorizer.fit_transform([value1, value2])
230
+ v1, v2 = vectors[0], vectors[1]
231
+ except ValueError:
232
+ # Empty vocabulary
233
+ return 0.0
234
+ else:
235
+ # Fitted - transform using learned vocabulary
236
+ try:
237
+ v1 = self.vectorizer.transform([value1])
238
+ v2 = self.vectorizer.transform([value2])
239
+ except ValueError:
240
+ return 0.0
241
+
242
+ # Compute cosine similarity
243
+ sim = cosine_similarity(v1, v2)[0, 0]
244
+
245
+ # Ensure [0, 1] range (cosine can be negative for sparse vectors)
246
+ return max(0.0, min(1.0, sim))
247
+
248
+ def similarity_from_cache(self, book1_id: int, book2_id: int) -> float:
249
+ """Fast similarity using pre-computed vectors.
250
+
251
+ Args:
252
+ book1_id: ID of first book
253
+ book2_id: ID of second book
254
+
255
+ Returns:
256
+ Cosine similarity in [0, 1]
257
+
258
+ Raises:
259
+ KeyError: If book IDs not in cache (need to call fit() first)
260
+ """
261
+ if not self._fitted:
262
+ raise RuntimeError("Must call fit() before similarity_from_cache()")
263
+
264
+ v1 = self._vectors[book1_id]
265
+ v2 = self._vectors[book2_id]
266
+
267
+ sim = cosine_similarity(v1, v2)[0, 0]
268
+ return max(0.0, min(1.0, sim))
269
+
270
+ def save(self, path: Path) -> None:
271
+ """Save fitted state to disk.
272
+
273
+ Args:
274
+ path: Path to save to (will create .pkl file)
275
+ """
276
+ if not self._fitted:
277
+ raise RuntimeError("Must call fit() before save()")
278
+
279
+ state = {
280
+ "vectorizer": self.vectorizer,
281
+ "vectors": self._vectors,
282
+ "max_features": self.max_features,
283
+ "min_df": self.min_df,
284
+ "max_df": self.max_df,
285
+ }
286
+
287
+ with open(path, "wb") as f:
288
+ pickle.dump(state, f)
289
+
290
+ def load(self, path: Path) -> None:
291
+ """Load fitted state from disk.
292
+
293
+ Args:
294
+ path: Path to load from
295
+ """
296
+ with open(path, "rb") as f:
297
+ state = pickle.load(f)
298
+
299
+ self.vectorizer = state["vectorizer"]
300
+ self._vectors = state["vectors"]
301
+ self.max_features = state["max_features"]
302
+ self.min_df = state["min_df"]
303
+ self.max_df = state["max_df"]
304
+ self._fitted = True
305
+
306
+
307
+ class CosineMetric(Metric[str]):
308
+ """Simple cosine similarity without TF-IDF weighting.
309
+
310
+ Uses CountVectorizer instead of TF-IDF. Faster but less accurate
311
+ than TfidfMetric.
312
+ """
313
+
314
+ def __init__(self, max_features: int = 5000):
315
+ """Initialize cosine metric.
316
+
317
+ Args:
318
+ max_features: Maximum number of features (default 5000)
319
+ """
320
+ from sklearn.feature_extraction.text import CountVectorizer
321
+
322
+ self.max_features = max_features
323
+ self.vectorizer = CountVectorizer(
324
+ max_features=max_features,
325
+ stop_words="english",
326
+ )
327
+ self._vectors: Dict[int, np.ndarray] = {}
328
+ self._fitted = False
329
+
330
+ def fit(self, data: Dict[int, str]) -> None:
331
+ """Fit vectorizer and cache all vectors.
332
+
333
+ Args:
334
+ data: Dictionary mapping book IDs to text content
335
+ """
336
+ if not data:
337
+ return
338
+
339
+ book_ids = list(data.keys())
340
+ texts = [data[book_id] for book_id in book_ids]
341
+
342
+ vectors = self.vectorizer.fit_transform(texts)
343
+
344
+ for book_id, vector in zip(book_ids, vectors):
345
+ self._vectors[book_id] = vector
346
+
347
+ self._fitted = True
348
+
349
+ def similarity(self, value1: str, value2: str) -> float:
350
+ """Compute cosine similarity.
351
+
352
+ Args:
353
+ value1: First text
354
+ value2: Second text
355
+
356
+ Returns:
357
+ Cosine similarity in [0, 1]
358
+ """
359
+ if not value1 or not value2:
360
+ return 0.0
361
+
362
+ if not self._fitted:
363
+ try:
364
+ vectors = self.vectorizer.fit_transform([value1, value2])
365
+ v1, v2 = vectors[0], vectors[1]
366
+ except ValueError:
367
+ return 0.0
368
+ else:
369
+ try:
370
+ v1 = self.vectorizer.transform([value1])
371
+ v2 = self.vectorizer.transform([value2])
372
+ except ValueError:
373
+ return 0.0
374
+
375
+ sim = cosine_similarity(v1, v2)[0, 0]
376
+ return max(0.0, min(1.0, sim))
ebk/vfs/__init__.py ADDED
@@ -0,0 +1,101 @@
1
+ """Virtual File System for navigating the library database.
2
+
3
+ The VFS provides a filesystem-like interface for browsing and interacting
4
+ with the ebook library. It maps database entities to a hierarchical structure
5
+ that can be navigated with familiar shell commands.
6
+
7
+ Architecture:
8
+
9
+ ```
10
+ / # Root (RootNode)
11
+ ├── books/ # All books (BooksDirectoryNode)
12
+ │ ├── 1/ # Book 1 (BookNode)
13
+ │ │ ├── title # Metadata file (TitleFileNode)
14
+ │ │ ├── authors # Metadata file (AuthorsFileNode)
15
+ │ │ ├── description # Metadata file
16
+ │ │ ├── text # Extracted text (TextFileNode)
17
+ │ │ ├── files/ # Physical files (FilesDirectoryNode)
18
+ │ │ │ ├── book.pdf
19
+ │ │ │ └── book.epub
20
+ │ │ ├── similar/ # Similar books (SimilarDirectoryNode)
21
+ │ │ ├── annotations/ # User annotations
22
+ │ │ └── covers/ # Cover images
23
+ │ └── 2/
24
+ ├── authors/ # Browse by author (AuthorsDirectoryNode)
25
+ │ └── knuth-donald/ # Books by this author
26
+ ├── subjects/ # Browse by subject
27
+ └── series/ # Browse by series
28
+ ```
29
+
30
+ Node Types:
31
+
32
+ - Node: Base class for all VFS entries
33
+ - DirectoryNode: Can contain children (cd into them)
34
+ - FileNode: Leaf nodes with content (cat them)
35
+ - VirtualNode: Dynamically computed (e.g., /books/, /similar/)
36
+ - SymlinkNode: Links to other nodes
37
+
38
+ Path Resolution:
39
+
40
+ The PathResolver handles navigation:
41
+ - Absolute paths: /books/42/title
42
+ - Relative paths: ../other, ./files
43
+ - Special: ., .., ~ (home = /)
44
+ - Symlink following
45
+ - Tab completion support
46
+
47
+ Usage Example:
48
+
49
+ ```python
50
+ from ebk.library_db import Library
51
+ from ebk.vfs import LibraryVFS
52
+
53
+ # Create VFS for a library
54
+ lib = Library.open("/path/to/library")
55
+ vfs = LibraryVFS(lib)
56
+
57
+ # Navigate
58
+ root = vfs.root
59
+ books_dir = vfs.resolver.resolve("/books", root)
60
+ book_node = vfs.resolver.resolve("/books/42", root)
61
+
62
+ # List children
63
+ children = books_dir.list_children() # All books
64
+ for child in children:
65
+ print(child.name, child.get_info())
66
+
67
+ # Read file content
68
+ title_node = vfs.resolver.resolve("/books/42/title", root)
69
+ if isinstance(title_node, FileNode):
70
+ content = title_node.read_content()
71
+ print(content)
72
+ ```
73
+ """
74
+
75
+ from ebk.vfs.base import (
76
+ Node,
77
+ DirectoryNode,
78
+ FileNode,
79
+ VirtualNode,
80
+ SymlinkNode,
81
+ NodeType,
82
+ )
83
+ from ebk.vfs.resolver import PathResolver, PathError, NotADirectoryError, NotFoundError
84
+ from ebk.vfs.library_vfs import LibraryVFS
85
+
86
+ __all__ = [
87
+ # Main entry point
88
+ "LibraryVFS",
89
+ # Core classes
90
+ "Node",
91
+ "DirectoryNode",
92
+ "FileNode",
93
+ "VirtualNode",
94
+ "SymlinkNode",
95
+ "NodeType",
96
+ # Path resolution
97
+ "PathResolver",
98
+ "PathError",
99
+ "NotADirectoryError",
100
+ "NotFoundError",
101
+ ]