ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,281 @@
1
+ """Service for managing hierarchical user tags.
2
+
3
+ Tags provide user-defined organization separate from bibliographic subjects.
4
+ """
5
+
6
+ from typing import List, Optional
7
+ from sqlalchemy.orm import Session
8
+
9
+ from ebk.db.models import Tag, Book, book_tags
10
+
11
+
12
+ class TagService:
13
+ """Service for CRUD operations on hierarchical tags."""
14
+
15
+ def __init__(self, session: Session):
16
+ """Initialize tag service.
17
+
18
+ Args:
19
+ session: SQLAlchemy session
20
+ """
21
+ self.session = session
22
+
23
+ def get_or_create_tag(self, path: str, description: Optional[str] = None,
24
+ color: Optional[str] = None) -> Tag:
25
+ """Get existing tag or create new one with full hierarchy.
26
+
27
+ Args:
28
+ path: Full tag path (e.g., "Work/Project-2024")
29
+ description: Optional description
30
+ color: Optional hex color code
31
+
32
+ Returns:
33
+ Tag instance
34
+
35
+ Examples:
36
+ >>> service.get_or_create_tag("Work/Project-2024")
37
+ # Creates: "Work" and "Work/Project-2024" if they don't exist
38
+ """
39
+ # Check if tag already exists
40
+ existing = self.session.query(Tag).filter_by(path=path).first()
41
+ if existing:
42
+ return existing
43
+
44
+ # Parse path into components
45
+ parts = path.split('/')
46
+ parent_tag = None
47
+ current_path = ""
48
+
49
+ # Create hierarchy from root to leaf
50
+ for i, name in enumerate(parts):
51
+ # Build current path
52
+ if current_path:
53
+ current_path += f"/{name}"
54
+ else:
55
+ current_path = name
56
+
57
+ # Check if this level exists
58
+ tag = self.session.query(Tag).filter_by(path=current_path).first()
59
+
60
+ if not tag:
61
+ # Create new tag at this level
62
+ tag = Tag(
63
+ name=name,
64
+ path=current_path,
65
+ parent_id=parent_tag.id if parent_tag else None
66
+ )
67
+
68
+ # Only set description and color on the leaf node
69
+ if i == len(parts) - 1:
70
+ tag.description = description
71
+ tag.color = color
72
+
73
+ self.session.add(tag)
74
+
75
+ parent_tag = tag
76
+
77
+ self.session.commit()
78
+ return parent_tag
79
+
80
+ def get_tag(self, path: str) -> Optional[Tag]:
81
+ """Get tag by path.
82
+
83
+ Args:
84
+ path: Full tag path
85
+
86
+ Returns:
87
+ Tag instance or None
88
+ """
89
+ return self.session.query(Tag).filter_by(path=path).first()
90
+
91
+ def get_all_tags(self) -> List[Tag]:
92
+ """Get all tags ordered by path.
93
+
94
+ Returns:
95
+ List of all tags
96
+ """
97
+ return self.session.query(Tag).order_by(Tag.path).all()
98
+
99
+ def get_root_tags(self) -> List[Tag]:
100
+ """Get top-level tags (no parent).
101
+
102
+ Returns:
103
+ List of root tags
104
+ """
105
+ return self.session.query(Tag).filter(Tag.parent_id.is_(None)).order_by(Tag.name).all()
106
+
107
+ def get_children(self, tag: Tag) -> List[Tag]:
108
+ """Get immediate children of a tag.
109
+
110
+ Args:
111
+ tag: Parent tag
112
+
113
+ Returns:
114
+ List of child tags
115
+ """
116
+ return self.session.query(Tag).filter_by(parent_id=tag.id).order_by(Tag.name).all()
117
+
118
+ def delete_tag(self, path: str, delete_children: bool = False) -> bool:
119
+ """Delete a tag.
120
+
121
+ Args:
122
+ path: Full tag path
123
+ delete_children: If True, delete children and all descendants too
124
+
125
+ Returns:
126
+ True if deleted, False if not found
127
+ """
128
+ tag = self.get_tag(path)
129
+ if not tag:
130
+ return False
131
+
132
+ # Check if tag has children
133
+ children = self.get_children(tag)
134
+ if children and not delete_children:
135
+ raise ValueError(f"Tag '{path}' has {len(children)} children. "
136
+ "Use delete_children=True to delete them too.")
137
+
138
+ # If delete_children=True, explicitly delete all descendants
139
+ if delete_children:
140
+ # Find all tags that start with this path + "/"
141
+ descendants = self.session.query(Tag).filter(
142
+ Tag.path.like(f"{path}/%")
143
+ ).all()
144
+ for desc in descendants:
145
+ self.session.delete(desc)
146
+
147
+ self.session.delete(tag)
148
+ self.session.commit()
149
+ return True
150
+
151
+ def rename_tag(self, old_path: str, new_path: str) -> Tag:
152
+ """Rename a tag and update all descendant paths.
153
+
154
+ Args:
155
+ old_path: Current tag path
156
+ new_path: New tag path
157
+
158
+ Returns:
159
+ Updated tag
160
+
161
+ Raises:
162
+ ValueError: If tag doesn't exist or new path already exists
163
+ """
164
+ tag = self.get_tag(old_path)
165
+ if not tag:
166
+ raise ValueError(f"Tag '{old_path}' not found")
167
+
168
+ # Check if new path already exists
169
+ if self.get_tag(new_path):
170
+ raise ValueError(f"Tag '{new_path}' already exists")
171
+
172
+ # Update this tag
173
+ old_name = tag.name
174
+ new_parts = new_path.split('/')
175
+ tag.name = new_parts[-1]
176
+ tag.path = new_path
177
+
178
+ # Update all descendant paths
179
+ descendants = self.session.query(Tag).filter(
180
+ Tag.path.like(f"{old_path}/%")
181
+ ).all()
182
+
183
+ for desc in descendants:
184
+ # Replace the old path prefix with new path
185
+ desc.path = desc.path.replace(old_path, new_path, 1)
186
+
187
+ self.session.commit()
188
+ return tag
189
+
190
+ def add_tag_to_book(self, book: Book, tag_path: str) -> Tag:
191
+ """Add a tag to a book (creates tag if it doesn't exist).
192
+
193
+ Args:
194
+ book: Book instance
195
+ tag_path: Full tag path
196
+
197
+ Returns:
198
+ Tag instance
199
+ """
200
+ tag = self.get_or_create_tag(tag_path)
201
+
202
+ if tag not in book.tags:
203
+ book.tags.append(tag)
204
+ self.session.commit()
205
+
206
+ return tag
207
+
208
+ def remove_tag_from_book(self, book: Book, tag_path: str) -> bool:
209
+ """Remove a tag from a book.
210
+
211
+ Args:
212
+ book: Book instance
213
+ tag_path: Full tag path
214
+
215
+ Returns:
216
+ True if removed, False if book didn't have that tag
217
+ """
218
+ tag = self.get_tag(tag_path)
219
+ if not tag:
220
+ return False
221
+
222
+ if tag in book.tags:
223
+ book.tags.remove(tag)
224
+ self.session.commit()
225
+ return True
226
+
227
+ return False
228
+
229
+ def get_books_with_tag(self, tag_path: str, include_subtags: bool = False) -> List[Book]:
230
+ """Get all books with a specific tag.
231
+
232
+ Args:
233
+ tag_path: Full tag path
234
+ include_subtags: If True, include books from descendant tags
235
+
236
+ Returns:
237
+ List of books
238
+ """
239
+ tag = self.get_tag(tag_path)
240
+ if not tag:
241
+ return []
242
+
243
+ if not include_subtags:
244
+ return tag.books
245
+
246
+ # Get all descendant tags
247
+ descendant_paths = self.session.query(Tag.id).filter(
248
+ Tag.path.like(f"{tag_path}/%")
249
+ ).all()
250
+
251
+ all_tag_ids = [tag.id] + [t[0] for t in descendant_paths]
252
+
253
+ # Get books with any of these tags
254
+ books = self.session.query(Book).join(book_tags).filter(
255
+ book_tags.c.tag_id.in_(all_tag_ids)
256
+ ).distinct().all()
257
+
258
+ return books
259
+
260
+ def get_tag_stats(self, tag_path: str) -> dict:
261
+ """Get statistics for a tag.
262
+
263
+ Args:
264
+ tag_path: Full tag path
265
+
266
+ Returns:
267
+ Dict with stats: {book_count, subtag_count, depth}
268
+ """
269
+ tag = self.get_tag(tag_path)
270
+ if not tag:
271
+ return {}
272
+
273
+ children = self.get_children(tag)
274
+
275
+ return {
276
+ 'path': tag.path,
277
+ 'book_count': len(tag.books),
278
+ 'subtag_count': len(children),
279
+ 'depth': tag.depth,
280
+ 'created_at': tag.created_at,
281
+ }
@@ -0,0 +1,317 @@
1
+ """
2
+ Text extraction service for ebook files.
3
+
4
+ Handles extraction from PDF, EPUB, TXT, MD and stores in database with FTS indexing.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+ from typing import List, Optional, Tuple
10
+ import logging
11
+
12
+ import pypdf
13
+ import fitz # PyMuPDF
14
+ from ebooklib import epub
15
+ from bs4 import BeautifulSoup
16
+
17
+ from ..db.models import File, ExtractedText, TextChunk
18
+ from sqlalchemy.orm import Session
19
+ from sqlalchemy import text
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class TextExtractionService:
25
+ """Service for extracting and chunking text from ebook files."""
26
+
27
+ def __init__(self, library_root: Path):
28
+ self.library_root = Path(library_root)
29
+
30
+ def extract_full_text(self, file: File, session: Session) -> Optional[ExtractedText]:
31
+ """
32
+ Extract complete text from ebook file and store in database.
33
+
34
+ Args:
35
+ file: File model instance
36
+ session: Database session
37
+
38
+ Returns:
39
+ ExtractedText instance or None if extraction failed
40
+ """
41
+ file_path = self.library_root / file.path
42
+
43
+ if not file_path.exists():
44
+ logger.error(f"File not found: {file_path}")
45
+ return None
46
+
47
+ try:
48
+ # Extract based on format
49
+ if file.format.lower() in ['txt', 'md', 'text']:
50
+ text = self._extract_plaintext(file_path)
51
+ elif file.format.lower() == 'pdf':
52
+ text = self._extract_pdf_text(file_path)
53
+ elif file.format.lower() == 'epub':
54
+ text = self._extract_epub_text(file_path)
55
+ else:
56
+ logger.warning(f"Unsupported format for text extraction: {file.format}")
57
+ return None
58
+
59
+ if not text or len(text.strip()) < 100:
60
+ logger.warning(f"Extracted text too short for {file.path}")
61
+ return None
62
+
63
+ # Store in database
64
+ extracted = ExtractedText(
65
+ file_id=file.id,
66
+ content=text,
67
+ content_hash=self._hash_text(text)
68
+ )
69
+ session.add(extracted)
70
+
71
+ # Update file status
72
+ file.text_extracted = True
73
+ file.extraction_date = extracted.extracted_at
74
+
75
+ # Update FTS index
76
+ self._update_fts_index(session, file.book_id, text)
77
+
78
+ logger.info(f"Extracted {len(text)} chars from {file.path}")
79
+ return extracted
80
+
81
+ except Exception as e:
82
+ logger.error(f"Error extracting text from {file.path}: {e}")
83
+ return None
84
+
85
+ def create_chunks(self, extracted: ExtractedText, file: File,
86
+ session: Session, chunk_size: int = 500,
87
+ overlap: int = 100) -> List[TextChunk]:
88
+ """
89
+ Split extracted text into overlapping chunks for semantic search.
90
+
91
+ Args:
92
+ extracted: ExtractedText instance
93
+ file: File instance
94
+ session: Database session
95
+ chunk_size: Number of words per chunk
96
+ overlap: Number of overlapping words between chunks
97
+
98
+ Returns:
99
+ List of TextChunk instances
100
+ """
101
+ text = extracted.content
102
+ words = text.split()
103
+
104
+ chunks = []
105
+ for i in range(0, len(words), chunk_size - overlap):
106
+ chunk_words = words[i:i + chunk_size]
107
+ chunk_text = ' '.join(chunk_words)
108
+
109
+ if len(chunk_text.strip()) < 50: # Skip tiny chunks
110
+ continue
111
+
112
+ chunk = TextChunk(
113
+ file_id=file.id,
114
+ chunk_index=len(chunks),
115
+ content=chunk_text,
116
+ has_embedding=False
117
+ )
118
+ chunks.append(chunk)
119
+
120
+ session.add_all(chunks)
121
+ logger.info(f"Created {len(chunks)} chunks from {file.path}")
122
+ return chunks
123
+
124
+ def _extract_plaintext(self, file_path: Path) -> str:
125
+ """Extract text from plain text files."""
126
+ try:
127
+ return file_path.read_text(encoding='utf-8')
128
+ except UnicodeDecodeError:
129
+ # Try with different encoding
130
+ return file_path.read_text(encoding='latin-1')
131
+
132
+ def _extract_pdf_text(self, file_path: Path) -> str:
133
+ """
134
+ Extract text from PDF using PyMuPDF (primary) with pypdf fallback.
135
+ """
136
+ try:
137
+ # Try PyMuPDF first (better quality)
138
+ doc = fitz.open(str(file_path))
139
+ text = ""
140
+ for page in doc:
141
+ text += page.get_text()
142
+ doc.close()
143
+
144
+ if text.strip():
145
+ return self._clean_text(text)
146
+
147
+ except Exception as e:
148
+ logger.warning(f"PyMuPDF extraction failed: {e}, trying pypdf")
149
+
150
+ try:
151
+ # Fallback to pypdf
152
+ with open(file_path, 'rb') as f:
153
+ reader = pypdf.PdfReader(f)
154
+ text = ""
155
+ for page in reader.pages:
156
+ text += page.extract_text()
157
+
158
+ return self._clean_text(text)
159
+
160
+ except Exception as e:
161
+ logger.error(f"PDF text extraction failed: {e}")
162
+ return ""
163
+
164
+ def _extract_epub_text(self, file_path: Path) -> str:
165
+ """Extract text from EPUB file."""
166
+ try:
167
+ book = epub.read_epub(str(file_path))
168
+ text_parts = []
169
+
170
+ for item in book.get_items():
171
+ # Handle different ebooklib versions
172
+ # Type 9 is ITEM_DOCUMENT in ebooklib
173
+ item_type = item.get_type()
174
+
175
+ # Check if this is a document item (HTML/XHTML content)
176
+ is_document = False
177
+ if hasattr(epub, 'ITEM_DOCUMENT'):
178
+ is_document = item_type == epub.ITEM_DOCUMENT
179
+ else:
180
+ # Fallback: type 9 is document, or check media type
181
+ is_document = (item_type == 9 or
182
+ 'html' in item.get_name().lower() or
183
+ (hasattr(item, 'media_type') and
184
+ item.media_type and
185
+ 'html' in item.media_type.lower()))
186
+
187
+ if is_document:
188
+ try:
189
+ soup = BeautifulSoup(item.content, 'html.parser')
190
+
191
+ # Remove script and style elements
192
+ for script in soup(["script", "style"]):
193
+ script.decompose()
194
+
195
+ text = soup.get_text(separator='\n')
196
+ text_parts.append(text)
197
+ except Exception as e:
198
+ logger.debug(f"Failed to extract text from item {item.get_name()}: {e}")
199
+ continue
200
+
201
+ full_text = '\n\n'.join(text_parts)
202
+ return self._clean_text(full_text)
203
+
204
+ except Exception as e:
205
+ logger.error(f"EPUB text extraction failed: {e}")
206
+ return ""
207
+
208
+ def _clean_text(self, text: str) -> str:
209
+ """Clean extracted text."""
210
+ # Remove excessive whitespace
211
+ text = re.sub(r'\n\s*\n', '\n\n', text)
212
+ text = re.sub(r' +', ' ', text)
213
+
214
+ # Remove page headers/footers (common patterns)
215
+ text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
216
+
217
+ # Strip leading/trailing whitespace
218
+ text = text.strip()
219
+
220
+ return text
221
+
222
+ def _hash_text(self, text: str) -> str:
223
+ """Generate hash of text content."""
224
+ import hashlib
225
+ return hashlib.sha256(text.encode()).hexdigest()
226
+
227
+ def _update_fts_index(self, session: Session, book_id: int, extracted_text: str):
228
+ """
229
+ Update full-text search index.
230
+
231
+ Args:
232
+ session: Database session
233
+ book_id: Book ID
234
+ extracted_text: Extracted text content
235
+ """
236
+ try:
237
+ # Get book title and description for FTS
238
+ from ..db.models import Book
239
+ book = session.get(Book, book_id)
240
+
241
+ if not book:
242
+ return
243
+
244
+ # Delete existing FTS entry if exists
245
+ session.execute(
246
+ text("DELETE FROM books_fts WHERE book_id = :book_id"),
247
+ {"book_id": book_id}
248
+ )
249
+
250
+ # Insert into FTS table
251
+ session.execute(
252
+ text("""
253
+ INSERT INTO books_fts (book_id, title, description, extracted_text)
254
+ VALUES (:book_id, :title, :description, :extracted_text)
255
+ """),
256
+ {
257
+ "book_id": book_id,
258
+ "title": book.title or '',
259
+ "description": book.description or '',
260
+ "extracted_text": extracted_text[:50000] # Limit FTS content to first 50k chars
261
+ }
262
+ )
263
+
264
+ logger.info(f"Updated FTS index for book {book_id}")
265
+
266
+ except Exception as e:
267
+ logger.error(f"Error updating FTS index: {e}")
268
+
269
+ def extract_page_content(self, file_path: Path, page_number: int) -> Optional[str]:
270
+ """
271
+ Extract text from a specific page (PDF only).
272
+
273
+ Args:
274
+ file_path: Path to PDF file
275
+ page_number: Page number (0-indexed)
276
+
277
+ Returns:
278
+ Page text or None
279
+ """
280
+ try:
281
+ if file_path.suffix.lower() == '.pdf':
282
+ doc = fitz.open(str(file_path))
283
+ if 0 <= page_number < len(doc):
284
+ page_text = doc[page_number].get_text()
285
+ doc.close()
286
+ return self._clean_text(page_text)
287
+ doc.close()
288
+ except Exception as e:
289
+ logger.error(f"Error extracting page {page_number}: {e}")
290
+
291
+ return None
292
+
293
+ def get_word_count(self, text: str) -> int:
294
+ """Get word count from text."""
295
+ return len(text.split())
296
+
297
+ def extract_and_chunk_all(self, file: File, session: Session,
298
+ chunk_size: int = 500) -> Tuple[Optional[ExtractedText], List[TextChunk]]:
299
+ """
300
+ Extract full text and create chunks in one operation.
301
+
302
+ Args:
303
+ file: File instance
304
+ session: Database session
305
+ chunk_size: Words per chunk
306
+
307
+ Returns:
308
+ Tuple of (ExtractedText, List[TextChunk])
309
+ """
310
+ extracted = self.extract_full_text(file, session)
311
+
312
+ if not extracted:
313
+ return None, []
314
+
315
+ chunks = self.create_chunks(extracted, file, session, chunk_size)
316
+
317
+ return extracted, chunks
@@ -0,0 +1,12 @@
1
+ """
2
+ Views Service - High-level API for managing views.
3
+
4
+ Provides CRUD operations and convenience methods for working with views.
5
+ This is a re-export from ebk.views.service for the services layer.
6
+ """
7
+
8
+ # Re-export ViewService from its original location
9
+ # This maintains backward compatibility while providing access via services layer
10
+ from ..views.service import ViewService
11
+
12
+ __all__ = ['ViewService']
@@ -0,0 +1,77 @@
1
+ """Book similarity system.
2
+
3
+ This module provides a flexible system for computing similarity between books
4
+ using multiple features (content, metadata, etc.).
5
+
6
+ Basic usage:
7
+ >>> from ebk.similarity import BookSimilarity
8
+ >>>
9
+ >>> # Configure similarity
10
+ >>> sim = BookSimilarity().balanced()
11
+ >>>
12
+ >>> # Fit on corpus for performance
13
+ >>> sim.fit(all_books)
14
+ >>>
15
+ >>> # Find similar books
16
+ >>> similar = sim.find_similar(my_book, all_books, top_k=10)
17
+
18
+ Advanced usage:
19
+ >>> # Custom configuration
20
+ >>> sim = (BookSimilarity()
21
+ ... .content(weight=4.0)
22
+ ... .authors(weight=2.0, metric=CustomMetric())
23
+ ... .temporal(weight=1.0, sigma=5.0))
24
+ >>>
25
+ >>> # Compute similarity matrix for batch processing
26
+ >>> matrix = sim.similarity_matrix(books)
27
+ >>>
28
+ >>> # Save/load fitted state
29
+ >>> sim.save(Path("cache/similarity"))
30
+ >>> sim.load(Path("cache/similarity"))
31
+ """
32
+
33
+ from ebk.similarity.base import Extractor, Feature, Metric
34
+ from ebk.similarity.core import BookSimilarity
35
+ from ebk.similarity.extractors import (
36
+ AuthorsExtractor,
37
+ ContentExtractor,
38
+ DescriptionExtractor,
39
+ LanguageExtractor,
40
+ PageCountExtractor,
41
+ PublicationYearExtractor,
42
+ PublisherExtractor,
43
+ SubjectsExtractor,
44
+ )
45
+ from ebk.similarity.metrics import (
46
+ CosineMetric,
47
+ ExactMatchMetric,
48
+ JaccardMetric,
49
+ NumericProximityMetric,
50
+ TemporalDecayMetric,
51
+ TfidfMetric,
52
+ )
53
+
54
+ __all__ = [
55
+ # Core
56
+ "BookSimilarity",
57
+ # Base classes
58
+ "Extractor",
59
+ "Metric",
60
+ "Feature",
61
+ # Extractors
62
+ "ContentExtractor",
63
+ "DescriptionExtractor",
64
+ "AuthorsExtractor",
65
+ "SubjectsExtractor",
66
+ "PublicationYearExtractor",
67
+ "LanguageExtractor",
68
+ "PublisherExtractor",
69
+ "PageCountExtractor",
70
+ # Metrics
71
+ "TfidfMetric",
72
+ "CosineMetric",
73
+ "JaccardMetric",
74
+ "ExactMatchMetric",
75
+ "TemporalDecayMetric",
76
+ "NumericProximityMetric",
77
+ ]