ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,282 @@
1
+ """Service for managing hierarchical user tags.
2
+
3
+ Tags provide user-defined organization separate from bibliographic subjects.
4
+ """
5
+
6
+ from typing import List, Optional
7
+ from sqlalchemy.orm import Session
8
+ from datetime import datetime
9
+
10
+ from ebk.db.models import Tag, Book, book_tags
11
+
12
+
13
+ class TagService:
14
+ """Service for CRUD operations on hierarchical tags."""
15
+
16
+ def __init__(self, session: Session):
17
+ """Initialize tag service.
18
+
19
+ Args:
20
+ session: SQLAlchemy session
21
+ """
22
+ self.session = session
23
+
24
+ def get_or_create_tag(self, path: str, description: Optional[str] = None,
25
+ color: Optional[str] = None) -> Tag:
26
+ """Get existing tag or create new one with full hierarchy.
27
+
28
+ Args:
29
+ path: Full tag path (e.g., "Work/Project-2024")
30
+ description: Optional description
31
+ color: Optional hex color code
32
+
33
+ Returns:
34
+ Tag instance
35
+
36
+ Examples:
37
+ >>> service.get_or_create_tag("Work/Project-2024")
38
+ # Creates: "Work" and "Work/Project-2024" if they don't exist
39
+ """
40
+ # Check if tag already exists
41
+ existing = self.session.query(Tag).filter_by(path=path).first()
42
+ if existing:
43
+ return existing
44
+
45
+ # Parse path into components
46
+ parts = path.split('/')
47
+ parent_tag = None
48
+ current_path = ""
49
+
50
+ # Create hierarchy from root to leaf
51
+ for i, name in enumerate(parts):
52
+ # Build current path
53
+ if current_path:
54
+ current_path += f"/{name}"
55
+ else:
56
+ current_path = name
57
+
58
+ # Check if this level exists
59
+ tag = self.session.query(Tag).filter_by(path=current_path).first()
60
+
61
+ if not tag:
62
+ # Create new tag at this level
63
+ tag = Tag(
64
+ name=name,
65
+ path=current_path,
66
+ parent_id=parent_tag.id if parent_tag else None
67
+ )
68
+
69
+ # Only set description and color on the leaf node
70
+ if i == len(parts) - 1:
71
+ tag.description = description
72
+ tag.color = color
73
+
74
+ self.session.add(tag)
75
+
76
+ parent_tag = tag
77
+
78
+ self.session.commit()
79
+ return parent_tag
80
+
81
+ def get_tag(self, path: str) -> Optional[Tag]:
82
+ """Get tag by path.
83
+
84
+ Args:
85
+ path: Full tag path
86
+
87
+ Returns:
88
+ Tag instance or None
89
+ """
90
+ return self.session.query(Tag).filter_by(path=path).first()
91
+
92
+ def get_all_tags(self) -> List[Tag]:
93
+ """Get all tags ordered by path.
94
+
95
+ Returns:
96
+ List of all tags
97
+ """
98
+ return self.session.query(Tag).order_by(Tag.path).all()
99
+
100
+ def get_root_tags(self) -> List[Tag]:
101
+ """Get top-level tags (no parent).
102
+
103
+ Returns:
104
+ List of root tags
105
+ """
106
+ return self.session.query(Tag).filter(Tag.parent_id.is_(None)).order_by(Tag.name).all()
107
+
108
+ def get_children(self, tag: Tag) -> List[Tag]:
109
+ """Get immediate children of a tag.
110
+
111
+ Args:
112
+ tag: Parent tag
113
+
114
+ Returns:
115
+ List of child tags
116
+ """
117
+ return self.session.query(Tag).filter_by(parent_id=tag.id).order_by(Tag.name).all()
118
+
119
+ def delete_tag(self, path: str, delete_children: bool = False) -> bool:
120
+ """Delete a tag.
121
+
122
+ Args:
123
+ path: Full tag path
124
+ delete_children: If True, delete children and all descendants too
125
+
126
+ Returns:
127
+ True if deleted, False if not found
128
+ """
129
+ tag = self.get_tag(path)
130
+ if not tag:
131
+ return False
132
+
133
+ # Check if tag has children
134
+ children = self.get_children(tag)
135
+ if children and not delete_children:
136
+ raise ValueError(f"Tag '{path}' has {len(children)} children. "
137
+ "Use delete_children=True to delete them too.")
138
+
139
+ # If delete_children=True, explicitly delete all descendants
140
+ if delete_children:
141
+ # Find all tags that start with this path + "/"
142
+ descendants = self.session.query(Tag).filter(
143
+ Tag.path.like(f"{path}/%")
144
+ ).all()
145
+ for desc in descendants:
146
+ self.session.delete(desc)
147
+
148
+ self.session.delete(tag)
149
+ self.session.commit()
150
+ return True
151
+
152
+ def rename_tag(self, old_path: str, new_path: str) -> Tag:
153
+ """Rename a tag and update all descendant paths.
154
+
155
+ Args:
156
+ old_path: Current tag path
157
+ new_path: New tag path
158
+
159
+ Returns:
160
+ Updated tag
161
+
162
+ Raises:
163
+ ValueError: If tag doesn't exist or new path already exists
164
+ """
165
+ tag = self.get_tag(old_path)
166
+ if not tag:
167
+ raise ValueError(f"Tag '{old_path}' not found")
168
+
169
+ # Check if new path already exists
170
+ if self.get_tag(new_path):
171
+ raise ValueError(f"Tag '{new_path}' already exists")
172
+
173
+ # Update this tag
174
+ old_name = tag.name
175
+ new_parts = new_path.split('/')
176
+ tag.name = new_parts[-1]
177
+ tag.path = new_path
178
+
179
+ # Update all descendant paths
180
+ descendants = self.session.query(Tag).filter(
181
+ Tag.path.like(f"{old_path}/%")
182
+ ).all()
183
+
184
+ for desc in descendants:
185
+ # Replace the old path prefix with new path
186
+ desc.path = desc.path.replace(old_path, new_path, 1)
187
+
188
+ self.session.commit()
189
+ return tag
190
+
191
+ def add_tag_to_book(self, book: Book, tag_path: str) -> Tag:
192
+ """Add a tag to a book (creates tag if it doesn't exist).
193
+
194
+ Args:
195
+ book: Book instance
196
+ tag_path: Full tag path
197
+
198
+ Returns:
199
+ Tag instance
200
+ """
201
+ tag = self.get_or_create_tag(tag_path)
202
+
203
+ if tag not in book.tags:
204
+ book.tags.append(tag)
205
+ self.session.commit()
206
+
207
+ return tag
208
+
209
+ def remove_tag_from_book(self, book: Book, tag_path: str) -> bool:
210
+ """Remove a tag from a book.
211
+
212
+ Args:
213
+ book: Book instance
214
+ tag_path: Full tag path
215
+
216
+ Returns:
217
+ True if removed, False if book didn't have that tag
218
+ """
219
+ tag = self.get_tag(tag_path)
220
+ if not tag:
221
+ return False
222
+
223
+ if tag in book.tags:
224
+ book.tags.remove(tag)
225
+ self.session.commit()
226
+ return True
227
+
228
+ return False
229
+
230
+ def get_books_with_tag(self, tag_path: str, include_subtags: bool = False) -> List[Book]:
231
+ """Get all books with a specific tag.
232
+
233
+ Args:
234
+ tag_path: Full tag path
235
+ include_subtags: If True, include books from descendant tags
236
+
237
+ Returns:
238
+ List of books
239
+ """
240
+ tag = self.get_tag(tag_path)
241
+ if not tag:
242
+ return []
243
+
244
+ if not include_subtags:
245
+ return tag.books
246
+
247
+ # Get all descendant tags
248
+ descendant_paths = self.session.query(Tag.id).filter(
249
+ Tag.path.like(f"{tag_path}/%")
250
+ ).all()
251
+
252
+ all_tag_ids = [tag.id] + [t[0] for t in descendant_paths]
253
+
254
+ # Get books with any of these tags
255
+ books = self.session.query(Book).join(book_tags).filter(
256
+ book_tags.c.tag_id.in_(all_tag_ids)
257
+ ).distinct().all()
258
+
259
+ return books
260
+
261
+ def get_tag_stats(self, tag_path: str) -> dict:
262
+ """Get statistics for a tag.
263
+
264
+ Args:
265
+ tag_path: Full tag path
266
+
267
+ Returns:
268
+ Dict with stats: {book_count, subtag_count, depth}
269
+ """
270
+ tag = self.get_tag(tag_path)
271
+ if not tag:
272
+ return {}
273
+
274
+ children = self.get_children(tag)
275
+
276
+ return {
277
+ 'path': tag.path,
278
+ 'book_count': len(tag.books),
279
+ 'subtag_count': len(children),
280
+ 'depth': tag.depth,
281
+ 'created_at': tag.created_at,
282
+ }
@@ -0,0 +1,317 @@
1
+ """
2
+ Text extraction service for ebook files.
3
+
4
+ Handles extraction from PDF, EPUB, TXT, MD and stores in database with FTS indexing.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+ from typing import List, Optional, Tuple
10
+ import logging
11
+
12
+ import pypdf
13
+ import fitz # PyMuPDF
14
+ from ebooklib import epub
15
+ from bs4 import BeautifulSoup
16
+
17
+ from ..db.models import File, ExtractedText, TextChunk
18
+ from sqlalchemy.orm import Session
19
+ from sqlalchemy import text
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class TextExtractionService:
25
+ """Service for extracting and chunking text from ebook files."""
26
+
27
+ def __init__(self, library_root: Path):
28
+ self.library_root = Path(library_root)
29
+
30
+ def extract_full_text(self, file: File, session: Session) -> Optional[ExtractedText]:
31
+ """
32
+ Extract complete text from ebook file and store in database.
33
+
34
+ Args:
35
+ file: File model instance
36
+ session: Database session
37
+
38
+ Returns:
39
+ ExtractedText instance or None if extraction failed
40
+ """
41
+ file_path = self.library_root / file.path
42
+
43
+ if not file_path.exists():
44
+ logger.error(f"File not found: {file_path}")
45
+ return None
46
+
47
+ try:
48
+ # Extract based on format
49
+ if file.format.lower() in ['txt', 'md', 'text']:
50
+ text = self._extract_plaintext(file_path)
51
+ elif file.format.lower() == 'pdf':
52
+ text = self._extract_pdf_text(file_path)
53
+ elif file.format.lower() == 'epub':
54
+ text = self._extract_epub_text(file_path)
55
+ else:
56
+ logger.warning(f"Unsupported format for text extraction: {file.format}")
57
+ return None
58
+
59
+ if not text or len(text.strip()) < 100:
60
+ logger.warning(f"Extracted text too short for {file.path}")
61
+ return None
62
+
63
+ # Store in database
64
+ extracted = ExtractedText(
65
+ file_id=file.id,
66
+ content=text,
67
+ content_hash=self._hash_text(text)
68
+ )
69
+ session.add(extracted)
70
+
71
+ # Update file status
72
+ file.text_extracted = True
73
+ file.extraction_date = extracted.extracted_at
74
+
75
+ # Update FTS index
76
+ self._update_fts_index(session, file.book_id, text)
77
+
78
+ logger.info(f"Extracted {len(text)} chars from {file.path}")
79
+ return extracted
80
+
81
+ except Exception as e:
82
+ logger.error(f"Error extracting text from {file.path}: {e}")
83
+ return None
84
+
85
+ def create_chunks(self, extracted: ExtractedText, file: File,
86
+ session: Session, chunk_size: int = 500,
87
+ overlap: int = 100) -> List[TextChunk]:
88
+ """
89
+ Split extracted text into overlapping chunks for semantic search.
90
+
91
+ Args:
92
+ extracted: ExtractedText instance
93
+ file: File instance
94
+ session: Database session
95
+ chunk_size: Number of words per chunk
96
+ overlap: Number of overlapping words between chunks
97
+
98
+ Returns:
99
+ List of TextChunk instances
100
+ """
101
+ text = extracted.content
102
+ words = text.split()
103
+
104
+ chunks = []
105
+ for i in range(0, len(words), chunk_size - overlap):
106
+ chunk_words = words[i:i + chunk_size]
107
+ chunk_text = ' '.join(chunk_words)
108
+
109
+ if len(chunk_text.strip()) < 50: # Skip tiny chunks
110
+ continue
111
+
112
+ chunk = TextChunk(
113
+ file_id=file.id,
114
+ chunk_index=len(chunks),
115
+ content=chunk_text,
116
+ has_embedding=False
117
+ )
118
+ chunks.append(chunk)
119
+
120
+ session.add_all(chunks)
121
+ logger.info(f"Created {len(chunks)} chunks from {file.path}")
122
+ return chunks
123
+
124
+ def _extract_plaintext(self, file_path: Path) -> str:
125
+ """Extract text from plain text files."""
126
+ try:
127
+ return file_path.read_text(encoding='utf-8')
128
+ except UnicodeDecodeError:
129
+ # Try with different encoding
130
+ return file_path.read_text(encoding='latin-1')
131
+
132
+ def _extract_pdf_text(self, file_path: Path) -> str:
133
+ """
134
+ Extract text from PDF using PyMuPDF (primary) with pypdf fallback.
135
+ """
136
+ try:
137
+ # Try PyMuPDF first (better quality)
138
+ doc = fitz.open(str(file_path))
139
+ text = ""
140
+ for page in doc:
141
+ text += page.get_text()
142
+ doc.close()
143
+
144
+ if text.strip():
145
+ return self._clean_text(text)
146
+
147
+ except Exception as e:
148
+ logger.warning(f"PyMuPDF extraction failed: {e}, trying pypdf")
149
+
150
+ try:
151
+ # Fallback to pypdf
152
+ with open(file_path, 'rb') as f:
153
+ reader = pypdf.PdfReader(f)
154
+ text = ""
155
+ for page in reader.pages:
156
+ text += page.extract_text()
157
+
158
+ return self._clean_text(text)
159
+
160
+ except Exception as e:
161
+ logger.error(f"PDF text extraction failed: {e}")
162
+ return ""
163
+
164
+ def _extract_epub_text(self, file_path: Path) -> str:
165
+ """Extract text from EPUB file."""
166
+ try:
167
+ book = epub.read_epub(str(file_path))
168
+ text_parts = []
169
+
170
+ for item in book.get_items():
171
+ # Handle different ebooklib versions
172
+ # Type 9 is ITEM_DOCUMENT in ebooklib
173
+ item_type = item.get_type()
174
+
175
+ # Check if this is a document item (HTML/XHTML content)
176
+ is_document = False
177
+ if hasattr(epub, 'ITEM_DOCUMENT'):
178
+ is_document = item_type == epub.ITEM_DOCUMENT
179
+ else:
180
+ # Fallback: type 9 is document, or check media type
181
+ is_document = (item_type == 9 or
182
+ 'html' in item.get_name().lower() or
183
+ (hasattr(item, 'media_type') and
184
+ item.media_type and
185
+ 'html' in item.media_type.lower()))
186
+
187
+ if is_document:
188
+ try:
189
+ soup = BeautifulSoup(item.content, 'html.parser')
190
+
191
+ # Remove script and style elements
192
+ for script in soup(["script", "style"]):
193
+ script.decompose()
194
+
195
+ text = soup.get_text(separator='\n')
196
+ text_parts.append(text)
197
+ except Exception as e:
198
+ logger.debug(f"Failed to extract text from item {item.get_name()}: {e}")
199
+ continue
200
+
201
+ full_text = '\n\n'.join(text_parts)
202
+ return self._clean_text(full_text)
203
+
204
+ except Exception as e:
205
+ logger.error(f"EPUB text extraction failed: {e}")
206
+ return ""
207
+
208
+ def _clean_text(self, text: str) -> str:
209
+ """Clean extracted text."""
210
+ # Remove excessive whitespace
211
+ text = re.sub(r'\n\s*\n', '\n\n', text)
212
+ text = re.sub(r' +', ' ', text)
213
+
214
+ # Remove page headers/footers (common patterns)
215
+ text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
216
+
217
+ # Strip leading/trailing whitespace
218
+ text = text.strip()
219
+
220
+ return text
221
+
222
+ def _hash_text(self, text: str) -> str:
223
+ """Generate hash of text content."""
224
+ import hashlib
225
+ return hashlib.sha256(text.encode()).hexdigest()
226
+
227
+ def _update_fts_index(self, session: Session, book_id: int, extracted_text: str):
228
+ """
229
+ Update full-text search index.
230
+
231
+ Args:
232
+ session: Database session
233
+ book_id: Book ID
234
+ extracted_text: Extracted text content
235
+ """
236
+ try:
237
+ # Get book title and description for FTS
238
+ from ..db.models import Book
239
+ book = session.query(Book).get(book_id)
240
+
241
+ if not book:
242
+ return
243
+
244
+ # Delete existing FTS entry if exists
245
+ session.execute(
246
+ text("DELETE FROM books_fts WHERE book_id = :book_id"),
247
+ {"book_id": book_id}
248
+ )
249
+
250
+ # Insert into FTS table
251
+ session.execute(
252
+ text("""
253
+ INSERT INTO books_fts (book_id, title, description, extracted_text)
254
+ VALUES (:book_id, :title, :description, :extracted_text)
255
+ """),
256
+ {
257
+ "book_id": book_id,
258
+ "title": book.title or '',
259
+ "description": book.description or '',
260
+ "extracted_text": extracted_text[:50000] # Limit FTS content to first 50k chars
261
+ }
262
+ )
263
+
264
+ logger.info(f"Updated FTS index for book {book_id}")
265
+
266
+ except Exception as e:
267
+ logger.error(f"Error updating FTS index: {e}")
268
+
269
+ def extract_page_content(self, file_path: Path, page_number: int) -> Optional[str]:
270
+ """
271
+ Extract text from a specific page (PDF only).
272
+
273
+ Args:
274
+ file_path: Path to PDF file
275
+ page_number: Page number (0-indexed)
276
+
277
+ Returns:
278
+ Page text or None
279
+ """
280
+ try:
281
+ if file_path.suffix.lower() == '.pdf':
282
+ doc = fitz.open(str(file_path))
283
+ if 0 <= page_number < len(doc):
284
+ page_text = doc[page_number].get_text()
285
+ doc.close()
286
+ return self._clean_text(page_text)
287
+ doc.close()
288
+ except Exception as e:
289
+ logger.error(f"Error extracting page {page_number}: {e}")
290
+
291
+ return None
292
+
293
+ def get_word_count(self, text: str) -> int:
294
+ """Get word count from text."""
295
+ return len(text.split())
296
+
297
+ def extract_and_chunk_all(self, file: File, session: Session,
298
+ chunk_size: int = 500) -> Tuple[Optional[ExtractedText], List[TextChunk]]:
299
+ """
300
+ Extract full text and create chunks in one operation.
301
+
302
+ Args:
303
+ file: File instance
304
+ session: Database session
305
+ chunk_size: Words per chunk
306
+
307
+ Returns:
308
+ Tuple of (ExtractedText, List[TextChunk])
309
+ """
310
+ extracted = self.extract_full_text(file, session)
311
+
312
+ if not extracted:
313
+ return None, []
314
+
315
+ chunks = self.create_chunks(extracted, file, session, chunk_size)
316
+
317
+ return extracted, chunks
@@ -0,0 +1,77 @@
1
+ """Book similarity system.
2
+
3
+ This module provides a flexible system for computing similarity between books
4
+ using multiple features (content, metadata, etc.).
5
+
6
+ Basic usage:
7
+ >>> from ebk.similarity import BookSimilarity
8
+ >>>
9
+ >>> # Configure similarity
10
+ >>> sim = BookSimilarity().balanced()
11
+ >>>
12
+ >>> # Fit on corpus for performance
13
+ >>> sim.fit(all_books)
14
+ >>>
15
+ >>> # Find similar books
16
+ >>> similar = sim.find_similar(my_book, all_books, top_k=10)
17
+
18
+ Advanced usage:
19
+ >>> # Custom configuration
20
+ >>> sim = (BookSimilarity()
21
+ ... .content(weight=4.0)
22
+ ... .authors(weight=2.0, metric=CustomMetric())
23
+ ... .temporal(weight=1.0, sigma=5.0))
24
+ >>>
25
+ >>> # Compute similarity matrix for batch processing
26
+ >>> matrix = sim.similarity_matrix(books)
27
+ >>>
28
+ >>> # Save/load fitted state
29
+ >>> sim.save(Path("cache/similarity"))
30
+ >>> sim.load(Path("cache/similarity"))
31
+ """
32
+
33
+ from ebk.similarity.base import Extractor, Feature, Metric
34
+ from ebk.similarity.core import BookSimilarity
35
+ from ebk.similarity.extractors import (
36
+ AuthorsExtractor,
37
+ ContentExtractor,
38
+ DescriptionExtractor,
39
+ LanguageExtractor,
40
+ PageCountExtractor,
41
+ PublicationYearExtractor,
42
+ PublisherExtractor,
43
+ SubjectsExtractor,
44
+ )
45
+ from ebk.similarity.metrics import (
46
+ CosineMetric,
47
+ ExactMatchMetric,
48
+ JaccardMetric,
49
+ NumericProximityMetric,
50
+ TemporalDecayMetric,
51
+ TfidfMetric,
52
+ )
53
+
54
+ __all__ = [
55
+ # Core
56
+ "BookSimilarity",
57
+ # Base classes
58
+ "Extractor",
59
+ "Metric",
60
+ "Feature",
61
+ # Extractors
62
+ "ContentExtractor",
63
+ "DescriptionExtractor",
64
+ "AuthorsExtractor",
65
+ "SubjectsExtractor",
66
+ "PublicationYearExtractor",
67
+ "LanguageExtractor",
68
+ "PublisherExtractor",
69
+ "PageCountExtractor",
70
+ # Metrics
71
+ "TfidfMetric",
72
+ "CosineMetric",
73
+ "JaccardMetric",
74
+ "ExactMatchMetric",
75
+ "TemporalDecayMetric",
76
+ "NumericProximityMetric",
77
+ ]