ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +1097 -9
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +59 -0
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/exports/zip.py +25 -0
- ebk/library_db.py +155 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
- ebk-0.3.2.dist-info/RECORD +69 -0
- ebk-0.3.2.dist-info/entry_points.txt +2 -0
- ebk-0.3.2.dist-info/top_level.txt +1 -0
- ebk-0.3.1.dist-info/RECORD +0 -19
- ebk-0.3.1.dist-info/entry_points.txt +0 -6
- ebk-0.3.1.dist-info/top_level.txt +0 -2
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
- {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Service for managing hierarchical user tags.
|
|
2
|
+
|
|
3
|
+
Tags provide user-defined organization separate from bibliographic subjects.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
from sqlalchemy.orm import Session
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
from ebk.db.models import Tag, Book, book_tags
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TagService:
|
|
14
|
+
"""Service for CRUD operations on hierarchical tags."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, session: Session):
|
|
17
|
+
"""Initialize tag service.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
session: SQLAlchemy session
|
|
21
|
+
"""
|
|
22
|
+
self.session = session
|
|
23
|
+
|
|
24
|
+
def get_or_create_tag(self, path: str, description: Optional[str] = None,
|
|
25
|
+
color: Optional[str] = None) -> Tag:
|
|
26
|
+
"""Get existing tag or create new one with full hierarchy.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
path: Full tag path (e.g., "Work/Project-2024")
|
|
30
|
+
description: Optional description
|
|
31
|
+
color: Optional hex color code
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Tag instance
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
>>> service.get_or_create_tag("Work/Project-2024")
|
|
38
|
+
# Creates: "Work" and "Work/Project-2024" if they don't exist
|
|
39
|
+
"""
|
|
40
|
+
# Check if tag already exists
|
|
41
|
+
existing = self.session.query(Tag).filter_by(path=path).first()
|
|
42
|
+
if existing:
|
|
43
|
+
return existing
|
|
44
|
+
|
|
45
|
+
# Parse path into components
|
|
46
|
+
parts = path.split('/')
|
|
47
|
+
parent_tag = None
|
|
48
|
+
current_path = ""
|
|
49
|
+
|
|
50
|
+
# Create hierarchy from root to leaf
|
|
51
|
+
for i, name in enumerate(parts):
|
|
52
|
+
# Build current path
|
|
53
|
+
if current_path:
|
|
54
|
+
current_path += f"/{name}"
|
|
55
|
+
else:
|
|
56
|
+
current_path = name
|
|
57
|
+
|
|
58
|
+
# Check if this level exists
|
|
59
|
+
tag = self.session.query(Tag).filter_by(path=current_path).first()
|
|
60
|
+
|
|
61
|
+
if not tag:
|
|
62
|
+
# Create new tag at this level
|
|
63
|
+
tag = Tag(
|
|
64
|
+
name=name,
|
|
65
|
+
path=current_path,
|
|
66
|
+
parent_id=parent_tag.id if parent_tag else None
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Only set description and color on the leaf node
|
|
70
|
+
if i == len(parts) - 1:
|
|
71
|
+
tag.description = description
|
|
72
|
+
tag.color = color
|
|
73
|
+
|
|
74
|
+
self.session.add(tag)
|
|
75
|
+
|
|
76
|
+
parent_tag = tag
|
|
77
|
+
|
|
78
|
+
self.session.commit()
|
|
79
|
+
return parent_tag
|
|
80
|
+
|
|
81
|
+
def get_tag(self, path: str) -> Optional[Tag]:
|
|
82
|
+
"""Get tag by path.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
path: Full tag path
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Tag instance or None
|
|
89
|
+
"""
|
|
90
|
+
return self.session.query(Tag).filter_by(path=path).first()
|
|
91
|
+
|
|
92
|
+
def get_all_tags(self) -> List[Tag]:
|
|
93
|
+
"""Get all tags ordered by path.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
List of all tags
|
|
97
|
+
"""
|
|
98
|
+
return self.session.query(Tag).order_by(Tag.path).all()
|
|
99
|
+
|
|
100
|
+
def get_root_tags(self) -> List[Tag]:
|
|
101
|
+
"""Get top-level tags (no parent).
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of root tags
|
|
105
|
+
"""
|
|
106
|
+
return self.session.query(Tag).filter(Tag.parent_id.is_(None)).order_by(Tag.name).all()
|
|
107
|
+
|
|
108
|
+
def get_children(self, tag: Tag) -> List[Tag]:
|
|
109
|
+
"""Get immediate children of a tag.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
tag: Parent tag
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of child tags
|
|
116
|
+
"""
|
|
117
|
+
return self.session.query(Tag).filter_by(parent_id=tag.id).order_by(Tag.name).all()
|
|
118
|
+
|
|
119
|
+
def delete_tag(self, path: str, delete_children: bool = False) -> bool:
|
|
120
|
+
"""Delete a tag.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
path: Full tag path
|
|
124
|
+
delete_children: If True, delete children and all descendants too
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
True if deleted, False if not found
|
|
128
|
+
"""
|
|
129
|
+
tag = self.get_tag(path)
|
|
130
|
+
if not tag:
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
# Check if tag has children
|
|
134
|
+
children = self.get_children(tag)
|
|
135
|
+
if children and not delete_children:
|
|
136
|
+
raise ValueError(f"Tag '{path}' has {len(children)} children. "
|
|
137
|
+
"Use delete_children=True to delete them too.")
|
|
138
|
+
|
|
139
|
+
# If delete_children=True, explicitly delete all descendants
|
|
140
|
+
if delete_children:
|
|
141
|
+
# Find all tags that start with this path + "/"
|
|
142
|
+
descendants = self.session.query(Tag).filter(
|
|
143
|
+
Tag.path.like(f"{path}/%")
|
|
144
|
+
).all()
|
|
145
|
+
for desc in descendants:
|
|
146
|
+
self.session.delete(desc)
|
|
147
|
+
|
|
148
|
+
self.session.delete(tag)
|
|
149
|
+
self.session.commit()
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
def rename_tag(self, old_path: str, new_path: str) -> Tag:
|
|
153
|
+
"""Rename a tag and update all descendant paths.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
old_path: Current tag path
|
|
157
|
+
new_path: New tag path
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Updated tag
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
ValueError: If tag doesn't exist or new path already exists
|
|
164
|
+
"""
|
|
165
|
+
tag = self.get_tag(old_path)
|
|
166
|
+
if not tag:
|
|
167
|
+
raise ValueError(f"Tag '{old_path}' not found")
|
|
168
|
+
|
|
169
|
+
# Check if new path already exists
|
|
170
|
+
if self.get_tag(new_path):
|
|
171
|
+
raise ValueError(f"Tag '{new_path}' already exists")
|
|
172
|
+
|
|
173
|
+
# Update this tag
|
|
174
|
+
old_name = tag.name
|
|
175
|
+
new_parts = new_path.split('/')
|
|
176
|
+
tag.name = new_parts[-1]
|
|
177
|
+
tag.path = new_path
|
|
178
|
+
|
|
179
|
+
# Update all descendant paths
|
|
180
|
+
descendants = self.session.query(Tag).filter(
|
|
181
|
+
Tag.path.like(f"{old_path}/%")
|
|
182
|
+
).all()
|
|
183
|
+
|
|
184
|
+
for desc in descendants:
|
|
185
|
+
# Replace the old path prefix with new path
|
|
186
|
+
desc.path = desc.path.replace(old_path, new_path, 1)
|
|
187
|
+
|
|
188
|
+
self.session.commit()
|
|
189
|
+
return tag
|
|
190
|
+
|
|
191
|
+
def add_tag_to_book(self, book: Book, tag_path: str) -> Tag:
|
|
192
|
+
"""Add a tag to a book (creates tag if it doesn't exist).
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
book: Book instance
|
|
196
|
+
tag_path: Full tag path
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tag instance
|
|
200
|
+
"""
|
|
201
|
+
tag = self.get_or_create_tag(tag_path)
|
|
202
|
+
|
|
203
|
+
if tag not in book.tags:
|
|
204
|
+
book.tags.append(tag)
|
|
205
|
+
self.session.commit()
|
|
206
|
+
|
|
207
|
+
return tag
|
|
208
|
+
|
|
209
|
+
def remove_tag_from_book(self, book: Book, tag_path: str) -> bool:
|
|
210
|
+
"""Remove a tag from a book.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
book: Book instance
|
|
214
|
+
tag_path: Full tag path
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
True if removed, False if book didn't have that tag
|
|
218
|
+
"""
|
|
219
|
+
tag = self.get_tag(tag_path)
|
|
220
|
+
if not tag:
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
if tag in book.tags:
|
|
224
|
+
book.tags.remove(tag)
|
|
225
|
+
self.session.commit()
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
def get_books_with_tag(self, tag_path: str, include_subtags: bool = False) -> List[Book]:
|
|
231
|
+
"""Get all books with a specific tag.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
tag_path: Full tag path
|
|
235
|
+
include_subtags: If True, include books from descendant tags
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
List of books
|
|
239
|
+
"""
|
|
240
|
+
tag = self.get_tag(tag_path)
|
|
241
|
+
if not tag:
|
|
242
|
+
return []
|
|
243
|
+
|
|
244
|
+
if not include_subtags:
|
|
245
|
+
return tag.books
|
|
246
|
+
|
|
247
|
+
# Get all descendant tags
|
|
248
|
+
descendant_paths = self.session.query(Tag.id).filter(
|
|
249
|
+
Tag.path.like(f"{tag_path}/%")
|
|
250
|
+
).all()
|
|
251
|
+
|
|
252
|
+
all_tag_ids = [tag.id] + [t[0] for t in descendant_paths]
|
|
253
|
+
|
|
254
|
+
# Get books with any of these tags
|
|
255
|
+
books = self.session.query(Book).join(book_tags).filter(
|
|
256
|
+
book_tags.c.tag_id.in_(all_tag_ids)
|
|
257
|
+
).distinct().all()
|
|
258
|
+
|
|
259
|
+
return books
|
|
260
|
+
|
|
261
|
+
def get_tag_stats(self, tag_path: str) -> dict:
|
|
262
|
+
"""Get statistics for a tag.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
tag_path: Full tag path
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dict with stats: {book_count, subtag_count, depth}
|
|
269
|
+
"""
|
|
270
|
+
tag = self.get_tag(tag_path)
|
|
271
|
+
if not tag:
|
|
272
|
+
return {}
|
|
273
|
+
|
|
274
|
+
children = self.get_children(tag)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
'path': tag.path,
|
|
278
|
+
'book_count': len(tag.books),
|
|
279
|
+
'subtag_count': len(children),
|
|
280
|
+
'depth': tag.depth,
|
|
281
|
+
'created_at': tag.created_at,
|
|
282
|
+
}
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction service for ebook files.
|
|
3
|
+
|
|
4
|
+
Handles extraction from PDF, EPUB, TXT, MD and stores in database with FTS indexing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import pypdf
|
|
13
|
+
import fitz # PyMuPDF
|
|
14
|
+
from ebooklib import epub
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
|
|
17
|
+
from ..db.models import File, ExtractedText, TextChunk
|
|
18
|
+
from sqlalchemy.orm import Session
|
|
19
|
+
from sqlalchemy import text
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TextExtractionService:
|
|
25
|
+
"""Service for extracting and chunking text from ebook files."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, library_root: Path):
|
|
28
|
+
self.library_root = Path(library_root)
|
|
29
|
+
|
|
30
|
+
def extract_full_text(self, file: File, session: Session) -> Optional[ExtractedText]:
|
|
31
|
+
"""
|
|
32
|
+
Extract complete text from ebook file and store in database.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file: File model instance
|
|
36
|
+
session: Database session
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
ExtractedText instance or None if extraction failed
|
|
40
|
+
"""
|
|
41
|
+
file_path = self.library_root / file.path
|
|
42
|
+
|
|
43
|
+
if not file_path.exists():
|
|
44
|
+
logger.error(f"File not found: {file_path}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Extract based on format
|
|
49
|
+
if file.format.lower() in ['txt', 'md', 'text']:
|
|
50
|
+
text = self._extract_plaintext(file_path)
|
|
51
|
+
elif file.format.lower() == 'pdf':
|
|
52
|
+
text = self._extract_pdf_text(file_path)
|
|
53
|
+
elif file.format.lower() == 'epub':
|
|
54
|
+
text = self._extract_epub_text(file_path)
|
|
55
|
+
else:
|
|
56
|
+
logger.warning(f"Unsupported format for text extraction: {file.format}")
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
if not text or len(text.strip()) < 100:
|
|
60
|
+
logger.warning(f"Extracted text too short for {file.path}")
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
# Store in database
|
|
64
|
+
extracted = ExtractedText(
|
|
65
|
+
file_id=file.id,
|
|
66
|
+
content=text,
|
|
67
|
+
content_hash=self._hash_text(text)
|
|
68
|
+
)
|
|
69
|
+
session.add(extracted)
|
|
70
|
+
|
|
71
|
+
# Update file status
|
|
72
|
+
file.text_extracted = True
|
|
73
|
+
file.extraction_date = extracted.extracted_at
|
|
74
|
+
|
|
75
|
+
# Update FTS index
|
|
76
|
+
self._update_fts_index(session, file.book_id, text)
|
|
77
|
+
|
|
78
|
+
logger.info(f"Extracted {len(text)} chars from {file.path}")
|
|
79
|
+
return extracted
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Error extracting text from {file.path}: {e}")
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def create_chunks(self, extracted: ExtractedText, file: File,
|
|
86
|
+
session: Session, chunk_size: int = 500,
|
|
87
|
+
overlap: int = 100) -> List[TextChunk]:
|
|
88
|
+
"""
|
|
89
|
+
Split extracted text into overlapping chunks for semantic search.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
extracted: ExtractedText instance
|
|
93
|
+
file: File instance
|
|
94
|
+
session: Database session
|
|
95
|
+
chunk_size: Number of words per chunk
|
|
96
|
+
overlap: Number of overlapping words between chunks
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of TextChunk instances
|
|
100
|
+
"""
|
|
101
|
+
text = extracted.content
|
|
102
|
+
words = text.split()
|
|
103
|
+
|
|
104
|
+
chunks = []
|
|
105
|
+
for i in range(0, len(words), chunk_size - overlap):
|
|
106
|
+
chunk_words = words[i:i + chunk_size]
|
|
107
|
+
chunk_text = ' '.join(chunk_words)
|
|
108
|
+
|
|
109
|
+
if len(chunk_text.strip()) < 50: # Skip tiny chunks
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
chunk = TextChunk(
|
|
113
|
+
file_id=file.id,
|
|
114
|
+
chunk_index=len(chunks),
|
|
115
|
+
content=chunk_text,
|
|
116
|
+
has_embedding=False
|
|
117
|
+
)
|
|
118
|
+
chunks.append(chunk)
|
|
119
|
+
|
|
120
|
+
session.add_all(chunks)
|
|
121
|
+
logger.info(f"Created {len(chunks)} chunks from {file.path}")
|
|
122
|
+
return chunks
|
|
123
|
+
|
|
124
|
+
def _extract_plaintext(self, file_path: Path) -> str:
|
|
125
|
+
"""Extract text from plain text files."""
|
|
126
|
+
try:
|
|
127
|
+
return file_path.read_text(encoding='utf-8')
|
|
128
|
+
except UnicodeDecodeError:
|
|
129
|
+
# Try with different encoding
|
|
130
|
+
return file_path.read_text(encoding='latin-1')
|
|
131
|
+
|
|
132
|
+
def _extract_pdf_text(self, file_path: Path) -> str:
|
|
133
|
+
"""
|
|
134
|
+
Extract text from PDF using PyMuPDF (primary) with pypdf fallback.
|
|
135
|
+
"""
|
|
136
|
+
try:
|
|
137
|
+
# Try PyMuPDF first (better quality)
|
|
138
|
+
doc = fitz.open(str(file_path))
|
|
139
|
+
text = ""
|
|
140
|
+
for page in doc:
|
|
141
|
+
text += page.get_text()
|
|
142
|
+
doc.close()
|
|
143
|
+
|
|
144
|
+
if text.strip():
|
|
145
|
+
return self._clean_text(text)
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.warning(f"PyMuPDF extraction failed: {e}, trying pypdf")
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Fallback to pypdf
|
|
152
|
+
with open(file_path, 'rb') as f:
|
|
153
|
+
reader = pypdf.PdfReader(f)
|
|
154
|
+
text = ""
|
|
155
|
+
for page in reader.pages:
|
|
156
|
+
text += page.extract_text()
|
|
157
|
+
|
|
158
|
+
return self._clean_text(text)
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(f"PDF text extraction failed: {e}")
|
|
162
|
+
return ""
|
|
163
|
+
|
|
164
|
+
def _extract_epub_text(self, file_path: Path) -> str:
|
|
165
|
+
"""Extract text from EPUB file."""
|
|
166
|
+
try:
|
|
167
|
+
book = epub.read_epub(str(file_path))
|
|
168
|
+
text_parts = []
|
|
169
|
+
|
|
170
|
+
for item in book.get_items():
|
|
171
|
+
# Handle different ebooklib versions
|
|
172
|
+
# Type 9 is ITEM_DOCUMENT in ebooklib
|
|
173
|
+
item_type = item.get_type()
|
|
174
|
+
|
|
175
|
+
# Check if this is a document item (HTML/XHTML content)
|
|
176
|
+
is_document = False
|
|
177
|
+
if hasattr(epub, 'ITEM_DOCUMENT'):
|
|
178
|
+
is_document = item_type == epub.ITEM_DOCUMENT
|
|
179
|
+
else:
|
|
180
|
+
# Fallback: type 9 is document, or check media type
|
|
181
|
+
is_document = (item_type == 9 or
|
|
182
|
+
'html' in item.get_name().lower() or
|
|
183
|
+
(hasattr(item, 'media_type') and
|
|
184
|
+
item.media_type and
|
|
185
|
+
'html' in item.media_type.lower()))
|
|
186
|
+
|
|
187
|
+
if is_document:
|
|
188
|
+
try:
|
|
189
|
+
soup = BeautifulSoup(item.content, 'html.parser')
|
|
190
|
+
|
|
191
|
+
# Remove script and style elements
|
|
192
|
+
for script in soup(["script", "style"]):
|
|
193
|
+
script.decompose()
|
|
194
|
+
|
|
195
|
+
text = soup.get_text(separator='\n')
|
|
196
|
+
text_parts.append(text)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.debug(f"Failed to extract text from item {item.get_name()}: {e}")
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
full_text = '\n\n'.join(text_parts)
|
|
202
|
+
return self._clean_text(full_text)
|
|
203
|
+
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"EPUB text extraction failed: {e}")
|
|
206
|
+
return ""
|
|
207
|
+
|
|
208
|
+
def _clean_text(self, text: str) -> str:
|
|
209
|
+
"""Clean extracted text."""
|
|
210
|
+
# Remove excessive whitespace
|
|
211
|
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
212
|
+
text = re.sub(r' +', ' ', text)
|
|
213
|
+
|
|
214
|
+
# Remove page headers/footers (common patterns)
|
|
215
|
+
text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
|
|
216
|
+
|
|
217
|
+
# Strip leading/trailing whitespace
|
|
218
|
+
text = text.strip()
|
|
219
|
+
|
|
220
|
+
return text
|
|
221
|
+
|
|
222
|
+
def _hash_text(self, text: str) -> str:
|
|
223
|
+
"""Generate hash of text content."""
|
|
224
|
+
import hashlib
|
|
225
|
+
return hashlib.sha256(text.encode()).hexdigest()
|
|
226
|
+
|
|
227
|
+
def _update_fts_index(self, session: Session, book_id: int, extracted_text: str):
|
|
228
|
+
"""
|
|
229
|
+
Update full-text search index.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
session: Database session
|
|
233
|
+
book_id: Book ID
|
|
234
|
+
extracted_text: Extracted text content
|
|
235
|
+
"""
|
|
236
|
+
try:
|
|
237
|
+
# Get book title and description for FTS
|
|
238
|
+
from ..db.models import Book
|
|
239
|
+
book = session.query(Book).get(book_id)
|
|
240
|
+
|
|
241
|
+
if not book:
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
# Delete existing FTS entry if exists
|
|
245
|
+
session.execute(
|
|
246
|
+
text("DELETE FROM books_fts WHERE book_id = :book_id"),
|
|
247
|
+
{"book_id": book_id}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Insert into FTS table
|
|
251
|
+
session.execute(
|
|
252
|
+
text("""
|
|
253
|
+
INSERT INTO books_fts (book_id, title, description, extracted_text)
|
|
254
|
+
VALUES (:book_id, :title, :description, :extracted_text)
|
|
255
|
+
"""),
|
|
256
|
+
{
|
|
257
|
+
"book_id": book_id,
|
|
258
|
+
"title": book.title or '',
|
|
259
|
+
"description": book.description or '',
|
|
260
|
+
"extracted_text": extracted_text[:50000] # Limit FTS content to first 50k chars
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
logger.info(f"Updated FTS index for book {book_id}")
|
|
265
|
+
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"Error updating FTS index: {e}")
|
|
268
|
+
|
|
269
|
+
def extract_page_content(self, file_path: Path, page_number: int) -> Optional[str]:
|
|
270
|
+
"""
|
|
271
|
+
Extract text from a specific page (PDF only).
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
file_path: Path to PDF file
|
|
275
|
+
page_number: Page number (0-indexed)
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Page text or None
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
if file_path.suffix.lower() == '.pdf':
|
|
282
|
+
doc = fitz.open(str(file_path))
|
|
283
|
+
if 0 <= page_number < len(doc):
|
|
284
|
+
page_text = doc[page_number].get_text()
|
|
285
|
+
doc.close()
|
|
286
|
+
return self._clean_text(page_text)
|
|
287
|
+
doc.close()
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.error(f"Error extracting page {page_number}: {e}")
|
|
290
|
+
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
def get_word_count(self, text: str) -> int:
|
|
294
|
+
"""Get word count from text."""
|
|
295
|
+
return len(text.split())
|
|
296
|
+
|
|
297
|
+
def extract_and_chunk_all(self, file: File, session: Session,
|
|
298
|
+
chunk_size: int = 500) -> Tuple[Optional[ExtractedText], List[TextChunk]]:
|
|
299
|
+
"""
|
|
300
|
+
Extract full text and create chunks in one operation.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
file: File instance
|
|
304
|
+
session: Database session
|
|
305
|
+
chunk_size: Words per chunk
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Tuple of (ExtractedText, List[TextChunk])
|
|
309
|
+
"""
|
|
310
|
+
extracted = self.extract_full_text(file, session)
|
|
311
|
+
|
|
312
|
+
if not extracted:
|
|
313
|
+
return None, []
|
|
314
|
+
|
|
315
|
+
chunks = self.create_chunks(extracted, file, session, chunk_size)
|
|
316
|
+
|
|
317
|
+
return extracted, chunks
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Book similarity system.
|
|
2
|
+
|
|
3
|
+
This module provides a flexible system for computing similarity between books
|
|
4
|
+
using multiple features (content, metadata, etc.).
|
|
5
|
+
|
|
6
|
+
Basic usage:
|
|
7
|
+
>>> from ebk.similarity import BookSimilarity
|
|
8
|
+
>>>
|
|
9
|
+
>>> # Configure similarity
|
|
10
|
+
>>> sim = BookSimilarity().balanced()
|
|
11
|
+
>>>
|
|
12
|
+
>>> # Fit on corpus for performance
|
|
13
|
+
>>> sim.fit(all_books)
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Find similar books
|
|
16
|
+
>>> similar = sim.find_similar(my_book, all_books, top_k=10)
|
|
17
|
+
|
|
18
|
+
Advanced usage:
|
|
19
|
+
>>> # Custom configuration
|
|
20
|
+
>>> sim = (BookSimilarity()
|
|
21
|
+
... .content(weight=4.0)
|
|
22
|
+
... .authors(weight=2.0, metric=CustomMetric())
|
|
23
|
+
... .temporal(weight=1.0, sigma=5.0))
|
|
24
|
+
>>>
|
|
25
|
+
>>> # Compute similarity matrix for batch processing
|
|
26
|
+
>>> matrix = sim.similarity_matrix(books)
|
|
27
|
+
>>>
|
|
28
|
+
>>> # Save/load fitted state
|
|
29
|
+
>>> sim.save(Path("cache/similarity"))
|
|
30
|
+
>>> sim.load(Path("cache/similarity"))
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from ebk.similarity.base import Extractor, Feature, Metric
|
|
34
|
+
from ebk.similarity.core import BookSimilarity
|
|
35
|
+
from ebk.similarity.extractors import (
|
|
36
|
+
AuthorsExtractor,
|
|
37
|
+
ContentExtractor,
|
|
38
|
+
DescriptionExtractor,
|
|
39
|
+
LanguageExtractor,
|
|
40
|
+
PageCountExtractor,
|
|
41
|
+
PublicationYearExtractor,
|
|
42
|
+
PublisherExtractor,
|
|
43
|
+
SubjectsExtractor,
|
|
44
|
+
)
|
|
45
|
+
from ebk.similarity.metrics import (
|
|
46
|
+
CosineMetric,
|
|
47
|
+
ExactMatchMetric,
|
|
48
|
+
JaccardMetric,
|
|
49
|
+
NumericProximityMetric,
|
|
50
|
+
TemporalDecayMetric,
|
|
51
|
+
TfidfMetric,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
# Core
|
|
56
|
+
"BookSimilarity",
|
|
57
|
+
# Base classes
|
|
58
|
+
"Extractor",
|
|
59
|
+
"Metric",
|
|
60
|
+
"Feature",
|
|
61
|
+
# Extractors
|
|
62
|
+
"ContentExtractor",
|
|
63
|
+
"DescriptionExtractor",
|
|
64
|
+
"AuthorsExtractor",
|
|
65
|
+
"SubjectsExtractor",
|
|
66
|
+
"PublicationYearExtractor",
|
|
67
|
+
"LanguageExtractor",
|
|
68
|
+
"PublisherExtractor",
|
|
69
|
+
"PageCountExtractor",
|
|
70
|
+
# Metrics
|
|
71
|
+
"TfidfMetric",
|
|
72
|
+
"CosineMetric",
|
|
73
|
+
"JaccardMetric",
|
|
74
|
+
"ExactMatchMetric",
|
|
75
|
+
"TemporalDecayMetric",
|
|
76
|
+
"NumericProximityMetric",
|
|
77
|
+
]
|