ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +2828 -680
- ebk/config.py +260 -22
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +132 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +7 -3
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/extract_metadata.py +76 -7
- ebk/library_db.py +899 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +444 -0
- ebk/plugins/registry.py +500 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +1633 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk-0.3.2.dist-info/METADATA +755 -0
- ebk-0.3.2.dist-info/RECORD +69 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
- ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +0 -144
- ebk/imports/ebooks.py +0 -116
- ebk/llm.py +0 -58
- ebk/manager.py +0 -44
- ebk/merge.py +0 -308
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +0 -185
- ebk/streamlit/display.py +0 -168
- ebk/streamlit/filters.py +0 -151
- ebk/streamlit/utils.py +0 -58
- ebk/utils.py +0 -311
- ebk-0.1.0.dist-info/METADATA +0 -457
- ebk-0.1.0.dist-info/RECORD +0 -29
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata Enrichment Service using LLM providers.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- Metadata inference from text
|
|
6
|
+
- Auto-tagging
|
|
7
|
+
- Subject categorization
|
|
8
|
+
- Description generation
|
|
9
|
+
- Difficulty level assessment
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from typing import Dict, Any, List, Optional
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .llm_providers.base import BaseLLMProvider
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class EnrichedMetadata:
|
|
22
|
+
"""Enriched metadata generated by LLM."""
|
|
23
|
+
|
|
24
|
+
# Inferred metadata
|
|
25
|
+
title: Optional[str] = None
|
|
26
|
+
authors: Optional[List[str]] = None
|
|
27
|
+
subjects: Optional[List[str]] = None
|
|
28
|
+
description: Optional[str] = None
|
|
29
|
+
keywords: Optional[List[str]] = None
|
|
30
|
+
|
|
31
|
+
# Auto-generated tags
|
|
32
|
+
tags: Optional[List[str]] = None
|
|
33
|
+
categories: Optional[List[str]] = None
|
|
34
|
+
|
|
35
|
+
# Content analysis
|
|
36
|
+
difficulty_level: Optional[str] = None # beginner, intermediate, advanced
|
|
37
|
+
reading_time_minutes: Optional[int] = None
|
|
38
|
+
target_audience: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
# Quality metrics
|
|
41
|
+
confidence_score: float = 0.0 # 0.0 to 1.0
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
44
|
+
"""Convert to dictionary."""
|
|
45
|
+
return {
|
|
46
|
+
k: v for k, v in {
|
|
47
|
+
"title": self.title,
|
|
48
|
+
"authors": self.authors,
|
|
49
|
+
"subjects": self.subjects,
|
|
50
|
+
"description": self.description,
|
|
51
|
+
"keywords": self.keywords,
|
|
52
|
+
"tags": self.tags,
|
|
53
|
+
"categories": self.categories,
|
|
54
|
+
"difficulty_level": self.difficulty_level,
|
|
55
|
+
"reading_time_minutes": self.reading_time_minutes,
|
|
56
|
+
"target_audience": self.target_audience,
|
|
57
|
+
"confidence_score": self.confidence_score,
|
|
58
|
+
}.items() if v is not None
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class MetadataEnrichmentService:
|
|
63
|
+
"""
|
|
64
|
+
Service for enriching book metadata using LLM.
|
|
65
|
+
|
|
66
|
+
Can work with:
|
|
67
|
+
- Extracted text from PDFs/EPUBs
|
|
68
|
+
- Existing metadata (title, authors, etc.)
|
|
69
|
+
- Combination of both
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self, provider: BaseLLMProvider):
|
|
73
|
+
"""
|
|
74
|
+
Initialize service with LLM provider.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
provider: LLM provider instance (e.g., OllamaProvider)
|
|
78
|
+
"""
|
|
79
|
+
self.provider = provider
|
|
80
|
+
|
|
81
|
+
async def infer_metadata_from_text(
|
|
82
|
+
self,
|
|
83
|
+
text: str,
|
|
84
|
+
existing_metadata: Optional[Dict[str, Any]] = None,
|
|
85
|
+
max_text_length: int = 5000
|
|
86
|
+
) -> EnrichedMetadata:
|
|
87
|
+
"""
|
|
88
|
+
Infer metadata from extracted text.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text: Extracted text from book (truncated if too long)
|
|
92
|
+
existing_metadata: Any existing metadata to enhance
|
|
93
|
+
max_text_length: Maximum text length to send to LLM
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
EnrichedMetadata with inferred fields
|
|
97
|
+
"""
|
|
98
|
+
# Truncate text if too long
|
|
99
|
+
if len(text) > max_text_length:
|
|
100
|
+
text = text[:max_text_length] + "..."
|
|
101
|
+
|
|
102
|
+
# Build prompt
|
|
103
|
+
prompt = self._build_metadata_inference_prompt(text, existing_metadata)
|
|
104
|
+
|
|
105
|
+
# Define expected schema
|
|
106
|
+
schema = {
|
|
107
|
+
"type": "object",
|
|
108
|
+
"properties": {
|
|
109
|
+
"title": {"type": "string"},
|
|
110
|
+
"authors": {"type": "array", "items": {"type": "string"}},
|
|
111
|
+
"subjects": {"type": "array", "items": {"type": "string"}},
|
|
112
|
+
"description": {"type": "string"},
|
|
113
|
+
"keywords": {"type": "array", "items": {"type": "string"}},
|
|
114
|
+
"difficulty_level": {"type": "string", "enum": ["beginner", "intermediate", "advanced"]},
|
|
115
|
+
"target_audience": {"type": "string"},
|
|
116
|
+
"confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
result = await self.provider.complete_json(
|
|
122
|
+
prompt=prompt,
|
|
123
|
+
schema=schema,
|
|
124
|
+
temperature=0.3, # Lower temperature for more consistent metadata
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return EnrichedMetadata(
|
|
128
|
+
title=result.get("title"),
|
|
129
|
+
authors=result.get("authors"),
|
|
130
|
+
subjects=result.get("subjects"),
|
|
131
|
+
description=result.get("description"),
|
|
132
|
+
keywords=result.get("keywords"),
|
|
133
|
+
difficulty_level=result.get("difficulty_level"),
|
|
134
|
+
target_audience=result.get("target_audience"),
|
|
135
|
+
confidence_score=result.get("confidence_score", 0.7),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
print(f"Metadata inference failed: {e}")
|
|
140
|
+
return EnrichedMetadata(confidence_score=0.0)
|
|
141
|
+
|
|
142
|
+
async def generate_tags(
|
|
143
|
+
self,
|
|
144
|
+
title: str,
|
|
145
|
+
authors: Optional[List[str]] = None,
|
|
146
|
+
subjects: Optional[List[str]] = None,
|
|
147
|
+
description: Optional[str] = None,
|
|
148
|
+
text_sample: Optional[str] = None,
|
|
149
|
+
max_tags: int = 15
|
|
150
|
+
) -> List[str]:
|
|
151
|
+
"""
|
|
152
|
+
Generate relevant tags for a book.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
title: Book title
|
|
156
|
+
authors: List of authors
|
|
157
|
+
subjects: Existing subjects/topics
|
|
158
|
+
description: Book description
|
|
159
|
+
text_sample: Sample text from book
|
|
160
|
+
max_tags: Maximum number of tags to generate
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of tags
|
|
164
|
+
"""
|
|
165
|
+
prompt = f"""Generate up to {max_tags} relevant tags for this book.
|
|
166
|
+
|
|
167
|
+
Title: {title}
|
|
168
|
+
Authors: {', '.join(authors or [])}
|
|
169
|
+
Subjects: {', '.join(subjects or [])}
|
|
170
|
+
Description: {description or 'N/A'}
|
|
171
|
+
|
|
172
|
+
{f'Text Sample: {text_sample[:1000]}...' if text_sample else ''}
|
|
173
|
+
|
|
174
|
+
Generate tags that would help someone find this book. Include:
|
|
175
|
+
- Genre (fiction, non-fiction, textbook, reference, etc.)
|
|
176
|
+
- Topics and themes
|
|
177
|
+
- Reading level (introductory, advanced, comprehensive, etc.)
|
|
178
|
+
- Format type (handbook, guide, manual, tutorial, etc.)
|
|
179
|
+
- Domain (programming, mathematics, history, science, etc.)
|
|
180
|
+
|
|
181
|
+
Return as JSON array: ["tag1", "tag2", ...]"""
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
result = await self.provider.complete_json(prompt, temperature=0.5)
|
|
185
|
+
|
|
186
|
+
# Handle both array and object responses
|
|
187
|
+
if isinstance(result, list):
|
|
188
|
+
tags = result
|
|
189
|
+
elif isinstance(result, dict) and "tags" in result:
|
|
190
|
+
tags = result["tags"]
|
|
191
|
+
else:
|
|
192
|
+
tags = []
|
|
193
|
+
|
|
194
|
+
# Clean and deduplicate tags
|
|
195
|
+
tags = [str(tag).strip().lower() for tag in tags if tag]
|
|
196
|
+
tags = list(dict.fromkeys(tags)) # Deduplicate while preserving order
|
|
197
|
+
|
|
198
|
+
return tags[:max_tags]
|
|
199
|
+
|
|
200
|
+
except Exception as e:
|
|
201
|
+
print(f"Tag generation failed: {e}")
|
|
202
|
+
return []
|
|
203
|
+
|
|
204
|
+
async def categorize(
|
|
205
|
+
self,
|
|
206
|
+
title: str,
|
|
207
|
+
subjects: Optional[List[str]] = None,
|
|
208
|
+
description: Optional[str] = None,
|
|
209
|
+
) -> List[str]:
|
|
210
|
+
"""
|
|
211
|
+
Categorize book into standard categories.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
title: Book title
|
|
215
|
+
subjects: Existing subjects
|
|
216
|
+
description: Book description
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of categories (e.g., ['Computer Science', 'Programming', 'Software Engineering'])
|
|
220
|
+
"""
|
|
221
|
+
prompt = f"""Categorize this book into standard academic/library categories.
|
|
222
|
+
|
|
223
|
+
Title: {title}
|
|
224
|
+
Subjects: {', '.join(subjects or [])}
|
|
225
|
+
Description: {description or 'N/A'}
|
|
226
|
+
|
|
227
|
+
Use standard categories like:
|
|
228
|
+
- Computer Science, Mathematics, Physics, Chemistry, Biology
|
|
229
|
+
- Engineering, Medicine, Law, Business
|
|
230
|
+
- History, Philosophy, Psychology, Sociology
|
|
231
|
+
- Literature, Art, Music
|
|
232
|
+
- Programming, Data Science, Machine Learning, AI
|
|
233
|
+
|
|
234
|
+
Return 2-5 relevant categories as JSON array: ["Category1", "Category2", ...]"""
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
result = await self.provider.complete_json(prompt, temperature=0.3)
|
|
238
|
+
|
|
239
|
+
if isinstance(result, list):
|
|
240
|
+
categories = result
|
|
241
|
+
elif isinstance(result, dict) and "categories" in result:
|
|
242
|
+
categories = result["categories"]
|
|
243
|
+
else:
|
|
244
|
+
categories = []
|
|
245
|
+
|
|
246
|
+
return [str(cat).strip() for cat in categories if cat]
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
print(f"Categorization failed: {e}")
|
|
250
|
+
return []
|
|
251
|
+
|
|
252
|
+
async def enhance_description(
|
|
253
|
+
self,
|
|
254
|
+
title: str,
|
|
255
|
+
existing_description: Optional[str] = None,
|
|
256
|
+
text_sample: Optional[str] = None,
|
|
257
|
+
max_length: int = 500
|
|
258
|
+
) -> Optional[str]:
|
|
259
|
+
"""
|
|
260
|
+
Generate or enhance book description.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
title: Book title
|
|
264
|
+
existing_description: Current description (if any)
|
|
265
|
+
text_sample: Sample text from book
|
|
266
|
+
max_length: Maximum description length
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Enhanced description
|
|
270
|
+
"""
|
|
271
|
+
if existing_description and len(existing_description) > max_length:
|
|
272
|
+
# Already have a good description
|
|
273
|
+
return existing_description
|
|
274
|
+
|
|
275
|
+
prompt = f"""Write a clear, informative description for this book.
|
|
276
|
+
|
|
277
|
+
Title: {title}
|
|
278
|
+
{f'Current Description: {existing_description}' if existing_description else ''}
|
|
279
|
+
{f'Text Sample: {text_sample[:2000]}...' if text_sample else ''}
|
|
280
|
+
|
|
281
|
+
Write a {max_length}-character description that:
|
|
282
|
+
1. Explains what the book is about
|
|
283
|
+
2. Identifies the target audience
|
|
284
|
+
3. Highlights key topics covered
|
|
285
|
+
4. Mentions the approach/style (e.g., practical guide, theoretical text, reference manual)
|
|
286
|
+
|
|
287
|
+
Return just the description text, no JSON."""
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
response = await self.provider.complete(prompt, temperature=0.5)
|
|
291
|
+
description = response.content.strip()
|
|
292
|
+
|
|
293
|
+
# Remove quotes if LLM wrapped it
|
|
294
|
+
if description.startswith('"') and description.endswith('"'):
|
|
295
|
+
description = description[1:-1]
|
|
296
|
+
|
|
297
|
+
return description[:max_length]
|
|
298
|
+
|
|
299
|
+
except Exception as e:
|
|
300
|
+
print(f"Description generation failed: {e}")
|
|
301
|
+
return existing_description
|
|
302
|
+
|
|
303
|
+
async def assess_difficulty(
|
|
304
|
+
self,
|
|
305
|
+
text_sample: str,
|
|
306
|
+
subjects: Optional[List[str]] = None
|
|
307
|
+
) -> str:
|
|
308
|
+
"""
|
|
309
|
+
Assess difficulty level of book.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
text_sample: Sample text from book
|
|
313
|
+
subjects: Book subjects/topics
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Difficulty level: 'beginner', 'intermediate', or 'advanced'
|
|
317
|
+
"""
|
|
318
|
+
prompt = f"""Assess the difficulty level of this text.
|
|
319
|
+
|
|
320
|
+
Subjects: {', '.join(subjects or [])}
|
|
321
|
+
|
|
322
|
+
Text Sample:
|
|
323
|
+
{text_sample[:2000]}
|
|
324
|
+
|
|
325
|
+
Consider:
|
|
326
|
+
- Vocabulary complexity
|
|
327
|
+
- Concept difficulty
|
|
328
|
+
- Prerequisites assumed
|
|
329
|
+
- Mathematical/technical content
|
|
330
|
+
|
|
331
|
+
Return one of: "beginner", "intermediate", "advanced"
|
|
332
|
+
|
|
333
|
+
Return as JSON: {{"level": "..."}}"""
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
result = await self.provider.complete_json(prompt, temperature=0.2)
|
|
337
|
+
level = result.get("level", "intermediate")
|
|
338
|
+
|
|
339
|
+
if level not in ["beginner", "intermediate", "advanced"]:
|
|
340
|
+
level = "intermediate"
|
|
341
|
+
|
|
342
|
+
return level
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
print(f"Difficulty assessment failed: {e}")
|
|
346
|
+
return "intermediate"
|
|
347
|
+
|
|
348
|
+
def _build_metadata_inference_prompt(
|
|
349
|
+
self,
|
|
350
|
+
text: str,
|
|
351
|
+
existing_metadata: Optional[Dict[str, Any]] = None
|
|
352
|
+
) -> str:
|
|
353
|
+
"""Build prompt for metadata inference."""
|
|
354
|
+
|
|
355
|
+
prompt = """Analyze this text and infer metadata about the book.
|
|
356
|
+
|
|
357
|
+
Text Sample:
|
|
358
|
+
---
|
|
359
|
+
{text}
|
|
360
|
+
---
|
|
361
|
+
|
|
362
|
+
{existing}
|
|
363
|
+
|
|
364
|
+
Extract or infer:
|
|
365
|
+
1. Title (if not provided)
|
|
366
|
+
2. Authors (if mentioned)
|
|
367
|
+
3. Subjects/topics (main themes and topics covered)
|
|
368
|
+
4. Description (2-3 sentence summary)
|
|
369
|
+
5. Keywords (10-15 relevant search terms)
|
|
370
|
+
6. Difficulty level (beginner/intermediate/advanced)
|
|
371
|
+
7. Target audience (who should read this)
|
|
372
|
+
|
|
373
|
+
Respond with JSON matching this schema:
|
|
374
|
+
{{
|
|
375
|
+
"title": "string",
|
|
376
|
+
"authors": ["author1", "author2"],
|
|
377
|
+
"subjects": ["subject1", "subject2"],
|
|
378
|
+
"description": "string",
|
|
379
|
+
"keywords": ["keyword1", "keyword2"],
|
|
380
|
+
"difficulty_level": "beginner|intermediate|advanced",
|
|
381
|
+
"target_audience": "string",
|
|
382
|
+
"confidence_score": 0.0-1.0
|
|
383
|
+
}}
|
|
384
|
+
|
|
385
|
+
Only include fields you can confidently infer. Set confidence_score based on how much information was available."""
|
|
386
|
+
|
|
387
|
+
existing_str = ""
|
|
388
|
+
if existing_metadata:
|
|
389
|
+
existing_str = f"""
|
|
390
|
+
Existing Metadata (enhance if possible):
|
|
391
|
+
Title: {existing_metadata.get('title', 'Unknown')}
|
|
392
|
+
Authors: {', '.join(existing_metadata.get('authors', []))}
|
|
393
|
+
Subjects: {', '.join(existing_metadata.get('subjects', []))}
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
return prompt.format(text=text, existing=existing_str)
|