ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,396 @@
1
+ """
2
+ Metadata Enrichment Service using LLM providers.
3
+
4
+ Provides:
5
+ - Metadata inference from text
6
+ - Auto-tagging
7
+ - Subject categorization
8
+ - Description generation
9
+ - Difficulty level assessment
10
+ """
11
+
12
+ import json
13
+ from typing import Dict, Any, List, Optional
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+
17
+ from .llm_providers.base import BaseLLMProvider
18
+
19
+
20
+ @dataclass
21
+ class EnrichedMetadata:
22
+ """Enriched metadata generated by LLM."""
23
+
24
+ # Inferred metadata
25
+ title: Optional[str] = None
26
+ authors: Optional[List[str]] = None
27
+ subjects: Optional[List[str]] = None
28
+ description: Optional[str] = None
29
+ keywords: Optional[List[str]] = None
30
+
31
+ # Auto-generated tags
32
+ tags: Optional[List[str]] = None
33
+ categories: Optional[List[str]] = None
34
+
35
+ # Content analysis
36
+ difficulty_level: Optional[str] = None # beginner, intermediate, advanced
37
+ reading_time_minutes: Optional[int] = None
38
+ target_audience: Optional[str] = None
39
+
40
+ # Quality metrics
41
+ confidence_score: float = 0.0 # 0.0 to 1.0
42
+
43
+ def to_dict(self) -> Dict[str, Any]:
44
+ """Convert to dictionary."""
45
+ return {
46
+ k: v for k, v in {
47
+ "title": self.title,
48
+ "authors": self.authors,
49
+ "subjects": self.subjects,
50
+ "description": self.description,
51
+ "keywords": self.keywords,
52
+ "tags": self.tags,
53
+ "categories": self.categories,
54
+ "difficulty_level": self.difficulty_level,
55
+ "reading_time_minutes": self.reading_time_minutes,
56
+ "target_audience": self.target_audience,
57
+ "confidence_score": self.confidence_score,
58
+ }.items() if v is not None
59
+ }
60
+
61
+
62
+ class MetadataEnrichmentService:
63
+ """
64
+ Service for enriching book metadata using LLM.
65
+
66
+ Can work with:
67
+ - Extracted text from PDFs/EPUBs
68
+ - Existing metadata (title, authors, etc.)
69
+ - Combination of both
70
+ """
71
+
72
+ def __init__(self, provider: BaseLLMProvider):
73
+ """
74
+ Initialize service with LLM provider.
75
+
76
+ Args:
77
+ provider: LLM provider instance (e.g., OllamaProvider)
78
+ """
79
+ self.provider = provider
80
+
81
+ async def infer_metadata_from_text(
82
+ self,
83
+ text: str,
84
+ existing_metadata: Optional[Dict[str, Any]] = None,
85
+ max_text_length: int = 5000
86
+ ) -> EnrichedMetadata:
87
+ """
88
+ Infer metadata from extracted text.
89
+
90
+ Args:
91
+ text: Extracted text from book (truncated if too long)
92
+ existing_metadata: Any existing metadata to enhance
93
+ max_text_length: Maximum text length to send to LLM
94
+
95
+ Returns:
96
+ EnrichedMetadata with inferred fields
97
+ """
98
+ # Truncate text if too long
99
+ if len(text) > max_text_length:
100
+ text = text[:max_text_length] + "..."
101
+
102
+ # Build prompt
103
+ prompt = self._build_metadata_inference_prompt(text, existing_metadata)
104
+
105
+ # Define expected schema
106
+ schema = {
107
+ "type": "object",
108
+ "properties": {
109
+ "title": {"type": "string"},
110
+ "authors": {"type": "array", "items": {"type": "string"}},
111
+ "subjects": {"type": "array", "items": {"type": "string"}},
112
+ "description": {"type": "string"},
113
+ "keywords": {"type": "array", "items": {"type": "string"}},
114
+ "difficulty_level": {"type": "string", "enum": ["beginner", "intermediate", "advanced"]},
115
+ "target_audience": {"type": "string"},
116
+ "confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
117
+ }
118
+ }
119
+
120
+ try:
121
+ result = await self.provider.complete_json(
122
+ prompt=prompt,
123
+ schema=schema,
124
+ temperature=0.3, # Lower temperature for more consistent metadata
125
+ )
126
+
127
+ return EnrichedMetadata(
128
+ title=result.get("title"),
129
+ authors=result.get("authors"),
130
+ subjects=result.get("subjects"),
131
+ description=result.get("description"),
132
+ keywords=result.get("keywords"),
133
+ difficulty_level=result.get("difficulty_level"),
134
+ target_audience=result.get("target_audience"),
135
+ confidence_score=result.get("confidence_score", 0.7),
136
+ )
137
+
138
+ except Exception as e:
139
+ print(f"Metadata inference failed: {e}")
140
+ return EnrichedMetadata(confidence_score=0.0)
141
+
142
+ async def generate_tags(
143
+ self,
144
+ title: str,
145
+ authors: Optional[List[str]] = None,
146
+ subjects: Optional[List[str]] = None,
147
+ description: Optional[str] = None,
148
+ text_sample: Optional[str] = None,
149
+ max_tags: int = 15
150
+ ) -> List[str]:
151
+ """
152
+ Generate relevant tags for a book.
153
+
154
+ Args:
155
+ title: Book title
156
+ authors: List of authors
157
+ subjects: Existing subjects/topics
158
+ description: Book description
159
+ text_sample: Sample text from book
160
+ max_tags: Maximum number of tags to generate
161
+
162
+ Returns:
163
+ List of tags
164
+ """
165
+ prompt = f"""Generate up to {max_tags} relevant tags for this book.
166
+
167
+ Title: {title}
168
+ Authors: {', '.join(authors or [])}
169
+ Subjects: {', '.join(subjects or [])}
170
+ Description: {description or 'N/A'}
171
+
172
+ {f'Text Sample: {text_sample[:1000]}...' if text_sample else ''}
173
+
174
+ Generate tags that would help someone find this book. Include:
175
+ - Genre (fiction, non-fiction, textbook, reference, etc.)
176
+ - Topics and themes
177
+ - Reading level (introductory, advanced, comprehensive, etc.)
178
+ - Format type (handbook, guide, manual, tutorial, etc.)
179
+ - Domain (programming, mathematics, history, science, etc.)
180
+
181
+ Return as JSON array: ["tag1", "tag2", ...]"""
182
+
183
+ try:
184
+ result = await self.provider.complete_json(prompt, temperature=0.5)
185
+
186
+ # Handle both array and object responses
187
+ if isinstance(result, list):
188
+ tags = result
189
+ elif isinstance(result, dict) and "tags" in result:
190
+ tags = result["tags"]
191
+ else:
192
+ tags = []
193
+
194
+ # Clean and deduplicate tags
195
+ tags = [str(tag).strip().lower() for tag in tags if tag]
196
+ tags = list(dict.fromkeys(tags)) # Deduplicate while preserving order
197
+
198
+ return tags[:max_tags]
199
+
200
+ except Exception as e:
201
+ print(f"Tag generation failed: {e}")
202
+ return []
203
+
204
+ async def categorize(
205
+ self,
206
+ title: str,
207
+ subjects: Optional[List[str]] = None,
208
+ description: Optional[str] = None,
209
+ ) -> List[str]:
210
+ """
211
+ Categorize book into standard categories.
212
+
213
+ Args:
214
+ title: Book title
215
+ subjects: Existing subjects
216
+ description: Book description
217
+
218
+ Returns:
219
+ List of categories (e.g., ['Computer Science', 'Programming', 'Software Engineering'])
220
+ """
221
+ prompt = f"""Categorize this book into standard academic/library categories.
222
+
223
+ Title: {title}
224
+ Subjects: {', '.join(subjects or [])}
225
+ Description: {description or 'N/A'}
226
+
227
+ Use standard categories like:
228
+ - Computer Science, Mathematics, Physics, Chemistry, Biology
229
+ - Engineering, Medicine, Law, Business
230
+ - History, Philosophy, Psychology, Sociology
231
+ - Literature, Art, Music
232
+ - Programming, Data Science, Machine Learning, AI
233
+
234
+ Return 2-5 relevant categories as JSON array: ["Category1", "Category2", ...]"""
235
+
236
+ try:
237
+ result = await self.provider.complete_json(prompt, temperature=0.3)
238
+
239
+ if isinstance(result, list):
240
+ categories = result
241
+ elif isinstance(result, dict) and "categories" in result:
242
+ categories = result["categories"]
243
+ else:
244
+ categories = []
245
+
246
+ return [str(cat).strip() for cat in categories if cat]
247
+
248
+ except Exception as e:
249
+ print(f"Categorization failed: {e}")
250
+ return []
251
+
252
+ async def enhance_description(
253
+ self,
254
+ title: str,
255
+ existing_description: Optional[str] = None,
256
+ text_sample: Optional[str] = None,
257
+ max_length: int = 500
258
+ ) -> Optional[str]:
259
+ """
260
+ Generate or enhance book description.
261
+
262
+ Args:
263
+ title: Book title
264
+ existing_description: Current description (if any)
265
+ text_sample: Sample text from book
266
+ max_length: Maximum description length
267
+
268
+ Returns:
269
+ Enhanced description
270
+ """
271
+ if existing_description and len(existing_description) > max_length:
272
+ # Already have a good description
273
+ return existing_description
274
+
275
+ prompt = f"""Write a clear, informative description for this book.
276
+
277
+ Title: {title}
278
+ {f'Current Description: {existing_description}' if existing_description else ''}
279
+ {f'Text Sample: {text_sample[:2000]}...' if text_sample else ''}
280
+
281
+ Write a {max_length}-character description that:
282
+ 1. Explains what the book is about
283
+ 2. Identifies the target audience
284
+ 3. Highlights key topics covered
285
+ 4. Mentions the approach/style (e.g., practical guide, theoretical text, reference manual)
286
+
287
+ Return just the description text, no JSON."""
288
+
289
+ try:
290
+ response = await self.provider.complete(prompt, temperature=0.5)
291
+ description = response.content.strip()
292
+
293
+ # Remove quotes if LLM wrapped it
294
+ if description.startswith('"') and description.endswith('"'):
295
+ description = description[1:-1]
296
+
297
+ return description[:max_length]
298
+
299
+ except Exception as e:
300
+ print(f"Description generation failed: {e}")
301
+ return existing_description
302
+
303
+ async def assess_difficulty(
304
+ self,
305
+ text_sample: str,
306
+ subjects: Optional[List[str]] = None
307
+ ) -> str:
308
+ """
309
+ Assess difficulty level of book.
310
+
311
+ Args:
312
+ text_sample: Sample text from book
313
+ subjects: Book subjects/topics
314
+
315
+ Returns:
316
+ Difficulty level: 'beginner', 'intermediate', or 'advanced'
317
+ """
318
+ prompt = f"""Assess the difficulty level of this text.
319
+
320
+ Subjects: {', '.join(subjects or [])}
321
+
322
+ Text Sample:
323
+ {text_sample[:2000]}
324
+
325
+ Consider:
326
+ - Vocabulary complexity
327
+ - Concept difficulty
328
+ - Prerequisites assumed
329
+ - Mathematical/technical content
330
+
331
+ Return one of: "beginner", "intermediate", "advanced"
332
+
333
+ Return as JSON: {{"level": "..."}}"""
334
+
335
+ try:
336
+ result = await self.provider.complete_json(prompt, temperature=0.2)
337
+ level = result.get("level", "intermediate")
338
+
339
+ if level not in ["beginner", "intermediate", "advanced"]:
340
+ level = "intermediate"
341
+
342
+ return level
343
+
344
+ except Exception as e:
345
+ print(f"Difficulty assessment failed: {e}")
346
+ return "intermediate"
347
+
348
+ def _build_metadata_inference_prompt(
349
+ self,
350
+ text: str,
351
+ existing_metadata: Optional[Dict[str, Any]] = None
352
+ ) -> str:
353
+ """Build prompt for metadata inference."""
354
+
355
+ prompt = """Analyze this text and infer metadata about the book.
356
+
357
+ Text Sample:
358
+ ---
359
+ {text}
360
+ ---
361
+
362
+ {existing}
363
+
364
+ Extract or infer:
365
+ 1. Title (if not provided)
366
+ 2. Authors (if mentioned)
367
+ 3. Subjects/topics (main themes and topics covered)
368
+ 4. Description (2-3 sentence summary)
369
+ 5. Keywords (10-15 relevant search terms)
370
+ 6. Difficulty level (beginner/intermediate/advanced)
371
+ 7. Target audience (who should read this)
372
+
373
+ Respond with JSON matching this schema:
374
+ {{
375
+ "title": "string",
376
+ "authors": ["author1", "author2"],
377
+ "subjects": ["subject1", "subject2"],
378
+ "description": "string",
379
+ "keywords": ["keyword1", "keyword2"],
380
+ "difficulty_level": "beginner|intermediate|advanced",
381
+ "target_audience": "string",
382
+ "confidence_score": 0.0-1.0
383
+ }}
384
+
385
+ Only include fields you can confidently infer. Set confidence_score based on how much information was available."""
386
+
387
+ existing_str = ""
388
+ if existing_metadata:
389
+ existing_str = f"""
390
+ Existing Metadata (enhance if possible):
391
+ Title: {existing_metadata.get('title', 'Unknown')}
392
+ Authors: {', '.join(existing_metadata.get('authors', []))}
393
+ Subjects: {', '.join(existing_metadata.get('subjects', []))}
394
+ """
395
+
396
+ return prompt.format(text=text, existing=existing_str)