ebk 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. ebk/__init__.py +35 -0
  2. ebk/ai/__init__.py +23 -0
  3. ebk/ai/knowledge_graph.py +450 -0
  4. ebk/ai/llm_providers/__init__.py +26 -0
  5. ebk/ai/llm_providers/anthropic.py +209 -0
  6. ebk/ai/llm_providers/base.py +295 -0
  7. ebk/ai/llm_providers/gemini.py +285 -0
  8. ebk/ai/llm_providers/ollama.py +294 -0
  9. ebk/ai/metadata_enrichment.py +394 -0
  10. ebk/ai/question_generator.py +328 -0
  11. ebk/ai/reading_companion.py +224 -0
  12. ebk/ai/semantic_search.py +433 -0
  13. ebk/ai/text_extractor.py +393 -0
  14. ebk/calibre_import.py +66 -0
  15. ebk/cli.py +6433 -0
  16. ebk/config.py +230 -0
  17. ebk/db/__init__.py +37 -0
  18. ebk/db/migrations.py +507 -0
  19. ebk/db/models.py +725 -0
  20. ebk/db/session.py +144 -0
  21. ebk/decorators.py +1 -0
  22. ebk/exports/__init__.py +0 -0
  23. ebk/exports/base_exporter.py +218 -0
  24. ebk/exports/echo_export.py +279 -0
  25. ebk/exports/html_library.py +1743 -0
  26. ebk/exports/html_utils.py +87 -0
  27. ebk/exports/hugo.py +59 -0
  28. ebk/exports/jinja_export.py +286 -0
  29. ebk/exports/multi_facet_export.py +159 -0
  30. ebk/exports/opds_export.py +232 -0
  31. ebk/exports/symlink_dag.py +479 -0
  32. ebk/exports/zip.py +25 -0
  33. ebk/extract_metadata.py +341 -0
  34. ebk/ident.py +89 -0
  35. ebk/library_db.py +1440 -0
  36. ebk/opds.py +748 -0
  37. ebk/plugins/__init__.py +42 -0
  38. ebk/plugins/base.py +502 -0
  39. ebk/plugins/hooks.py +442 -0
  40. ebk/plugins/registry.py +499 -0
  41. ebk/repl/__init__.py +9 -0
  42. ebk/repl/find.py +126 -0
  43. ebk/repl/grep.py +173 -0
  44. ebk/repl/shell.py +1677 -0
  45. ebk/repl/text_utils.py +320 -0
  46. ebk/search_parser.py +413 -0
  47. ebk/server.py +3608 -0
  48. ebk/services/__init__.py +28 -0
  49. ebk/services/annotation_extraction.py +351 -0
  50. ebk/services/annotation_service.py +380 -0
  51. ebk/services/export_service.py +577 -0
  52. ebk/services/import_service.py +447 -0
  53. ebk/services/personal_metadata_service.py +347 -0
  54. ebk/services/queue_service.py +253 -0
  55. ebk/services/tag_service.py +281 -0
  56. ebk/services/text_extraction.py +317 -0
  57. ebk/services/view_service.py +12 -0
  58. ebk/similarity/__init__.py +77 -0
  59. ebk/similarity/base.py +154 -0
  60. ebk/similarity/core.py +471 -0
  61. ebk/similarity/extractors.py +168 -0
  62. ebk/similarity/metrics.py +376 -0
  63. ebk/skills/SKILL.md +182 -0
  64. ebk/skills/__init__.py +1 -0
  65. ebk/vfs/__init__.py +101 -0
  66. ebk/vfs/base.py +298 -0
  67. ebk/vfs/library_vfs.py +122 -0
  68. ebk/vfs/nodes/__init__.py +54 -0
  69. ebk/vfs/nodes/authors.py +196 -0
  70. ebk/vfs/nodes/books.py +480 -0
  71. ebk/vfs/nodes/files.py +155 -0
  72. ebk/vfs/nodes/metadata.py +385 -0
  73. ebk/vfs/nodes/root.py +100 -0
  74. ebk/vfs/nodes/similar.py +165 -0
  75. ebk/vfs/nodes/subjects.py +184 -0
  76. ebk/vfs/nodes/tags.py +371 -0
  77. ebk/vfs/resolver.py +228 -0
  78. ebk/vfs_router.py +275 -0
  79. ebk/views/__init__.py +32 -0
  80. ebk/views/dsl.py +668 -0
  81. ebk/views/service.py +619 -0
  82. ebk-0.4.4.dist-info/METADATA +755 -0
  83. ebk-0.4.4.dist-info/RECORD +87 -0
  84. ebk-0.4.4.dist-info/WHEEL +5 -0
  85. ebk-0.4.4.dist-info/entry_points.txt +2 -0
  86. ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
  87. ebk-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,341 @@
1
+ import os
2
+ import xmltodict
3
+ from typing import Dict, Optional
4
+ import pypdf
5
+ from ebooklib import epub
6
+
7
+ def extract_metadata_from_opf(opf_file: str) -> Dict:
8
+ """
9
+ Parse a Calibre OPF file into a simplified dictionary structure (Dublin Core).
10
+ Returns a dict with keys:
11
+ - title
12
+ - creators
13
+ - subjects
14
+ - description
15
+ - language
16
+ - date
17
+ - identifiers
18
+ - publisher
19
+ """
20
+ try:
21
+ with open(opf_file, "r", encoding="utf-8") as f:
22
+ opf_dict = xmltodict.parse(f.read(), process_namespaces=False)
23
+ except Exception as e:
24
+ print(f"[extract_metadata_from_opf] Error reading '{opf_file}': {e}")
25
+ return {}
26
+
27
+ package = opf_dict.get("package", {})
28
+ metadata = package.get("metadata", {})
29
+
30
+ # Prepare simplified structure
31
+ simplified = {
32
+ "title": metadata.get("dc:title", metadata.get("title")),
33
+ "creators": None,
34
+ "contributors": None,
35
+ "subjects": None,
36
+ "description": metadata.get("dc:description", metadata.get("description")),
37
+ "language": metadata.get("dc:language", metadata.get("language")),
38
+ "date": metadata.get("dc:date", metadata.get("date")),
39
+ "publisher": metadata.get("dc:publisher", metadata.get("publisher")),
40
+ "identifiers": None,
41
+ "rights": metadata.get("dc:rights", metadata.get("rights")),
42
+ "source": metadata.get("dc:source", metadata.get("source")),
43
+ "series": None,
44
+ "series_index": None
45
+ }
46
+
47
+ # -- Creators
48
+ creators = metadata.get("dc:creator", metadata.get("creator"))
49
+ if isinstance(creators, list):
50
+ simplified["creators"] = [
51
+ c.get("#text", "").strip() if isinstance(c, dict) else c
52
+ for c in creators
53
+ ]
54
+ elif isinstance(creators, dict):
55
+ simplified["creators"] = [creators.get("#text", "").strip()]
56
+ elif isinstance(creators, str):
57
+ simplified["creators"] = [creators.strip()]
58
+
59
+ # -- Subjects
60
+ subjects = metadata.get("dc:subject", metadata.get("subject"))
61
+ if isinstance(subjects, list):
62
+ simplified["subjects"] = [s.strip() for s in subjects]
63
+ elif isinstance(subjects, str):
64
+ simplified["subjects"] = [subjects.strip()]
65
+
66
+ # -- Identifiers
67
+ identifiers = metadata.get("dc:identifier", metadata.get("identifier"))
68
+ if isinstance(identifiers, list):
69
+ simplified["identifiers"] = {}
70
+ for identifier in identifiers:
71
+ if isinstance(identifier, dict):
72
+ scheme = identifier.get("@opf:scheme", "unknown")
73
+ text = identifier.get("#text", "").strip()
74
+ simplified["identifiers"][scheme] = text
75
+ else:
76
+ simplified["identifiers"]["unknown"] = identifier
77
+ elif isinstance(identifiers, dict):
78
+ scheme = identifiers.get("@opf:scheme", "unknown")
79
+ text = identifiers.get("#text", "").strip()
80
+ simplified["identifiers"][scheme] = text
81
+
82
+ # -- Contributors (editors, translators, etc)
83
+ contributors_raw = metadata.get("dc:contributor", metadata.get("contributor"))
84
+ if contributors_raw:
85
+ simplified["contributors"] = []
86
+ if isinstance(contributors_raw, list):
87
+ for contrib in contributors_raw:
88
+ if isinstance(contrib, dict):
89
+ name = contrib.get("#text", "").strip()
90
+ role = contrib.get("@opf:role", "contributor")
91
+ file_as = contrib.get("@opf:file-as", "")
92
+ if name:
93
+ simplified["contributors"].append({
94
+ "name": name,
95
+ "role": role,
96
+ "file_as": file_as
97
+ })
98
+ elif isinstance(contrib, str):
99
+ simplified["contributors"].append({
100
+ "name": contrib.strip(),
101
+ "role": "contributor",
102
+ "file_as": ""
103
+ })
104
+ elif isinstance(contributors_raw, dict):
105
+ name = contributors_raw.get("#text", "").strip()
106
+ role = contributors_raw.get("@opf:role", "contributor")
107
+ file_as = contributors_raw.get("@opf:file-as", "")
108
+ if name:
109
+ simplified["contributors"] = [{
110
+ "name": name,
111
+ "role": role,
112
+ "file_as": file_as
113
+ }]
114
+
115
+ # -- Calibre-specific metadata (series, etc)
116
+ # Look for meta tags with name attributes
117
+ meta_tags = metadata.get("meta", [])
118
+ if not isinstance(meta_tags, list):
119
+ meta_tags = [meta_tags] if meta_tags else []
120
+
121
+ for meta in meta_tags:
122
+ if isinstance(meta, dict):
123
+ meta_name = meta.get("@name", "")
124
+ meta_content = meta.get("@content", "")
125
+
126
+ if meta_name == "calibre:series" and meta_content:
127
+ simplified["series"] = meta_content
128
+ elif meta_name == "calibre:series_index" and meta_content:
129
+ try:
130
+ simplified["series_index"] = float(meta_content)
131
+ except (ValueError, TypeError):
132
+ pass
133
+
134
+ return simplified
135
+
136
+
137
+ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
138
+ """
139
+ Extract metadata from a PDF file using pypdf.
140
+ Returns a dictionary with the same keys as the OPF-based dict.
141
+ """
142
+
143
+ metadata = {
144
+ "title": None,
145
+ "creators": None,
146
+ "subjects": None,
147
+ "description": None,
148
+ "language": None,
149
+ "date": None,
150
+ "publisher": None,
151
+ "identifiers": None,
152
+ "keywords": None,
153
+ "creator_application": None,
154
+ }
155
+
156
+ try:
157
+ with open(pdf_path, "rb") as f:
158
+ reader = pypdf.PdfReader(f)
159
+ info = reader.metadata or {}
160
+
161
+ # NOTE: Depending on pypdf version, metadata keys can differ
162
+ # e.g. info.title vs info.get('/Title')
163
+ pdf_title = info.get("/Title", None) or info.get("title", None)
164
+ pdf_author = info.get("/Author", None) or info.get("author", None)
165
+ pdf_subject = info.get("/Subject", None) or info.get("subject", None)
166
+ pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
167
+ pdf_creator = info.get("/Creator", None) or info.get("creator", None) # Application used
168
+ pdf_producer = info.get("/Producer", None) or info.get("producer", None)
169
+ pdf_publisher = info.get("/Publisher", None) or info.get("publisher", None)
170
+ pdf_creation_date = info.get("/CreationDate", None)
171
+
172
+ if pdf_title:
173
+ metadata["title"] = pdf_title.strip()
174
+ if pdf_author:
175
+ metadata["creators"] = [pdf_author.strip()]
176
+ if pdf_subject:
177
+ metadata["subjects"] = [sub.strip() for sub in pdf_subject.split(",")]
178
+ metadata["description"] = pdf_subject.strip()
179
+
180
+ if pdf_creation_date and len(pdf_creation_date) >= 10:
181
+ # Format: 'D:YYYYMMDDhhmmss'
182
+ # We'll extract 'YYYY-MM-DD'
183
+ date_str = pdf_creation_date[2:10] # e.g., 20210101
184
+ metadata["date"] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
185
+ # Language not typically stored in PDF metadata
186
+ metadata["language"] = "unknown-language"
187
+
188
+ # For an "identifier," we don't really have a built-in PDF field, so it's optional
189
+ metadata["identifiers"] = {"pdf:identifier": pdf_path}
190
+
191
+ if pdf_keywords:
192
+ metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",") if kw.strip()]
193
+
194
+ # Creator is the application that created the PDF (e.g., LaTeX, Word)
195
+ if pdf_creator:
196
+ metadata["creator_application"] = pdf_creator.strip()
197
+
198
+ # Publisher: prefer explicit Publisher field, fallback to Producer
199
+ if pdf_publisher:
200
+ metadata["publisher"] = pdf_publisher.strip()
201
+ elif pdf_producer and not pdf_creator:
202
+ # Only use producer as publisher if there's no creator app
203
+ metadata["publisher"] = pdf_producer.strip()
204
+
205
+ metadata["file_paths"] = [pdf_path]
206
+
207
+
208
+ except Exception as e:
209
+ print(f"[extract_metadata_from_pdf] Error reading '{pdf_path}': {e}")
210
+
211
+ return metadata
212
+
213
+
214
+ def extract_metadata_from_epub(epub_path: str) -> Dict:
215
+ """
216
+ Extract metadata from an EPUB file using ebooklib.
217
+ Returns a dictionary with the same keys as the OPF-based dict.
218
+ """
219
+ metadata = {
220
+ "title": None,
221
+ "creators": [],
222
+ "subjects": [],
223
+ "description": None,
224
+ "language": None,
225
+ "date": None,
226
+ "identifiers": {},
227
+ }
228
+
229
+ try:
230
+ book = epub.read_epub(epub_path)
231
+
232
+ # Title
233
+ dc_title = book.get_metadata("DC", "title")
234
+ if dc_title:
235
+ metadata["title"] = dc_title[0][0]
236
+
237
+ # Creators
238
+ dc_creators = book.get_metadata("DC", "creator")
239
+ if dc_creators:
240
+ metadata["creators"] = [c[0] for c in dc_creators]
241
+
242
+ # Subjects
243
+ dc_subjects = book.get_metadata("DC", "subject")
244
+ if dc_subjects:
245
+ metadata["subjects"] = [s[0] for s in dc_subjects]
246
+
247
+ # Description
248
+ dc_description = book.get_metadata("DC", "description")
249
+ if dc_description:
250
+ metadata["description"] = dc_description[0][0]
251
+
252
+ # Language
253
+ dc_language = book.get_metadata("DC", "language")
254
+ if dc_language:
255
+ metadata["language"] = dc_language[0][0]
256
+
257
+ # Date
258
+ dc_date = book.get_metadata("DC", "date")
259
+ if dc_date:
260
+ metadata["date"] = dc_date[0][0]
261
+
262
+ # Identifiers
263
+ identifiers = book.get_metadata("DC", "identifier")
264
+ if identifiers:
265
+ for identifier in identifiers:
266
+ # identifier is a tuple: (value, { 'scheme': '...' })
267
+ ident_value, ident_attrs = identifier
268
+ scheme = ident_attrs.get("scheme", "unknown")
269
+ metadata["identifiers"][scheme] = ident_value
270
+ except Exception as e:
271
+ print(f"[extract_metadata_from_epub] Error reading '{epub_path}': {e}")
272
+
273
+ return metadata
274
+
275
+
276
+ def extract_metadata_from_path(file_path: str) -> Dict:
277
+ """
278
+ Fallback metadata extraction by interpreting the path as <...>/<author>/<title>.
279
+ Slugify them to remove invalid characters.
280
+ """
281
+ metadata = {
282
+ "title": None,
283
+ "creators": [],
284
+ "subjects": [],
285
+ "description": "",
286
+ "language": "unknown-language",
287
+ "date": "unknown-date",
288
+ "identifiers": {}
289
+ }
290
+
291
+ try:
292
+ path_parts = file_path.split(os.sep)
293
+ # path_parts: ['base_dir', 'author_dir', 'title', 'title - author.ext'] ]
294
+ title = path_parts[-2]
295
+ creators = path_parts[1].split(",")
296
+ metadata["title"] = title
297
+ metadata["creators"] = creators
298
+ except Exception as e:
299
+ print(f"[extract_metadata_from_path] Error with '{file_path}': {e}")
300
+
301
+ return metadata
302
+
303
+ def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
304
+ """
305
+ High-level function to extract metadata from either:
306
+ - OPF file (if provided)
307
+ - The ebook_file (PDF, EPUB, or fallback from path)
308
+ Then merges them, giving priority to OPF data.
309
+
310
+ Returns a final merged dictionary with keys:
311
+ - title
312
+ - creators
313
+ - subjects
314
+ - description
315
+ - language
316
+ - date
317
+ - identifiers
318
+ - cover_path
319
+ - file_paths
320
+ - virtual_libs
321
+ - unique_id
322
+ """
323
+
324
+ # 1. Extract from OPF if we have it
325
+ opf_metadata = {}
326
+ if opf_file and os.path.isfile(opf_file):
327
+ opf_metadata = extract_metadata_from_opf(opf_file)
328
+
329
+ ebook_metadata = {}
330
+ _, ext = os.path.splitext(ebook_file.lower())
331
+ if ext == ".pdf":
332
+ ebook_metadata = extract_metadata_from_pdf(ebook_file)
333
+ elif ext == ".epub":
334
+ ebook_metadata = extract_metadata_from_epub(ebook_file)
335
+
336
+ path_metadata = extract_metadata_from_path(ebook_file)
337
+
338
+ metadata = {key: opf_metadata.get(key) or ebook_metadata.get(key) or value for key, value in ebook_metadata.items()}
339
+ metadata = {key: metadata.get(key) or value for key, value in path_metadata.items()}
340
+ return metadata
341
+
ebk/ident.py ADDED
@@ -0,0 +1,89 @@
1
+ import hashlib
2
+ import re
3
+ from typing import List, Dict
4
+ import uuid
5
+
6
+ def canonicalize_text(text: str) -> str:
7
+ """
8
+ Canonicalize text by converting to lowercase, removing punctuation,
9
+ stripping whitespace, and replacing spaces with underscores.
10
+ """
11
+ text = text.lower()
12
+ # Remove punctuation using regex
13
+ text = re.sub(r'[^\w\s]', '', text)
14
+ # Replace multiple spaces with a single space
15
+ text = re.sub(r'\s+', ' ', text)
16
+ # Strip leading and trailing whitespace
17
+ text = text.strip()
18
+ # Replace spaces with underscores
19
+ text = text.replace(' ', '_')
20
+ return text
21
+
22
+ def canonicalize_creators(creators: List[str]) -> str:
23
+ """
24
+ Canonicalize a list of creators (authors) by sorting them,
25
+ canonicalizing each name, and joining with underscores.
26
+ """
27
+ # Sort creators alphabetically for consistency
28
+ sorted_creators = sorted(creators)
29
+ canonical_creators = [canonicalize_text(creator) for creator in sorted_creators]
30
+ # Join multiple creators with underscores
31
+ return '_'.join(canonical_creators)
32
+
33
+ def generate_composite_string(entry: Dict) -> str:
34
+ """
35
+ Generate a composite string by concatenating canonicalized values
36
+ of ISBN, date, language, publisher, creators, and title.
37
+
38
+ The order is important for consistency.
39
+ """
40
+ language = entry.get('language', '').strip()
41
+ creators = entry.get('creators', [])
42
+ title = entry.get('title', '').strip()
43
+
44
+ # Canonicalize each field
45
+ language_c = canonicalize_text(language) if language else 'no_language'
46
+ creators_c = canonicalize_creators(creators) if creators else 'no_creators'
47
+ title_c = canonicalize_text(title) if title else 'no_title'
48
+
49
+ if language_c == 'no_language' and creators_c == 'no_creators' and title_c == 'no_title':
50
+ return None
51
+
52
+ # Concatenate fields with double underscores as delimiters
53
+ composite_string = f"{language_c}__{creators_c}__{title_c}"
54
+ return composite_string
55
+
56
+ def generate_hash_id(entry: Dict) -> str:
57
+ """
58
+ Generate a unique hash ID for an eBook entry by hashing the composite string.
59
+
60
+ Args:
61
+ entry (Dict): The eBook entry metadata.
62
+
63
+ Returns:
64
+ str: The SHA-256 hash hexadecimal string.
65
+ """
66
+ composite_string = generate_composite_string(entry)
67
+ if composite_string:
68
+ composite_bytes = composite_string.encode('utf-8')
69
+ else:
70
+ composite_bytes = str(uuid.uuid4()).encode('utf-8')
71
+
72
+ # Create SHA-256 hash
73
+ hash_obj = hashlib.sha256(composite_bytes)
74
+ hash_hex = hash_obj.hexdigest()
75
+ return hash_hex
76
+
77
+ def add_unique_id(entry: Dict) -> Dict:
78
+ """
79
+ Add a unique hash ID to the eBook entry.
80
+
81
+ Args:
82
+ entry (Dict): The original eBook entry metadata.
83
+
84
+ Returns:
85
+ Dict: The eBook entry with an added 'unique_id' field.
86
+ """
87
+ unique_id = generate_hash_id(entry)
88
+ entry['unique_id'] = unique_id
89
+ return entry