ebk 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +450 -0
- ebk/ai/llm_providers/__init__.py +26 -0
- ebk/ai/llm_providers/anthropic.py +209 -0
- ebk/ai/llm_providers/base.py +295 -0
- ebk/ai/llm_providers/gemini.py +285 -0
- ebk/ai/llm_providers/ollama.py +294 -0
- ebk/ai/metadata_enrichment.py +394 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +433 -0
- ebk/ai/text_extractor.py +393 -0
- ebk/calibre_import.py +66 -0
- ebk/cli.py +6433 -0
- ebk/config.py +230 -0
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +507 -0
- ebk/db/models.py +725 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +1 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/echo_export.py +279 -0
- ebk/exports/html_library.py +1743 -0
- ebk/exports/html_utils.py +87 -0
- ebk/exports/hugo.py +59 -0
- ebk/exports/jinja_export.py +286 -0
- ebk/exports/multi_facet_export.py +159 -0
- ebk/exports/opds_export.py +232 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/exports/zip.py +25 -0
- ebk/extract_metadata.py +341 -0
- ebk/ident.py +89 -0
- ebk/library_db.py +1440 -0
- ebk/opds.py +748 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +442 -0
- ebk/plugins/registry.py +499 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +173 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +3608 -0
- ebk/services/__init__.py +28 -0
- ebk/services/annotation_extraction.py +351 -0
- ebk/services/annotation_service.py +380 -0
- ebk/services/export_service.py +577 -0
- ebk/services/import_service.py +447 -0
- ebk/services/personal_metadata_service.py +347 -0
- ebk/services/queue_service.py +253 -0
- ebk/services/tag_service.py +281 -0
- ebk/services/text_extraction.py +317 -0
- ebk/services/view_service.py +12 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +471 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/skills/SKILL.md +182 -0
- ebk/skills/__init__.py +1 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +298 -0
- ebk/vfs/library_vfs.py +122 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk/vfs_router.py +275 -0
- ebk/views/__init__.py +32 -0
- ebk/views/dsl.py +668 -0
- ebk/views/service.py +619 -0
- ebk-0.4.4.dist-info/METADATA +755 -0
- ebk-0.4.4.dist-info/RECORD +87 -0
- ebk-0.4.4.dist-info/WHEEL +5 -0
- ebk-0.4.4.dist-info/entry_points.txt +2 -0
- ebk-0.4.4.dist-info/licenses/LICENSE +21 -0
- ebk-0.4.4.dist-info/top_level.txt +1 -0
ebk/extract_metadata.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import xmltodict
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
import pypdf
|
|
5
|
+
from ebooklib import epub
|
|
6
|
+
|
|
7
|
+
def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
8
|
+
"""
|
|
9
|
+
Parse a Calibre OPF file into a simplified dictionary structure (Dublin Core).
|
|
10
|
+
Returns a dict with keys:
|
|
11
|
+
- title
|
|
12
|
+
- creators
|
|
13
|
+
- subjects
|
|
14
|
+
- description
|
|
15
|
+
- language
|
|
16
|
+
- date
|
|
17
|
+
- identifiers
|
|
18
|
+
- publisher
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
with open(opf_file, "r", encoding="utf-8") as f:
|
|
22
|
+
opf_dict = xmltodict.parse(f.read(), process_namespaces=False)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"[extract_metadata_from_opf] Error reading '{opf_file}': {e}")
|
|
25
|
+
return {}
|
|
26
|
+
|
|
27
|
+
package = opf_dict.get("package", {})
|
|
28
|
+
metadata = package.get("metadata", {})
|
|
29
|
+
|
|
30
|
+
# Prepare simplified structure
|
|
31
|
+
simplified = {
|
|
32
|
+
"title": metadata.get("dc:title", metadata.get("title")),
|
|
33
|
+
"creators": None,
|
|
34
|
+
"contributors": None,
|
|
35
|
+
"subjects": None,
|
|
36
|
+
"description": metadata.get("dc:description", metadata.get("description")),
|
|
37
|
+
"language": metadata.get("dc:language", metadata.get("language")),
|
|
38
|
+
"date": metadata.get("dc:date", metadata.get("date")),
|
|
39
|
+
"publisher": metadata.get("dc:publisher", metadata.get("publisher")),
|
|
40
|
+
"identifiers": None,
|
|
41
|
+
"rights": metadata.get("dc:rights", metadata.get("rights")),
|
|
42
|
+
"source": metadata.get("dc:source", metadata.get("source")),
|
|
43
|
+
"series": None,
|
|
44
|
+
"series_index": None
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# -- Creators
|
|
48
|
+
creators = metadata.get("dc:creator", metadata.get("creator"))
|
|
49
|
+
if isinstance(creators, list):
|
|
50
|
+
simplified["creators"] = [
|
|
51
|
+
c.get("#text", "").strip() if isinstance(c, dict) else c
|
|
52
|
+
for c in creators
|
|
53
|
+
]
|
|
54
|
+
elif isinstance(creators, dict):
|
|
55
|
+
simplified["creators"] = [creators.get("#text", "").strip()]
|
|
56
|
+
elif isinstance(creators, str):
|
|
57
|
+
simplified["creators"] = [creators.strip()]
|
|
58
|
+
|
|
59
|
+
# -- Subjects
|
|
60
|
+
subjects = metadata.get("dc:subject", metadata.get("subject"))
|
|
61
|
+
if isinstance(subjects, list):
|
|
62
|
+
simplified["subjects"] = [s.strip() for s in subjects]
|
|
63
|
+
elif isinstance(subjects, str):
|
|
64
|
+
simplified["subjects"] = [subjects.strip()]
|
|
65
|
+
|
|
66
|
+
# -- Identifiers
|
|
67
|
+
identifiers = metadata.get("dc:identifier", metadata.get("identifier"))
|
|
68
|
+
if isinstance(identifiers, list):
|
|
69
|
+
simplified["identifiers"] = {}
|
|
70
|
+
for identifier in identifiers:
|
|
71
|
+
if isinstance(identifier, dict):
|
|
72
|
+
scheme = identifier.get("@opf:scheme", "unknown")
|
|
73
|
+
text = identifier.get("#text", "").strip()
|
|
74
|
+
simplified["identifiers"][scheme] = text
|
|
75
|
+
else:
|
|
76
|
+
simplified["identifiers"]["unknown"] = identifier
|
|
77
|
+
elif isinstance(identifiers, dict):
|
|
78
|
+
scheme = identifiers.get("@opf:scheme", "unknown")
|
|
79
|
+
text = identifiers.get("#text", "").strip()
|
|
80
|
+
simplified["identifiers"][scheme] = text
|
|
81
|
+
|
|
82
|
+
# -- Contributors (editors, translators, etc)
|
|
83
|
+
contributors_raw = metadata.get("dc:contributor", metadata.get("contributor"))
|
|
84
|
+
if contributors_raw:
|
|
85
|
+
simplified["contributors"] = []
|
|
86
|
+
if isinstance(contributors_raw, list):
|
|
87
|
+
for contrib in contributors_raw:
|
|
88
|
+
if isinstance(contrib, dict):
|
|
89
|
+
name = contrib.get("#text", "").strip()
|
|
90
|
+
role = contrib.get("@opf:role", "contributor")
|
|
91
|
+
file_as = contrib.get("@opf:file-as", "")
|
|
92
|
+
if name:
|
|
93
|
+
simplified["contributors"].append({
|
|
94
|
+
"name": name,
|
|
95
|
+
"role": role,
|
|
96
|
+
"file_as": file_as
|
|
97
|
+
})
|
|
98
|
+
elif isinstance(contrib, str):
|
|
99
|
+
simplified["contributors"].append({
|
|
100
|
+
"name": contrib.strip(),
|
|
101
|
+
"role": "contributor",
|
|
102
|
+
"file_as": ""
|
|
103
|
+
})
|
|
104
|
+
elif isinstance(contributors_raw, dict):
|
|
105
|
+
name = contributors_raw.get("#text", "").strip()
|
|
106
|
+
role = contributors_raw.get("@opf:role", "contributor")
|
|
107
|
+
file_as = contributors_raw.get("@opf:file-as", "")
|
|
108
|
+
if name:
|
|
109
|
+
simplified["contributors"] = [{
|
|
110
|
+
"name": name,
|
|
111
|
+
"role": role,
|
|
112
|
+
"file_as": file_as
|
|
113
|
+
}]
|
|
114
|
+
|
|
115
|
+
# -- Calibre-specific metadata (series, etc)
|
|
116
|
+
# Look for meta tags with name attributes
|
|
117
|
+
meta_tags = metadata.get("meta", [])
|
|
118
|
+
if not isinstance(meta_tags, list):
|
|
119
|
+
meta_tags = [meta_tags] if meta_tags else []
|
|
120
|
+
|
|
121
|
+
for meta in meta_tags:
|
|
122
|
+
if isinstance(meta, dict):
|
|
123
|
+
meta_name = meta.get("@name", "")
|
|
124
|
+
meta_content = meta.get("@content", "")
|
|
125
|
+
|
|
126
|
+
if meta_name == "calibre:series" and meta_content:
|
|
127
|
+
simplified["series"] = meta_content
|
|
128
|
+
elif meta_name == "calibre:series_index" and meta_content:
|
|
129
|
+
try:
|
|
130
|
+
simplified["series_index"] = float(meta_content)
|
|
131
|
+
except (ValueError, TypeError):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
return simplified
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
138
|
+
"""
|
|
139
|
+
Extract metadata from a PDF file using pypdf.
|
|
140
|
+
Returns a dictionary with the same keys as the OPF-based dict.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
metadata = {
|
|
144
|
+
"title": None,
|
|
145
|
+
"creators": None,
|
|
146
|
+
"subjects": None,
|
|
147
|
+
"description": None,
|
|
148
|
+
"language": None,
|
|
149
|
+
"date": None,
|
|
150
|
+
"publisher": None,
|
|
151
|
+
"identifiers": None,
|
|
152
|
+
"keywords": None,
|
|
153
|
+
"creator_application": None,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
with open(pdf_path, "rb") as f:
|
|
158
|
+
reader = pypdf.PdfReader(f)
|
|
159
|
+
info = reader.metadata or {}
|
|
160
|
+
|
|
161
|
+
# NOTE: Depending on pypdf version, metadata keys can differ
|
|
162
|
+
# e.g. info.title vs info.get('/Title')
|
|
163
|
+
pdf_title = info.get("/Title", None) or info.get("title", None)
|
|
164
|
+
pdf_author = info.get("/Author", None) or info.get("author", None)
|
|
165
|
+
pdf_subject = info.get("/Subject", None) or info.get("subject", None)
|
|
166
|
+
pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
|
|
167
|
+
pdf_creator = info.get("/Creator", None) or info.get("creator", None) # Application used
|
|
168
|
+
pdf_producer = info.get("/Producer", None) or info.get("producer", None)
|
|
169
|
+
pdf_publisher = info.get("/Publisher", None) or info.get("publisher", None)
|
|
170
|
+
pdf_creation_date = info.get("/CreationDate", None)
|
|
171
|
+
|
|
172
|
+
if pdf_title:
|
|
173
|
+
metadata["title"] = pdf_title.strip()
|
|
174
|
+
if pdf_author:
|
|
175
|
+
metadata["creators"] = [pdf_author.strip()]
|
|
176
|
+
if pdf_subject:
|
|
177
|
+
metadata["subjects"] = [sub.strip() for sub in pdf_subject.split(",")]
|
|
178
|
+
metadata["description"] = pdf_subject.strip()
|
|
179
|
+
|
|
180
|
+
if pdf_creation_date and len(pdf_creation_date) >= 10:
|
|
181
|
+
# Format: 'D:YYYYMMDDhhmmss'
|
|
182
|
+
# We'll extract 'YYYY-MM-DD'
|
|
183
|
+
date_str = pdf_creation_date[2:10] # e.g., 20210101
|
|
184
|
+
metadata["date"] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
|
|
185
|
+
# Language not typically stored in PDF metadata
|
|
186
|
+
metadata["language"] = "unknown-language"
|
|
187
|
+
|
|
188
|
+
# For an "identifier," we don't really have a built-in PDF field, so it's optional
|
|
189
|
+
metadata["identifiers"] = {"pdf:identifier": pdf_path}
|
|
190
|
+
|
|
191
|
+
if pdf_keywords:
|
|
192
|
+
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",") if kw.strip()]
|
|
193
|
+
|
|
194
|
+
# Creator is the application that created the PDF (e.g., LaTeX, Word)
|
|
195
|
+
if pdf_creator:
|
|
196
|
+
metadata["creator_application"] = pdf_creator.strip()
|
|
197
|
+
|
|
198
|
+
# Publisher: prefer explicit Publisher field, fallback to Producer
|
|
199
|
+
if pdf_publisher:
|
|
200
|
+
metadata["publisher"] = pdf_publisher.strip()
|
|
201
|
+
elif pdf_producer and not pdf_creator:
|
|
202
|
+
# Only use producer as publisher if there's no creator app
|
|
203
|
+
metadata["publisher"] = pdf_producer.strip()
|
|
204
|
+
|
|
205
|
+
metadata["file_paths"] = [pdf_path]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
print(f"[extract_metadata_from_pdf] Error reading '{pdf_path}': {e}")
|
|
210
|
+
|
|
211
|
+
return metadata
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def extract_metadata_from_epub(epub_path: str) -> Dict:
|
|
215
|
+
"""
|
|
216
|
+
Extract metadata from an EPUB file using ebooklib.
|
|
217
|
+
Returns a dictionary with the same keys as the OPF-based dict.
|
|
218
|
+
"""
|
|
219
|
+
metadata = {
|
|
220
|
+
"title": None,
|
|
221
|
+
"creators": [],
|
|
222
|
+
"subjects": [],
|
|
223
|
+
"description": None,
|
|
224
|
+
"language": None,
|
|
225
|
+
"date": None,
|
|
226
|
+
"identifiers": {},
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
book = epub.read_epub(epub_path)
|
|
231
|
+
|
|
232
|
+
# Title
|
|
233
|
+
dc_title = book.get_metadata("DC", "title")
|
|
234
|
+
if dc_title:
|
|
235
|
+
metadata["title"] = dc_title[0][0]
|
|
236
|
+
|
|
237
|
+
# Creators
|
|
238
|
+
dc_creators = book.get_metadata("DC", "creator")
|
|
239
|
+
if dc_creators:
|
|
240
|
+
metadata["creators"] = [c[0] for c in dc_creators]
|
|
241
|
+
|
|
242
|
+
# Subjects
|
|
243
|
+
dc_subjects = book.get_metadata("DC", "subject")
|
|
244
|
+
if dc_subjects:
|
|
245
|
+
metadata["subjects"] = [s[0] for s in dc_subjects]
|
|
246
|
+
|
|
247
|
+
# Description
|
|
248
|
+
dc_description = book.get_metadata("DC", "description")
|
|
249
|
+
if dc_description:
|
|
250
|
+
metadata["description"] = dc_description[0][0]
|
|
251
|
+
|
|
252
|
+
# Language
|
|
253
|
+
dc_language = book.get_metadata("DC", "language")
|
|
254
|
+
if dc_language:
|
|
255
|
+
metadata["language"] = dc_language[0][0]
|
|
256
|
+
|
|
257
|
+
# Date
|
|
258
|
+
dc_date = book.get_metadata("DC", "date")
|
|
259
|
+
if dc_date:
|
|
260
|
+
metadata["date"] = dc_date[0][0]
|
|
261
|
+
|
|
262
|
+
# Identifiers
|
|
263
|
+
identifiers = book.get_metadata("DC", "identifier")
|
|
264
|
+
if identifiers:
|
|
265
|
+
for identifier in identifiers:
|
|
266
|
+
# identifier is a tuple: (value, { 'scheme': '...' })
|
|
267
|
+
ident_value, ident_attrs = identifier
|
|
268
|
+
scheme = ident_attrs.get("scheme", "unknown")
|
|
269
|
+
metadata["identifiers"][scheme] = ident_value
|
|
270
|
+
except Exception as e:
|
|
271
|
+
print(f"[extract_metadata_from_epub] Error reading '{epub_path}': {e}")
|
|
272
|
+
|
|
273
|
+
return metadata
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def extract_metadata_from_path(file_path: str) -> Dict:
|
|
277
|
+
"""
|
|
278
|
+
Fallback metadata extraction by interpreting the path as <...>/<author>/<title>.
|
|
279
|
+
Slugify them to remove invalid characters.
|
|
280
|
+
"""
|
|
281
|
+
metadata = {
|
|
282
|
+
"title": None,
|
|
283
|
+
"creators": [],
|
|
284
|
+
"subjects": [],
|
|
285
|
+
"description": "",
|
|
286
|
+
"language": "unknown-language",
|
|
287
|
+
"date": "unknown-date",
|
|
288
|
+
"identifiers": {}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
path_parts = file_path.split(os.sep)
|
|
293
|
+
# path_parts: ['base_dir', 'author_dir', 'title', 'title - author.ext'] ]
|
|
294
|
+
title = path_parts[-2]
|
|
295
|
+
creators = path_parts[1].split(",")
|
|
296
|
+
metadata["title"] = title
|
|
297
|
+
metadata["creators"] = creators
|
|
298
|
+
except Exception as e:
|
|
299
|
+
print(f"[extract_metadata_from_path] Error with '{file_path}': {e}")
|
|
300
|
+
|
|
301
|
+
return metadata
|
|
302
|
+
|
|
303
|
+
def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
|
|
304
|
+
"""
|
|
305
|
+
High-level function to extract metadata from either:
|
|
306
|
+
- OPF file (if provided)
|
|
307
|
+
- The ebook_file (PDF, EPUB, or fallback from path)
|
|
308
|
+
Then merges them, giving priority to OPF data.
|
|
309
|
+
|
|
310
|
+
Returns a final merged dictionary with keys:
|
|
311
|
+
- title
|
|
312
|
+
- creators
|
|
313
|
+
- subjects
|
|
314
|
+
- description
|
|
315
|
+
- language
|
|
316
|
+
- date
|
|
317
|
+
- identifiers
|
|
318
|
+
- cover_path
|
|
319
|
+
- file_paths
|
|
320
|
+
- virtual_libs
|
|
321
|
+
- unique_id
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
# 1. Extract from OPF if we have it
|
|
325
|
+
opf_metadata = {}
|
|
326
|
+
if opf_file and os.path.isfile(opf_file):
|
|
327
|
+
opf_metadata = extract_metadata_from_opf(opf_file)
|
|
328
|
+
|
|
329
|
+
ebook_metadata = {}
|
|
330
|
+
_, ext = os.path.splitext(ebook_file.lower())
|
|
331
|
+
if ext == ".pdf":
|
|
332
|
+
ebook_metadata = extract_metadata_from_pdf(ebook_file)
|
|
333
|
+
elif ext == ".epub":
|
|
334
|
+
ebook_metadata = extract_metadata_from_epub(ebook_file)
|
|
335
|
+
|
|
336
|
+
path_metadata = extract_metadata_from_path(ebook_file)
|
|
337
|
+
|
|
338
|
+
metadata = {key: opf_metadata.get(key) or ebook_metadata.get(key) or value for key, value in ebook_metadata.items()}
|
|
339
|
+
metadata = {key: metadata.get(key) or value for key, value in path_metadata.items()}
|
|
340
|
+
return metadata
|
|
341
|
+
|
ebk/ident.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
import uuid
|
|
5
|
+
|
|
6
|
+
def canonicalize_text(text: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Canonicalize text by converting to lowercase, removing punctuation,
|
|
9
|
+
stripping whitespace, and replacing spaces with underscores.
|
|
10
|
+
"""
|
|
11
|
+
text = text.lower()
|
|
12
|
+
# Remove punctuation using regex
|
|
13
|
+
text = re.sub(r'[^\w\s]', '', text)
|
|
14
|
+
# Replace multiple spaces with a single space
|
|
15
|
+
text = re.sub(r'\s+', ' ', text)
|
|
16
|
+
# Strip leading and trailing whitespace
|
|
17
|
+
text = text.strip()
|
|
18
|
+
# Replace spaces with underscores
|
|
19
|
+
text = text.replace(' ', '_')
|
|
20
|
+
return text
|
|
21
|
+
|
|
22
|
+
def canonicalize_creators(creators: List[str]) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Canonicalize a list of creators (authors) by sorting them,
|
|
25
|
+
canonicalizing each name, and joining with underscores.
|
|
26
|
+
"""
|
|
27
|
+
# Sort creators alphabetically for consistency
|
|
28
|
+
sorted_creators = sorted(creators)
|
|
29
|
+
canonical_creators = [canonicalize_text(creator) for creator in sorted_creators]
|
|
30
|
+
# Join multiple creators with underscores
|
|
31
|
+
return '_'.join(canonical_creators)
|
|
32
|
+
|
|
33
|
+
def generate_composite_string(entry: Dict) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Generate a composite string by concatenating canonicalized values
|
|
36
|
+
of ISBN, date, language, publisher, creators, and title.
|
|
37
|
+
|
|
38
|
+
The order is important for consistency.
|
|
39
|
+
"""
|
|
40
|
+
language = entry.get('language', '').strip()
|
|
41
|
+
creators = entry.get('creators', [])
|
|
42
|
+
title = entry.get('title', '').strip()
|
|
43
|
+
|
|
44
|
+
# Canonicalize each field
|
|
45
|
+
language_c = canonicalize_text(language) if language else 'no_language'
|
|
46
|
+
creators_c = canonicalize_creators(creators) if creators else 'no_creators'
|
|
47
|
+
title_c = canonicalize_text(title) if title else 'no_title'
|
|
48
|
+
|
|
49
|
+
if language_c == 'no_language' and creators_c == 'no_creators' and title_c == 'no_title':
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
# Concatenate fields with double underscores as delimiters
|
|
53
|
+
composite_string = f"{language_c}__{creators_c}__{title_c}"
|
|
54
|
+
return composite_string
|
|
55
|
+
|
|
56
|
+
def generate_hash_id(entry: Dict) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Generate a unique hash ID for an eBook entry by hashing the composite string.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
entry (Dict): The eBook entry metadata.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
str: The SHA-256 hash hexadecimal string.
|
|
65
|
+
"""
|
|
66
|
+
composite_string = generate_composite_string(entry)
|
|
67
|
+
if composite_string:
|
|
68
|
+
composite_bytes = composite_string.encode('utf-8')
|
|
69
|
+
else:
|
|
70
|
+
composite_bytes = str(uuid.uuid4()).encode('utf-8')
|
|
71
|
+
|
|
72
|
+
# Create SHA-256 hash
|
|
73
|
+
hash_obj = hashlib.sha256(composite_bytes)
|
|
74
|
+
hash_hex = hash_obj.hexdigest()
|
|
75
|
+
return hash_hex
|
|
76
|
+
|
|
77
|
+
def add_unique_id(entry: Dict) -> Dict:
|
|
78
|
+
"""
|
|
79
|
+
Add a unique hash ID to the eBook entry.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
entry (Dict): The original eBook entry metadata.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Dict: The eBook entry with an added 'unique_id' field.
|
|
86
|
+
"""
|
|
87
|
+
unique_id = generate_hash_id(entry)
|
|
88
|
+
entry['unique_id'] = unique_id
|
|
89
|
+
return entry
|