ebk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +0 -0
- ebk/cli.py +879 -0
- ebk/config.py +35 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/hugo.py +55 -0
- ebk/exports/zip.py +25 -0
- ebk/extract_metadata.py +273 -0
- ebk/ident.py +96 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +144 -0
- ebk/imports/ebooks.py +116 -0
- ebk/llm.py +58 -0
- ebk/manager.py +44 -0
- ebk/merge.py +308 -0
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +185 -0
- ebk/streamlit/display.py +168 -0
- ebk/streamlit/filters.py +151 -0
- ebk/streamlit/utils.py +58 -0
- ebk/utils.py +311 -0
- ebk-0.1.0.dist-info/METADATA +457 -0
- ebk-0.1.0.dist-info/RECORD +29 -0
- ebk-0.1.0.dist-info/WHEEL +5 -0
- ebk-0.1.0.dist-info/entry_points.txt +2 -0
- ebk-0.1.0.dist-info/top_level.txt +1 -0
ebk/config.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import configparser
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
def load_ebkrc_config():
|
|
5
|
+
"""
|
|
6
|
+
Loads configuration from ~/.btkrc.
|
|
7
|
+
|
|
8
|
+
If using LLM interface, expects a section [llm] with at least 'endpoint' and 'api_key'.
|
|
9
|
+
If using cloud interface (for generating complex networks), the section [cloud] may be used to specify various parameters.
|
|
10
|
+
"""
|
|
11
|
+
config_path = os.path.expanduser("~/.ebkrc")
|
|
12
|
+
parser = configparser.ConfigParser()
|
|
13
|
+
|
|
14
|
+
if not os.path.exists(config_path):
|
|
15
|
+
raise FileNotFoundError(f"Could not find config file at {config_path}")
|
|
16
|
+
|
|
17
|
+
parser.read(config_path)
|
|
18
|
+
|
|
19
|
+
if "llm" not in parser:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
"Config file ~/.btkrc is missing the [llm] section. "
|
|
22
|
+
"Please add it with 'endpoint' and 'api_key' keys."
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
endpoint = parser["llm"].get("endpoint", "")
|
|
26
|
+
api_key = parser["llm"].get("api_key", "")
|
|
27
|
+
model = parser["llm"].get("model", "gpt-3.5-turbo")
|
|
28
|
+
|
|
29
|
+
if not endpoint or not api_key or not model:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
"Please make sure your [llm] section in ~/.btkrc "
|
|
32
|
+
"includes 'endpoint', 'api_key', and 'model' keys."
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return endpoint, api_key, model
|
ebk/exports/__init__.py
ADDED
|
File without changes
|
ebk/exports/hugo.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
def export_hugo(lib_dir, hugo_dir):
|
|
10
|
+
"""
|
|
11
|
+
Export ebk library to Hugo-compatible Markdown files.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
lib_dir (str): Path to the ebk library directory to export (contains `metadata.json` and ebook-related files)
|
|
15
|
+
hugo_dir (str): Path to the Hugo site directory
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
lib_dir = Path(lib_dir)
|
|
19
|
+
with open(lib_dir / "metadata.json", "r") as f:
|
|
20
|
+
books = json.load(f)
|
|
21
|
+
|
|
22
|
+
hugo_dir = Path(hugo_dir)
|
|
23
|
+
|
|
24
|
+
content_dir = hugo_dir / "content" / "library"
|
|
25
|
+
static_dir = hugo_dir / "static" / "ebooks"
|
|
26
|
+
content_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
static_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
for book in books:
|
|
30
|
+
slug = book['title'].replace(" ", "-").lower()
|
|
31
|
+
md_file = content_dir / f"{slug}.md"
|
|
32
|
+
|
|
33
|
+
with open(md_file, "w") as md:
|
|
34
|
+
md.write("---\n")
|
|
35
|
+
md.write(f"title: {book['title']}\n")
|
|
36
|
+
md.write(f"creators: [{', '.join(book['creators'])}]\n")
|
|
37
|
+
md.write(f"subjects: [{', '.join(book['subjects'])}]\n")
|
|
38
|
+
md.write(f"description: {book['description']}\n")
|
|
39
|
+
md.write(f"date: {book['date']}\n")
|
|
40
|
+
md.write(f"tags: [{', '.join(book['Tags'].split(', '))}]\n")
|
|
41
|
+
md.write(f"ebook_file: /ebooks/{Path(book['file_path']).name}\n")
|
|
42
|
+
md.write(f"cover_image: /ebooks/{Path(book['Cover Path']).name if book['Cover Path'] else ''}\n")
|
|
43
|
+
md.write("---\n\n")
|
|
44
|
+
md.write(f"# {book['Title']}\n\n")
|
|
45
|
+
md.write(f"Author: {book['Author']}\n\n")
|
|
46
|
+
md.write(f"[Download eBook](/ebooks/{Path(book['File Path']).name})\n")
|
|
47
|
+
|
|
48
|
+
# Copy eBook and cover to static directory
|
|
49
|
+
if book["File Path"]:
|
|
50
|
+
os.system(f"cp '{book['File Path']}' '{static_dir}'")
|
|
51
|
+
if book["Cover Path"]:
|
|
52
|
+
os.system(f"cp '{book['Cover Path']}' '{static_dir}'")
|
|
53
|
+
|
|
54
|
+
logger.debug(f"Exported {len(books)} books to Hugo site at '{hugo_dir}'")
|
|
55
|
+
|
ebk/exports/zip.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import zipfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
def export_zipfile(lib_dir, zip_file):
|
|
9
|
+
"""
|
|
10
|
+
Export ebk library to a ZIP archive.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
lib_dir (str): Path to the ebk library directory to export (contains `metadata.json` and ebook-related files)
|
|
14
|
+
zip_file (str): Path to the output ZIP file
|
|
15
|
+
"""
|
|
16
|
+
lib_dir = Path(lib_dir)
|
|
17
|
+
|
|
18
|
+
# just want to take the entire directory and zip it
|
|
19
|
+
|
|
20
|
+
with zipfile.ZipFile(zip_file, "w") as z:
|
|
21
|
+
for root, _, files in os.walk(lib_dir):
|
|
22
|
+
for file in files:
|
|
23
|
+
file_path = Path(root) / file
|
|
24
|
+
logging.debug(f"Adding file to zip: {file_path}")
|
|
25
|
+
z.write(file_path, arcname=file_path.relative_to(lib_dir))
|
ebk/extract_metadata.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import xmltodict
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
from slugify import slugify
|
|
5
|
+
import PyPDF2
|
|
6
|
+
from ebooklib import epub
|
|
7
|
+
|
|
8
|
+
def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
9
|
+
"""
|
|
10
|
+
Parse a Calibre OPF file into a simplified dictionary structure (Dublin Core).
|
|
11
|
+
Returns a dict with keys:
|
|
12
|
+
- title
|
|
13
|
+
- creators
|
|
14
|
+
- subjects
|
|
15
|
+
- description
|
|
16
|
+
- language
|
|
17
|
+
- date
|
|
18
|
+
- identifiers
|
|
19
|
+
- publisher
|
|
20
|
+
"""
|
|
21
|
+
try:
|
|
22
|
+
with open(opf_file, "r", encoding="utf-8") as f:
|
|
23
|
+
opf_dict = xmltodict.parse(f.read(), process_namespaces=False)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"[extract_metadata_from_opf] Error reading '{opf_file}': {e}")
|
|
26
|
+
return {}
|
|
27
|
+
|
|
28
|
+
package = opf_dict.get("package", {})
|
|
29
|
+
metadata = package.get("metadata", {})
|
|
30
|
+
|
|
31
|
+
# Prepare simplified structure
|
|
32
|
+
simplified = {
|
|
33
|
+
"title": metadata.get("dc:title", metadata.get("title")),
|
|
34
|
+
"creators": None,
|
|
35
|
+
"subjects": None,
|
|
36
|
+
"description": metadata.get("dc:description", metadata.get("description")),
|
|
37
|
+
"language": metadata.get("dc:language", metadata.get("language")),
|
|
38
|
+
"date": metadata.get("dc:date", metadata.get("date")),
|
|
39
|
+
"publisher": metadata.get("dc:publisher", metadata.get("publisher")),
|
|
40
|
+
"identifiers": None
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# -- Creators
|
|
44
|
+
creators = metadata.get("dc:creator", metadata.get("creator"))
|
|
45
|
+
if isinstance(creators, list):
|
|
46
|
+
simplified["creators"] = [
|
|
47
|
+
c.get("#text", "").strip() if isinstance(c, dict) else c
|
|
48
|
+
for c in creators
|
|
49
|
+
]
|
|
50
|
+
elif isinstance(creators, dict):
|
|
51
|
+
simplified["creators"] = [creators.get("#text", "").strip()]
|
|
52
|
+
elif isinstance(creators, str):
|
|
53
|
+
simplified["creators"] = [creators.strip()]
|
|
54
|
+
|
|
55
|
+
# -- Subjects
|
|
56
|
+
subjects = metadata.get("dc:subject", metadata.get("subject"))
|
|
57
|
+
if isinstance(subjects, list):
|
|
58
|
+
simplified["subjects"] = [s.strip() for s in subjects]
|
|
59
|
+
elif isinstance(subjects, str):
|
|
60
|
+
simplified["subjects"] = [subjects.strip()]
|
|
61
|
+
|
|
62
|
+
# -- Identifiers
|
|
63
|
+
identifiers = metadata.get("dc:identifier", metadata.get("identifier"))
|
|
64
|
+
if isinstance(identifiers, list):
|
|
65
|
+
simplified["identifiers"] = {}
|
|
66
|
+
for identifier in identifiers:
|
|
67
|
+
if isinstance(identifier, dict):
|
|
68
|
+
scheme = identifier.get("@opf:scheme", "unknown")
|
|
69
|
+
text = identifier.get("#text", "").strip()
|
|
70
|
+
simplified["identifiers"][scheme] = text
|
|
71
|
+
else:
|
|
72
|
+
simplified["identifiers"]["unknown"] = identifier
|
|
73
|
+
elif isinstance(identifiers, dict):
|
|
74
|
+
scheme = identifiers.get("@opf:scheme", "unknown")
|
|
75
|
+
text = identifiers.get("#text", "").strip()
|
|
76
|
+
simplified["identifiers"][scheme] = text
|
|
77
|
+
|
|
78
|
+
return simplified
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
82
|
+
"""
|
|
83
|
+
Extract metadata from a PDF file using PyPDF2.
|
|
84
|
+
Returns a dictionary with the same keys as the OPF-based dict.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
metadata = {
|
|
88
|
+
"title": None,
|
|
89
|
+
"creators": None,
|
|
90
|
+
"subjects": None,
|
|
91
|
+
"description": None,
|
|
92
|
+
"language": None,
|
|
93
|
+
"date": None,
|
|
94
|
+
"publisher": None,
|
|
95
|
+
"identifiers": None,
|
|
96
|
+
"keywords": None,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
with open(pdf_path, "rb") as f:
|
|
101
|
+
reader = PyPDF2.PdfReader(f)
|
|
102
|
+
info = reader.metadata or {}
|
|
103
|
+
|
|
104
|
+
# NOTE: Depending on PyPDF2 version, metadata keys can differ
|
|
105
|
+
# e.g. info.title vs info.get('/Title')
|
|
106
|
+
pdf_title = info.get("/Title", None) or info.get("title", None)
|
|
107
|
+
pdf_author = info.get("/Author", None) or info.get("author", None)
|
|
108
|
+
pdf_subject = info.get("/Subject", None) or info.get("subject", None)
|
|
109
|
+
pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
|
|
110
|
+
pdf_publisher = info.get("/Producer", None) or info.get("producer", None) or info.get("/Publisher", None) or info.get("publisher", None)
|
|
111
|
+
pdf_creation_date = info.get("/CreationDate", None)
|
|
112
|
+
|
|
113
|
+
if pdf_title:
|
|
114
|
+
metadata["title"] = pdf_title.strip()
|
|
115
|
+
if pdf_author:
|
|
116
|
+
metadata["creators"] = [pdf_author.strip()]
|
|
117
|
+
if pdf_subject:
|
|
118
|
+
metadata["subjects"] = [sub.strip() for sub in pdf_subject.split(",")]
|
|
119
|
+
metadata["description"] = pdf_subject.strip()
|
|
120
|
+
|
|
121
|
+
if pdf_creation_date and len(pdf_creation_date) >= 10:
|
|
122
|
+
# Format: 'D:YYYYMMDDhhmmss'
|
|
123
|
+
# We'll extract 'YYYY-MM-DD'
|
|
124
|
+
date_str = pdf_creation_date[2:10] # e.g., 20210101
|
|
125
|
+
metadata["date"] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
|
|
126
|
+
# Language not typically stored in PDF metadata
|
|
127
|
+
metadata["language"] = "unknown-language"
|
|
128
|
+
|
|
129
|
+
# For an "identifier," we don't really have a built-in PDF field, so it's optional
|
|
130
|
+
metadata["identifiers"] = {"pdf:identifier": pdf_path}
|
|
131
|
+
|
|
132
|
+
if pdf_keywords:
|
|
133
|
+
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",")]
|
|
134
|
+
|
|
135
|
+
if pdf_publisher:
|
|
136
|
+
metadata["publisher"] = pdf_publisher.strip()
|
|
137
|
+
|
|
138
|
+
metadata["file_paths"] = [pdf_path]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"[extract_metadata_from_pdf] Error reading '{pdf_path}': {e}")
|
|
143
|
+
|
|
144
|
+
return metadata
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def extract_metadata_from_epub(epub_path: str) -> Dict:
|
|
148
|
+
"""
|
|
149
|
+
Extract metadata from an EPUB file using ebooklib.
|
|
150
|
+
Returns a dictionary with the same keys as the OPF-based dict.
|
|
151
|
+
"""
|
|
152
|
+
metadata = {
|
|
153
|
+
"title": None,
|
|
154
|
+
"creators": [],
|
|
155
|
+
"subjects": [],
|
|
156
|
+
"description": None,
|
|
157
|
+
"language": None,
|
|
158
|
+
"date": None,
|
|
159
|
+
"identifiers": {},
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
book = epub.read_epub(epub_path)
|
|
164
|
+
|
|
165
|
+
# Title
|
|
166
|
+
dc_title = book.get_metadata("DC", "title")
|
|
167
|
+
if dc_title:
|
|
168
|
+
metadata["title"] = dc_title[0][0]
|
|
169
|
+
|
|
170
|
+
# Creators
|
|
171
|
+
dc_creators = book.get_metadata("DC", "creator")
|
|
172
|
+
if dc_creators:
|
|
173
|
+
metadata["creators"] = [c[0] for c in dc_creators]
|
|
174
|
+
|
|
175
|
+
# Subjects
|
|
176
|
+
dc_subjects = book.get_metadata("DC", "subject")
|
|
177
|
+
if dc_subjects:
|
|
178
|
+
metadata["subjects"] = [s[0] for s in dc_subjects]
|
|
179
|
+
|
|
180
|
+
# Description
|
|
181
|
+
dc_description = book.get_metadata("DC", "description")
|
|
182
|
+
if dc_description:
|
|
183
|
+
metadata["description"] = dc_description[0][0]
|
|
184
|
+
|
|
185
|
+
# Language
|
|
186
|
+
dc_language = book.get_metadata("DC", "language")
|
|
187
|
+
if dc_language:
|
|
188
|
+
metadata["language"] = dc_language[0][0]
|
|
189
|
+
|
|
190
|
+
# Date
|
|
191
|
+
dc_date = book.get_metadata("DC", "date")
|
|
192
|
+
if dc_date:
|
|
193
|
+
metadata["date"] = dc_date[0][0]
|
|
194
|
+
|
|
195
|
+
# Identifiers
|
|
196
|
+
identifiers = book.get_metadata("DC", "identifier")
|
|
197
|
+
if identifiers:
|
|
198
|
+
for identifier in identifiers:
|
|
199
|
+
# identifier is a tuple: (value, { 'scheme': '...' })
|
|
200
|
+
ident_value, ident_attrs = identifier
|
|
201
|
+
scheme = ident_attrs.get("scheme", "unknown")
|
|
202
|
+
metadata["identifiers"][scheme] = ident_value
|
|
203
|
+
except Exception as e:
|
|
204
|
+
print(f"[extract_metadata_from_epub] Error reading '{epub_path}': {e}")
|
|
205
|
+
|
|
206
|
+
return metadata
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def extract_metadata_from_path(file_path: str) -> Dict:
|
|
210
|
+
"""
|
|
211
|
+
Fallback metadata extraction by interpreting the path as <...>/<author>/<title>.
|
|
212
|
+
Slugify them to remove invalid characters.
|
|
213
|
+
"""
|
|
214
|
+
metadata = {
|
|
215
|
+
"title": None,
|
|
216
|
+
"creators": [],
|
|
217
|
+
"subjects": [],
|
|
218
|
+
"description": "",
|
|
219
|
+
"language": "unknown-language",
|
|
220
|
+
"date": "unknown-date",
|
|
221
|
+
"identifiers": {}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
path_parts = file_path.split(os.sep)
|
|
226
|
+
# path_parts: ['base_dir', 'author_dir', 'title', 'title - author.ext'] ]
|
|
227
|
+
title = path_parts[-2]
|
|
228
|
+
creators = path_parts[1].split(",")
|
|
229
|
+
metadata["title"] = title
|
|
230
|
+
metadata["creators"] = creators
|
|
231
|
+
except Exception as e:
|
|
232
|
+
print(f"[extract_metadata_from_path] Error with '{file_path}': {e}")
|
|
233
|
+
|
|
234
|
+
return metadata
|
|
235
|
+
|
|
236
|
+
def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
|
|
237
|
+
"""
|
|
238
|
+
High-level function to extract metadata from either:
|
|
239
|
+
- OPF file (if provided)
|
|
240
|
+
- The ebook_file (PDF, EPUB, or fallback from path)
|
|
241
|
+
Then merges them, giving priority to OPF data.
|
|
242
|
+
|
|
243
|
+
Returns a final merged dictionary with keys:
|
|
244
|
+
- title
|
|
245
|
+
- creators
|
|
246
|
+
- subjects
|
|
247
|
+
- description
|
|
248
|
+
- language
|
|
249
|
+
- date
|
|
250
|
+
- identifiers
|
|
251
|
+
- cover_path
|
|
252
|
+
- file_paths
|
|
253
|
+
- virtual_libs
|
|
254
|
+
- unique_id
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
# 1. Extract from OPF if we have it
|
|
258
|
+
opf_metadata = {}
|
|
259
|
+
if opf_file and os.path.isfile(opf_file):
|
|
260
|
+
opf_metadata = extract_metadata_from_opf(opf_file)
|
|
261
|
+
|
|
262
|
+
_, ext = os.path.splitext(ebook_file.lower())
|
|
263
|
+
if ext == ".pdf":
|
|
264
|
+
ebook_metadata = extract_metadata_from_pdf(ebook_file)
|
|
265
|
+
elif ext == ".epub":
|
|
266
|
+
ebook_metadata = extract_metadata_from_epub(ebook_file)
|
|
267
|
+
|
|
268
|
+
path_metadata = extract_metadata_from_path(ebook_file)
|
|
269
|
+
|
|
270
|
+
metadata = {key: opf_metadata.get(key) or ebook_metadata.get(key) or value for key, value in ebook_metadata.items()}
|
|
271
|
+
metadata = {key: metadata.get(key) or value for key, value in path_metadata.items()}
|
|
272
|
+
return metadata
|
|
273
|
+
|
ebk/ident.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
import uuid
|
|
5
|
+
|
|
6
|
+
def canonicalize_text(text: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Canonicalize text by converting to lowercase, removing punctuation,
|
|
9
|
+
stripping whitespace, and replacing spaces with underscores.
|
|
10
|
+
"""
|
|
11
|
+
text = text.lower()
|
|
12
|
+
# Remove punctuation using regex
|
|
13
|
+
text = re.sub(r'[^\w\s]', '', text)
|
|
14
|
+
# Replace multiple spaces with a single space
|
|
15
|
+
text = re.sub(r'\s+', ' ', text)
|
|
16
|
+
# Strip leading and trailing whitespace
|
|
17
|
+
text = text.strip()
|
|
18
|
+
# Replace spaces with underscores
|
|
19
|
+
text = text.replace(' ', '_')
|
|
20
|
+
return text
|
|
21
|
+
|
|
22
|
+
def canonicalize_creators(creators: List[str]) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Canonicalize a list of creators (authors) by sorting them,
|
|
25
|
+
canonicalizing each name, and joining with underscores.
|
|
26
|
+
"""
|
|
27
|
+
# Sort creators alphabetically for consistency
|
|
28
|
+
sorted_creators = sorted(creators)
|
|
29
|
+
canonical_creators = [canonicalize_text(creator) for creator in sorted_creators]
|
|
30
|
+
# Join multiple creators with underscores
|
|
31
|
+
return '_'.join(canonical_creators)
|
|
32
|
+
|
|
33
|
+
def generate_composite_string(entry: Dict) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Generate a composite string by concatenating canonicalized values
|
|
36
|
+
of ISBN, date, language, publisher, creators, and title.
|
|
37
|
+
|
|
38
|
+
The order is important for consistency.
|
|
39
|
+
"""
|
|
40
|
+
identifiers = entry.get('identifiers', {})
|
|
41
|
+
#isbn = identifiers.get('ISBN', '').strip()
|
|
42
|
+
#date = entry.get('date', '').strip()
|
|
43
|
+
language = entry.get('language', '').strip()
|
|
44
|
+
#publisher = entry.get('publisher', '').strip()
|
|
45
|
+
creators = entry.get('creators', [])
|
|
46
|
+
title = entry.get('title', '').strip()
|
|
47
|
+
|
|
48
|
+
# Canonicalize each field
|
|
49
|
+
#isbn_c = canonicalize_text(isbn) if isbn else 'no_isbn'
|
|
50
|
+
#date_c = canonicalize_text(date) if date else 'no_date'
|
|
51
|
+
language_c = canonicalize_text(language) if language else 'no_language'
|
|
52
|
+
#publisher_c = canonicalize_text(publisher) if publisher else 'no_publisher'
|
|
53
|
+
creators_c = canonicalize_creators(creators) if creators else 'no_creators'
|
|
54
|
+
title_c = canonicalize_text(title) if title else 'no_title'
|
|
55
|
+
|
|
56
|
+
if language_c == 'no_language' and creators_c == 'no_creators' and title_c == 'no_title':
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
# Concatenate fields with double underscores as delimiters
|
|
60
|
+
composite_string = f"{language_c}__{creators_c}__{title_c}"
|
|
61
|
+
return composite_string
|
|
62
|
+
|
|
63
|
+
def generate_hash_id(entry: Dict) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Generate a unique hash ID for an eBook entry by hashing the composite string.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
entry (Dict): The eBook entry metadata.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
str: The SHA-256 hash hexadecimal string.
|
|
72
|
+
"""
|
|
73
|
+
composite_string = generate_composite_string(entry)
|
|
74
|
+
if composite_string:
|
|
75
|
+
composite_bytes = composite_string.encode('utf-8')
|
|
76
|
+
else:
|
|
77
|
+
composite_bytes = str(uuid.uuid4()).encode('utf-8')
|
|
78
|
+
|
|
79
|
+
# Create SHA-256 hash
|
|
80
|
+
hash_obj = hashlib.sha256(composite_bytes)
|
|
81
|
+
hash_hex = hash_obj.hexdigest()
|
|
82
|
+
return hash_hex
|
|
83
|
+
|
|
84
|
+
def add_unique_id(entry: Dict) -> Dict:
|
|
85
|
+
"""
|
|
86
|
+
Add a unique hash ID to the eBook entry.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
entry (Dict): The original eBook entry metadata.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Dict: The eBook entry with an added 'unique_id' field.
|
|
93
|
+
"""
|
|
94
|
+
unique_id = generate_hash_id(entry)
|
|
95
|
+
entry['unique_id'] = unique_id
|
|
96
|
+
return entry
|
ebk/imports/__init__.py
ADDED
|
File without changes
|
ebk/imports/calibre.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import json
|
|
4
|
+
from slugify import slugify
|
|
5
|
+
from typing import Dict
|
|
6
|
+
import logging
|
|
7
|
+
from ..extract_metadata import extract_metadata
|
|
8
|
+
from ..ident import add_unique_id
|
|
9
|
+
from ..utils import get_unique_filename
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
ebook_exts = (".pdf", ".epub", ".mobi", ".azw3", ".txt", ".docx", ".odt",
|
|
14
|
+
".html", ".rtf", ".md", ".fb2", ".cbz", ".cbr", ".djvu",
|
|
15
|
+
".xps", ".ibooks", ".azw", ".lit", ".pdb", ".prc", ".lrf",
|
|
16
|
+
".pdb", ".pml", ".rb", ".snb", ".tcr", ".txtz", ".azw1")
|
|
17
|
+
|
|
18
|
+
def import_calibre(calibre_dir: str,
|
|
19
|
+
output_dir: str,
|
|
20
|
+
ebook_exts: tuple = ebook_exts):
|
|
21
|
+
if not os.path.exists(output_dir):
|
|
22
|
+
os.makedirs(output_dir)
|
|
23
|
+
|
|
24
|
+
metadata_list = []
|
|
25
|
+
|
|
26
|
+
for root, _, files in os.walk(calibre_dir):
|
|
27
|
+
# Look for OPF
|
|
28
|
+
opf_file_path = os.path.join(root, "metadata.opf")
|
|
29
|
+
|
|
30
|
+
# Gather valid ebook files
|
|
31
|
+
ebook_files = [f for f in files if f.lower().endswith(ebook_exts)]
|
|
32
|
+
|
|
33
|
+
if not ebook_files:
|
|
34
|
+
logger.debug(f"No recognized ebook files found in {root}. Skipping.")
|
|
35
|
+
continue # skip if no recognized ebook files
|
|
36
|
+
|
|
37
|
+
# Pick the "primary" ebook file. This is arbitrary and can be changed.
|
|
38
|
+
primary_ebook_file = ebook_files[0]
|
|
39
|
+
ebook_full_path = os.path.join(root, primary_ebook_file)
|
|
40
|
+
|
|
41
|
+
# Extract metadata
|
|
42
|
+
if os.path.exists(opf_file_path):
|
|
43
|
+
logger.debug(f"Found metadata.opf in {root}. Extracting metadata from OPF.")
|
|
44
|
+
metadata = extract_metadata(ebook_full_path, opf_file_path)
|
|
45
|
+
else:
|
|
46
|
+
logger.warning(f"No metadata.opf found in {root}. Inferring metadata from ebook files.")
|
|
47
|
+
metadata = extract_metadata(ebook_full_path) # Only ebook file path is provided
|
|
48
|
+
|
|
49
|
+
# Extract metadata (OPF + ebook)
|
|
50
|
+
metadata = extract_metadata(ebook_full_path, opf_file_path)
|
|
51
|
+
metadata["root"] = root
|
|
52
|
+
metadata["source_folder"] = calibre_dir
|
|
53
|
+
metadata["output_folder"] = output_dir
|
|
54
|
+
metadata["imported_from"] = "calibre"
|
|
55
|
+
metadata["virtual_libs"] = [slugify(output_dir)]
|
|
56
|
+
|
|
57
|
+
# Generate base name
|
|
58
|
+
title_slug = slugify(metadata.get("title", "unknown_title"))
|
|
59
|
+
creator_slug = slugify(
|
|
60
|
+
metadata["creators"][0]) if metadata.get("creators") else "unknown_creator"
|
|
61
|
+
|
|
62
|
+
base_name = f"{title_slug}__{creator_slug}"
|
|
63
|
+
|
|
64
|
+
# Copy ebooks
|
|
65
|
+
file_paths = []
|
|
66
|
+
for ebook_file in ebook_files:
|
|
67
|
+
_, ext = os.path.splitext(ebook_file)
|
|
68
|
+
src = os.path.join(root, ebook_file)
|
|
69
|
+
dst = os.path.join(output_dir, f"{base_name}{ext}")
|
|
70
|
+
dst = get_unique_filename(dst)
|
|
71
|
+
shutil.copy(src, dst)
|
|
72
|
+
file_paths.append(os.path.relpath(dst, output_dir))
|
|
73
|
+
|
|
74
|
+
# Optionally handle cover.jpg
|
|
75
|
+
if "cover.jpg" in files:
|
|
76
|
+
cover_src = os.path.join(root, "cover.jpg")
|
|
77
|
+
cover_dst = os.path.join(output_dir, f"{base_name}_cover.jpg")
|
|
78
|
+
shutil.copy(cover_src, cover_dst)
|
|
79
|
+
metadata["cover_path"] = os.path.relpath(cover_dst, output_dir)
|
|
80
|
+
|
|
81
|
+
# Store relative paths in metadata
|
|
82
|
+
metadata["file_paths"] = file_paths
|
|
83
|
+
metadata_list.append(metadata)
|
|
84
|
+
|
|
85
|
+
for entry in metadata_list:
|
|
86
|
+
add_unique_id(entry)
|
|
87
|
+
|
|
88
|
+
# Write out metadata.json
|
|
89
|
+
output_json = os.path.join(output_dir, "metadata.json")
|
|
90
|
+
with open(output_json, "w", encoding="utf-8") as f:
|
|
91
|
+
json.dump(metadata_list, f, indent=2, ensure_ascii=False)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def ensure_metadata_completeness(metadata: Dict) -> Dict:
|
|
95
|
+
"""
|
|
96
|
+
Ensure that all required metadata fields are present.
|
|
97
|
+
If a field is missing or empty, attempt to infer or set default values.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
metadata (Dict): The metadata dictionary extracted from OPF or inferred.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Dict: The updated metadata dictionary with all necessary fields.
|
|
104
|
+
"""
|
|
105
|
+
required_fields = ["title", "creators",
|
|
106
|
+
"subjects", "description",
|
|
107
|
+
"language", "date", "identifiers",
|
|
108
|
+
"file_paths", "cover_path", "unique_id",
|
|
109
|
+
"source_folder", "output_folder",
|
|
110
|
+
"imported_from", "virtual_libs"]
|
|
111
|
+
for field in required_fields:
|
|
112
|
+
if field not in metadata:
|
|
113
|
+
if field == "creators":
|
|
114
|
+
metadata[field] = ["Unknown Author"]
|
|
115
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
116
|
+
elif field == "subjects":
|
|
117
|
+
metadata[field] = []
|
|
118
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
119
|
+
elif field == "description":
|
|
120
|
+
metadata[field] = "No description available."
|
|
121
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
122
|
+
elif field == "language":
|
|
123
|
+
metadata[field] = "en" # Default to English
|
|
124
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
125
|
+
elif field == "date":
|
|
126
|
+
metadata[field] = None # Unknown date
|
|
127
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
128
|
+
elif field == "title":
|
|
129
|
+
metadata[field] = "Unknown Title"
|
|
130
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
131
|
+
elif field == "identifiers":
|
|
132
|
+
metadata[field] = {}
|
|
133
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
134
|
+
elif field == "file_paths":
|
|
135
|
+
metadata[field] = []
|
|
136
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
137
|
+
elif field == "cover_path":
|
|
138
|
+
metadata[field] = None
|
|
139
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
140
|
+
elif field == "unique_id":
|
|
141
|
+
metadata[field] = None
|
|
142
|
+
logger.debug(f"Set default value for '{field}'.")
|
|
143
|
+
|
|
144
|
+
return metadata
|