ebk 0.1.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +35 -0
- ebk/ai/__init__.py +23 -0
- ebk/ai/knowledge_graph.py +443 -0
- ebk/ai/llm_providers/__init__.py +21 -0
- ebk/ai/llm_providers/base.py +230 -0
- ebk/ai/llm_providers/ollama.py +362 -0
- ebk/ai/metadata_enrichment.py +396 -0
- ebk/ai/question_generator.py +328 -0
- ebk/ai/reading_companion.py +224 -0
- ebk/ai/semantic_search.py +434 -0
- ebk/ai/text_extractor.py +394 -0
- ebk/cli.py +2828 -680
- ebk/config.py +260 -22
- ebk/db/__init__.py +37 -0
- ebk/db/migrations.py +180 -0
- ebk/db/models.py +526 -0
- ebk/db/session.py +144 -0
- ebk/decorators.py +132 -0
- ebk/exports/base_exporter.py +218 -0
- ebk/exports/html_library.py +1390 -0
- ebk/exports/html_utils.py +117 -0
- ebk/exports/hugo.py +7 -3
- ebk/exports/jinja_export.py +287 -0
- ebk/exports/multi_facet_export.py +164 -0
- ebk/exports/symlink_dag.py +479 -0
- ebk/extract_metadata.py +76 -7
- ebk/library_db.py +899 -0
- ebk/plugins/__init__.py +42 -0
- ebk/plugins/base.py +502 -0
- ebk/plugins/hooks.py +444 -0
- ebk/plugins/registry.py +500 -0
- ebk/repl/__init__.py +9 -0
- ebk/repl/find.py +126 -0
- ebk/repl/grep.py +174 -0
- ebk/repl/shell.py +1677 -0
- ebk/repl/text_utils.py +320 -0
- ebk/search_parser.py +413 -0
- ebk/server.py +1633 -0
- ebk/services/__init__.py +11 -0
- ebk/services/import_service.py +442 -0
- ebk/services/tag_service.py +282 -0
- ebk/services/text_extraction.py +317 -0
- ebk/similarity/__init__.py +77 -0
- ebk/similarity/base.py +154 -0
- ebk/similarity/core.py +445 -0
- ebk/similarity/extractors.py +168 -0
- ebk/similarity/metrics.py +376 -0
- ebk/vfs/__init__.py +101 -0
- ebk/vfs/base.py +301 -0
- ebk/vfs/library_vfs.py +124 -0
- ebk/vfs/nodes/__init__.py +54 -0
- ebk/vfs/nodes/authors.py +196 -0
- ebk/vfs/nodes/books.py +480 -0
- ebk/vfs/nodes/files.py +155 -0
- ebk/vfs/nodes/metadata.py +385 -0
- ebk/vfs/nodes/root.py +100 -0
- ebk/vfs/nodes/similar.py +165 -0
- ebk/vfs/nodes/subjects.py +184 -0
- ebk/vfs/nodes/tags.py +371 -0
- ebk/vfs/resolver.py +228 -0
- ebk-0.3.2.dist-info/METADATA +755 -0
- ebk-0.3.2.dist-info/RECORD +69 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/WHEEL +1 -1
- ebk-0.3.2.dist-info/licenses/LICENSE +21 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +0 -144
- ebk/imports/ebooks.py +0 -116
- ebk/llm.py +0 -58
- ebk/manager.py +0 -44
- ebk/merge.py +0 -308
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +0 -185
- ebk/streamlit/display.py +0 -168
- ebk/streamlit/filters.py +0 -151
- ebk/streamlit/utils.py +0 -58
- ebk/utils.py +0 -311
- ebk-0.1.0.dist-info/METADATA +0 -457
- ebk-0.1.0.dist-info/RECORD +0 -29
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/entry_points.txt +0 -0
- {ebk-0.1.0.dist-info → ebk-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Export library as a navigable directory structure using symlinks to represent tag hierarchies.
|
|
3
|
+
|
|
4
|
+
This module creates a filesystem view of the library where:
|
|
5
|
+
- Tags are represented as directories in a hierarchy
|
|
6
|
+
- Books appear in all relevant tag directories via symlinks
|
|
7
|
+
- The DAG structure of tags is preserved through the directory tree
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
import shutil
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, List, Set, Optional, Tuple
|
|
15
|
+
import re
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SymlinkDAGExporter:
|
|
20
|
+
"""Creates a navigable directory structure using symlinks to represent tag hierarchies."""
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.tag_separator = "/" # Separator for hierarchical tags
|
|
24
|
+
self.books_dir_name = "_books" # Directory to store actual book files
|
|
25
|
+
|
|
26
|
+
def export(self, lib_dir: str, output_dir: str,
|
|
27
|
+
tag_field: str = "subjects",
|
|
28
|
+
include_files: bool = False, # Changed default to False
|
|
29
|
+
create_index: bool = True,
|
|
30
|
+
flatten: bool = False,
|
|
31
|
+
min_books: int = 0):
|
|
32
|
+
"""
|
|
33
|
+
Export library as symlink-based directory structure.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
lib_dir: Path to the ebk library
|
|
37
|
+
output_dir: Output directory for the symlink structure
|
|
38
|
+
tag_field: Field to use for tags (default: "subjects")
|
|
39
|
+
include_files: Whether to copy actual ebook files (default: False)
|
|
40
|
+
create_index: Whether to create index.html files in directories
|
|
41
|
+
flatten: Whether to create direct symlinks to files instead of _books structure
|
|
42
|
+
min_books: Minimum books per tag folder; smaller folders go to _misc (default: 0)
|
|
43
|
+
"""
|
|
44
|
+
lib_path = Path(lib_dir)
|
|
45
|
+
output_path = Path(output_dir)
|
|
46
|
+
|
|
47
|
+
# Load metadata
|
|
48
|
+
metadata_file = lib_path / "metadata.json"
|
|
49
|
+
with open(metadata_file, "r") as f:
|
|
50
|
+
entries = json.load(f)
|
|
51
|
+
|
|
52
|
+
# Create output directory
|
|
53
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
# Create books directory for actual files (unless flattening)
|
|
56
|
+
if not flatten:
|
|
57
|
+
books_path = output_path / self.books_dir_name
|
|
58
|
+
books_path.mkdir(exist_ok=True)
|
|
59
|
+
|
|
60
|
+
# Process each entry
|
|
61
|
+
entry_paths = {} # Map entry ID to its path in _books
|
|
62
|
+
tag_entries = defaultdict(list) # Map tag to list of entries
|
|
63
|
+
|
|
64
|
+
for i, entry in enumerate(entries):
|
|
65
|
+
entry_id = entry.get("unique_id", f"entry_{i}")
|
|
66
|
+
|
|
67
|
+
if not flatten:
|
|
68
|
+
# Create entry directory in _books
|
|
69
|
+
entry_dir = books_path / self._sanitize_filename(entry_id)
|
|
70
|
+
entry_dir.mkdir(exist_ok=True)
|
|
71
|
+
entry_paths[entry_id] = entry_dir
|
|
72
|
+
|
|
73
|
+
# Save metadata
|
|
74
|
+
with open(entry_dir / "metadata.json", "w") as f:
|
|
75
|
+
json.dump(entry, f, indent=2)
|
|
76
|
+
|
|
77
|
+
# Handle files - either copy or symlink
|
|
78
|
+
if include_files:
|
|
79
|
+
self._copy_entry_files(entry, lib_path, entry_dir)
|
|
80
|
+
else:
|
|
81
|
+
# Create symlinks to original files
|
|
82
|
+
self._symlink_entry_files(entry, lib_path, entry_dir)
|
|
83
|
+
else:
|
|
84
|
+
# For flatten mode, store original file paths
|
|
85
|
+
entry_paths[entry_id] = entry.get("file_paths", [])
|
|
86
|
+
|
|
87
|
+
# Create a readable symlink name
|
|
88
|
+
title = entry.get("title", "Unknown Title")
|
|
89
|
+
creators = entry.get("creators", [])
|
|
90
|
+
if creators:
|
|
91
|
+
readable_name = f"{self._sanitize_filename(title)} - {self._sanitize_filename(creators[0])}"
|
|
92
|
+
else:
|
|
93
|
+
readable_name = self._sanitize_filename(title)
|
|
94
|
+
|
|
95
|
+
# Store readable name for later use
|
|
96
|
+
entry["_readable_name"] = readable_name
|
|
97
|
+
entry["_entry_id"] = entry_id
|
|
98
|
+
|
|
99
|
+
# Extract tags and build hierarchy
|
|
100
|
+
tags = entry.get(tag_field, [])
|
|
101
|
+
if isinstance(tags, str):
|
|
102
|
+
tags = [tags]
|
|
103
|
+
|
|
104
|
+
for tag in tags:
|
|
105
|
+
# Add to this tag and all parent tags
|
|
106
|
+
tag_parts = tag.split(self.tag_separator)
|
|
107
|
+
for i in range(len(tag_parts)):
|
|
108
|
+
parent_tag = self.tag_separator.join(tag_parts[:i+1])
|
|
109
|
+
tag_entries[parent_tag].append(entry)
|
|
110
|
+
|
|
111
|
+
# Consolidate small tag folders if min_books is set
|
|
112
|
+
if min_books > 0:
|
|
113
|
+
tag_entries = self._consolidate_small_tags(tag_entries, min_books)
|
|
114
|
+
|
|
115
|
+
# Create tag directory structure with symlinks
|
|
116
|
+
self._create_tag_structure(output_path, tag_entries, entry_paths, flatten, lib_path)
|
|
117
|
+
|
|
118
|
+
# Create root index if requested
|
|
119
|
+
if create_index:
|
|
120
|
+
self._create_index_files(output_path, tag_entries, entries)
|
|
121
|
+
|
|
122
|
+
# Create a README
|
|
123
|
+
self._create_readme(output_path, len(entries), len(tag_entries))
|
|
124
|
+
|
|
125
|
+
def _consolidate_small_tags(self, tag_entries: Dict[str, List[Dict]],
|
|
126
|
+
min_books: int) -> Dict[str, List[Dict]]:
|
|
127
|
+
"""Consolidate tags with fewer than min_books into a _misc folder."""
|
|
128
|
+
consolidated = defaultdict(list)
|
|
129
|
+
misc_entries = []
|
|
130
|
+
|
|
131
|
+
for tag, entries in tag_entries.items():
|
|
132
|
+
# Get unique entries for this tag
|
|
133
|
+
seen_ids = set()
|
|
134
|
+
unique_entries = []
|
|
135
|
+
for entry in entries:
|
|
136
|
+
entry_id = entry.get("_entry_id", entry.get("unique_id"))
|
|
137
|
+
if entry_id not in seen_ids:
|
|
138
|
+
seen_ids.add(entry_id)
|
|
139
|
+
unique_entries.append(entry)
|
|
140
|
+
|
|
141
|
+
# Check if this tag has enough unique books
|
|
142
|
+
if len(unique_entries) < min_books:
|
|
143
|
+
# Check if it's a leaf tag (no children with enough books)
|
|
144
|
+
tag_prefix = tag + self.tag_separator
|
|
145
|
+
has_large_children = any(
|
|
146
|
+
other_tag.startswith(tag_prefix) and
|
|
147
|
+
len(set(e.get("_entry_id", e.get("unique_id")) for e in tag_entries[other_tag])) >= min_books
|
|
148
|
+
for other_tag in tag_entries.keys()
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if not has_large_children:
|
|
152
|
+
# Add to misc folder with tag prefix
|
|
153
|
+
for entry in unique_entries:
|
|
154
|
+
misc_entry = entry.copy()
|
|
155
|
+
# Store original tag for display in misc folder
|
|
156
|
+
misc_entry["_original_tag"] = tag
|
|
157
|
+
misc_entries.append(misc_entry)
|
|
158
|
+
else:
|
|
159
|
+
# Keep it as is because it has large children
|
|
160
|
+
consolidated[tag] = entries
|
|
161
|
+
else:
|
|
162
|
+
# Keep tags with enough books
|
|
163
|
+
consolidated[tag] = entries
|
|
164
|
+
|
|
165
|
+
# Add misc entries if any
|
|
166
|
+
if misc_entries:
|
|
167
|
+
consolidated["_misc"] = misc_entries
|
|
168
|
+
|
|
169
|
+
return dict(consolidated)
|
|
170
|
+
|
|
171
|
+
def _sanitize_filename(self, name: str) -> str:
|
|
172
|
+
"""Sanitize a string to be safe as a filename."""
|
|
173
|
+
# Replace problematic characters
|
|
174
|
+
name = re.sub(r'[<>:"/\\|?*]', '-', str(name))
|
|
175
|
+
# Remove leading/trailing spaces and dots
|
|
176
|
+
name = name.strip('. ')
|
|
177
|
+
# Limit length (being more conservative)
|
|
178
|
+
if len(name) > 150:
|
|
179
|
+
name = name[:147] + "..."
|
|
180
|
+
return name or "unnamed"
|
|
181
|
+
|
|
182
|
+
def _copy_entry_files(self, entry: Dict, lib_path: Path, entry_dir: Path):
|
|
183
|
+
"""Copy ebook and cover files for an entry."""
|
|
184
|
+
# Copy ebook files
|
|
185
|
+
for file_path in entry.get("file_paths", []):
|
|
186
|
+
src_file = lib_path / file_path
|
|
187
|
+
if src_file.exists():
|
|
188
|
+
dest_file = entry_dir / src_file.name
|
|
189
|
+
shutil.copy2(src_file, dest_file)
|
|
190
|
+
|
|
191
|
+
# Copy cover file
|
|
192
|
+
cover_path = entry.get("cover_path")
|
|
193
|
+
if cover_path:
|
|
194
|
+
src_cover = lib_path / cover_path
|
|
195
|
+
if src_cover.exists():
|
|
196
|
+
dest_cover = entry_dir / src_cover.name
|
|
197
|
+
shutil.copy2(src_cover, dest_cover)
|
|
198
|
+
|
|
199
|
+
def _symlink_entry_files(self, entry: Dict, lib_path: Path, entry_dir: Path):
|
|
200
|
+
"""Create symlinks to ebook and cover files for an entry."""
|
|
201
|
+
# Symlink ebook files
|
|
202
|
+
for file_path in entry.get("file_paths", []):
|
|
203
|
+
src_file = lib_path / file_path
|
|
204
|
+
if src_file.exists():
|
|
205
|
+
# Get absolute path of source file
|
|
206
|
+
abs_src = src_file.resolve()
|
|
207
|
+
dest_link = entry_dir / src_file.name
|
|
208
|
+
|
|
209
|
+
# Remove existing symlink if it exists
|
|
210
|
+
if dest_link.exists() or dest_link.is_symlink():
|
|
211
|
+
dest_link.unlink()
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
# Create symlink using absolute path
|
|
215
|
+
dest_link.symlink_to(abs_src)
|
|
216
|
+
except OSError as e:
|
|
217
|
+
print(f"Warning: Could not create symlink for '{file_path}': {e}")
|
|
218
|
+
|
|
219
|
+
# Symlink cover file
|
|
220
|
+
cover_path = entry.get("cover_path")
|
|
221
|
+
if cover_path:
|
|
222
|
+
src_cover = lib_path / cover_path
|
|
223
|
+
if src_cover.exists():
|
|
224
|
+
# Get absolute path of source cover
|
|
225
|
+
abs_cover = src_cover.resolve()
|
|
226
|
+
dest_link = entry_dir / src_cover.name
|
|
227
|
+
|
|
228
|
+
if dest_link.exists() or dest_link.is_symlink():
|
|
229
|
+
dest_link.unlink()
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
# Create symlink using absolute path
|
|
233
|
+
dest_link.symlink_to(abs_cover)
|
|
234
|
+
except OSError as e:
|
|
235
|
+
print(f"Warning: Could not create symlink for cover '{cover_path}': {e}")
|
|
236
|
+
|
|
237
|
+
def _create_tag_structure(self, output_path: Path,
|
|
238
|
+
tag_entries: Dict[str, List[Dict]],
|
|
239
|
+
entry_paths: Dict[str, Path],
|
|
240
|
+
flatten: bool = False,
|
|
241
|
+
lib_path: Path = None):
|
|
242
|
+
"""Create the hierarchical tag directory structure with symlinks."""
|
|
243
|
+
# Sort tags to ensure parents are created before children
|
|
244
|
+
sorted_tags = sorted(tag_entries.keys())
|
|
245
|
+
|
|
246
|
+
for tag in sorted_tags:
|
|
247
|
+
# Create tag directory path
|
|
248
|
+
tag_parts = tag.split(self.tag_separator)
|
|
249
|
+
tag_dir = output_path
|
|
250
|
+
for part in tag_parts:
|
|
251
|
+
tag_dir = tag_dir / self._sanitize_filename(part)
|
|
252
|
+
tag_dir.mkdir(parents=True, exist_ok=True)
|
|
253
|
+
|
|
254
|
+
# Get unique entries for this tag (avoid duplicates)
|
|
255
|
+
seen_ids = set()
|
|
256
|
+
unique_entries = []
|
|
257
|
+
for entry in tag_entries[tag]:
|
|
258
|
+
entry_id = entry["_entry_id"]
|
|
259
|
+
if entry_id not in seen_ids:
|
|
260
|
+
seen_ids.add(entry_id)
|
|
261
|
+
unique_entries.append(entry)
|
|
262
|
+
|
|
263
|
+
# Create symlinks to entries
|
|
264
|
+
for entry in unique_entries:
|
|
265
|
+
entry_id = entry["_entry_id"]
|
|
266
|
+
readable_name = entry["_readable_name"]
|
|
267
|
+
|
|
268
|
+
# For _misc folder, include original tag in the name
|
|
269
|
+
if tag == "_misc" and "_original_tag" in entry:
|
|
270
|
+
original_tag = entry["_original_tag"]
|
|
271
|
+
# Shorten the tag to avoid filesystem limits
|
|
272
|
+
tag_parts = original_tag.split(self.tag_separator)
|
|
273
|
+
if len(tag_parts) > 2:
|
|
274
|
+
# Use only the last two parts of hierarchical tags
|
|
275
|
+
short_tag = self.tag_separator.join(tag_parts[-2:])
|
|
276
|
+
else:
|
|
277
|
+
short_tag = original_tag
|
|
278
|
+
|
|
279
|
+
# Further limit tag length
|
|
280
|
+
if len(short_tag) > 50:
|
|
281
|
+
short_tag = short_tag[:47] + "..."
|
|
282
|
+
|
|
283
|
+
tag_prefix = f"[{short_tag.replace(self.tag_separator, '-')}] "
|
|
284
|
+
|
|
285
|
+
# Ensure the total name isn't too long
|
|
286
|
+
max_name_length = 200 # Safe limit for most filesystems
|
|
287
|
+
if len(tag_prefix + readable_name) > max_name_length:
|
|
288
|
+
# Truncate the readable name to fit
|
|
289
|
+
available_length = max_name_length - len(tag_prefix) - 3
|
|
290
|
+
readable_name = readable_name[:available_length] + "..."
|
|
291
|
+
|
|
292
|
+
if not flatten:
|
|
293
|
+
# Path to actual entry in _books
|
|
294
|
+
target_path = Path(*[".."] * len(tag_parts)) / self.books_dir_name / self._sanitize_filename(entry_id)
|
|
295
|
+
# Create symlink
|
|
296
|
+
symlink_path = tag_dir / readable_name
|
|
297
|
+
else:
|
|
298
|
+
# For flatten mode, create direct symlinks to original files
|
|
299
|
+
file_paths = entry_paths.get(entry_id, [])
|
|
300
|
+
if file_paths:
|
|
301
|
+
# Use the first file path (usually the main ebook file)
|
|
302
|
+
original_file = file_paths[0]
|
|
303
|
+
# Get absolute path to the original file
|
|
304
|
+
abs_file_path = (lib_path / original_file).resolve()
|
|
305
|
+
# Use original filename as symlink name
|
|
306
|
+
symlink_path = tag_dir / Path(original_file).name
|
|
307
|
+
target_path = abs_file_path
|
|
308
|
+
else:
|
|
309
|
+
continue # Skip if no files
|
|
310
|
+
|
|
311
|
+
# Remove existing symlink if it exists
|
|
312
|
+
if symlink_path.exists() or symlink_path.is_symlink():
|
|
313
|
+
symlink_path.unlink()
|
|
314
|
+
|
|
315
|
+
# Create relative symlink
|
|
316
|
+
try:
|
|
317
|
+
symlink_path.symlink_to(target_path)
|
|
318
|
+
except OSError as e:
|
|
319
|
+
# On Windows, creating symlinks might require admin privileges
|
|
320
|
+
print(f"Warning: Could not create symlink for '{readable_name}': {e}")
|
|
321
|
+
|
|
322
|
+
def _create_index_files(self, output_path: Path,
|
|
323
|
+
tag_entries: Dict[str, List[Dict]],
|
|
324
|
+
all_entries: List[Dict]):
|
|
325
|
+
"""Create index.html files in each directory for web browsing."""
|
|
326
|
+
# Create root index with tag counts
|
|
327
|
+
root_child_tags = {}
|
|
328
|
+
for tag, entries in tag_entries.items():
|
|
329
|
+
if self.tag_separator not in tag: # Top-level tags only
|
|
330
|
+
unique_count = len(set(e.get("_entry_id", e.get("unique_id"))
|
|
331
|
+
for e in entries))
|
|
332
|
+
root_child_tags[tag] = unique_count
|
|
333
|
+
self._write_index_file(output_path, "Library Root", all_entries, root_child_tags, output_path)
|
|
334
|
+
|
|
335
|
+
# Create index for each tag directory
|
|
336
|
+
for tag, entries in tag_entries.items():
|
|
337
|
+
tag_parts = tag.split(self.tag_separator)
|
|
338
|
+
tag_dir = output_path
|
|
339
|
+
for part in tag_parts:
|
|
340
|
+
tag_dir = tag_dir / self._sanitize_filename(part)
|
|
341
|
+
|
|
342
|
+
# Get child tags with counts
|
|
343
|
+
child_tags = {}
|
|
344
|
+
tag_prefix = tag + self.tag_separator
|
|
345
|
+
for other_tag, other_entries in tag_entries.items():
|
|
346
|
+
if other_tag.startswith(tag_prefix) and other_tag != tag:
|
|
347
|
+
# Check if it's a direct child
|
|
348
|
+
remaining = other_tag[len(tag_prefix):]
|
|
349
|
+
if self.tag_separator not in remaining:
|
|
350
|
+
# Count unique entries for this tag
|
|
351
|
+
unique_count = len(set(e.get("_entry_id", e.get("unique_id"))
|
|
352
|
+
for e in other_entries))
|
|
353
|
+
child_tags[other_tag] = unique_count
|
|
354
|
+
|
|
355
|
+
# Get unique entries
|
|
356
|
+
seen_ids = set()
|
|
357
|
+
unique_entries = []
|
|
358
|
+
for entry in entries:
|
|
359
|
+
entry_id = entry.get("_entry_id", entry.get("unique_id"))
|
|
360
|
+
if entry_id not in seen_ids:
|
|
361
|
+
seen_ids.add(entry_id)
|
|
362
|
+
unique_entries.append(entry)
|
|
363
|
+
|
|
364
|
+
self._write_index_file(tag_dir, tag, unique_entries, child_tags, output_path)
|
|
365
|
+
|
|
366
|
+
def _write_index_file(self, directory: Path, title: str,
|
|
367
|
+
entries: List[Dict], child_tags: Dict[str, int], output_path: Path):
|
|
368
|
+
"""Write an index.html file for a directory using Jinja2 template."""
|
|
369
|
+
from jinja2 import Environment, FileSystemLoader
|
|
370
|
+
import json
|
|
371
|
+
import re
|
|
372
|
+
|
|
373
|
+
# Prepare entries for JSON (clean and escape)
|
|
374
|
+
clean_entries = []
|
|
375
|
+
for entry in entries:
|
|
376
|
+
clean_entry = {}
|
|
377
|
+
for key, value in entry.items():
|
|
378
|
+
if isinstance(value, str):
|
|
379
|
+
# Remove problematic HTML from descriptions
|
|
380
|
+
if key == "description":
|
|
381
|
+
# Strip HTML tags from description for JSON
|
|
382
|
+
value = re.sub(r'<[^>]+>', '', value)
|
|
383
|
+
# Limit description length
|
|
384
|
+
if len(value) > 500:
|
|
385
|
+
value = value[:500] + "..."
|
|
386
|
+
clean_entry[key] = value
|
|
387
|
+
elif isinstance(value, list):
|
|
388
|
+
clean_entry[key] = [str(v) for v in value]
|
|
389
|
+
else:
|
|
390
|
+
clean_entry[key] = str(value)
|
|
391
|
+
clean_entries.append(clean_entry)
|
|
392
|
+
|
|
393
|
+
# Convert to JSON for JavaScript
|
|
394
|
+
entries_json = json.dumps(clean_entries, ensure_ascii=True)
|
|
395
|
+
|
|
396
|
+
# Set up Jinja2 environment
|
|
397
|
+
template_dir = Path(__file__).parent / "templates"
|
|
398
|
+
env = Environment(loader=FileSystemLoader(str(template_dir)))
|
|
399
|
+
template = env.get_template("advanced_index.html")
|
|
400
|
+
|
|
401
|
+
# Calculate if we're in a subdirectory (for proper _books path)
|
|
402
|
+
is_subdir = directory != output_path
|
|
403
|
+
|
|
404
|
+
# Render template
|
|
405
|
+
html_content = template.render(
|
|
406
|
+
title=title,
|
|
407
|
+
entries=entries,
|
|
408
|
+
entries_json=entries_json,
|
|
409
|
+
child_tags=child_tags,
|
|
410
|
+
tag_separator=self.tag_separator,
|
|
411
|
+
is_subdir=is_subdir
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Write the file
|
|
415
|
+
index_path = directory / "index.html"
|
|
416
|
+
with open(index_path, "w", encoding="utf-8") as f:
|
|
417
|
+
f.write(html_content)
|
|
418
|
+
|
|
419
|
+
def _create_readme(self, output_path: Path, num_entries: int, num_tags: int):
|
|
420
|
+
"""Create a README file explaining the structure."""
|
|
421
|
+
readme_content = f"""# EBK Library - Symlink Navigation Structure
|
|
422
|
+
|
|
423
|
+
This directory contains a navigable view of your ebook library organized by tags.
|
|
424
|
+
|
|
425
|
+
## Statistics
|
|
426
|
+
- Total books: {num_entries}
|
|
427
|
+
- Total tags/categories: {num_tags}
|
|
428
|
+
|
|
429
|
+
## Structure
|
|
430
|
+
|
|
431
|
+
- **_books/**: Contains the actual ebook files and metadata
|
|
432
|
+
- **Tag directories**: Each tag becomes a directory, with hierarchical tags creating nested directories
|
|
433
|
+
- **Symlinks**: Books appear in multiple tag directories via symbolic links
|
|
434
|
+
|
|
435
|
+
## Navigation
|
|
436
|
+
|
|
437
|
+
You can navigate this structure using:
|
|
438
|
+
1. Your file explorer (Finder, Windows Explorer, etc.)
|
|
439
|
+
2. Command line tools (cd, ls, etc.)
|
|
440
|
+
3. Web browser (open index.html files)
|
|
441
|
+
|
|
442
|
+
## Hierarchical Tags
|
|
443
|
+
|
|
444
|
+
Tags like "Programming/Python/Web" create a nested structure:
|
|
445
|
+
```
|
|
446
|
+
Programming/
|
|
447
|
+
Python/
|
|
448
|
+
Web/
|
|
449
|
+
(books tagged with Programming/Python/Web)
|
|
450
|
+
(books tagged with Programming/Python)
|
|
451
|
+
(books tagged with Programming)
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
Books appear at each relevant level in the hierarchy.
|
|
455
|
+
|
|
456
|
+
## Notes
|
|
457
|
+
|
|
458
|
+
- This is a read-only view. Modifying files here won't affect the original library.
|
|
459
|
+
- Symlinks point to files in the _books directory.
|
|
460
|
+
- On Windows, you may need administrator privileges to create symlinks.
|
|
461
|
+
|
|
462
|
+
Generated by EBK - https://github.com/queelius/ebk
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
with open(output_path / "README.md", "w") as f:
|
|
466
|
+
f.write(readme_content)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def export_symlink_dag(lib_dir: str, output_dir: str, **kwargs):
|
|
470
|
+
"""
|
|
471
|
+
Convenience function to export library as symlink DAG.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
lib_dir: Path to ebk library
|
|
475
|
+
output_dir: Output directory
|
|
476
|
+
**kwargs: Additional arguments passed to SymlinkDAGExporter.export()
|
|
477
|
+
"""
|
|
478
|
+
exporter = SymlinkDAGExporter()
|
|
479
|
+
exporter.export(lib_dir, output_dir, **kwargs)
|
ebk/extract_metadata.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import xmltodict
|
|
3
3
|
from typing import Dict, Optional
|
|
4
4
|
from slugify import slugify
|
|
5
|
-
import
|
|
5
|
+
import pypdf
|
|
6
6
|
from ebooklib import epub
|
|
7
7
|
|
|
8
8
|
def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
@@ -32,12 +32,17 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
|
32
32
|
simplified = {
|
|
33
33
|
"title": metadata.get("dc:title", metadata.get("title")),
|
|
34
34
|
"creators": None,
|
|
35
|
+
"contributors": None,
|
|
35
36
|
"subjects": None,
|
|
36
37
|
"description": metadata.get("dc:description", metadata.get("description")),
|
|
37
38
|
"language": metadata.get("dc:language", metadata.get("language")),
|
|
38
39
|
"date": metadata.get("dc:date", metadata.get("date")),
|
|
39
40
|
"publisher": metadata.get("dc:publisher", metadata.get("publisher")),
|
|
40
|
-
"identifiers": None
|
|
41
|
+
"identifiers": None,
|
|
42
|
+
"rights": metadata.get("dc:rights", metadata.get("rights")),
|
|
43
|
+
"source": metadata.get("dc:source", metadata.get("source")),
|
|
44
|
+
"series": None,
|
|
45
|
+
"series_index": None
|
|
41
46
|
}
|
|
42
47
|
|
|
43
48
|
# -- Creators
|
|
@@ -75,12 +80,64 @@ def extract_metadata_from_opf(opf_file: str) -> Dict:
|
|
|
75
80
|
text = identifiers.get("#text", "").strip()
|
|
76
81
|
simplified["identifiers"][scheme] = text
|
|
77
82
|
|
|
83
|
+
# -- Contributors (editors, translators, etc)
|
|
84
|
+
contributors_raw = metadata.get("dc:contributor", metadata.get("contributor"))
|
|
85
|
+
if contributors_raw:
|
|
86
|
+
simplified["contributors"] = []
|
|
87
|
+
if isinstance(contributors_raw, list):
|
|
88
|
+
for contrib in contributors_raw:
|
|
89
|
+
if isinstance(contrib, dict):
|
|
90
|
+
name = contrib.get("#text", "").strip()
|
|
91
|
+
role = contrib.get("@opf:role", "contributor")
|
|
92
|
+
file_as = contrib.get("@opf:file-as", "")
|
|
93
|
+
if name:
|
|
94
|
+
simplified["contributors"].append({
|
|
95
|
+
"name": name,
|
|
96
|
+
"role": role,
|
|
97
|
+
"file_as": file_as
|
|
98
|
+
})
|
|
99
|
+
elif isinstance(contrib, str):
|
|
100
|
+
simplified["contributors"].append({
|
|
101
|
+
"name": contrib.strip(),
|
|
102
|
+
"role": "contributor",
|
|
103
|
+
"file_as": ""
|
|
104
|
+
})
|
|
105
|
+
elif isinstance(contributors_raw, dict):
|
|
106
|
+
name = contributors_raw.get("#text", "").strip()
|
|
107
|
+
role = contributors_raw.get("@opf:role", "contributor")
|
|
108
|
+
file_as = contributors_raw.get("@opf:file-as", "")
|
|
109
|
+
if name:
|
|
110
|
+
simplified["contributors"] = [{
|
|
111
|
+
"name": name,
|
|
112
|
+
"role": role,
|
|
113
|
+
"file_as": file_as
|
|
114
|
+
}]
|
|
115
|
+
|
|
116
|
+
# -- Calibre-specific metadata (series, etc)
|
|
117
|
+
# Look for meta tags with name attributes
|
|
118
|
+
meta_tags = metadata.get("meta", [])
|
|
119
|
+
if not isinstance(meta_tags, list):
|
|
120
|
+
meta_tags = [meta_tags] if meta_tags else []
|
|
121
|
+
|
|
122
|
+
for meta in meta_tags:
|
|
123
|
+
if isinstance(meta, dict):
|
|
124
|
+
meta_name = meta.get("@name", "")
|
|
125
|
+
meta_content = meta.get("@content", "")
|
|
126
|
+
|
|
127
|
+
if meta_name == "calibre:series" and meta_content:
|
|
128
|
+
simplified["series"] = meta_content
|
|
129
|
+
elif meta_name == "calibre:series_index" and meta_content:
|
|
130
|
+
try:
|
|
131
|
+
simplified["series_index"] = float(meta_content)
|
|
132
|
+
except (ValueError, TypeError):
|
|
133
|
+
pass
|
|
134
|
+
|
|
78
135
|
return simplified
|
|
79
136
|
|
|
80
137
|
|
|
81
138
|
def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
82
139
|
"""
|
|
83
|
-
Extract metadata from a PDF file using
|
|
140
|
+
Extract metadata from a PDF file using pypdf.
|
|
84
141
|
Returns a dictionary with the same keys as the OPF-based dict.
|
|
85
142
|
"""
|
|
86
143
|
|
|
@@ -94,20 +151,23 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
|
94
151
|
"publisher": None,
|
|
95
152
|
"identifiers": None,
|
|
96
153
|
"keywords": None,
|
|
154
|
+
"creator_application": None,
|
|
97
155
|
}
|
|
98
156
|
|
|
99
157
|
try:
|
|
100
158
|
with open(pdf_path, "rb") as f:
|
|
101
|
-
reader =
|
|
159
|
+
reader = pypdf.PdfReader(f)
|
|
102
160
|
info = reader.metadata or {}
|
|
103
161
|
|
|
104
|
-
# NOTE: Depending on
|
|
162
|
+
# NOTE: Depending on pypdf version, metadata keys can differ
|
|
105
163
|
# e.g. info.title vs info.get('/Title')
|
|
106
164
|
pdf_title = info.get("/Title", None) or info.get("title", None)
|
|
107
165
|
pdf_author = info.get("/Author", None) or info.get("author", None)
|
|
108
166
|
pdf_subject = info.get("/Subject", None) or info.get("subject", None)
|
|
109
167
|
pdf_keywords = info.get("/Keywords", None) or info.get("keywords", None)
|
|
110
|
-
|
|
168
|
+
pdf_creator = info.get("/Creator", None) or info.get("creator", None) # Application used
|
|
169
|
+
pdf_producer = info.get("/Producer", None) or info.get("producer", None)
|
|
170
|
+
pdf_publisher = info.get("/Publisher", None) or info.get("publisher", None)
|
|
111
171
|
pdf_creation_date = info.get("/CreationDate", None)
|
|
112
172
|
|
|
113
173
|
if pdf_title:
|
|
@@ -130,10 +190,18 @@ def extract_metadata_from_pdf(pdf_path: str) -> Dict:
|
|
|
130
190
|
metadata["identifiers"] = {"pdf:identifier": pdf_path}
|
|
131
191
|
|
|
132
192
|
if pdf_keywords:
|
|
133
|
-
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",")]
|
|
193
|
+
metadata["keywords"] = [kw.strip() for kw in pdf_keywords.split(",") if kw.strip()]
|
|
194
|
+
|
|
195
|
+
# Creator is the application that created the PDF (e.g., LaTeX, Word)
|
|
196
|
+
if pdf_creator:
|
|
197
|
+
metadata["creator_application"] = pdf_creator.strip()
|
|
134
198
|
|
|
199
|
+
# Publisher: prefer explicit Publisher field, fallback to Producer
|
|
135
200
|
if pdf_publisher:
|
|
136
201
|
metadata["publisher"] = pdf_publisher.strip()
|
|
202
|
+
elif pdf_producer and not pdf_creator:
|
|
203
|
+
# Only use producer as publisher if there's no creator app
|
|
204
|
+
metadata["publisher"] = pdf_producer.strip()
|
|
137
205
|
|
|
138
206
|
metadata["file_paths"] = [pdf_path]
|
|
139
207
|
|
|
@@ -259,6 +327,7 @@ def extract_metadata(ebook_file: str, opf_file: Optional[str] = None) -> Dict:
|
|
|
259
327
|
if opf_file and os.path.isfile(opf_file):
|
|
260
328
|
opf_metadata = extract_metadata_from_opf(opf_file)
|
|
261
329
|
|
|
330
|
+
ebook_metadata = {}
|
|
262
331
|
_, ext = os.path.splitext(ebook_file.lower())
|
|
263
332
|
if ext == ".pdf":
|
|
264
333
|
ebook_metadata = extract_metadata_from_pdf(ebook_file)
|