ebk 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ebk might be problematic. Click here for more details.

Files changed (61) hide show
  1. ebk/ai/__init__.py +23 -0
  2. ebk/ai/knowledge_graph.py +443 -0
  3. ebk/ai/llm_providers/__init__.py +21 -0
  4. ebk/ai/llm_providers/base.py +230 -0
  5. ebk/ai/llm_providers/ollama.py +362 -0
  6. ebk/ai/metadata_enrichment.py +396 -0
  7. ebk/ai/question_generator.py +328 -0
  8. ebk/ai/reading_companion.py +224 -0
  9. ebk/ai/semantic_search.py +434 -0
  10. ebk/ai/text_extractor.py +394 -0
  11. ebk/cli.py +1097 -9
  12. ebk/db/__init__.py +37 -0
  13. ebk/db/migrations.py +180 -0
  14. ebk/db/models.py +526 -0
  15. ebk/db/session.py +144 -0
  16. ebk/exports/__init__.py +0 -0
  17. ebk/exports/base_exporter.py +218 -0
  18. ebk/exports/html_library.py +1390 -0
  19. ebk/exports/html_utils.py +117 -0
  20. ebk/exports/hugo.py +59 -0
  21. ebk/exports/jinja_export.py +287 -0
  22. ebk/exports/multi_facet_export.py +164 -0
  23. ebk/exports/symlink_dag.py +479 -0
  24. ebk/exports/zip.py +25 -0
  25. ebk/library_db.py +155 -0
  26. ebk/repl/__init__.py +9 -0
  27. ebk/repl/find.py +126 -0
  28. ebk/repl/grep.py +174 -0
  29. ebk/repl/shell.py +1677 -0
  30. ebk/repl/text_utils.py +320 -0
  31. ebk/services/__init__.py +11 -0
  32. ebk/services/import_service.py +442 -0
  33. ebk/services/tag_service.py +282 -0
  34. ebk/services/text_extraction.py +317 -0
  35. ebk/similarity/__init__.py +77 -0
  36. ebk/similarity/base.py +154 -0
  37. ebk/similarity/core.py +445 -0
  38. ebk/similarity/extractors.py +168 -0
  39. ebk/similarity/metrics.py +376 -0
  40. ebk/vfs/__init__.py +101 -0
  41. ebk/vfs/base.py +301 -0
  42. ebk/vfs/library_vfs.py +124 -0
  43. ebk/vfs/nodes/__init__.py +54 -0
  44. ebk/vfs/nodes/authors.py +196 -0
  45. ebk/vfs/nodes/books.py +480 -0
  46. ebk/vfs/nodes/files.py +155 -0
  47. ebk/vfs/nodes/metadata.py +385 -0
  48. ebk/vfs/nodes/root.py +100 -0
  49. ebk/vfs/nodes/similar.py +165 -0
  50. ebk/vfs/nodes/subjects.py +184 -0
  51. ebk/vfs/nodes/tags.py +371 -0
  52. ebk/vfs/resolver.py +228 -0
  53. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/METADATA +1 -1
  54. ebk-0.3.2.dist-info/RECORD +69 -0
  55. ebk-0.3.2.dist-info/entry_points.txt +2 -0
  56. ebk-0.3.2.dist-info/top_level.txt +1 -0
  57. ebk-0.3.1.dist-info/RECORD +0 -19
  58. ebk-0.3.1.dist-info/entry_points.txt +0 -6
  59. ebk-0.3.1.dist-info/top_level.txt +0 -2
  60. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/WHEEL +0 -0
  61. {ebk-0.3.1.dist-info → ebk-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,117 @@
1
+ """HTML sanitization utilities for secure template rendering."""
2
+
3
+ import json
4
+ import html
5
+ from typing import Any, Dict, List
6
+ import re
7
+
8
+
9
+ def sanitize_for_html(text: str) -> str:
10
+ """
11
+ Sanitize text for safe HTML output.
12
+
13
+ Escapes HTML special characters to prevent XSS attacks.
14
+ """
15
+ if not text:
16
+ return ""
17
+ return html.escape(str(text))
18
+
19
+
20
+ def sanitize_for_javascript(obj: Any) -> str:
21
+ """
22
+ Safely encode data for embedding in JavaScript.
23
+
24
+ This prevents XSS attacks when embedding data in script tags.
25
+ """
26
+ # Convert to JSON with proper escaping
27
+ json_str = json.dumps(obj, ensure_ascii=False)
28
+
29
+ # Additional escaping for script context
30
+ # Replace </script> to prevent breaking out of script tags
31
+ json_str = json_str.replace('</script>', '<\\/script>')
32
+ json_str = json_str.replace('<!--', '<\\!--')
33
+ json_str = json_str.replace('-->', '--\\>')
34
+
35
+ return json_str
36
+
37
+
38
+ def sanitize_metadata(entry: Dict) -> Dict:
39
+ """
40
+ Sanitize metadata fields that will be displayed in HTML.
41
+
42
+ Preserves structure but escapes string values.
43
+ """
44
+ sanitized = {}
45
+
46
+ for key, value in entry.items():
47
+ if isinstance(value, str):
48
+ # Don't sanitize file paths and IDs (they're not displayed as HTML)
49
+ if key in ('file_paths', 'cover_path', 'unique_id', '_entry_id'):
50
+ sanitized[key] = value
51
+ else:
52
+ sanitized[key] = sanitize_for_html(value)
53
+ elif isinstance(value, list):
54
+ # Sanitize list items if they're strings
55
+ sanitized[key] = [
56
+ sanitize_for_html(item) if isinstance(item, str) else item
57
+ for item in value
58
+ ]
59
+ elif isinstance(value, dict):
60
+ # Recursively sanitize nested dicts
61
+ sanitized[key] = sanitize_metadata(value)
62
+ else:
63
+ sanitized[key] = value
64
+
65
+ return sanitized
66
+
67
+
68
+ def sanitize_entries_for_javascript(entries: List[Dict]) -> str:
69
+ """
70
+ Prepare entries for safe embedding in JavaScript.
71
+
72
+ This sanitizes user content while preserving the data structure.
73
+ """
74
+ # Create a sanitized copy of entries
75
+ sanitized_entries = []
76
+
77
+ for entry in entries:
78
+ # Create a minimal, safe version for JavaScript
79
+ safe_entry = {
80
+ 'unique_id': entry.get('unique_id', ''),
81
+ 'title': sanitize_for_html(entry.get('title', '')),
82
+ 'creators': [sanitize_for_html(c) for c in entry.get('creators', [])],
83
+ 'subjects': [sanitize_for_html(s) for s in entry.get('subjects', [])],
84
+ 'language': sanitize_for_html(entry.get('language', '')),
85
+ 'date': sanitize_for_html(str(entry.get('date', ''))),
86
+ 'publisher': sanitize_for_html(str(entry.get('publisher', ''))),
87
+ 'description': sanitize_for_html(entry.get('description', '')),
88
+ 'cover_path': entry.get('cover_path', ''),
89
+ 'file_paths': entry.get('file_paths', []),
90
+ '_readable_name': sanitize_for_html(entry.get('_readable_name', '')),
91
+ '_entry_id': entry.get('_entry_id', '')
92
+ }
93
+ sanitized_entries.append(safe_entry)
94
+
95
+ return sanitize_for_javascript(sanitized_entries)
96
+
97
+
98
+ def create_safe_filename(text: str, max_length: int = 255) -> str:
99
+ """
100
+ Create a safe filename from text.
101
+
102
+ Removes/replaces characters that could cause issues in filenames.
103
+ """
104
+ # Remove HTML tags if any
105
+ text = re.sub(r'<[^>]+>', '', text)
106
+
107
+ # Replace unsafe characters
108
+ safe_chars = re.sub(r'[<>:"/\\|?*]', '_', text)
109
+
110
+ # Remove control characters
111
+ safe_chars = ''.join(char for char in safe_chars if ord(char) >= 32)
112
+
113
+ # Truncate if too long
114
+ if len(safe_chars) > max_length:
115
+ safe_chars = safe_chars[:max_length-3] + '...'
116
+
117
+ return safe_chars.strip()
ebk/exports/hugo.py ADDED
@@ -0,0 +1,59 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import List
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ def export_hugo(lib_dir, hugo_dir):
10
+ """
11
+ Export ebk library to Hugo-compatible Markdown files.
12
+
13
+ Args:
14
+ lib_dir (str): Path to the ebk library directory to export (contains `metadata.json` and ebook-related files)
15
+ hugo_dir (str): Path to the Hugo site directory
16
+ """
17
+
18
+ lib_dir = Path(lib_dir)
19
+ with open(lib_dir / "metadata.json", "r") as f:
20
+ books = json.load(f)
21
+
22
+ hugo_dir = Path(hugo_dir)
23
+
24
+ content_dir = hugo_dir / "content" / "library"
25
+ static_dir = hugo_dir / "static" / "ebooks"
26
+ content_dir.mkdir(parents=True, exist_ok=True)
27
+ static_dir.mkdir(parents=True, exist_ok=True)
28
+
29
+ for book in books:
30
+ slug = book['title'].replace(" ", "-").lower()
31
+ md_file = content_dir / f"{slug}.md"
32
+
33
+ with open(md_file, "w") as md:
34
+ md.write("---\n")
35
+ md.write(f"title: {book['title']}\n")
36
+ md.write(f"creators: [{', '.join(book['creators'])}]\n")
37
+ md.write(f"subjects: [{', '.join(book['subjects'])}]\n")
38
+ md.write(f"description: {book['description']}\n")
39
+ md.write(f"date: {book['date']}\n")
40
+ md.write(f"tags: [{', '.join(book['Tags'].split(', '))}]\n")
41
+ md.write(f"ebook_file: /ebooks/{Path(book['file_path']).name}\n")
42
+ md.write(f"cover_image: /ebooks/{Path(book['Cover Path']).name if book['Cover Path'] else ''}\n")
43
+ md.write("---\n\n")
44
+ md.write(f"# {book['Title']}\n\n")
45
+ md.write(f"Author: {book['Author']}\n\n")
46
+ md.write(f"[Download eBook](/ebooks/{Path(book['File Path']).name})\n")
47
+
48
+ # Copy eBook and cover to static directory
49
+ if book["File Path"]:
50
+ source_file = Path(book['File Path'])
51
+ if source_file.exists():
52
+ shutil.copy2(source_file, static_dir)
53
+ if book["Cover Path"]:
54
+ cover_file = Path(book['Cover Path'])
55
+ if cover_file.exists():
56
+ shutil.copy2(cover_file, static_dir)
57
+
58
+ logger.debug(f"Exported {len(books)} books to Hugo site at '{hugo_dir}'")
59
+
@@ -0,0 +1,287 @@
1
+ """
2
+ Flexible Jinja2-based export system for ebk libraries.
3
+
4
+ This module provides a template-driven approach to exporting ebook metadata
5
+ in various formats, with Hugo as the primary implementation.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import shutil
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional, Any
13
+ from jinja2 import Environment, FileSystemLoader, select_autoescape
14
+ import logging
15
+ from slugify import slugify
16
+ from collections import defaultdict
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class JinjaExporter:
22
+ """Flexible export system using Jinja2 templates."""
23
+
24
+ def __init__(self, template_dir: Optional[Path] = None):
25
+ """
26
+ Initialize the exporter with a template directory.
27
+
28
+ Args:
29
+ template_dir: Path to custom templates. If None, uses built-in templates.
30
+ """
31
+ if template_dir is None:
32
+ template_dir = Path(__file__).parent / "templates"
33
+
34
+ self.env = Environment(
35
+ loader=FileSystemLoader(template_dir),
36
+ autoescape=select_autoescape(['html', 'xml']),
37
+ trim_blocks=True,
38
+ lstrip_blocks=True
39
+ )
40
+
41
+ # Add custom filters
42
+ self.env.filters['slugify'] = slugify
43
+ self.env.filters['join_list'] = lambda x: ', '.join(x) if isinstance(x, list) else x
44
+ self.env.filters['default_if_none'] = lambda x, default='': x if x is not None else default
45
+
46
+ def export_hugo(self, lib_dir: str, hugo_dir: str,
47
+ organize_by: str = "flat",
48
+ create_indexes: bool = True,
49
+ copy_files: bool = True):
50
+ """
51
+ Export library to Hugo with flexible organization options.
52
+
53
+ Args:
54
+ lib_dir: Path to ebk library
55
+ hugo_dir: Path to Hugo site directory
56
+ organize_by: Organization method - "flat", "year", "language", "subject", "creator"
57
+ create_indexes: Whether to create index pages for categories
58
+ copy_files: Whether to copy ebook and cover files
59
+ """
60
+ lib_path = Path(lib_dir)
61
+ hugo_path = Path(hugo_dir)
62
+
63
+ # Load metadata
64
+ with open(lib_path / "metadata.json", "r") as f:
65
+ books = json.load(f)
66
+
67
+ # Prepare books with normalized fields
68
+ books = self._normalize_metadata(books)
69
+
70
+ # Create directory structure
71
+ content_dir = hugo_path / "content" / "library"
72
+ static_dir = hugo_path / "static" / "ebooks"
73
+ content_dir.mkdir(parents=True, exist_ok=True)
74
+ static_dir.mkdir(parents=True, exist_ok=True)
75
+
76
+ # Group books by organization method
77
+ grouped_books = self._group_books(books, organize_by)
78
+
79
+ # Export individual book pages
80
+ for group_key, group_books in grouped_books.items():
81
+ group_dir = content_dir / group_key if organize_by != "flat" else content_dir
82
+ group_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ for book in group_books:
85
+ self._export_book(book, group_dir, static_dir, lib_path, copy_files)
86
+
87
+ # Create index pages
88
+ if create_indexes:
89
+ self._create_indexes(grouped_books, content_dir, organize_by)
90
+
91
+ # Create main library index
92
+ self._create_main_index(books, content_dir, organize_by)
93
+
94
+ logger.info(f"Exported {len(books)} books to Hugo site at '{hugo_dir}'")
95
+
96
+ def _normalize_metadata(self, books: List[Dict]) -> List[Dict]:
97
+ """Normalize metadata fields for consistent access."""
98
+ normalized = []
99
+
100
+ for book in books:
101
+ # Create a normalized version with consistent field names
102
+ norm = {
103
+ 'title': book.get('title', 'Unknown Title'),
104
+ 'creators': book.get('creators', []),
105
+ 'subjects': book.get('subjects', []),
106
+ 'description': book.get('description', ''),
107
+ 'language': book.get('language', 'en'),
108
+ 'date': book.get('date', ''),
109
+ 'publisher': book.get('publisher', ''),
110
+ 'identifiers': book.get('identifiers', {}),
111
+ 'file_paths': book.get('file_paths', []),
112
+ 'cover_path': book.get('cover_path', ''),
113
+ 'unique_id': book.get('unique_id', ''),
114
+ # Keep original data for backward compatibility
115
+ '_original': book
116
+ }
117
+
118
+ # Extract year from date if available
119
+ if norm['date']:
120
+ try:
121
+ norm['year'] = norm['date'][:4]
122
+ except (IndexError, TypeError, AttributeError):
123
+ norm['year'] = '' # Invalid date format
124
+ else:
125
+ norm['year'] = ''
126
+
127
+ # Generate slug
128
+ norm['slug'] = slugify(f"{norm['title']}-{norm['unique_id'][:8]}")
129
+
130
+ normalized.append(norm)
131
+
132
+ return normalized
133
+
134
+ def _group_books(self, books: List[Dict], organize_by: str) -> Dict[str, List[Dict]]:
135
+ """Group books by specified organization method."""
136
+ grouped = defaultdict(list)
137
+
138
+ if organize_by == "flat":
139
+ grouped[""] = books
140
+ elif organize_by == "year":
141
+ for book in books:
142
+ year = book.get('year', 'unknown-year')
143
+ grouped[year].append(book)
144
+ elif organize_by == "language":
145
+ for book in books:
146
+ lang = book.get('language', 'unknown-language')
147
+ grouped[lang].append(book)
148
+ elif organize_by == "subject":
149
+ for book in books:
150
+ subjects = book.get('subjects', ['uncategorized'])
151
+ for subject in subjects:
152
+ grouped[slugify(subject)].append(book)
153
+ elif organize_by == "creator":
154
+ for book in books:
155
+ creators = book.get('creators', ['unknown-creator'])
156
+ for creator in creators:
157
+ grouped[slugify(creator)].append(book)
158
+ else:
159
+ # Default to flat
160
+ grouped[""] = books
161
+
162
+ return dict(grouped)
163
+
164
+ def _export_book(self, book: Dict, output_dir: Path, static_dir: Path,
165
+ lib_path: Path, copy_files: bool):
166
+ """Export a single book."""
167
+ # Load book template
168
+ template = self.env.get_template('hugo/book.md')
169
+
170
+ # Prepare file paths for Hugo
171
+ ebook_urls = []
172
+ if book['file_paths']:
173
+ for file_path in book['file_paths']:
174
+ if copy_files and file_path:
175
+ src = lib_path / file_path
176
+ if src.exists():
177
+ dst = static_dir / src.name
178
+ shutil.copy2(src, dst)
179
+ ebook_urls.append(f"/ebooks/{src.name}")
180
+
181
+ cover_url = ""
182
+ if book['cover_path'] and copy_files:
183
+ src = lib_path / book['cover_path']
184
+ if src.exists():
185
+ dst = static_dir / src.name
186
+ shutil.copy2(src, dst)
187
+ cover_url = f"/ebooks/{src.name}"
188
+
189
+ # Render template
190
+ content = template.render(
191
+ book=book,
192
+ ebook_urls=ebook_urls,
193
+ cover_url=cover_url
194
+ )
195
+
196
+ # Write file
197
+ output_file = output_dir / f"{book['slug']}.md"
198
+ with open(output_file, 'w', encoding='utf-8') as f:
199
+ f.write(content)
200
+
201
+ def _create_indexes(self, grouped_books: Dict[str, List[Dict]],
202
+ content_dir: Path, organize_by: str):
203
+ """Create index pages for each group."""
204
+ if organize_by == "flat":
205
+ return
206
+
207
+ template = self.env.get_template('hugo/index.md')
208
+
209
+ for group_key, books in grouped_books.items():
210
+ if not group_key: # Skip empty group
211
+ continue
212
+
213
+ group_dir = content_dir / group_key
214
+ index_file = group_dir / "_index.md"
215
+
216
+ # Determine group title
217
+ if organize_by == "year":
218
+ group_title = f"Books from {group_key}"
219
+ elif organize_by == "language":
220
+ group_title = f"Books in {group_key}"
221
+ elif organize_by == "subject":
222
+ group_title = f"Subject: {group_key.replace('-', ' ').title()}"
223
+ elif organize_by == "creator":
224
+ group_title = f"Books by {group_key.replace('-', ' ').title()}"
225
+ else:
226
+ group_title = group_key.replace('-', ' ').title()
227
+
228
+ content = template.render(
229
+ title=group_title,
230
+ organize_by=organize_by,
231
+ group_key=group_key,
232
+ books=books,
233
+ book_count=len(books)
234
+ )
235
+
236
+ with open(index_file, 'w', encoding='utf-8') as f:
237
+ f.write(content)
238
+
239
+ def _create_main_index(self, books: List[Dict], content_dir: Path, organize_by: str):
240
+ """Create main library index page."""
241
+ template = self.env.get_template('hugo/library.md')
242
+
243
+ # Calculate statistics
244
+ stats = {
245
+ 'total_books': len(books),
246
+ 'total_creators': len(set(creator for book in books for creator in book.get('creators', []))),
247
+ 'total_subjects': len(set(subject for book in books for subject in book.get('subjects', []))),
248
+ 'languages': defaultdict(int),
249
+ 'years': defaultdict(int),
250
+ 'top_creators': defaultdict(int),
251
+ 'top_subjects': defaultdict(int)
252
+ }
253
+
254
+ for book in books:
255
+ # Language stats
256
+ lang = book.get('language', 'unknown')
257
+ stats['languages'][lang] += 1
258
+
259
+ # Year stats
260
+ year = book.get('year', 'unknown')
261
+ if year:
262
+ stats['years'][year] += 1
263
+
264
+ # Creator stats
265
+ for creator in book.get('creators', []):
266
+ stats['top_creators'][creator] += 1
267
+
268
+ # Subject stats
269
+ for subject in book.get('subjects', []):
270
+ stats['top_subjects'][subject] += 1
271
+
272
+ # Sort and limit top items
273
+ stats['top_creators'] = sorted(stats['top_creators'].items(),
274
+ key=lambda x: x[1], reverse=True)[:10]
275
+ stats['top_subjects'] = sorted(stats['top_subjects'].items(),
276
+ key=lambda x: x[1], reverse=True)[:10]
277
+
278
+ content = template.render(
279
+ title="Library",
280
+ books=books,
281
+ stats=stats,
282
+ organize_by=organize_by
283
+ )
284
+
285
+ index_file = content_dir / "_index.md"
286
+ with open(index_file, 'w', encoding='utf-8') as f:
287
+ f.write(content)
@@ -0,0 +1,164 @@
1
+ """Multi-faceted export for ebk libraries with sidebar navigation."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Set, Optional
5
+ import json
6
+ import shutil
7
+ from collections import defaultdict
8
+ import re
9
+ from datetime import datetime
10
+ from jinja2 import Environment, FileSystemLoader
11
+ from .html_utils import sanitize_entries_for_javascript, sanitize_for_html, create_safe_filename
12
+ from .base_exporter import BaseExporter
13
+
14
+
15
+ class MultiFacetExporter(BaseExporter):
16
+ """Export library with multiple faceted navigation (subjects, authors, etc.)."""
17
+
18
+ def __init__(self, facets: Optional[Dict[str, str]] = None):
19
+ """
20
+ Initialize the multi-facet exporter.
21
+
22
+ Args:
23
+ facets: Dictionary mapping facet names to metadata fields
24
+ e.g., {"Subjects": "subjects", "Authors": "creators", "Years": "date"}
25
+ """
26
+ super().__init__()
27
+ self.facets = facets or {
28
+ "Subjects": "subjects",
29
+ "Authors": "creators",
30
+ "Publishers": "publisher",
31
+ "Languages": "language"
32
+ }
33
+
34
+ def export(self, library_path: Path, output_path: Path,
35
+ include_files: bool = False,
36
+ create_index: bool = True, **options):
37
+ """Export the library with multi-faceted navigation."""
38
+ # Use base class methods
39
+ entries = self.load_metadata(library_path)
40
+ self.prepare_output_directory(output_path)
41
+
42
+ # Build facet data
43
+ facet_data = self._build_facet_data(entries)
44
+
45
+ # Create _books directory structure
46
+ books_dir = output_path / "_books"
47
+ books_dir.mkdir()
48
+
49
+ # Process each entry
50
+ for entry in entries:
51
+ entry_id = entry.get("unique_id", "")
52
+ if not entry_id:
53
+ continue
54
+
55
+ # Create entry directory
56
+ entry_dir = books_dir / self._sanitize_filename(entry_id)
57
+ entry_dir.mkdir(exist_ok=True)
58
+
59
+ # Use base class file operations
60
+ if include_files:
61
+ self.copy_entry_files(entry, library_path, entry_dir)
62
+ else:
63
+ self.symlink_entry_files(entry, library_path, entry_dir)
64
+
65
+ # Write entry metadata using base class method
66
+ self.write_json(entry, entry_dir / "metadata.json")
67
+
68
+ # Add computed fields for template
69
+ entry["_entry_id"] = entry_id
70
+ entry["_readable_name"] = self.get_readable_name(entry)
71
+
72
+ # Create index.html if requested
73
+ if create_index:
74
+ self._create_index_file(output_path, entries, facet_data)
75
+
76
+ # Create README using base class method
77
+ stats = {
78
+ 'total_entries': len(entries),
79
+ 'export_date': datetime.now().isoformat(),
80
+ 'export_type': 'Multi-Faceted Export',
81
+ 'structure_description': f"Organized by {len(self.facets)} facets with {len(entries)} entries"
82
+ }
83
+ self.create_readme(output_path, stats)
84
+
85
+ def _build_facet_data(self, entries: List[Dict]) -> Dict[str, Dict]:
86
+ """Build facet data structure from entries."""
87
+ facet_data = {}
88
+
89
+ for facet_name, field_name in self.facets.items():
90
+ items = defaultdict(int)
91
+
92
+ for entry in entries:
93
+ values = entry.get(field_name, [])
94
+ if not isinstance(values, list):
95
+ values = [values] if values else []
96
+
97
+ for value in values:
98
+ if value: # Skip empty values
99
+ # Special handling for dates - extract year
100
+ if field_name == "date" and value:
101
+ try:
102
+ year = str(value)[:4]
103
+ if year.isdigit():
104
+ items[year] += 1
105
+ except (KeyError, ValueError, AttributeError):
106
+ pass # Skip entries with invalid date format
107
+ else:
108
+ items[str(value)] += 1
109
+
110
+ facet_data[field_name] = {
111
+ "display_name": facet_name,
112
+ "items": dict(items)
113
+ }
114
+
115
+ return facet_data
116
+
117
+ def _create_index_file(self, output_path: Path, entries: List[Dict],
118
+ facet_data: Dict[str, Dict]):
119
+ """Create the multi-faceted index.html file."""
120
+ # Prepare entries for JSON
121
+ clean_entries = []
122
+ for entry in entries:
123
+ clean_entry = {}
124
+ for key, value in entry.items():
125
+ if isinstance(value, str):
126
+ if key == "description":
127
+ # Strip HTML and limit length
128
+ import re
129
+ value = re.sub(r'<[^>]+>', '', value)
130
+ if len(value) > 500:
131
+ value = value[:500] + "..."
132
+ clean_entry[key] = value
133
+ elif isinstance(value, list):
134
+ clean_entry[key] = [str(v) for v in value]
135
+ else:
136
+ clean_entry[key] = str(value)
137
+ clean_entries.append(clean_entry)
138
+
139
+ # Use safe JSON encoding for JavaScript embedding
140
+ entries_json = sanitize_entries_for_javascript(clean_entries)
141
+
142
+ # Set up Jinja2
143
+ template_dir = Path(__file__).parent / "templates"
144
+ env = Environment(loader=FileSystemLoader(str(template_dir)))
145
+ template = env.get_template("multi_facet_index.html")
146
+
147
+ # Render template with sanitized data
148
+ html_content = template.render(
149
+ title=sanitize_for_html("EBK Library"),
150
+ entries=entries,
151
+ entries_json=entries_json, # Already sanitized
152
+ facets=facet_data,
153
+ is_subdir=False
154
+ )
155
+
156
+ # Write the file
157
+ index_path = output_path / "index.html"
158
+ with open(index_path, "w", encoding="utf-8") as f:
159
+ f.write(html_content)
160
+
161
+
162
+ readme_path = output_path / "README.md"
163
+ with open(readme_path, 'w', encoding='utf-8') as f:
164
+ f.write(readme_content)