ebk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ebk might be problematic. Click here for more details.
- ebk/__init__.py +0 -0
- ebk/cli.py +879 -0
- ebk/config.py +35 -0
- ebk/exports/__init__.py +0 -0
- ebk/exports/hugo.py +55 -0
- ebk/exports/zip.py +25 -0
- ebk/extract_metadata.py +273 -0
- ebk/ident.py +96 -0
- ebk/imports/__init__.py +0 -0
- ebk/imports/calibre.py +144 -0
- ebk/imports/ebooks.py +116 -0
- ebk/llm.py +58 -0
- ebk/manager.py +44 -0
- ebk/merge.py +308 -0
- ebk/streamlit/__init__.py +0 -0
- ebk/streamlit/__pycache__/__init__.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/display.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/filters.cpython-310.pyc +0 -0
- ebk/streamlit/__pycache__/utils.cpython-310.pyc +0 -0
- ebk/streamlit/app.py +185 -0
- ebk/streamlit/display.py +168 -0
- ebk/streamlit/filters.py +151 -0
- ebk/streamlit/utils.py +58 -0
- ebk/utils.py +311 -0
- ebk-0.1.0.dist-info/METADATA +457 -0
- ebk-0.1.0.dist-info/RECORD +29 -0
- ebk-0.1.0.dist-info/WHEEL +5 -0
- ebk-0.1.0.dist-info/entry_points.txt +2 -0
- ebk-0.1.0.dist-info/top_level.txt +1 -0
ebk/streamlit/app.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import streamlit as st
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
from utils import load_metadata, extract_zip
|
|
6
|
+
from filters import sanitize_dataframe, create_filters
|
|
7
|
+
from display import display_books_tab, display_statistics_tab
|
|
8
|
+
|
|
9
|
+
# Configure logging
|
|
10
|
+
logging.basicConfig(
|
|
11
|
+
level=logging.INFO,
|
|
12
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
13
|
+
)
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
#def display_footer():
|
|
17
|
+
# st.markdown("---")
|
|
18
|
+
# st.write("Developed with ❤️ using Streamlit.")
|
|
19
|
+
|
|
20
|
+
def display_dashboard(metadata_list: list, cover_images: dict, ebook_files: dict):
|
|
21
|
+
"""
|
|
22
|
+
Displays the main dashboard with advanced filtering and a compact UI layout using tabs.
|
|
23
|
+
"""
|
|
24
|
+
# Convert metadata list to DataFrame
|
|
25
|
+
df = pd.DataFrame(metadata_list)
|
|
26
|
+
logger.debug("Converted metadata list to DataFrame.")
|
|
27
|
+
|
|
28
|
+
# Sanitize DataFrame
|
|
29
|
+
df = sanitize_dataframe(df)
|
|
30
|
+
logger.debug("Sanitized DataFrame.")
|
|
31
|
+
|
|
32
|
+
# Apply Filters
|
|
33
|
+
filtered_df = create_filters(df)
|
|
34
|
+
logger.debug("Applied filters to DataFrame.")
|
|
35
|
+
|
|
36
|
+
# Create Tabs
|
|
37
|
+
tabs = st.tabs(["📚 Books", "📊 Statistics", "Advanced Search", "📖 Table", "📝 Instructions"])
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
with tabs[0]:
|
|
41
|
+
# Display Books
|
|
42
|
+
display_books_tab(filtered_df, cover_images, ebook_files)
|
|
43
|
+
|
|
44
|
+
with tabs[1]:
|
|
45
|
+
# Display Statistics
|
|
46
|
+
display_statistics_tab(filtered_df)
|
|
47
|
+
|
|
48
|
+
with tabs[2]:
|
|
49
|
+
# Display Advanced Search
|
|
50
|
+
display_advanced_search_tab(metadata_list)
|
|
51
|
+
|
|
52
|
+
with tabs[3]:
|
|
53
|
+
# Display Table
|
|
54
|
+
display_table_view_tab(filtered_df)
|
|
55
|
+
|
|
56
|
+
with tabs[4]:
|
|
57
|
+
# Display Instructions
|
|
58
|
+
st.header("📝 Instructions")
|
|
59
|
+
st.markdown("""
|
|
60
|
+
1. **Prepare a ZIP Archive** of an ebk library using the following process:
|
|
61
|
+
- Go to the directory containing the desired ebk library (should have 'metadata.json` and associated files).
|
|
62
|
+
- Compress the directory into a ZIP archive.
|
|
63
|
+
- The `ebk` CLI tool can also autoatically output a ZIP archive,
|
|
64
|
+
e.g., `ebk import calibre <calibre-library> --output.zip`.
|
|
65
|
+
2. **Upload the ZIP Archive** using the uploader below.
|
|
66
|
+
3. **Use the Sidebar** to apply filters and search your library.
|
|
67
|
+
4. **Interact** with the dashboard to view details and download ebooks.
|
|
68
|
+
""")
|
|
69
|
+
|
|
70
|
+
# Display Footer
|
|
71
|
+
# display_footer()
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
st.set_page_config(page_title="ebk Dashboard", layout="wide")
|
|
75
|
+
st.title("📚 ebk Dashoard")
|
|
76
|
+
st.write("""
|
|
77
|
+
Upload a **ZIP archive** containing your `metadata.json`, all associated cover images, and ebook files.
|
|
78
|
+
The app will automatically process and display your library with advanced search and filtering options.
|
|
79
|
+
""")
|
|
80
|
+
|
|
81
|
+
# File uploader for ZIP archive
|
|
82
|
+
st.subheader("📁 Upload ZIP Archive")
|
|
83
|
+
zip_file = st.file_uploader(
|
|
84
|
+
label="Upload a ZIP file containing `metadata.json`, cover images, and ebook files",
|
|
85
|
+
type=["zip"],
|
|
86
|
+
key="zip_upload"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
MAX_ZIP_SIZE = 8 * 1024 * 1024 * 1024 # 1 GB
|
|
90
|
+
|
|
91
|
+
if zip_file:
|
|
92
|
+
print("Uploaded ZIP file:", zip_file.name)
|
|
93
|
+
print("🔄 File size:", zip_file.size)
|
|
94
|
+
if zip_file.size > MAX_ZIP_SIZE:
|
|
95
|
+
st.error(f"❌ Uploaded ZIP file is {zip_file.size / 1024 / 1024 / 1024:.2f} GB, which exceeds the size limit of 1 GB.")
|
|
96
|
+
logger.error("Uploaded ZIP file exceeds the size limit.")
|
|
97
|
+
st.stop()
|
|
98
|
+
|
|
99
|
+
with st.spinner("🔄 Extracting and processing ZIP archive..."):
|
|
100
|
+
extracted_files = extract_zip(zip_file)
|
|
101
|
+
if not extracted_files:
|
|
102
|
+
logger.error("No files extracted from the ZIP archive.")
|
|
103
|
+
st.stop() # Stop if extraction failed
|
|
104
|
+
|
|
105
|
+
# Locate metadata.json (case-insensitive search)
|
|
106
|
+
metadata_key = next((k for k in extracted_files if os.path.basename(k).lower() == "metadata.json"), None)
|
|
107
|
+
if not metadata_key:
|
|
108
|
+
st.error("❌ `metadata.json` not found in the uploaded ZIP archive.")
|
|
109
|
+
logger.error("`metadata.json` not found in the uploaded ZIP archive.")
|
|
110
|
+
st.stop()
|
|
111
|
+
|
|
112
|
+
metadata_content = extracted_files[metadata_key]
|
|
113
|
+
metadata_list = load_metadata(metadata_content)
|
|
114
|
+
if not metadata_list:
|
|
115
|
+
logger.error("Failed to load metadata from `metadata.json`.")
|
|
116
|
+
st.stop()
|
|
117
|
+
|
|
118
|
+
# Collect cover images and ebook files
|
|
119
|
+
cover_images = {}
|
|
120
|
+
ebook_files = {}
|
|
121
|
+
for filename, file_bytes in extracted_files.items():
|
|
122
|
+
lower_filename = filename.lower()
|
|
123
|
+
basename = os.path.basename(filename)
|
|
124
|
+
if lower_filename.endswith(('.jpg', '.jpeg', '.png')):
|
|
125
|
+
cover_images[basename] = file_bytes
|
|
126
|
+
logger.debug(f"Added cover image: {basename}")
|
|
127
|
+
elif lower_filename.endswith(('.pdf', '.epub', '.mobi', '.azw3', '.txt')):
|
|
128
|
+
ebook_files[basename] = file_bytes
|
|
129
|
+
logger.debug(f"Added ebook file: {basename}")
|
|
130
|
+
else:
|
|
131
|
+
# Ignore other file types or handle as needed
|
|
132
|
+
logger.debug(f"Ignored unsupported file type: {basename}")
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
# Inform user about unmatched cover images
|
|
136
|
+
expected_covers = {os.path.basename(md.get("cover_path", "")) for md in metadata_list if md.get("cover_path")}
|
|
137
|
+
uploaded_covers = set(cover_images.keys())
|
|
138
|
+
missing_covers = expected_covers - uploaded_covers
|
|
139
|
+
if missing_covers:
|
|
140
|
+
st.warning(f"⚠️ The following cover images are referenced in `metadata.json` but were not uploaded: {', '.join(missing_covers)}")
|
|
141
|
+
logger.warning(f"Missing cover images: {missing_covers}")
|
|
142
|
+
|
|
143
|
+
# Inform user about unmatched ebook files
|
|
144
|
+
expected_ebooks = {os.path.basename(path) for md in metadata_list for path in md.get("file_paths", [])}
|
|
145
|
+
uploaded_ebooks = set(ebook_files.keys())
|
|
146
|
+
missing_ebooks = expected_ebooks - uploaded_ebooks
|
|
147
|
+
if missing_ebooks:
|
|
148
|
+
st.warning(f"⚠️ The following ebook files are referenced in `metadata.json` but were not uploaded: {', '.join(missing_ebooks)}")
|
|
149
|
+
logger.warning(f"Missing ebook files: {missing_ebooks}")
|
|
150
|
+
|
|
151
|
+
# Display the dashboard with metadata and cover images
|
|
152
|
+
display_dashboard(metadata_list, cover_images, ebook_files)
|
|
153
|
+
else:
|
|
154
|
+
st.info("📥 Please upload a ZIP archive to get started.")
|
|
155
|
+
logger.debug("No ZIP archive uploaded yet.")
|
|
156
|
+
|
|
157
|
+
def display_table_view_tab(filtered_df: pd.DataFrame):
|
|
158
|
+
"""
|
|
159
|
+
Displays the Table tab with a searchable table of metadata.
|
|
160
|
+
"""
|
|
161
|
+
st.header("📖 Table")
|
|
162
|
+
st.write("Explore the metadata of your library using the interactive table below.")
|
|
163
|
+
st.dataframe(filtered_df)
|
|
164
|
+
|
|
165
|
+
def display_advanced_search_tab(metadata_list: list):
|
|
166
|
+
"""
|
|
167
|
+
Using JMESPath to search the metadata list.
|
|
168
|
+
"""
|
|
169
|
+
import jmespath
|
|
170
|
+
|
|
171
|
+
st.header("Advanced Search")
|
|
172
|
+
st.write("Use JMESPath queries to search the metadata list.")
|
|
173
|
+
query = st.text_input("Enter a JMESPath query", "[].[?date > `2020-01-01`]")
|
|
174
|
+
try:
|
|
175
|
+
result = jmespath.search(query, metadata_list)
|
|
176
|
+
st.write("Search Results:")
|
|
177
|
+
st.write(result)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
st.error(f"An error occurred: {e}")
|
|
180
|
+
logger.error(f"JMESPath search error: {e}")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
main()
|
ebk/streamlit/display.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import streamlit as st
|
|
2
|
+
from PIL import Image
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import altair as alt
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
def display_books_tab(filtered_df: pd.DataFrame, cover_images: dict, ebook_files: dict):
|
|
11
|
+
"""
|
|
12
|
+
Displays the Books tab with book entries and download/view links.
|
|
13
|
+
"""
|
|
14
|
+
total_size = len(filtered_df)
|
|
15
|
+
st.subheader(f"📚 Book Entries (Total: {total_size})")
|
|
16
|
+
if not filtered_df.empty:
|
|
17
|
+
for idx, row in filtered_df.iterrows():
|
|
18
|
+
with st.expander(f"**{row.get('title', 'No Title')}**"):
|
|
19
|
+
# Layout: Cover Image & Downloads | Metadata
|
|
20
|
+
cols = st.columns([1.5, 3])
|
|
21
|
+
|
|
22
|
+
# Left Column: Cover Image
|
|
23
|
+
with cols[0]:
|
|
24
|
+
# Cover Image
|
|
25
|
+
cover_path = row.get("cover_path", "")
|
|
26
|
+
cover_filename = os.path.basename(cover_path)
|
|
27
|
+
cover_data = cover_images.get(cover_filename)
|
|
28
|
+
if cover_data:
|
|
29
|
+
try:
|
|
30
|
+
image = Image.open(cover_data)
|
|
31
|
+
st.image(image, use_container_width=True, caption="🖼️ Cover")
|
|
32
|
+
logger.debug(f"Displayed cover image: {cover_filename}")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
st.error(f"🖼️ Error loading image: {e}")
|
|
35
|
+
logger.error(f"Error loading image {cover_filename}: {e}")
|
|
36
|
+
else:
|
|
37
|
+
st.info("🖼️ No cover image available.")
|
|
38
|
+
logger.debug(f"No cover image available for {cover_filename}.")
|
|
39
|
+
|
|
40
|
+
# Right Column: Metadata Details and Ebook Links
|
|
41
|
+
with cols[1]:
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# show title in a header style
|
|
45
|
+
title = row.get("title", "No Title")
|
|
46
|
+
st.markdown(f"# 📖 {title}")
|
|
47
|
+
|
|
48
|
+
metadata_details = {
|
|
49
|
+
"👤 **Author(s)**": ", ".join(row.get("creators", ["N/A"])),
|
|
50
|
+
"📚 **Subjects**": ", ".join(row.get("subjects", ["N/A"])),
|
|
51
|
+
"📝 **Description**": row.get("description", "N/A"),
|
|
52
|
+
"🌐 **Language**": row.get("language", "N/A"),
|
|
53
|
+
"📅 **Publication Date**": row.get("date", "N/A") if pd.notna(row.get("date", None)) else "N/A",
|
|
54
|
+
"📖 **Publisher**": row.get("publisher", "N/A"),
|
|
55
|
+
"📏 **File Size**": row.get("file_size", "N/A"),
|
|
56
|
+
"📚 **Virtual Libraries**": ", ".join(row.get("virtual_libs", ["N/A"])),
|
|
57
|
+
"🔑 **Identifiers**": ", ".join([f"{k}: {v}" for k, v in row.get("identifiers", {}).items()]),
|
|
58
|
+
"🔑 **Unique ID**": row.get("unique_id", "NA"),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for key, value in metadata_details.items():
|
|
62
|
+
st.markdown(f"{key}: {value}")
|
|
63
|
+
|
|
64
|
+
# Ebook Download and View Links
|
|
65
|
+
ebook_paths = row.get("file_paths", [])
|
|
66
|
+
if ebook_paths:
|
|
67
|
+
st.markdown("### 📥 Ebook Links")
|
|
68
|
+
for ebook_path in ebook_paths:
|
|
69
|
+
ebook_filename = os.path.basename(ebook_path)
|
|
70
|
+
ebook_data = ebook_files.get(ebook_filename)
|
|
71
|
+
if ebook_data:
|
|
72
|
+
# Determine MIME type based on file extension
|
|
73
|
+
_, ext = os.path.splitext(ebook_filename.lower())
|
|
74
|
+
mime_types = {
|
|
75
|
+
'.pdf': 'application/pdf',
|
|
76
|
+
'.epub': 'application/epub+zip',
|
|
77
|
+
'.mobi': 'application/x-mobipocket-ebook',
|
|
78
|
+
'.azw3': 'application/vnd.amazon.ebook',
|
|
79
|
+
'.txt': 'text/plain'
|
|
80
|
+
}
|
|
81
|
+
mime_type = mime_types.get(ext, 'application/octet-stream')
|
|
82
|
+
|
|
83
|
+
st.download_button(
|
|
84
|
+
label=f"💾 Download {ebook_filename}",
|
|
85
|
+
data=ebook_data.getvalue(),
|
|
86
|
+
file_name=ebook_filename,
|
|
87
|
+
mime=mime_type
|
|
88
|
+
)
|
|
89
|
+
logger.debug(f"Provided link for {ebook_filename}.")
|
|
90
|
+
else:
|
|
91
|
+
st.warning(f"Ebook file '{ebook_filename}' not found in the uploaded ZIP.")
|
|
92
|
+
logger.warning(f"Ebook file '{ebook_filename}' not found in the uploaded ZIP.")
|
|
93
|
+
else:
|
|
94
|
+
st.info("📄 No ebook files available for download.")
|
|
95
|
+
logger.debug("No ebook files available for download.")
|
|
96
|
+
else:
|
|
97
|
+
st.info("📚 No books match the current filter criteria.")
|
|
98
|
+
logger.debug("No books match the current filter criteria.")
|
|
99
|
+
|
|
100
|
+
def display_statistics_tab(filtered_df: pd.DataFrame):
|
|
101
|
+
"""
|
|
102
|
+
Displays the Statistics tab with various visualizations.
|
|
103
|
+
"""
|
|
104
|
+
st.subheader("📊 Statistics")
|
|
105
|
+
|
|
106
|
+
if not filtered_df.empty:
|
|
107
|
+
# Visualization: Books per Author (Top 10)
|
|
108
|
+
st.markdown("### 📈 Top 10 Authors by Number of Books")
|
|
109
|
+
author_counts = pd.Series([creator for creators in filtered_df['creators'] for creator in creators]).value_counts().nlargest(10).reset_index()
|
|
110
|
+
author_counts.columns = ['Author', 'Number of Books']
|
|
111
|
+
|
|
112
|
+
chart = alt.Chart(author_counts).mark_bar().encode(
|
|
113
|
+
x=alt.X('Number of Books:Q', title='Number of Books'),
|
|
114
|
+
y=alt.Y('Author:N', sort='-x', title='Author'),
|
|
115
|
+
tooltip=['Author', 'Number of Books']
|
|
116
|
+
).properties(
|
|
117
|
+
width=600,
|
|
118
|
+
height=400
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
st.altair_chart(chart, use_container_width=True)
|
|
122
|
+
logger.debug("Displayed Top 10 Authors chart.")
|
|
123
|
+
|
|
124
|
+
# Visualization: Books per Subject (Top 10)
|
|
125
|
+
st.markdown("### 📊 Top 10 Subjects by Number of Books")
|
|
126
|
+
subject_counts = pd.Series([subject for subjects in filtered_df['subjects'] for subject in subjects]).value_counts().nlargest(10).reset_index()
|
|
127
|
+
subject_counts.columns = ['Subject', 'Number of Books']
|
|
128
|
+
|
|
129
|
+
subject_chart = alt.Chart(subject_counts).mark_bar().encode(
|
|
130
|
+
x=alt.X('Number of Books:Q', title='Number of Books'),
|
|
131
|
+
y=alt.Y('Subject:N', sort='-x', title='Subject'),
|
|
132
|
+
tooltip=['Subject', 'Number of Books']
|
|
133
|
+
).properties(
|
|
134
|
+
width=600,
|
|
135
|
+
height=400
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
st.altair_chart(subject_chart, use_container_width=True)
|
|
139
|
+
logger.debug("Displayed Top 10 Subjects chart.")
|
|
140
|
+
|
|
141
|
+
# Visualization: Books Published Over Time
|
|
142
|
+
st.markdown("### 📈 Books Published Over Time")
|
|
143
|
+
if 'date' in filtered_df.columns and pd.api.types.is_numeric_dtype(filtered_df['date']):
|
|
144
|
+
publication_years = filtered_df['date'].dropna().astype(int)
|
|
145
|
+
if not publication_years.empty:
|
|
146
|
+
year_counts = publication_years.value_counts().sort_index().reset_index()
|
|
147
|
+
year_counts.columns = ['Year', 'Number of Books']
|
|
148
|
+
|
|
149
|
+
time_chart = alt.Chart(year_counts).mark_line(point=True).encode(
|
|
150
|
+
x=alt.X('Year:O', title='Year'),
|
|
151
|
+
y=alt.Y('Number of Books:Q', title='Number of Books'),
|
|
152
|
+
tooltip=['Year', 'Number of Books']
|
|
153
|
+
).properties(
|
|
154
|
+
width=800,
|
|
155
|
+
height=400
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
st.altair_chart(time_chart, use_container_width=True)
|
|
159
|
+
logger.debug("Displayed Books Published Over Time chart.")
|
|
160
|
+
else:
|
|
161
|
+
st.info("📅 No publication date data available.")
|
|
162
|
+
logger.warning("Publication year data is empty after filtering.")
|
|
163
|
+
else:
|
|
164
|
+
st.info("📅 Publication date data is not available or not in a numeric format.")
|
|
165
|
+
logger.warning("Publication date data is not available or not numeric.")
|
|
166
|
+
else:
|
|
167
|
+
st.info("📊 No statistics to display as no books match the current filter criteria.")
|
|
168
|
+
logger.debug("No statistics to display due to empty filtered DataFrame.")
|
ebk/streamlit/filters.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import streamlit as st
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
8
|
+
"""
|
|
9
|
+
Sanitizes the DataFrame by ensuring correct data types and handling missing values.
|
|
10
|
+
"""
|
|
11
|
+
# List of columns that should contain lists
|
|
12
|
+
list_columns = ['creators', 'subjects', 'file_paths']
|
|
13
|
+
|
|
14
|
+
def ensure_list(column):
|
|
15
|
+
"""
|
|
16
|
+
Ensures that each entry in the column is a list. If not, replaces it with an empty list.
|
|
17
|
+
"""
|
|
18
|
+
return column.apply(lambda x: x if isinstance(x, list) else [])
|
|
19
|
+
|
|
20
|
+
for col in list_columns:
|
|
21
|
+
if col in df.columns:
|
|
22
|
+
df[col] = ensure_list(df[col])
|
|
23
|
+
logger.debug(f"Processed list column: {col}")
|
|
24
|
+
else:
|
|
25
|
+
df[col] = [[] for _ in range(len(df))]
|
|
26
|
+
logger.debug(f"Created empty list column: {col}")
|
|
27
|
+
|
|
28
|
+
# Handle 'identifiers' column
|
|
29
|
+
if 'identifiers' in df.columns:
|
|
30
|
+
df['identifiers'] = df['identifiers'].apply(lambda x: x if isinstance(x, dict) else {})
|
|
31
|
+
logger.debug("Sanitized 'identifiers' column.")
|
|
32
|
+
else:
|
|
33
|
+
df['identifiers'] = [{} for _ in range(len(df))]
|
|
34
|
+
logger.debug("Created empty 'identifiers' column.")
|
|
35
|
+
|
|
36
|
+
# Sanitize 'language' column
|
|
37
|
+
if 'language' in df.columns:
|
|
38
|
+
df['language'] = df['language'].apply(lambda x: x if isinstance(x, str) else '').fillna('').astype(str)
|
|
39
|
+
logger.debug("Sanitized 'language' column.")
|
|
40
|
+
else:
|
|
41
|
+
df['language'] = ['' for _ in range(len(df))]
|
|
42
|
+
logger.debug("Created empty 'language' column.")
|
|
43
|
+
|
|
44
|
+
# Sanitize 'cover_path' column
|
|
45
|
+
if 'cover_path' in df.columns:
|
|
46
|
+
df['cover_path'] = df['cover_path'].apply(lambda x: x if isinstance(x, str) else '').fillna('').astype(str)
|
|
47
|
+
logger.debug("Sanitized 'cover_path' column.")
|
|
48
|
+
else:
|
|
49
|
+
df['cover_path'] = ['' for _ in range(len(df))]
|
|
50
|
+
logger.debug("Created empty 'cover_path' column.")
|
|
51
|
+
|
|
52
|
+
# Sanitize string fields: 'title', 'description'
|
|
53
|
+
string_fields = ['title', 'description']
|
|
54
|
+
for field in string_fields:
|
|
55
|
+
if field in df.columns:
|
|
56
|
+
df[field] = df[field].apply(lambda x: x if isinstance(x, str) else '').fillna('').astype(str)
|
|
57
|
+
logger.debug(f"Sanitized '{field}' column.")
|
|
58
|
+
else:
|
|
59
|
+
df[field] = ['' for _ in range(len(df))]
|
|
60
|
+
logger.debug(f"Created empty '{field}' column.")
|
|
61
|
+
|
|
62
|
+
# Sanitize 'date' column
|
|
63
|
+
if 'date' in df.columns:
|
|
64
|
+
df['date'] = pd.to_numeric(df['date'], errors='coerce')
|
|
65
|
+
logger.debug("Sanitized 'date' column to ensure numeric types.")
|
|
66
|
+
else:
|
|
67
|
+
df['date'] = [None for _ in range(len(df))]
|
|
68
|
+
logger.debug("Created empty 'date' column.")
|
|
69
|
+
|
|
70
|
+
return df
|
|
71
|
+
|
|
72
|
+
def create_filters(df: pd.DataFrame) -> pd.DataFrame:
|
|
73
|
+
"""
|
|
74
|
+
Creates and applies advanced filters to the DataFrame based on user inputs.
|
|
75
|
+
Returns the filtered DataFrame.
|
|
76
|
+
"""
|
|
77
|
+
# Sidebar for Filters
|
|
78
|
+
st.sidebar.header("🔍 Filters")
|
|
79
|
+
|
|
80
|
+
# Title Search
|
|
81
|
+
title_search = st.sidebar.text_input("🔎 Search by Title")
|
|
82
|
+
|
|
83
|
+
# Author Filter (Multi-select)
|
|
84
|
+
all_creators = sorted(set(creator for creators in df['creators'] for creator in creators))
|
|
85
|
+
selected_authors = st.sidebar.multiselect("👤 Filter by Author(s)", all_creators, default=[])
|
|
86
|
+
|
|
87
|
+
# Subjects Filter (Multi-select)
|
|
88
|
+
all_subjects = sorted(set(subject for subjects in df['subjects'] for subject in subjects))
|
|
89
|
+
selected_subjects = st.sidebar.multiselect("📚 Filter by Subject(s)", all_subjects, default=[])
|
|
90
|
+
|
|
91
|
+
# Search by Various Libraries
|
|
92
|
+
all_libraries = sorted(set(lib for libs in df['virtual_libs'] for lib in libs))
|
|
93
|
+
selected_libraries = st.sidebar.multiselect("📚 Filter by Virtual Library(s)", all_libraries, default=[])
|
|
94
|
+
|
|
95
|
+
# Language Filter (Multi-select)
|
|
96
|
+
all_languages = sorted(set(lang for lang in df['language'] if lang))
|
|
97
|
+
selected_languages = st.sidebar.multiselect("🌐 Filter by Language(s)", all_languages, default=[])
|
|
98
|
+
|
|
99
|
+
# Publication Date Filter (Range Slider)
|
|
100
|
+
selected_years = None
|
|
101
|
+
if 'date' in df.columns and pd.api.types.is_numeric_dtype(df['date']):
|
|
102
|
+
min_year = int(df['date'].min()) if pd.notna(df['date'].min()) else 0
|
|
103
|
+
max_year = int(df['date'].max()) if pd.notna(df['date'].max()) else 0
|
|
104
|
+
if min_year and max_year:
|
|
105
|
+
selected_years = st.sidebar.slider("📅 Publication Year Range", min_year, max_year, (min_year, max_year))
|
|
106
|
+
logger.debug(f"Publication year range selected: {selected_years}")
|
|
107
|
+
else:
|
|
108
|
+
st.sidebar.info("📅 No valid publication year data available.")
|
|
109
|
+
logger.warning("Publication year data is not available or entirely NaN.")
|
|
110
|
+
else:
|
|
111
|
+
st.sidebar.info("📅 Publication date data is not available or not in a numeric format.")
|
|
112
|
+
logger.warning("Publication date data is not available or not numeric.")
|
|
113
|
+
|
|
114
|
+
# Identifier Search
|
|
115
|
+
identifier_search = st.sidebar.text_input("🔑 Search by Identifier (e.g., ISBN)")
|
|
116
|
+
|
|
117
|
+
# Apply Filters
|
|
118
|
+
filtered_df = df.copy()
|
|
119
|
+
|
|
120
|
+
if title_search:
|
|
121
|
+
filtered_df = filtered_df[filtered_df['title'].str.contains(title_search, case=False, na=False)]
|
|
122
|
+
logger.debug(f"Applied title search filter: '{title_search}'")
|
|
123
|
+
|
|
124
|
+
if selected_authors:
|
|
125
|
+
filtered_df = filtered_df[filtered_df['creators'].apply(lambda x: any(creator in selected_authors for creator in x))]
|
|
126
|
+
logger.debug(f"Applied author filter: {selected_authors}")
|
|
127
|
+
|
|
128
|
+
if selected_subjects:
|
|
129
|
+
filtered_df = filtered_df[filtered_df['subjects'].apply(lambda x: any(subject in selected_subjects for subject in x))]
|
|
130
|
+
logger.debug(f"Applied subject filter: {selected_subjects}")
|
|
131
|
+
|
|
132
|
+
if selected_libraries:
|
|
133
|
+
filtered_df = filtered_df[filtered_df['virtual_libs'].apply(lambda x: any(lib in selected_libraries for lib in x))]
|
|
134
|
+
logger.debug(f"Applied library filter: {selected_libraries}")
|
|
135
|
+
|
|
136
|
+
if selected_languages:
|
|
137
|
+
filtered_df = filtered_df[filtered_df['language'].isin(selected_languages)]
|
|
138
|
+
logger.debug(f"Applied language filter: {selected_languages}")
|
|
139
|
+
|
|
140
|
+
if selected_years:
|
|
141
|
+
filtered_df = filtered_df[(filtered_df['date'] >= selected_years[0]) & (filtered_df['date'] <= selected_years[1])]
|
|
142
|
+
logger.debug(f"Applied publication year range filter: {selected_years}")
|
|
143
|
+
|
|
144
|
+
if identifier_search:
|
|
145
|
+
idents = filtered_df['identifiers']
|
|
146
|
+
idents_stringified = idents.apply(
|
|
147
|
+
lambda x: ' '.join(f"{k}:{v}" for k, v in x.items()) if isinstance(x, dict) else str(x)
|
|
148
|
+
)
|
|
149
|
+
filtered_df = filtered_df[idents_stringified.str.contains(identifier_search)]
|
|
150
|
+
|
|
151
|
+
return filtered_df
|
ebk/streamlit/utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import zipfile
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
import streamlit as st
|
|
6
|
+
import logging
|
|
7
|
+
import streamlit as st
|
|
8
|
+
from typing import List, Dict
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
def load_metadata(metadata_content: BytesIO) -> list:
|
|
15
|
+
"""
|
|
16
|
+
Loads metadata from the uploaded JSON file.
|
|
17
|
+
Returns a list of dictionaries.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
data = json.load(metadata_content)
|
|
21
|
+
logger.debug("Metadata loaded successfully.")
|
|
22
|
+
return data
|
|
23
|
+
except json.JSONDecodeError as e:
|
|
24
|
+
st.error(f"JSON decoding error: {e}")
|
|
25
|
+
logger.error(f"JSONDecodeError: {e}")
|
|
26
|
+
return []
|
|
27
|
+
except Exception as e:
|
|
28
|
+
st.error(f"Unexpected error loading metadata.json: {e}")
|
|
29
|
+
logger.error(f"Unexpected error: {e}")
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
def extract_zip(zip_bytes: BytesIO) -> dict:
|
|
33
|
+
"""
|
|
34
|
+
Extracts a ZIP file in-memory and returns a dictionary of its contents.
|
|
35
|
+
Keys are file names, and values are BytesIO objects containing the file data.
|
|
36
|
+
"""
|
|
37
|
+
extracted_files = {}
|
|
38
|
+
try:
|
|
39
|
+
with zipfile.ZipFile(zip_bytes) as z:
|
|
40
|
+
for file_info in z.infolist():
|
|
41
|
+
if not file_info.is_dir():
|
|
42
|
+
with z.open(file_info) as f:
|
|
43
|
+
normalized_path = os.path.normpath(file_info.filename)
|
|
44
|
+
# Prevent path traversal
|
|
45
|
+
if os.path.commonprefix([normalized_path, os.path.basename(normalized_path)]) != "":
|
|
46
|
+
extracted_files[normalized_path] = BytesIO(f.read())
|
|
47
|
+
logger.debug(f"Extracted: {normalized_path}")
|
|
48
|
+
logger.debug("ZIP archive extracted successfully.")
|
|
49
|
+
return extracted_files
|
|
50
|
+
except zipfile.BadZipFile:
|
|
51
|
+
st.error("The uploaded file is not a valid ZIP archive.")
|
|
52
|
+
logger.error("BadZipFile encountered.")
|
|
53
|
+
return {}
|
|
54
|
+
except Exception as e:
|
|
55
|
+
st.error(f"Error extracting ZIP file: {e}")
|
|
56
|
+
logger.error(f"Exception during ZIP extraction: {e}")
|
|
57
|
+
return {}
|
|
58
|
+
|