deepdiver 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdiver/__init__.py +38 -0
- deepdiver/content_processor.py +343 -0
- deepdiver/deepdive.py +801 -0
- deepdiver/deepdiver.yaml +79 -0
- deepdiver/notebooklm_automator.py +1441 -0
- deepdiver/podcast_manager.py +402 -0
- deepdiver/session_tracker.py +723 -0
- deepdiver-0.1.0.dist-info/METADATA +455 -0
- deepdiver-0.1.0.dist-info/RECORD +13 -0
- deepdiver-0.1.0.dist-info/WHEEL +5 -0
- deepdiver-0.1.0.dist-info/entry_points.txt +2 -0
- deepdiver-0.1.0.dist-info/licenses/LICENSE +21 -0
- deepdiver-0.1.0.dist-info/top_level.txt +1 -0
deepdiver/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DeepDiver - NotebookLM Podcast Automation System
|
|
3
|
+
|
|
4
|
+
A Python-based automation tool for creating podcasts from documents
|
|
5
|
+
using NotebookLM's Audio Overview feature through browser automation.
|
|
6
|
+
|
|
7
|
+
Part of Jerry's G.Music Assembly ecosystem.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
__author__ = "gerico1007"
|
|
12
|
+
__email__ = "gerico@jgwill.com"
|
|
13
|
+
__description__ = "NotebookLM Podcast Automation System"
|
|
14
|
+
|
|
15
|
+
# Assembly Team
|
|
16
|
+
ASSEMBLY_TEAM = {
|
|
17
|
+
"leader": "Jerry ⚡",
|
|
18
|
+
"nyro": "♠️ Structural Architect",
|
|
19
|
+
"aureon": "🌿 Emotional Context",
|
|
20
|
+
"jamai": "🎸 Musical Harmony",
|
|
21
|
+
"synth": "🧵 Terminal Orchestration"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Core modules
|
|
25
|
+
from .deepdive import main
|
|
26
|
+
from .notebooklm_automator import NotebookLMAutomator
|
|
27
|
+
from .content_processor import ContentProcessor
|
|
28
|
+
from .podcast_manager import PodcastManager
|
|
29
|
+
from .session_tracker import SessionTracker
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"main",
|
|
33
|
+
"NotebookLMAutomator",
|
|
34
|
+
"ContentProcessor",
|
|
35
|
+
"PodcastManager",
|
|
36
|
+
"SessionTracker",
|
|
37
|
+
"ASSEMBLY_TEAM"
|
|
38
|
+
]
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content Processor Module
|
|
3
|
+
Part of DeepDiver - NotebookLM Podcast Automation System
|
|
4
|
+
|
|
5
|
+
This module handles content preparation, formatting, and validation
|
|
6
|
+
for documents before they are uploaded to NotebookLM.
|
|
7
|
+
|
|
8
|
+
Assembly Team: Jerry ⚡, Nyro ♠️, Aureon 🌿, JamAI 🎸, Synth 🧵
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import tempfile
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Optional, Any, Union
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ContentProcessor:
|
|
21
|
+
"""
|
|
22
|
+
Handles content preparation and processing for NotebookLM.
|
|
23
|
+
|
|
24
|
+
This class manages document validation, formatting, and preparation
|
|
25
|
+
before upload to NotebookLM for podcast generation.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
29
|
+
"""Initialize the content processor with configuration."""
|
|
30
|
+
self.config = config or {}
|
|
31
|
+
self.logger = self._setup_logging()
|
|
32
|
+
|
|
33
|
+
# Content settings
|
|
34
|
+
self.supported_formats = self.config.get('CONTENT_SETTINGS', {}).get(
|
|
35
|
+
'supported_formats', ['pdf', 'docx', 'txt', 'md', 'html']
|
|
36
|
+
)
|
|
37
|
+
self.max_file_size = self.config.get('CONTENT_SETTINGS', {}).get(
|
|
38
|
+
'max_file_size', '50MB'
|
|
39
|
+
)
|
|
40
|
+
self.temp_dir = self.config.get('CONTENT_SETTINGS', {}).get(
|
|
41
|
+
'temp_dir', './temp'
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self.logger.info("♠️🌿🎸🧵 ContentProcessor initialized")
|
|
45
|
+
|
|
46
|
+
def _setup_logging(self) -> logging.Logger:
|
|
47
|
+
"""Set up logging configuration."""
|
|
48
|
+
logger = logging.getLogger('ContentProcessor')
|
|
49
|
+
logger.setLevel(logging.INFO)
|
|
50
|
+
|
|
51
|
+
if not logger.handlers:
|
|
52
|
+
handler = logging.StreamHandler()
|
|
53
|
+
formatter = logging.Formatter(
|
|
54
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
55
|
+
)
|
|
56
|
+
handler.setFormatter(formatter)
|
|
57
|
+
logger.addHandler(handler)
|
|
58
|
+
|
|
59
|
+
return logger
|
|
60
|
+
|
|
61
|
+
def validate_file(self, file_path: str) -> Dict[str, Any]:
|
|
62
|
+
"""
|
|
63
|
+
Validate a file for NotebookLM upload.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
file_path (str): Path to the file to validate
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Dict[str, Any]: Validation results with success status and details
|
|
70
|
+
"""
|
|
71
|
+
result = {
|
|
72
|
+
'success': False,
|
|
73
|
+
'file_path': file_path,
|
|
74
|
+
'file_size': 0,
|
|
75
|
+
'file_format': None,
|
|
76
|
+
'errors': [],
|
|
77
|
+
'warnings': []
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Check if file exists
|
|
82
|
+
if not os.path.exists(file_path):
|
|
83
|
+
result['errors'].append(f"File not found: {file_path}")
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
# Get file info
|
|
87
|
+
file_path_obj = Path(file_path)
|
|
88
|
+
result['file_size'] = file_path_obj.stat().st_size
|
|
89
|
+
result['file_format'] = file_path_obj.suffix.lower().lstrip('.')
|
|
90
|
+
|
|
91
|
+
# Check file format
|
|
92
|
+
if result['file_format'] not in self.supported_formats:
|
|
93
|
+
result['errors'].append(
|
|
94
|
+
f"Unsupported file format: {result['file_format']}. "
|
|
95
|
+
f"Supported formats: {', '.join(self.supported_formats)}"
|
|
96
|
+
)
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
# Check file size
|
|
100
|
+
max_size_bytes = self._parse_file_size(self.max_file_size)
|
|
101
|
+
if result['file_size'] > max_size_bytes:
|
|
102
|
+
result['errors'].append(
|
|
103
|
+
f"File too large: {self._format_file_size(result['file_size'])}. "
|
|
104
|
+
f"Maximum allowed: {self.max_file_size}"
|
|
105
|
+
)
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
# Check if file is readable
|
|
109
|
+
try:
|
|
110
|
+
with open(file_path, 'rb') as f:
|
|
111
|
+
f.read(1024) # Read first 1KB to test readability
|
|
112
|
+
except Exception as e:
|
|
113
|
+
result['errors'].append(f"File not readable: {e}")
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
result['success'] = True
|
|
117
|
+
self.logger.info(f"✅ File validation successful: {file_path}")
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
result['errors'].append(f"Validation error: {e}")
|
|
121
|
+
self.logger.error(f"❌ File validation failed: {e}")
|
|
122
|
+
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
def _parse_file_size(self, size_str: str) -> int:
|
|
126
|
+
"""Parse file size string to bytes."""
|
|
127
|
+
size_str = size_str.upper().strip()
|
|
128
|
+
|
|
129
|
+
multipliers = {
|
|
130
|
+
'B': 1,
|
|
131
|
+
'KB': 1024,
|
|
132
|
+
'MB': 1024 * 1024,
|
|
133
|
+
'GB': 1024 * 1024 * 1024
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
for suffix, multiplier in multipliers.items():
|
|
137
|
+
if size_str.endswith(suffix):
|
|
138
|
+
try:
|
|
139
|
+
number = float(size_str[:-len(suffix)])
|
|
140
|
+
return int(number * multiplier)
|
|
141
|
+
except ValueError:
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
# Default to 50MB if parsing fails
|
|
145
|
+
return 50 * 1024 * 1024
|
|
146
|
+
|
|
147
|
+
def _format_file_size(self, size_bytes: int) -> str:
|
|
148
|
+
"""Format file size in bytes to human readable format."""
|
|
149
|
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
150
|
+
if size_bytes < 1024.0:
|
|
151
|
+
return f"{size_bytes:.1f}{unit}"
|
|
152
|
+
size_bytes /= 1024.0
|
|
153
|
+
return f"{size_bytes:.1f}TB"
|
|
154
|
+
|
|
155
|
+
def prepare_content(self, file_path: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
|
|
156
|
+
"""
|
|
157
|
+
Prepare content for NotebookLM upload.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
file_path (str): Path to the source file
|
|
161
|
+
output_dir (str, optional): Directory for processed files
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Dict[str, Any]: Preparation results with processed file path
|
|
165
|
+
"""
|
|
166
|
+
result = {
|
|
167
|
+
'success': False,
|
|
168
|
+
'original_path': file_path,
|
|
169
|
+
'processed_path': None,
|
|
170
|
+
'preparation_steps': [],
|
|
171
|
+
'errors': []
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
# Validate file first
|
|
176
|
+
validation = self.validate_file(file_path)
|
|
177
|
+
if not validation['success']:
|
|
178
|
+
result['errors'] = validation['errors']
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
result['preparation_steps'].append("File validation completed")
|
|
182
|
+
|
|
183
|
+
# Determine output directory
|
|
184
|
+
if output_dir is None:
|
|
185
|
+
output_dir = self.temp_dir
|
|
186
|
+
|
|
187
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
188
|
+
|
|
189
|
+
# Process based on file type
|
|
190
|
+
file_format = validation['file_format']
|
|
191
|
+
|
|
192
|
+
if file_format in ['txt', 'md']:
|
|
193
|
+
# Text files can be used directly
|
|
194
|
+
result['processed_path'] = file_path
|
|
195
|
+
result['preparation_steps'].append("Text file ready for upload")
|
|
196
|
+
|
|
197
|
+
elif file_format == 'html':
|
|
198
|
+
# HTML files might need cleaning
|
|
199
|
+
processed_path = self._process_html_file(file_path, output_dir)
|
|
200
|
+
result['processed_path'] = processed_path
|
|
201
|
+
result['preparation_steps'].append("HTML file processed")
|
|
202
|
+
|
|
203
|
+
elif file_format in ['pdf', 'docx']:
|
|
204
|
+
# Binary files can be used directly
|
|
205
|
+
result['processed_path'] = file_path
|
|
206
|
+
result['preparation_steps'].append("Binary file ready for upload")
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
result['errors'].append(f"Unsupported file format for processing: {file_format}")
|
|
210
|
+
return result
|
|
211
|
+
|
|
212
|
+
result['success'] = True
|
|
213
|
+
self.logger.info(f"✅ Content preparation successful: {file_path}")
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
result['errors'].append(f"Preparation error: {e}")
|
|
217
|
+
self.logger.error(f"❌ Content preparation failed: {e}")
|
|
218
|
+
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
def _process_html_file(self, file_path: str, output_dir: str) -> str:
|
|
222
|
+
"""Process HTML file for better NotebookLM compatibility."""
|
|
223
|
+
try:
|
|
224
|
+
from bs4 import BeautifulSoup
|
|
225
|
+
|
|
226
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
227
|
+
content = f.read()
|
|
228
|
+
|
|
229
|
+
# Parse HTML
|
|
230
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
231
|
+
|
|
232
|
+
# Remove scripts and styles
|
|
233
|
+
for script in soup(["script", "style"]):
|
|
234
|
+
script.decompose()
|
|
235
|
+
|
|
236
|
+
# Get text content
|
|
237
|
+
text = soup.get_text()
|
|
238
|
+
|
|
239
|
+
# Clean up whitespace
|
|
240
|
+
lines = (line.strip() for line in text.splitlines())
|
|
241
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
242
|
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
|
243
|
+
|
|
244
|
+
# Save processed text
|
|
245
|
+
output_path = os.path.join(output_dir, f"processed_{Path(file_path).stem}.txt")
|
|
246
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
247
|
+
f.write(text)
|
|
248
|
+
|
|
249
|
+
return output_path
|
|
250
|
+
|
|
251
|
+
except ImportError:
|
|
252
|
+
self.logger.warning("BeautifulSoup not available, using original HTML file")
|
|
253
|
+
return file_path
|
|
254
|
+
except Exception as e:
|
|
255
|
+
self.logger.warning(f"HTML processing failed, using original file: {e}")
|
|
256
|
+
return file_path
|
|
257
|
+
|
|
258
|
+
def get_content_info(self, file_path: str) -> Dict[str, Any]:
|
|
259
|
+
"""
|
|
260
|
+
Get detailed information about a content file.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
file_path (str): Path to the file
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Dict[str, Any]: File information
|
|
267
|
+
"""
|
|
268
|
+
info = {
|
|
269
|
+
'file_path': file_path,
|
|
270
|
+
'exists': False,
|
|
271
|
+
'file_size': 0,
|
|
272
|
+
'file_format': None,
|
|
273
|
+
'readable': False,
|
|
274
|
+
'validation': None
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
if os.path.exists(file_path):
|
|
279
|
+
info['exists'] = True
|
|
280
|
+
|
|
281
|
+
file_path_obj = Path(file_path)
|
|
282
|
+
info['file_size'] = file_path_obj.stat().st_size
|
|
283
|
+
info['file_format'] = file_path_obj.suffix.lower().lstrip('.')
|
|
284
|
+
|
|
285
|
+
# Test readability
|
|
286
|
+
try:
|
|
287
|
+
with open(file_path, 'rb') as f:
|
|
288
|
+
f.read(1024)
|
|
289
|
+
info['readable'] = True
|
|
290
|
+
except:
|
|
291
|
+
info['readable'] = False
|
|
292
|
+
|
|
293
|
+
# Run validation
|
|
294
|
+
info['validation'] = self.validate_file(file_path)
|
|
295
|
+
|
|
296
|
+
except Exception as e:
|
|
297
|
+
self.logger.error(f"Error getting content info: {e}")
|
|
298
|
+
|
|
299
|
+
return info
|
|
300
|
+
|
|
301
|
+
def cleanup_temp_files(self, temp_dir: Optional[str] = None):
|
|
302
|
+
"""Clean up temporary files."""
|
|
303
|
+
if temp_dir is None:
|
|
304
|
+
temp_dir = self.temp_dir
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
if os.path.exists(temp_dir):
|
|
308
|
+
for file in os.listdir(temp_dir):
|
|
309
|
+
file_path = os.path.join(temp_dir, file)
|
|
310
|
+
if os.path.isfile(file_path):
|
|
311
|
+
os.remove(file_path)
|
|
312
|
+
self.logger.info(f"Cleaned up temp file: {file_path}")
|
|
313
|
+
except Exception as e:
|
|
314
|
+
self.logger.error(f"Error cleaning up temp files: {e}")
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# Example usage and testing
|
|
318
|
+
def test_content_processor():
|
|
319
|
+
"""Test function for content processor."""
|
|
320
|
+
processor = ContentProcessor()
|
|
321
|
+
|
|
322
|
+
# Test with a sample file (if it exists)
|
|
323
|
+
test_file = "README.md"
|
|
324
|
+
if os.path.exists(test_file):
|
|
325
|
+
print(f"Testing with file: {test_file}")
|
|
326
|
+
|
|
327
|
+
# Test validation
|
|
328
|
+
validation = processor.validate_file(test_file)
|
|
329
|
+
print(f"Validation result: {validation}")
|
|
330
|
+
|
|
331
|
+
# Test content info
|
|
332
|
+
info = processor.get_content_info(test_file)
|
|
333
|
+
print(f"Content info: {info}")
|
|
334
|
+
|
|
335
|
+
# Test preparation
|
|
336
|
+
preparation = processor.prepare_content(test_file)
|
|
337
|
+
print(f"Preparation result: {preparation}")
|
|
338
|
+
else:
|
|
339
|
+
print("No test file available")
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
if __name__ == "__main__":
|
|
343
|
+
test_content_processor()
|