deepdiver 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepdiver/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ DeepDiver - NotebookLM Podcast Automation System
3
+
4
+ A Python-based automation tool for creating podcasts from documents
5
+ using NotebookLM's Audio Overview feature through browser automation.
6
+
7
+ Part of Jerry's G.Music Assembly ecosystem.
8
+ """
9
+
10
+ __version__ = "0.1.0"
11
+ __author__ = "gerico1007"
12
+ __email__ = "gerico@jgwill.com"
13
+ __description__ = "NotebookLM Podcast Automation System"
14
+
15
+ # Assembly Team
16
+ ASSEMBLY_TEAM = {
17
+ "leader": "Jerry ⚡",
18
+ "nyro": "♠️ Structural Architect",
19
+ "aureon": "🌿 Emotional Context",
20
+ "jamai": "🎸 Musical Harmony",
21
+ "synth": "🧵 Terminal Orchestration"
22
+ }
23
+
24
+ # Core modules
25
+ from .deepdive import main
26
+ from .notebooklm_automator import NotebookLMAutomator
27
+ from .content_processor import ContentProcessor
28
+ from .podcast_manager import PodcastManager
29
+ from .session_tracker import SessionTracker
30
+
31
+ __all__ = [
32
+ "main",
33
+ "NotebookLMAutomator",
34
+ "ContentProcessor",
35
+ "PodcastManager",
36
+ "SessionTracker",
37
+ "ASSEMBLY_TEAM"
38
+ ]
@@ -0,0 +1,343 @@
1
+ """
2
+ Content Processor Module
3
+ Part of DeepDiver - NotebookLM Podcast Automation System
4
+
5
+ This module handles content preparation, formatting, and validation
6
+ for documents before they are uploaded to NotebookLM.
7
+
8
+ Assembly Team: Jerry ⚡, Nyro ♠️, Aureon 🌿, JamAI 🎸, Synth 🧵
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ import tempfile
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional, Any, Union
16
+
17
+ import yaml
18
+
19
+
20
+ class ContentProcessor:
21
+ """
22
+ Handles content preparation and processing for NotebookLM.
23
+
24
+ This class manages document validation, formatting, and preparation
25
+ before upload to NotebookLM for podcast generation.
26
+ """
27
+
28
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
29
+ """Initialize the content processor with configuration."""
30
+ self.config = config or {}
31
+ self.logger = self._setup_logging()
32
+
33
+ # Content settings
34
+ self.supported_formats = self.config.get('CONTENT_SETTINGS', {}).get(
35
+ 'supported_formats', ['pdf', 'docx', 'txt', 'md', 'html']
36
+ )
37
+ self.max_file_size = self.config.get('CONTENT_SETTINGS', {}).get(
38
+ 'max_file_size', '50MB'
39
+ )
40
+ self.temp_dir = self.config.get('CONTENT_SETTINGS', {}).get(
41
+ 'temp_dir', './temp'
42
+ )
43
+
44
+ self.logger.info("♠️🌿🎸🧵 ContentProcessor initialized")
45
+
46
+ def _setup_logging(self) -> logging.Logger:
47
+ """Set up logging configuration."""
48
+ logger = logging.getLogger('ContentProcessor')
49
+ logger.setLevel(logging.INFO)
50
+
51
+ if not logger.handlers:
52
+ handler = logging.StreamHandler()
53
+ formatter = logging.Formatter(
54
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
55
+ )
56
+ handler.setFormatter(formatter)
57
+ logger.addHandler(handler)
58
+
59
+ return logger
60
+
61
+ def validate_file(self, file_path: str) -> Dict[str, Any]:
62
+ """
63
+ Validate a file for NotebookLM upload.
64
+
65
+ Args:
66
+ file_path (str): Path to the file to validate
67
+
68
+ Returns:
69
+ Dict[str, Any]: Validation results with success status and details
70
+ """
71
+ result = {
72
+ 'success': False,
73
+ 'file_path': file_path,
74
+ 'file_size': 0,
75
+ 'file_format': None,
76
+ 'errors': [],
77
+ 'warnings': []
78
+ }
79
+
80
+ try:
81
+ # Check if file exists
82
+ if not os.path.exists(file_path):
83
+ result['errors'].append(f"File not found: {file_path}")
84
+ return result
85
+
86
+ # Get file info
87
+ file_path_obj = Path(file_path)
88
+ result['file_size'] = file_path_obj.stat().st_size
89
+ result['file_format'] = file_path_obj.suffix.lower().lstrip('.')
90
+
91
+ # Check file format
92
+ if result['file_format'] not in self.supported_formats:
93
+ result['errors'].append(
94
+ f"Unsupported file format: {result['file_format']}. "
95
+ f"Supported formats: {', '.join(self.supported_formats)}"
96
+ )
97
+ return result
98
+
99
+ # Check file size
100
+ max_size_bytes = self._parse_file_size(self.max_file_size)
101
+ if result['file_size'] > max_size_bytes:
102
+ result['errors'].append(
103
+ f"File too large: {self._format_file_size(result['file_size'])}. "
104
+ f"Maximum allowed: {self.max_file_size}"
105
+ )
106
+ return result
107
+
108
+ # Check if file is readable
109
+ try:
110
+ with open(file_path, 'rb') as f:
111
+ f.read(1024) # Read first 1KB to test readability
112
+ except Exception as e:
113
+ result['errors'].append(f"File not readable: {e}")
114
+ return result
115
+
116
+ result['success'] = True
117
+ self.logger.info(f"✅ File validation successful: {file_path}")
118
+
119
+ except Exception as e:
120
+ result['errors'].append(f"Validation error: {e}")
121
+ self.logger.error(f"❌ File validation failed: {e}")
122
+
123
+ return result
124
+
125
+ def _parse_file_size(self, size_str: str) -> int:
126
+ """Parse file size string to bytes."""
127
+ size_str = size_str.upper().strip()
128
+
129
+ multipliers = {
130
+ 'B': 1,
131
+ 'KB': 1024,
132
+ 'MB': 1024 * 1024,
133
+ 'GB': 1024 * 1024 * 1024
134
+ }
135
+
136
+ for suffix, multiplier in multipliers.items():
137
+ if size_str.endswith(suffix):
138
+ try:
139
+ number = float(size_str[:-len(suffix)])
140
+ return int(number * multiplier)
141
+ except ValueError:
142
+ break
143
+
144
+ # Default to 50MB if parsing fails
145
+ return 50 * 1024 * 1024
146
+
147
+ def _format_file_size(self, size_bytes: int) -> str:
148
+ """Format file size in bytes to human readable format."""
149
+ for unit in ['B', 'KB', 'MB', 'GB']:
150
+ if size_bytes < 1024.0:
151
+ return f"{size_bytes:.1f}{unit}"
152
+ size_bytes /= 1024.0
153
+ return f"{size_bytes:.1f}TB"
154
+
155
+ def prepare_content(self, file_path: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
156
+ """
157
+ Prepare content for NotebookLM upload.
158
+
159
+ Args:
160
+ file_path (str): Path to the source file
161
+ output_dir (str, optional): Directory for processed files
162
+
163
+ Returns:
164
+ Dict[str, Any]: Preparation results with processed file path
165
+ """
166
+ result = {
167
+ 'success': False,
168
+ 'original_path': file_path,
169
+ 'processed_path': None,
170
+ 'preparation_steps': [],
171
+ 'errors': []
172
+ }
173
+
174
+ try:
175
+ # Validate file first
176
+ validation = self.validate_file(file_path)
177
+ if not validation['success']:
178
+ result['errors'] = validation['errors']
179
+ return result
180
+
181
+ result['preparation_steps'].append("File validation completed")
182
+
183
+ # Determine output directory
184
+ if output_dir is None:
185
+ output_dir = self.temp_dir
186
+
187
+ os.makedirs(output_dir, exist_ok=True)
188
+
189
+ # Process based on file type
190
+ file_format = validation['file_format']
191
+
192
+ if file_format in ['txt', 'md']:
193
+ # Text files can be used directly
194
+ result['processed_path'] = file_path
195
+ result['preparation_steps'].append("Text file ready for upload")
196
+
197
+ elif file_format == 'html':
198
+ # HTML files might need cleaning
199
+ processed_path = self._process_html_file(file_path, output_dir)
200
+ result['processed_path'] = processed_path
201
+ result['preparation_steps'].append("HTML file processed")
202
+
203
+ elif file_format in ['pdf', 'docx']:
204
+ # Binary files can be used directly
205
+ result['processed_path'] = file_path
206
+ result['preparation_steps'].append("Binary file ready for upload")
207
+
208
+ else:
209
+ result['errors'].append(f"Unsupported file format for processing: {file_format}")
210
+ return result
211
+
212
+ result['success'] = True
213
+ self.logger.info(f"✅ Content preparation successful: {file_path}")
214
+
215
+ except Exception as e:
216
+ result['errors'].append(f"Preparation error: {e}")
217
+ self.logger.error(f"❌ Content preparation failed: {e}")
218
+
219
+ return result
220
+
221
+ def _process_html_file(self, file_path: str, output_dir: str) -> str:
222
+ """Process HTML file for better NotebookLM compatibility."""
223
+ try:
224
+ from bs4 import BeautifulSoup
225
+
226
+ with open(file_path, 'r', encoding='utf-8') as f:
227
+ content = f.read()
228
+
229
+ # Parse HTML
230
+ soup = BeautifulSoup(content, 'html.parser')
231
+
232
+ # Remove scripts and styles
233
+ for script in soup(["script", "style"]):
234
+ script.decompose()
235
+
236
+ # Get text content
237
+ text = soup.get_text()
238
+
239
+ # Clean up whitespace
240
+ lines = (line.strip() for line in text.splitlines())
241
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
242
+ text = ' '.join(chunk for chunk in chunks if chunk)
243
+
244
+ # Save processed text
245
+ output_path = os.path.join(output_dir, f"processed_{Path(file_path).stem}.txt")
246
+ with open(output_path, 'w', encoding='utf-8') as f:
247
+ f.write(text)
248
+
249
+ return output_path
250
+
251
+ except ImportError:
252
+ self.logger.warning("BeautifulSoup not available, using original HTML file")
253
+ return file_path
254
+ except Exception as e:
255
+ self.logger.warning(f"HTML processing failed, using original file: {e}")
256
+ return file_path
257
+
258
+ def get_content_info(self, file_path: str) -> Dict[str, Any]:
259
+ """
260
+ Get detailed information about a content file.
261
+
262
+ Args:
263
+ file_path (str): Path to the file
264
+
265
+ Returns:
266
+ Dict[str, Any]: File information
267
+ """
268
+ info = {
269
+ 'file_path': file_path,
270
+ 'exists': False,
271
+ 'file_size': 0,
272
+ 'file_format': None,
273
+ 'readable': False,
274
+ 'validation': None
275
+ }
276
+
277
+ try:
278
+ if os.path.exists(file_path):
279
+ info['exists'] = True
280
+
281
+ file_path_obj = Path(file_path)
282
+ info['file_size'] = file_path_obj.stat().st_size
283
+ info['file_format'] = file_path_obj.suffix.lower().lstrip('.')
284
+
285
+ # Test readability
286
+ try:
287
+ with open(file_path, 'rb') as f:
288
+ f.read(1024)
289
+ info['readable'] = True
290
+ except:
291
+ info['readable'] = False
292
+
293
+ # Run validation
294
+ info['validation'] = self.validate_file(file_path)
295
+
296
+ except Exception as e:
297
+ self.logger.error(f"Error getting content info: {e}")
298
+
299
+ return info
300
+
301
+ def cleanup_temp_files(self, temp_dir: Optional[str] = None):
302
+ """Clean up temporary files."""
303
+ if temp_dir is None:
304
+ temp_dir = self.temp_dir
305
+
306
+ try:
307
+ if os.path.exists(temp_dir):
308
+ for file in os.listdir(temp_dir):
309
+ file_path = os.path.join(temp_dir, file)
310
+ if os.path.isfile(file_path):
311
+ os.remove(file_path)
312
+ self.logger.info(f"Cleaned up temp file: {file_path}")
313
+ except Exception as e:
314
+ self.logger.error(f"Error cleaning up temp files: {e}")
315
+
316
+
317
+ # Example usage and testing
318
+ def test_content_processor():
319
+ """Test function for content processor."""
320
+ processor = ContentProcessor()
321
+
322
+ # Test with a sample file (if it exists)
323
+ test_file = "README.md"
324
+ if os.path.exists(test_file):
325
+ print(f"Testing with file: {test_file}")
326
+
327
+ # Test validation
328
+ validation = processor.validate_file(test_file)
329
+ print(f"Validation result: {validation}")
330
+
331
+ # Test content info
332
+ info = processor.get_content_info(test_file)
333
+ print(f"Content info: {info}")
334
+
335
+ # Test preparation
336
+ preparation = processor.prepare_content(test_file)
337
+ print(f"Preparation result: {preparation}")
338
+ else:
339
+ print("No test file available")
340
+
341
+
342
+ if __name__ == "__main__":
343
+ test_content_processor()