drupal-news 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. drupal_news/__init__.py +1 -0
  2. drupal_news/ai_summarizer.py +337 -0
  3. drupal_news/cache_manager.py +174 -0
  4. drupal_news/cli.py +41 -0
  5. drupal_news/compile_scss.py +161 -0
  6. drupal_news/data_cleaner.py +167 -0
  7. drupal_news/email_sender.py +265 -0
  8. drupal_news/index.py +426 -0
  9. drupal_news/markdown_converter.py +187 -0
  10. drupal_news/metrics_collector.py +119 -0
  11. drupal_news/pdf_generator.py +259 -0
  12. drupal_news/pipeline_integrity.py +152 -0
  13. drupal_news/process_logger.py +82 -0
  14. drupal_news/rss_reader.py +134 -0
  15. drupal_news/scheduler.py +100 -0
  16. drupal_news/utils/__init__.py +1 -0
  17. drupal_news/utils/dedupe.py +70 -0
  18. drupal_news/utils/html_norm.py +96 -0
  19. drupal_news/utils/io_safe.py +107 -0
  20. drupal_news/utils/md_config_parser.py +116 -0
  21. drupal_news/utils/providers/__init__.py +1 -0
  22. drupal_news/utils/providers/anthropic_client.py +72 -0
  23. drupal_news/utils/providers/deepseek_client.py +78 -0
  24. drupal_news/utils/providers/gemini_client.py +78 -0
  25. drupal_news/utils/providers/generic_client.py +139 -0
  26. drupal_news/utils/providers/grok_client.py +80 -0
  27. drupal_news/utils/providers/lmstudio_client.py +70 -0
  28. drupal_news/utils/providers/ollama_client.py +67 -0
  29. drupal_news/utils/providers/openai_client.py +72 -0
  30. drupal_news/utils/providers/openrouter_client.py +88 -0
  31. drupal_news/utils/providers/qwen_client.py +80 -0
  32. drupal_news/utils/schema.py +87 -0
  33. drupal_news/utils/timebox.py +93 -0
  34. drupal_news/validator.py +153 -0
  35. drupal_news/viewer.py +219 -0
  36. drupal_news/webpage_reader.py +405 -0
  37. drupal_news-0.1.0.dist-info/METADATA +349 -0
  38. drupal_news-0.1.0.dist-info/RECORD +42 -0
  39. drupal_news-0.1.0.dist-info/WHEEL +5 -0
  40. drupal_news-0.1.0.dist-info/entry_points.txt +5 -0
  41. drupal_news-0.1.0.dist-info/licenses/LICENSE.txt +339 -0
  42. drupal_news-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ """Drupal News Aggregator - Source package."""
@@ -0,0 +1,337 @@
1
+ """AI summarizer for Drupal Newsletter."""
2
+ import importlib
3
+ from typing import List, Dict, Any, Optional
4
+ from pathlib import Path
5
+ import time
6
+ from drupal_news.markdown_converter import items_to_text
7
+
8
+
9
+ SUMMARIZER_PROMPT_TEMPLATE = """
10
+ You are a technical writer for the Drupal community. Generate a summary of Drupal news and updates.
11
+
12
+ **Requirements:**
13
+ 1. Focus on AI module and news on AI
14
+ 2. Each fact MUST include a [source](URL) link
15
+ 3. Use clear, factual language - no hype
16
+ 4. If no major updates: include "No significant core updates this week"
17
+ 5. Present RSS/new modules as a table with columns: URL, Name, Description
18
+ 6. Organize by sections: Core Updates, Modules, AI/Automation, Canvas/Admin UI, Planet, D.O. Blog
19
+
20
+ **Timeframe:** Last {timeframe_days} days ({timezone})
21
+
22
+ **Items to summarize:**
23
+
24
+ {items_text}
25
+
26
+ Generate the summary in Markdown format with proper sections and source links.
27
+ """
28
+
29
+
30
+ def load_prompt_template(prompt_file: str = None) -> str:
31
+ """
32
+ Load prompt template from markdown file or use default.
33
+
34
+ Args:
35
+ prompt_file: Path to prompt.md file (default: prompt.md in project root)
36
+
37
+ Returns:
38
+ Prompt template string with placeholders
39
+ """
40
+ if prompt_file is None:
41
+ # Default to prompt.md in the project root
42
+ prompt_file = Path(__file__).parent.parent / 'prompt.md'
43
+
44
+ prompt_path = Path(prompt_file)
45
+
46
+ if prompt_path.exists():
47
+ try:
48
+ return prompt_path.read_text(encoding='utf-8')
49
+ except Exception as e:
50
+ print(f"Warning: Could not read {prompt_file}: {e}")
51
+ print("Using default hardcoded prompt template")
52
+
53
+ # Fall back to hardcoded default
54
+ return SUMMARIZER_PROMPT_TEMPLATE
55
+
56
+
57
+ def get_provider_client(provider_name: str, client_name: str = None):
58
+ """
59
+ Dynamically load provider client module.
60
+
61
+ Args:
62
+ provider_name: Provider name (e.g., 'openai', 'anthropic')
63
+ client_name: Optional client name from config (e.g., 'generic_client')
64
+
65
+ Returns:
66
+ Provider client module
67
+ """
68
+ try:
69
+ # Use client_name if provided, otherwise use provider_name
70
+ module_name = client_name if client_name else f"{provider_name}_client"
71
+ # Remove _client suffix if already present
72
+ if not module_name.endswith('_client'):
73
+ module_name = f"{module_name}_client"
74
+ module = importlib.import_module(f"utils.providers.{module_name}")
75
+ return module
76
+ except ImportError as e:
77
+ raise ImportError(f"Provider '{provider_name}' (client: {client_name or provider_name}) not found: {e}")
78
+
79
+
80
+
81
+ def calculate_cost(provider: str, provider_config: Dict[str, Any], summary_result: Dict[str, Any]) -> float:
82
+ """Estimate USD cost for a summary run based on token usage and configured pricing."""
83
+ if not provider_config:
84
+ return 0.0
85
+
86
+ pricing = provider_config.get("pricing")
87
+ if not pricing:
88
+ return 0.0
89
+
90
+ input_tokens = summary_result.get("input_tokens")
91
+ output_tokens = summary_result.get("output_tokens")
92
+ total_tokens = summary_result.get("tokens", 0) or 0
93
+
94
+ cost = 0.0
95
+
96
+ input_rate = pricing.get("input_cost_per_1k")
97
+ output_rate = pricing.get("output_cost_per_1k")
98
+ blended_rate = pricing.get("per_1k_tokens")
99
+
100
+ if input_tokens is not None and input_rate is not None:
101
+ cost += (input_tokens / 1000.0) * float(input_rate)
102
+
103
+ if output_tokens is not None and output_rate is not None:
104
+ cost += (output_tokens / 1000.0) * float(output_rate)
105
+
106
+ # Fallback to blended pricing when detailed usage is absent
107
+ if cost == 0.0 and blended_rate is not None:
108
+ cost += (total_tokens / 1000.0) * float(blended_rate)
109
+
110
+ return round(cost, 6)
111
+
112
+
113
+
114
+ def summarize(
115
+ items: List[Dict[str, Any]],
116
+ provider: str,
117
+ model: str,
118
+ temperature: float,
119
+ timeframe_days: int,
120
+ timezone: str,
121
+ max_items: int = 200,
122
+ chunk_size: int = 200,
123
+ provider_config: Dict[str, Any] = None
124
+ ) -> Dict[str, Any]:
125
+ """
126
+ Generate AI summary of items.
127
+
128
+ Args:
129
+ items: List of news items
130
+ provider: AI provider name
131
+ model: Model name
132
+ temperature: Temperature setting
133
+ timeframe_days: Number of days covered
134
+ timezone: Timezone name
135
+ max_items: Maximum items to process
136
+ chunk_size: Chunk size for large inputs
137
+ provider_config: Full provider configuration (includes api_url, headers, etc.)
138
+
139
+ Returns:
140
+ Dictionary with 'text', 'tokens', 'model', 'provider', 'duration'
141
+ """
142
+ start_time = time.time()
143
+
144
+ # Load provider client
145
+ # Extract client name from provider_config if available
146
+ client_name = provider_config.get("client") if provider_config else None
147
+ client = get_provider_client(provider, client_name)
148
+
149
+ # Convert items to text
150
+ items_text = items_to_text(items, max_items)
151
+
152
+ # Load prompt template from file or use default
153
+ template = load_prompt_template()
154
+
155
+ def render_prompt(items_text_value: str) -> str:
156
+ return template.format(
157
+ timeframe_days=timeframe_days,
158
+ timezone=timezone,
159
+ items_text=items_text_value
160
+ )
161
+
162
+ prompt = render_prompt(items_text)
163
+
164
+ # Prepare kwargs for provider (prompt injected per request)
165
+ base_kwargs = {
166
+ "model": model,
167
+ "temperature": temperature
168
+ }
169
+
170
+ # Add optional provider-specific settings
171
+ if provider_config:
172
+ if "api_url" in provider_config:
173
+ base_kwargs["api_url"] = provider_config["api_url"]
174
+ if "headers" in provider_config:
175
+ base_kwargs["headers"] = provider_config["headers"]
176
+
177
+ # Check if chunking needed
178
+ if len(items) > chunk_size:
179
+ summary_result = _summarize_chunked(
180
+ client,
181
+ items,
182
+ render_prompt,
183
+ chunk_size,
184
+ base_kwargs
185
+ )
186
+ else:
187
+ request_kwargs = dict(base_kwargs)
188
+ request_kwargs["prompt"] = prompt
189
+ try:
190
+ summary_result = client.generate_summary(**request_kwargs)
191
+ except Exception as e:
192
+ raise RuntimeError(f"Summarization failed: {str(e)}")
193
+
194
+ # Add metadata
195
+ summary_result.setdefault("provider", provider)
196
+ summary_result.setdefault("model", model)
197
+ summary_result["duration"] = time.time() - start_time
198
+ summary_result["cost"] = calculate_cost(provider, provider_config or {}, summary_result)
199
+
200
+ return summary_result
201
+
202
+
203
+ def _summarize_chunked(
204
+ client,
205
+ items: List[Dict[str, Any]],
206
+ render_prompt,
207
+ chunk_size: int,
208
+ base_kwargs: Dict[str, Any]
209
+ ) -> Dict[str, Any]:
210
+ """Summarize items in chunks for large datasets."""
211
+ summaries = []
212
+ total_tokens = 0
213
+ total_input_tokens = 0
214
+ total_output_tokens = 0
215
+
216
+ for i in range(0, len(items), chunk_size):
217
+ chunk = items[i:i + chunk_size]
218
+ chunk_kwargs = dict(base_kwargs)
219
+ chunk_kwargs["prompt"] = render_prompt(items_to_text(chunk))
220
+
221
+ try:
222
+ result = client.generate_summary(**chunk_kwargs)
223
+ summaries.append(result.get("text", ""))
224
+ total_tokens += result.get("tokens", 0) or 0
225
+ total_input_tokens += result.get("input_tokens") or 0
226
+ total_output_tokens += result.get("output_tokens") or 0
227
+ except Exception as e:
228
+ print(f"Warning: Chunk {i // chunk_size + 1} failed: {e}")
229
+
230
+ combined_text = "\n\n".join(filter(None, summaries))
231
+
232
+ summary = {
233
+ "text": combined_text,
234
+ "tokens": total_tokens,
235
+ "model": base_kwargs.get("model"),
236
+ "provider": client.__name__.split(".")[-1].replace("_client", ""),
237
+ "chunked": True
238
+ }
239
+
240
+ if total_input_tokens:
241
+ summary["input_tokens"] = total_input_tokens
242
+ if total_output_tokens:
243
+ summary["output_tokens"] = total_output_tokens
244
+
245
+ return summary
246
+
247
+
248
+ def summarize_with_fallback(
249
+ items: List[Dict[str, Any]],
250
+ providers_config: Dict[str, Any],
251
+ default_provider: str,
252
+ timeframe_days: int,
253
+ timezone: str,
254
+ fallback_order: Optional[List[str]] = None
255
+ ) -> Dict[str, Any]:
256
+ """
257
+ Summarize with automatic fallback to other providers.
258
+
259
+ Args:
260
+ items: List of news items
261
+ providers_config: Providers configuration
262
+ default_provider: Default provider to try first
263
+ timeframe_days: Days covered
264
+ timezone: Timezone name
265
+ fallback_order: Optional custom fallback order
266
+
267
+ Returns:
268
+ Summary result with provider info
269
+ """
270
+ if fallback_order is None:
271
+ fallback_order = ["openai", "anthropic", "ollama", "qwen", "openrouter"]
272
+
273
+ # Try default provider first
274
+ provider_list = [default_provider] + [p for p in fallback_order if p != default_provider]
275
+
276
+ last_error = None
277
+
278
+ for provider_name in provider_list:
279
+ provider_config = providers_config.get("providers", {}).get(provider_name)
280
+
281
+ if not provider_config:
282
+ continue
283
+
284
+ try:
285
+ result = summarize(
286
+ items=items,
287
+ provider=provider_name,
288
+ model=provider_config["model"],
289
+ temperature=provider_config.get("temperature", 0.2),
290
+ timeframe_days=timeframe_days,
291
+ timezone=timezone,
292
+ provider_config=provider_config
293
+ )
294
+
295
+ return result
296
+
297
+ except Exception as e:
298
+ last_error = e
299
+ print(f"Provider {provider_name} failed: {e}")
300
+ continue
301
+
302
+ # All providers failed
303
+ raise RuntimeError(f"All providers failed. Last error: {last_error}")
304
+
305
+
306
+ def generate_placeholder_summary(items: List[Dict[str, Any]], timeframe_days: int) -> str:
307
+ """
308
+ Generate placeholder summary for dry-run mode.
309
+
310
+ Args:
311
+ items: List of items
312
+ timeframe_days: Days covered
313
+
314
+ Returns:
315
+ Placeholder markdown summary
316
+ """
317
+ return f"""# Drupal Newesleter Summary (Dry Run)
318
+
319
+ **Timeframe:** Last {timeframe_days} days
320
+
321
+ ## Summary
322
+
323
+ This is a placeholder summary for dry-run mode.
324
+ Total items collected: {len(items)}
325
+
326
+ ### Core Updates
327
+ No significant core updates this week.
328
+
329
+ ### New Modules
330
+ {len([i for i in items if i.get('source_type') == 'rss'])} new modules found.
331
+
332
+ ### AI and Automation
333
+ No major AI updates this week.
334
+
335
+ ---
336
+ *This is a dry-run summary. Enable AI provider for actual summaries.*
337
+ """
@@ -0,0 +1,174 @@
1
+ """Cache manager for Drupal Newsletter using SQLite."""
2
+ import sqlite3
3
+ from pathlib import Path
4
+ from typing import Optional, Dict, Any
5
+ from datetime import datetime, timedelta
6
+ import json
7
+ import hashlib
8
+
9
+
10
+ class CacheManager:
11
+ """Manages persistent caching with SQLite."""
12
+
13
+ def __init__(self, db_path: str = "./cache/cache.db", ttl_days: int = 21):
14
+ """
15
+ Initialize cache manager.
16
+
17
+ Args:
18
+ db_path: Path to SQLite database
19
+ ttl_days: Time-to-live for cache entries in days
20
+ """
21
+ self.db_path = Path(db_path)
22
+ self.ttl_days = ttl_days
23
+ self._init_db()
24
+
25
+ def _init_db(self):
26
+ """Initialize database schema."""
27
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
28
+
29
+ conn = sqlite3.connect(self.db_path)
30
+ cursor = conn.cursor()
31
+
32
+ cursor.execute("""
33
+ CREATE TABLE IF NOT EXISTS cache (
34
+ key TEXT PRIMARY KEY,
35
+ value TEXT NOT NULL,
36
+ created_at TEXT NOT NULL,
37
+ expires_at TEXT NOT NULL
38
+ )
39
+ """)
40
+
41
+ cursor.execute("""
42
+ CREATE INDEX IF NOT EXISTS idx_expires_at ON cache(expires_at)
43
+ """)
44
+
45
+ conn.commit()
46
+ conn.close()
47
+
48
+ def _compute_key(self, url: str) -> str:
49
+ """Compute cache key from URL."""
50
+ return hashlib.sha256(url.encode()).hexdigest()
51
+
52
+ def get(self, url: str) -> Optional[Dict[str, Any]]:
53
+ """
54
+ Get cached value for URL.
55
+
56
+ Returns:
57
+ Cached value if found and not expired, None otherwise
58
+ """
59
+ key = self._compute_key(url)
60
+ conn = sqlite3.connect(self.db_path)
61
+ cursor = conn.cursor()
62
+
63
+ cursor.execute("""
64
+ SELECT value, expires_at FROM cache WHERE key = ?
65
+ """, (key,))
66
+
67
+ row = cursor.fetchone()
68
+ conn.close()
69
+
70
+ if not row:
71
+ return None
72
+
73
+ value_json, expires_at = row
74
+ expires_dt = datetime.fromisoformat(expires_at)
75
+
76
+ # Check expiration
77
+ if datetime.now() > expires_dt:
78
+ self.delete(url)
79
+ return None
80
+
81
+ try:
82
+ return json.loads(value_json)
83
+ except json.JSONDecodeError:
84
+ return None
85
+
86
+ def set(self, url: str, value: Dict[str, Any]):
87
+ """
88
+ Set cached value for URL.
89
+
90
+ Args:
91
+ url: URL to cache
92
+ value: Dictionary value to cache
93
+ """
94
+ key = self._compute_key(url)
95
+ created_at = datetime.now().isoformat()
96
+ expires_at = (datetime.now() + timedelta(days=self.ttl_days)).isoformat()
97
+
98
+ value_json = json.dumps(value, ensure_ascii=False)
99
+
100
+ conn = sqlite3.connect(self.db_path)
101
+ cursor = conn.cursor()
102
+
103
+ cursor.execute("""
104
+ INSERT OR REPLACE INTO cache (key, value, created_at, expires_at)
105
+ VALUES (?, ?, ?, ?)
106
+ """, (key, value_json, created_at, expires_at))
107
+
108
+ conn.commit()
109
+ conn.close()
110
+
111
+ def delete(self, url: str):
112
+ """Delete cached value for URL."""
113
+ key = self._compute_key(url)
114
+ conn = sqlite3.connect(self.db_path)
115
+ cursor = conn.cursor()
116
+
117
+ cursor.execute("DELETE FROM cache WHERE key = ?", (key,))
118
+
119
+ conn.commit()
120
+ conn.close()
121
+
122
+ def purge_expired(self) -> int:
123
+ """
124
+ Remove all expired cache entries.
125
+
126
+ Returns:
127
+ Number of entries removed
128
+ """
129
+ conn = sqlite3.connect(self.db_path)
130
+ cursor = conn.cursor()
131
+
132
+ now = datetime.now().isoformat()
133
+ cursor.execute("DELETE FROM cache WHERE expires_at < ?", (now,))
134
+
135
+ deleted_count = cursor.rowcount
136
+ conn.commit()
137
+ conn.close()
138
+
139
+ return deleted_count
140
+
141
+ def clear_all(self):
142
+ """Clear all cache entries."""
143
+ conn = sqlite3.connect(self.db_path)
144
+ cursor = conn.cursor()
145
+
146
+ cursor.execute("DELETE FROM cache")
147
+
148
+ conn.commit()
149
+ conn.close()
150
+
151
+ def get_stats(self) -> Dict[str, int]:
152
+ """
153
+ Get cache statistics.
154
+
155
+ Returns:
156
+ Dictionary with total, expired, and valid counts
157
+ """
158
+ conn = sqlite3.connect(self.db_path)
159
+ cursor = conn.cursor()
160
+
161
+ cursor.execute("SELECT COUNT(*) FROM cache")
162
+ total = cursor.fetchone()[0]
163
+
164
+ now = datetime.now().isoformat()
165
+ cursor.execute("SELECT COUNT(*) FROM cache WHERE expires_at < ?", (now,))
166
+ expired = cursor.fetchone()[0]
167
+
168
+ conn.close()
169
+
170
+ return {
171
+ "total": total,
172
+ "expired": expired,
173
+ "valid": total - expired
174
+ }
drupal_news/cli.py ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI entry points for drupal-news package.
4
+ """
5
+ import sys
6
+
7
+
8
+ def main():
9
+ """Main CLI entry point for drupal-news command."""
10
+ from drupal_news.index import main as index_main
11
+ sys.exit(index_main())
12
+
13
+
14
+ def scheduler_main():
15
+ """Scheduler CLI entry point for drupal-news-scheduler command."""
16
+ from drupal_news.scheduler import main as scheduler_main_func
17
+ sys.exit(scheduler_main_func())
18
+
19
+
20
+ def email_main():
21
+ """Email CLI entry point for drupal-news-email command."""
22
+ from drupal_news.email_sender import main as email_main_func
23
+ sys.exit(email_main_func())
24
+
25
+
26
+ def viewer_main():
27
+ """Web viewer CLI entry point for drupal-news-viewer command."""
28
+ from drupal_news.viewer import app
29
+
30
+ print("=" * 60)
31
+ print("Drupal News Viewer")
32
+ print("=" * 60)
33
+ print("Starting server on http://localhost:5000")
34
+ print("Press Ctrl+C to stop")
35
+ print("=" * 60)
36
+
37
+ app.run(debug=False, host='0.0.0.0', port=5000)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()