agent-recipes 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_recipes/__init__.py +27 -0
- agent_recipes/recipe_runtime/__init__.py +28 -0
- agent_recipes/recipe_runtime/core.py +385 -0
- agent_recipes/templates/ai-ab-hook-tester/recipe.yaml +45 -0
- agent_recipes/templates/ai-ab-hook-tester/tools.py +169 -0
- agent_recipes/templates/ai-angle-generator/recipe.yaml +49 -0
- agent_recipes/templates/ai-angle-generator/tools.py +182 -0
- agent_recipes/templates/ai-api-doc-generator/README.md +59 -0
- agent_recipes/templates/ai-api-doc-generator/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-api-tester/README.md +60 -0
- agent_recipes/templates/ai-api-tester/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-audio-enhancer/README.md +59 -0
- agent_recipes/templates/ai-audio-enhancer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-audio-normalizer/README.md +13 -0
- agent_recipes/templates/ai-audio-normalizer/TEMPLATE.yaml +44 -0
- agent_recipes/templates/ai-audio-splitter/README.md +14 -0
- agent_recipes/templates/ai-audio-splitter/TEMPLATE.yaml +47 -0
- agent_recipes/templates/ai-background-music-generator/README.md +59 -0
- agent_recipes/templates/ai-background-music-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-background-remover/README.md +60 -0
- agent_recipes/templates/ai-background-remover/TEMPLATE.yaml +27 -0
- agent_recipes/templates/ai-barcode-scanner/README.md +60 -0
- agent_recipes/templates/ai-barcode-scanner/TEMPLATE.yaml +26 -0
- agent_recipes/templates/ai-blog-generator/README.md +59 -0
- agent_recipes/templates/ai-blog-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-brief-generator/recipe.yaml +52 -0
- agent_recipes/templates/ai-brief-generator/tools.py +231 -0
- agent_recipes/templates/ai-broll-builder/recipe.yaml +47 -0
- agent_recipes/templates/ai-broll-builder/tools.py +204 -0
- agent_recipes/templates/ai-calendar-scheduler/README.md +60 -0
- agent_recipes/templates/ai-calendar-scheduler/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-changelog-generator/README.md +14 -0
- agent_recipes/templates/ai-changelog-generator/TEMPLATE.yaml +46 -0
- agent_recipes/templates/ai-chart-generator/README.md +61 -0
- agent_recipes/templates/ai-chart-generator/TEMPLATE.yaml +32 -0
- agent_recipes/templates/ai-code-documenter/README.md +12 -0
- agent_recipes/templates/ai-code-documenter/TEMPLATE.yaml +37 -0
- agent_recipes/templates/ai-code-refactorer/README.md +59 -0
- agent_recipes/templates/ai-code-refactorer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-code-reviewer/README.md +59 -0
- agent_recipes/templates/ai-code-reviewer/TEMPLATE.yaml +31 -0
- agent_recipes/templates/ai-color-palette-extractor/README.md +60 -0
- agent_recipes/templates/ai-color-palette-extractor/TEMPLATE.yaml +27 -0
- agent_recipes/templates/ai-comment-miner/recipe.yaml +40 -0
- agent_recipes/templates/ai-comment-miner/tools.py +141 -0
- agent_recipes/templates/ai-commit-message-generator/README.md +59 -0
- agent_recipes/templates/ai-commit-message-generator/TEMPLATE.yaml +31 -0
- agent_recipes/templates/ai-content-calendar/recipe.yaml +43 -0
- agent_recipes/templates/ai-content-calendar/tools.py +170 -0
- agent_recipes/templates/ai-context-enricher/recipe.yaml +48 -0
- agent_recipes/templates/ai-context-enricher/tools.py +258 -0
- agent_recipes/templates/ai-contract-analyzer/README.md +60 -0
- agent_recipes/templates/ai-contract-analyzer/TEMPLATE.yaml +34 -0
- agent_recipes/templates/ai-csv-cleaner/README.md +13 -0
- agent_recipes/templates/ai-csv-cleaner/TEMPLATE.yaml +45 -0
- agent_recipes/templates/ai-cta-generator/recipe.yaml +54 -0
- agent_recipes/templates/ai-cta-generator/tools.py +174 -0
- agent_recipes/templates/ai-daily-news-show/recipe.yaml +103 -0
- agent_recipes/templates/ai-daily-news-show/tools.py +308 -0
- agent_recipes/templates/ai-data-anonymizer/README.md +60 -0
- agent_recipes/templates/ai-data-anonymizer/TEMPLATE.yaml +31 -0
- agent_recipes/templates/ai-data-profiler/README.md +14 -0
- agent_recipes/templates/ai-data-profiler/TEMPLATE.yaml +42 -0
- agent_recipes/templates/ai-dependency-auditor/README.md +12 -0
- agent_recipes/templates/ai-dependency-auditor/TEMPLATE.yaml +37 -0
- agent_recipes/templates/ai-doc-translator/README.md +12 -0
- agent_recipes/templates/ai-doc-translator/TEMPLATE.yaml +41 -0
- agent_recipes/templates/ai-duplicate-finder/README.md +59 -0
- agent_recipes/templates/ai-duplicate-finder/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-ebook-converter/README.md +60 -0
- agent_recipes/templates/ai-ebook-converter/TEMPLATE.yaml +27 -0
- agent_recipes/templates/ai-email-parser/README.md +59 -0
- agent_recipes/templates/ai-email-parser/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-etl-pipeline/README.md +60 -0
- agent_recipes/templates/ai-etl-pipeline/TEMPLATE.yaml +30 -0
- agent_recipes/templates/ai-excel-formula-generator/README.md +59 -0
- agent_recipes/templates/ai-excel-formula-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-face-blur/README.md +60 -0
- agent_recipes/templates/ai-face-blur/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-fact-checker/recipe.yaml +52 -0
- agent_recipes/templates/ai-fact-checker/tools.py +279 -0
- agent_recipes/templates/ai-faq-generator/README.md +59 -0
- agent_recipes/templates/ai-faq-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-file-organizer/README.md +59 -0
- agent_recipes/templates/ai-file-organizer/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-folder-packager/README.md +15 -0
- agent_recipes/templates/ai-folder-packager/TEMPLATE.yaml +48 -0
- agent_recipes/templates/ai-form-filler/README.md +60 -0
- agent_recipes/templates/ai-form-filler/TEMPLATE.yaml +30 -0
- agent_recipes/templates/ai-hashtag-optimizer/recipe.yaml +45 -0
- agent_recipes/templates/ai-hashtag-optimizer/tools.py +134 -0
- agent_recipes/templates/ai-hook-generator/recipe.yaml +50 -0
- agent_recipes/templates/ai-hook-generator/tools.py +177 -0
- agent_recipes/templates/ai-image-captioner/README.md +59 -0
- agent_recipes/templates/ai-image-captioner/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-image-cataloger/README.md +13 -0
- agent_recipes/templates/ai-image-cataloger/TEMPLATE.yaml +39 -0
- agent_recipes/templates/ai-image-optimizer/README.md +13 -0
- agent_recipes/templates/ai-image-optimizer/TEMPLATE.yaml +43 -0
- agent_recipes/templates/ai-image-resizer/README.md +12 -0
- agent_recipes/templates/ai-image-resizer/TEMPLATE.yaml +39 -0
- agent_recipes/templates/ai-image-tagger/README.md +59 -0
- agent_recipes/templates/ai-image-tagger/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-image-upscaler/README.md +60 -0
- agent_recipes/templates/ai-image-upscaler/TEMPLATE.yaml +27 -0
- agent_recipes/templates/ai-invoice-processor/README.md +60 -0
- agent_recipes/templates/ai-invoice-processor/TEMPLATE.yaml +34 -0
- agent_recipes/templates/ai-json-to-csv/README.md +12 -0
- agent_recipes/templates/ai-json-to-csv/TEMPLATE.yaml +36 -0
- agent_recipes/templates/ai-log-analyzer/README.md +59 -0
- agent_recipes/templates/ai-log-analyzer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-markdown-to-pdf/README.md +12 -0
- agent_recipes/templates/ai-markdown-to-pdf/TEMPLATE.yaml +40 -0
- agent_recipes/templates/ai-meeting-summarizer/README.md +59 -0
- agent_recipes/templates/ai-meeting-summarizer/TEMPLATE.yaml +32 -0
- agent_recipes/templates/ai-meta-tag-generator/README.md +59 -0
- agent_recipes/templates/ai-meta-tag-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-news-capture-pack/recipe.yaml +42 -0
- agent_recipes/templates/ai-news-capture-pack/tools.py +150 -0
- agent_recipes/templates/ai-news-crawler/recipe.yaml +99 -0
- agent_recipes/templates/ai-news-crawler/tools.py +417 -0
- agent_recipes/templates/ai-news-deduper/recipe.yaml +47 -0
- agent_recipes/templates/ai-news-deduper/tools.py +235 -0
- agent_recipes/templates/ai-newsletter-generator/README.md +59 -0
- agent_recipes/templates/ai-newsletter-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-note-summarizer/README.md +59 -0
- agent_recipes/templates/ai-note-summarizer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-pdf-summarizer/README.md +12 -0
- agent_recipes/templates/ai-pdf-summarizer/TEMPLATE.yaml +40 -0
- agent_recipes/templates/ai-pdf-to-markdown/README.md +19 -0
- agent_recipes/templates/ai-pdf-to-markdown/TEMPLATE.yaml +63 -0
- agent_recipes/templates/ai-performance-analyzer/recipe.yaml +45 -0
- agent_recipes/templates/ai-performance-analyzer/tools.py +159 -0
- agent_recipes/templates/ai-podcast-cleaner/README.md +117 -0
- agent_recipes/templates/ai-podcast-cleaner/TEMPLATE.yaml +117 -0
- agent_recipes/templates/ai-podcast-cleaner/agents.yaml +59 -0
- agent_recipes/templates/ai-podcast-cleaner/workflow.yaml +77 -0
- agent_recipes/templates/ai-podcast-transcriber/README.md +59 -0
- agent_recipes/templates/ai-podcast-transcriber/TEMPLATE.yaml +32 -0
- agent_recipes/templates/ai-post-copy-generator/recipe.yaml +41 -0
- agent_recipes/templates/ai-post-copy-generator/tools.py +105 -0
- agent_recipes/templates/ai-product-description-generator/README.md +59 -0
- agent_recipes/templates/ai-product-description-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-publisher-pack/recipe.yaml +44 -0
- agent_recipes/templates/ai-publisher-pack/tools.py +252 -0
- agent_recipes/templates/ai-qr-code-generator/README.md +60 -0
- agent_recipes/templates/ai-qr-code-generator/TEMPLATE.yaml +26 -0
- agent_recipes/templates/ai-regex-generator/README.md +59 -0
- agent_recipes/templates/ai-regex-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-repo-readme/README.md +13 -0
- agent_recipes/templates/ai-repo-readme/TEMPLATE.yaml +42 -0
- agent_recipes/templates/ai-report-generator/README.md +61 -0
- agent_recipes/templates/ai-report-generator/TEMPLATE.yaml +32 -0
- agent_recipes/templates/ai-resume-parser/README.md +60 -0
- agent_recipes/templates/ai-resume-parser/TEMPLATE.yaml +33 -0
- agent_recipes/templates/ai-rss-aggregator/README.md +60 -0
- agent_recipes/templates/ai-rss-aggregator/TEMPLATE.yaml +30 -0
- agent_recipes/templates/ai-schema-generator/README.md +12 -0
- agent_recipes/templates/ai-schema-generator/TEMPLATE.yaml +34 -0
- agent_recipes/templates/ai-screen-recorder/recipe.yaml +43 -0
- agent_recipes/templates/ai-screen-recorder/tools.py +184 -0
- agent_recipes/templates/ai-screenshot-capture/recipe.yaml +45 -0
- agent_recipes/templates/ai-screenshot-capture/tools.py +231 -0
- agent_recipes/templates/ai-screenshot-ocr/README.md +12 -0
- agent_recipes/templates/ai-screenshot-ocr/TEMPLATE.yaml +37 -0
- agent_recipes/templates/ai-script-writer/recipe.yaml +58 -0
- agent_recipes/templates/ai-script-writer/tools.py +297 -0
- agent_recipes/templates/ai-sentiment-analyzer/README.md +59 -0
- agent_recipes/templates/ai-sentiment-analyzer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-seo-optimizer/README.md +59 -0
- agent_recipes/templates/ai-seo-optimizer/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-signal-ranker/recipe.yaml +54 -0
- agent_recipes/templates/ai-signal-ranker/tools.py +256 -0
- agent_recipes/templates/ai-sitemap-generator/README.md +59 -0
- agent_recipes/templates/ai-sitemap-generator/TEMPLATE.yaml +26 -0
- agent_recipes/templates/ai-sitemap-scraper/README.md +13 -0
- agent_recipes/templates/ai-sitemap-scraper/TEMPLATE.yaml +41 -0
- agent_recipes/templates/ai-slide-generator/README.md +60 -0
- agent_recipes/templates/ai-slide-generator/TEMPLATE.yaml +29 -0
- agent_recipes/templates/ai-slide-to-notes/README.md +12 -0
- agent_recipes/templates/ai-slide-to-notes/TEMPLATE.yaml +37 -0
- agent_recipes/templates/ai-social-media-generator/README.md +59 -0
- agent_recipes/templates/ai-social-media-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-sql-generator/README.md +59 -0
- agent_recipes/templates/ai-sql-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-subtitle-generator/README.md +59 -0
- agent_recipes/templates/ai-subtitle-generator/TEMPLATE.yaml +31 -0
- agent_recipes/templates/ai-test-generator/README.md +59 -0
- agent_recipes/templates/ai-test-generator/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-translation-batch/README.md +59 -0
- agent_recipes/templates/ai-translation-batch/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-url-to-markdown/README.md +14 -0
- agent_recipes/templates/ai-url-to-markdown/TEMPLATE.yaml +44 -0
- agent_recipes/templates/ai-video-chapter-generator/README.md +59 -0
- agent_recipes/templates/ai-video-chapter-generator/TEMPLATE.yaml +32 -0
- agent_recipes/templates/ai-video-compressor/README.md +59 -0
- agent_recipes/templates/ai-video-compressor/TEMPLATE.yaml +28 -0
- agent_recipes/templates/ai-video-editor/README.md +254 -0
- agent_recipes/templates/ai-video-editor/TEMPLATE.yaml +139 -0
- agent_recipes/templates/ai-video-editor/agents.yaml +36 -0
- agent_recipes/templates/ai-video-editor/requirements.txt +8 -0
- agent_recipes/templates/ai-video-editor/scripts/run.sh +10 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__init__.py +45 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__main__.py +8 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/__init__.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/cli.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/config.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/ffmpeg_probe.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/heuristics.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/llm_plan.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/models.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/pipeline.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/render.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/timeline.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/transcribe.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/__pycache__/utils.cpython-312.pyc +0 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/cli.py +343 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/config.py +102 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/ffmpeg_probe.py +92 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/heuristics.py +119 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/llm_plan.py +277 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/models.py +343 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/pipeline.py +287 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/render.py +274 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/timeline.py +278 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/transcribe.py +233 -0
- agent_recipes/templates/ai-video-editor/src/ai_video_editor/utils.py +222 -0
- agent_recipes/templates/ai-video-editor/src/input.mov +0 -0
- agent_recipes/templates/ai-video-editor/src/out.mp4 +0 -0
- agent_recipes/templates/ai-video-editor/tests/test_heuristics.py +130 -0
- agent_recipes/templates/ai-video-editor/tests/test_models.py +152 -0
- agent_recipes/templates/ai-video-editor/tests/test_timeline.py +105 -0
- agent_recipes/templates/ai-video-editor/workflow.yaml +51 -0
- agent_recipes/templates/ai-video-highlight-extractor/README.md +60 -0
- agent_recipes/templates/ai-video-highlight-extractor/TEMPLATE.yaml +33 -0
- agent_recipes/templates/ai-video-merger/recipe.yaml +40 -0
- agent_recipes/templates/ai-video-merger/tools.py +172 -0
- agent_recipes/templates/ai-video-thumbnails/README.md +16 -0
- agent_recipes/templates/ai-video-thumbnails/TEMPLATE.yaml +53 -0
- agent_recipes/templates/ai-video-to-gif/README.md +14 -0
- agent_recipes/templates/ai-video-to-gif/TEMPLATE.yaml +64 -0
- agent_recipes/templates/ai-voice-cloner/README.md +59 -0
- agent_recipes/templates/ai-voice-cloner/TEMPLATE.yaml +31 -0
- agent_recipes/templates/ai-voiceover-generator/recipe.yaml +41 -0
- agent_recipes/templates/ai-voiceover-generator/tools.py +194 -0
- agent_recipes/templates/ai-watermark-adder/README.md +59 -0
- agent_recipes/templates/ai-watermark-adder/TEMPLATE.yaml +26 -0
- agent_recipes/templates/ai-watermark-remover/README.md +60 -0
- agent_recipes/templates/ai-watermark-remover/TEMPLATE.yaml +32 -0
- agent_recipes/templates/data-transformer/README.md +75 -0
- agent_recipes/templates/data-transformer/TEMPLATE.yaml +63 -0
- agent_recipes/templates/data-transformer/agents.yaml +70 -0
- agent_recipes/templates/data-transformer/workflow.yaml +92 -0
- agent_recipes/templates/shorts-generator/README.md +61 -0
- agent_recipes/templates/shorts-generator/TEMPLATE.yaml +65 -0
- agent_recipes/templates/shorts-generator/agents.yaml +66 -0
- agent_recipes/templates/shorts-generator/workflow.yaml +86 -0
- agent_recipes/templates/transcript-generator/README.md +103 -0
- agent_recipes/templates/transcript-generator/TEMPLATE.yaml +57 -0
- agent_recipes/templates/transcript-generator/agents.yaml +62 -0
- agent_recipes/templates/transcript-generator/workflow.yaml +82 -0
- agent_recipes/templates/video-editor/README.md +70 -0
- agent_recipes/templates/video-editor/TEMPLATE.yaml +55 -0
- agent_recipes/templates/video-editor/agents.yaml +68 -0
- agent_recipes/templates/video-editor/workflow.yaml +92 -0
- agent_recipes-0.0.5.dist-info/METADATA +145 -0
- agent_recipes-0.0.5.dist-info/RECORD +269 -0
- agent_recipes-0.0.5.dist-info/WHEEL +5 -0
- agent_recipes-0.0.5.dist-info/top_level.txt +1 -0
- /236/326/177nE/243/231/214/232/265/322m/201/253/353/022C/372/321/266/b/225^=/272/017t/262/3337/310@/315wb/341pB/277z/216/330/314/004/265B/213/375/236/203/026/373/307/354z41/347#/374q/262/22589/032/276 /277/244Vh/322/017/004/224/215/004/367/377/375/335/n +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AI News Capture Pack Tools
|
|
3
|
+
|
|
4
|
+
Bundle assets per news story:
|
|
5
|
+
- Screenshots
|
|
6
|
+
- Metadata
|
|
7
|
+
- Source links
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import zipfile
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def capture_story_assets(
|
|
23
|
+
urls: List[str],
|
|
24
|
+
story_id: str,
|
|
25
|
+
output_dir: Optional[str] = None,
|
|
26
|
+
) -> Dict[str, Any]:
|
|
27
|
+
"""
|
|
28
|
+
Capture all assets for a news story.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
urls: List of URLs to capture
|
|
32
|
+
story_id: Unique story identifier
|
|
33
|
+
output_dir: Output directory
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Dictionary with captured assets
|
|
37
|
+
"""
|
|
38
|
+
output_dir = output_dir or f"./captures/{story_id}"
|
|
39
|
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
assets = []
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
from playwright.sync_api import sync_playwright
|
|
45
|
+
|
|
46
|
+
with sync_playwright() as p:
|
|
47
|
+
browser = p.chromium.launch(headless=True)
|
|
48
|
+
page = browser.new_page(viewport={"width": 1920, "height": 1080})
|
|
49
|
+
|
|
50
|
+
for i, url in enumerate(urls):
|
|
51
|
+
try:
|
|
52
|
+
page.goto(url, wait_until="networkidle", timeout=30000)
|
|
53
|
+
|
|
54
|
+
# Capture screenshot
|
|
55
|
+
screenshot_path = os.path.join(output_dir, f"capture_{i}.png")
|
|
56
|
+
page.screenshot(path=screenshot_path, full_page=True)
|
|
57
|
+
|
|
58
|
+
# Extract metadata
|
|
59
|
+
title = page.title()
|
|
60
|
+
|
|
61
|
+
assets.append({
|
|
62
|
+
"url": url,
|
|
63
|
+
"screenshot": screenshot_path,
|
|
64
|
+
"title": title,
|
|
65
|
+
"captured_at": datetime.now(timezone.utc).isoformat(),
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning(f"Error capturing {url}: {e}")
|
|
70
|
+
assets.append({
|
|
71
|
+
"url": url,
|
|
72
|
+
"error": str(e),
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
browser.close()
|
|
76
|
+
|
|
77
|
+
except ImportError:
|
|
78
|
+
logger.error("Playwright not installed")
|
|
79
|
+
return {"error": "Playwright not installed"}
|
|
80
|
+
|
|
81
|
+
return {
|
|
82
|
+
"story_id": story_id,
|
|
83
|
+
"assets": assets,
|
|
84
|
+
"output_dir": output_dir,
|
|
85
|
+
"total_captured": len([a for a in assets if "screenshot" in a]),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def create_bundle(
|
|
90
|
+
story_id: str,
|
|
91
|
+
assets: List[Dict[str, Any]],
|
|
92
|
+
output_dir: Optional[str] = None,
|
|
93
|
+
include_metadata: bool = True,
|
|
94
|
+
) -> Dict[str, Any]:
|
|
95
|
+
"""
|
|
96
|
+
Create a bundled pack of story assets.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
story_id: Story identifier
|
|
100
|
+
assets: List of asset dictionaries
|
|
101
|
+
output_dir: Output directory
|
|
102
|
+
include_metadata: Include metadata JSON
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Bundle info
|
|
106
|
+
"""
|
|
107
|
+
output_dir = output_dir or "./bundles"
|
|
108
|
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
109
|
+
|
|
110
|
+
bundle_name = f"{story_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
111
|
+
bundle_dir = os.path.join(output_dir, bundle_name)
|
|
112
|
+
Path(bundle_dir).mkdir(parents=True, exist_ok=True)
|
|
113
|
+
|
|
114
|
+
bundled_files = []
|
|
115
|
+
|
|
116
|
+
# Copy assets to bundle
|
|
117
|
+
for asset in assets:
|
|
118
|
+
if "screenshot" in asset and os.path.exists(asset["screenshot"]):
|
|
119
|
+
filename = os.path.basename(asset["screenshot"])
|
|
120
|
+
dest = os.path.join(bundle_dir, filename)
|
|
121
|
+
shutil.copy2(asset["screenshot"], dest)
|
|
122
|
+
bundled_files.append(filename)
|
|
123
|
+
|
|
124
|
+
# Create metadata
|
|
125
|
+
if include_metadata:
|
|
126
|
+
metadata = {
|
|
127
|
+
"story_id": story_id,
|
|
128
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
129
|
+
"assets": assets,
|
|
130
|
+
"files": bundled_files,
|
|
131
|
+
}
|
|
132
|
+
metadata_path = os.path.join(bundle_dir, "metadata.json")
|
|
133
|
+
with open(metadata_path, "w") as f:
|
|
134
|
+
json.dump(metadata, f, indent=2)
|
|
135
|
+
|
|
136
|
+
# Create zip
|
|
137
|
+
zip_path = f"{bundle_dir}.zip"
|
|
138
|
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
139
|
+
for root, dirs, files in os.walk(bundle_dir):
|
|
140
|
+
for file in files:
|
|
141
|
+
file_path = os.path.join(root, file)
|
|
142
|
+
arcname = os.path.relpath(file_path, bundle_dir)
|
|
143
|
+
zf.write(file_path, arcname)
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
"bundle_path": zip_path,
|
|
147
|
+
"bundle_dir": bundle_dir,
|
|
148
|
+
"files": bundled_files,
|
|
149
|
+
"size_bytes": os.path.getsize(zip_path),
|
|
150
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
name: ai-news-crawler
|
|
2
|
+
version: 1.0.0
|
|
3
|
+
description: Crawl AI news from multiple sources including AI labs blogs, arXiv, GitHub trending, HackerNews, Reddit, and X lists
|
|
4
|
+
author: PraisonAI
|
|
5
|
+
tags:
|
|
6
|
+
- news
|
|
7
|
+
- crawler
|
|
8
|
+
- ai
|
|
9
|
+
- research
|
|
10
|
+
- aggregation
|
|
11
|
+
|
|
12
|
+
config:
|
|
13
|
+
sources:
|
|
14
|
+
- hackernews
|
|
15
|
+
- reddit
|
|
16
|
+
- arxiv
|
|
17
|
+
- github_trending
|
|
18
|
+
- ai_blogs
|
|
19
|
+
max_articles: 50
|
|
20
|
+
time_window_hours: 24
|
|
21
|
+
include_content: true
|
|
22
|
+
output_format: json
|
|
23
|
+
|
|
24
|
+
input:
|
|
25
|
+
type: object
|
|
26
|
+
properties:
|
|
27
|
+
sources:
|
|
28
|
+
type: array
|
|
29
|
+
description: List of sources to crawl
|
|
30
|
+
items:
|
|
31
|
+
type: string
|
|
32
|
+
enum: [hackernews, reddit, arxiv, github_trending, ai_blogs, x_lists]
|
|
33
|
+
max_articles:
|
|
34
|
+
type: integer
|
|
35
|
+
description: Maximum number of articles to fetch
|
|
36
|
+
default: 50
|
|
37
|
+
time_window_hours:
|
|
38
|
+
type: integer
|
|
39
|
+
description: Only fetch articles from the last N hours
|
|
40
|
+
default: 24
|
|
41
|
+
|
|
42
|
+
output:
|
|
43
|
+
type: object
|
|
44
|
+
properties:
|
|
45
|
+
articles:
|
|
46
|
+
type: array
|
|
47
|
+
items:
|
|
48
|
+
type: object
|
|
49
|
+
properties:
|
|
50
|
+
title:
|
|
51
|
+
type: string
|
|
52
|
+
url:
|
|
53
|
+
type: string
|
|
54
|
+
source:
|
|
55
|
+
type: string
|
|
56
|
+
published:
|
|
57
|
+
type: string
|
|
58
|
+
content:
|
|
59
|
+
type: string
|
|
60
|
+
score:
|
|
61
|
+
type: number
|
|
62
|
+
crawl_metadata:
|
|
63
|
+
type: object
|
|
64
|
+
properties:
|
|
65
|
+
total_fetched:
|
|
66
|
+
type: integer
|
|
67
|
+
sources_crawled:
|
|
68
|
+
type: array
|
|
69
|
+
crawl_time:
|
|
70
|
+
type: string
|
|
71
|
+
|
|
72
|
+
requires:
|
|
73
|
+
env:
|
|
74
|
+
- OPENAI_API_KEY
|
|
75
|
+
optional_env:
|
|
76
|
+
- TAVILY_API_KEY
|
|
77
|
+
- REDDIT_CLIENT_ID
|
|
78
|
+
- REDDIT_CLIENT_SECRET
|
|
79
|
+
packages:
|
|
80
|
+
- requests
|
|
81
|
+
- feedparser
|
|
82
|
+
- beautifulsoup4
|
|
83
|
+
|
|
84
|
+
workflow:
|
|
85
|
+
agents:
|
|
86
|
+
- name: news_crawler
|
|
87
|
+
role: AI News Aggregator
|
|
88
|
+
goal: Crawl and collect AI-related news from multiple sources
|
|
89
|
+
backstory: Expert at finding and aggregating AI news from various platforms
|
|
90
|
+
tools:
|
|
91
|
+
- crawl_hackernews
|
|
92
|
+
- crawl_reddit
|
|
93
|
+
- crawl_arxiv
|
|
94
|
+
- crawl_github_trending
|
|
95
|
+
- search_web
|
|
96
|
+
tasks:
|
|
97
|
+
- name: crawl_sources
|
|
98
|
+
description: Crawl all configured sources for AI news
|
|
99
|
+
expected_output: List of articles with metadata
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AI News Crawler Tools
|
|
3
|
+
|
|
4
|
+
Tools for crawling AI news from multiple sources:
|
|
5
|
+
- HackerNews
|
|
6
|
+
- Reddit (r/MachineLearning, r/artificial, etc.)
|
|
7
|
+
- arXiv (cs.AI, cs.LG, cs.CL)
|
|
8
|
+
- GitHub Trending
|
|
9
|
+
- AI Labs Blogs
|
|
10
|
+
- Web Search (via Tavily)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import logging
|
|
16
|
+
from datetime import datetime, timedelta, timezone
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
18
|
+
from urllib.parse import urljoin
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def crawl_hackernews(
|
|
24
|
+
max_articles: int = 20,
|
|
25
|
+
time_window_hours: int = 24,
|
|
26
|
+
keywords: Optional[List[str]] = None,
|
|
27
|
+
) -> List[Dict[str, Any]]:
|
|
28
|
+
"""
|
|
29
|
+
Crawl HackerNews for AI-related stories.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
max_articles: Maximum number of articles to fetch
|
|
33
|
+
time_window_hours: Only fetch articles from the last N hours
|
|
34
|
+
keywords: Filter by keywords (default: AI-related terms)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
List of article dictionaries
|
|
38
|
+
"""
|
|
39
|
+
import requests
|
|
40
|
+
|
|
41
|
+
keywords = keywords or ["ai", "gpt", "llm", "machine learning", "openai", "anthropic", "google ai", "neural", "transformer"]
|
|
42
|
+
|
|
43
|
+
articles = []
|
|
44
|
+
base_url = "https://hacker-news.firebaseio.com/v0"
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
# Get top stories
|
|
48
|
+
response = requests.get(f"{base_url}/topstories.json", timeout=10)
|
|
49
|
+
response.raise_for_status()
|
|
50
|
+
story_ids = response.json()[:100] # Get top 100 to filter
|
|
51
|
+
|
|
52
|
+
cutoff_time = datetime.now(timezone.utc) - timedelta(hours=time_window_hours)
|
|
53
|
+
|
|
54
|
+
for story_id in story_ids:
|
|
55
|
+
if len(articles) >= max_articles:
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
story_response = requests.get(f"{base_url}/item/{story_id}.json", timeout=5)
|
|
60
|
+
story = story_response.json()
|
|
61
|
+
|
|
62
|
+
if not story or story.get("type") != "story":
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
title = story.get("title", "").lower()
|
|
66
|
+
|
|
67
|
+
# Check if AI-related
|
|
68
|
+
if not any(kw in title for kw in keywords):
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Check time window
|
|
72
|
+
story_time = datetime.fromtimestamp(story.get("time", 0), tz=timezone.utc)
|
|
73
|
+
if story_time < cutoff_time:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
articles.append({
|
|
77
|
+
"title": story.get("title", ""),
|
|
78
|
+
"url": story.get("url", f"https://news.ycombinator.com/item?id={story_id}"),
|
|
79
|
+
"source": "hackernews",
|
|
80
|
+
"published": story_time.isoformat(),
|
|
81
|
+
"score": story.get("score", 0),
|
|
82
|
+
"comments": story.get("descendants", 0),
|
|
83
|
+
"author": story.get("by", ""),
|
|
84
|
+
"content": "", # HN doesn't provide content
|
|
85
|
+
})
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.warning(f"Error fetching story {story_id}: {e}")
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Error crawling HackerNews: {e}")
|
|
92
|
+
|
|
93
|
+
return articles
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def crawl_reddit(
|
|
97
|
+
subreddits: Optional[List[str]] = None,
|
|
98
|
+
max_articles: int = 20,
|
|
99
|
+
time_window_hours: int = 24,
|
|
100
|
+
) -> List[Dict[str, Any]]:
|
|
101
|
+
"""
|
|
102
|
+
Crawl Reddit for AI-related posts.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
subreddits: List of subreddits to crawl
|
|
106
|
+
max_articles: Maximum number of articles to fetch
|
|
107
|
+
time_window_hours: Only fetch articles from the last N hours
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
List of article dictionaries
|
|
111
|
+
"""
|
|
112
|
+
import requests
|
|
113
|
+
|
|
114
|
+
subreddits = subreddits or ["MachineLearning", "artificial", "LocalLLaMA", "OpenAI", "ClaudeAI"]
|
|
115
|
+
articles = []
|
|
116
|
+
|
|
117
|
+
headers = {"User-Agent": "PraisonAI News Crawler 1.0"}
|
|
118
|
+
|
|
119
|
+
for subreddit in subreddits:
|
|
120
|
+
if len(articles) >= max_articles:
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
url = f"https://www.reddit.com/r/{subreddit}/hot.json?limit=25"
|
|
125
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
126
|
+
response.raise_for_status()
|
|
127
|
+
data = response.json()
|
|
128
|
+
|
|
129
|
+
cutoff_time = datetime.now(timezone.utc) - timedelta(hours=time_window_hours)
|
|
130
|
+
|
|
131
|
+
for post in data.get("data", {}).get("children", []):
|
|
132
|
+
if len(articles) >= max_articles:
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
post_data = post.get("data", {})
|
|
136
|
+
|
|
137
|
+
# Check time window
|
|
138
|
+
created = datetime.fromtimestamp(post_data.get("created_utc", 0), tz=timezone.utc)
|
|
139
|
+
if created < cutoff_time:
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
articles.append({
|
|
143
|
+
"title": post_data.get("title", ""),
|
|
144
|
+
"url": post_data.get("url", ""),
|
|
145
|
+
"source": f"reddit/r/{subreddit}",
|
|
146
|
+
"published": created.isoformat(),
|
|
147
|
+
"score": post_data.get("score", 0),
|
|
148
|
+
"comments": post_data.get("num_comments", 0),
|
|
149
|
+
"author": post_data.get("author", ""),
|
|
150
|
+
"content": post_data.get("selftext", "")[:500],
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.warning(f"Error crawling r/{subreddit}: {e}")
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
return articles
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def crawl_arxiv(
|
|
161
|
+
categories: Optional[List[str]] = None,
|
|
162
|
+
max_articles: int = 20,
|
|
163
|
+
time_window_hours: int = 48,
|
|
164
|
+
) -> List[Dict[str, Any]]:
|
|
165
|
+
"""
|
|
166
|
+
Crawl arXiv for AI research papers.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
categories: arXiv categories to search
|
|
170
|
+
max_articles: Maximum number of papers to fetch
|
|
171
|
+
time_window_hours: Only fetch papers from the last N hours
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of paper dictionaries
|
|
175
|
+
"""
|
|
176
|
+
import requests
|
|
177
|
+
import xml.etree.ElementTree as ET
|
|
178
|
+
|
|
179
|
+
categories = categories or ["cs.AI", "cs.LG", "cs.CL", "cs.CV", "cs.NE"]
|
|
180
|
+
articles = []
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# Build query
|
|
184
|
+
cat_query = " OR ".join([f"cat:{cat}" for cat in categories])
|
|
185
|
+
url = f"http://export.arxiv.org/api/query?search_query={cat_query}&sortBy=submittedDate&sortOrder=descending&max_results={max_articles}"
|
|
186
|
+
|
|
187
|
+
response = requests.get(url, timeout=30)
|
|
188
|
+
response.raise_for_status()
|
|
189
|
+
|
|
190
|
+
# Parse XML
|
|
191
|
+
root = ET.fromstring(response.content)
|
|
192
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
193
|
+
|
|
194
|
+
for entry in root.findall("atom:entry", ns):
|
|
195
|
+
title = entry.find("atom:title", ns)
|
|
196
|
+
summary = entry.find("atom:summary", ns)
|
|
197
|
+
published = entry.find("atom:published", ns)
|
|
198
|
+
link = entry.find("atom:id", ns)
|
|
199
|
+
authors = entry.findall("atom:author/atom:name", ns)
|
|
200
|
+
|
|
201
|
+
articles.append({
|
|
202
|
+
"title": title.text.strip() if title is not None else "",
|
|
203
|
+
"url": link.text if link is not None else "",
|
|
204
|
+
"source": "arxiv",
|
|
205
|
+
"published": published.text if published is not None else "",
|
|
206
|
+
"content": summary.text.strip()[:500] if summary is not None else "",
|
|
207
|
+
"authors": [a.text for a in authors],
|
|
208
|
+
"score": 0,
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Error crawling arXiv: {e}")
|
|
213
|
+
|
|
214
|
+
return articles
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def crawl_github_trending(
|
|
218
|
+
language: Optional[str] = None,
|
|
219
|
+
max_repos: int = 20,
|
|
220
|
+
) -> List[Dict[str, Any]]:
|
|
221
|
+
"""
|
|
222
|
+
Crawl GitHub trending repositories for AI projects.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
language: Filter by programming language
|
|
226
|
+
max_repos: Maximum number of repos to fetch
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of repository dictionaries
|
|
230
|
+
"""
|
|
231
|
+
import requests
|
|
232
|
+
from bs4 import BeautifulSoup
|
|
233
|
+
|
|
234
|
+
articles = []
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
url = "https://github.com/trending"
|
|
238
|
+
if language:
|
|
239
|
+
url += f"/{language}"
|
|
240
|
+
url += "?since=daily"
|
|
241
|
+
|
|
242
|
+
headers = {"User-Agent": "PraisonAI News Crawler 1.0"}
|
|
243
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
244
|
+
response.raise_for_status()
|
|
245
|
+
|
|
246
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
247
|
+
|
|
248
|
+
# AI-related keywords
|
|
249
|
+
ai_keywords = ["ai", "llm", "gpt", "transformer", "neural", "ml", "machine-learning",
|
|
250
|
+
"deep-learning", "nlp", "vision", "agent", "rag", "embedding"]
|
|
251
|
+
|
|
252
|
+
for article in soup.select("article.Box-row")[:max_repos * 2]:
|
|
253
|
+
try:
|
|
254
|
+
repo_link = article.select_one("h2 a")
|
|
255
|
+
if not repo_link:
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
repo_name = repo_link.get_text(strip=True).replace("\n", "").replace(" ", "")
|
|
259
|
+
repo_url = "https://github.com" + repo_link.get("href", "")
|
|
260
|
+
|
|
261
|
+
description_elem = article.select_one("p")
|
|
262
|
+
description = description_elem.get_text(strip=True) if description_elem else ""
|
|
263
|
+
|
|
264
|
+
# Check if AI-related
|
|
265
|
+
text_to_check = (repo_name + " " + description).lower()
|
|
266
|
+
if not any(kw in text_to_check for kw in ai_keywords):
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
stars_elem = article.select_one("a[href*='/stargazers']")
|
|
270
|
+
stars = stars_elem.get_text(strip=True).replace(",", "") if stars_elem else "0"
|
|
271
|
+
|
|
272
|
+
articles.append({
|
|
273
|
+
"title": repo_name,
|
|
274
|
+
"url": repo_url,
|
|
275
|
+
"source": "github_trending",
|
|
276
|
+
"published": datetime.now(timezone.utc).isoformat(),
|
|
277
|
+
"content": description,
|
|
278
|
+
"score": int(stars) if stars.isdigit() else 0,
|
|
279
|
+
})
|
|
280
|
+
|
|
281
|
+
if len(articles) >= max_repos:
|
|
282
|
+
break
|
|
283
|
+
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.warning(f"Error parsing repo: {e}")
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.error(f"Error crawling GitHub trending: {e}")
|
|
290
|
+
|
|
291
|
+
return articles
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def search_web(
|
|
295
|
+
query: str,
|
|
296
|
+
max_results: int = 10,
|
|
297
|
+
) -> List[Dict[str, Any]]:
|
|
298
|
+
"""
|
|
299
|
+
Search the web for AI news using Tavily API.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
query: Search query
|
|
303
|
+
max_results: Maximum number of results
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
List of search result dictionaries
|
|
307
|
+
"""
|
|
308
|
+
api_key = os.environ.get("TAVILY_API_KEY")
|
|
309
|
+
if not api_key:
|
|
310
|
+
logger.warning("TAVILY_API_KEY not set, skipping web search")
|
|
311
|
+
return []
|
|
312
|
+
|
|
313
|
+
import requests
|
|
314
|
+
|
|
315
|
+
articles = []
|
|
316
|
+
|
|
317
|
+
try:
|
|
318
|
+
response = requests.post(
|
|
319
|
+
"https://api.tavily.com/search",
|
|
320
|
+
json={
|
|
321
|
+
"api_key": api_key,
|
|
322
|
+
"query": query,
|
|
323
|
+
"search_depth": "advanced",
|
|
324
|
+
"max_results": max_results,
|
|
325
|
+
"include_answer": False,
|
|
326
|
+
},
|
|
327
|
+
timeout=30,
|
|
328
|
+
)
|
|
329
|
+
response.raise_for_status()
|
|
330
|
+
data = response.json()
|
|
331
|
+
|
|
332
|
+
for result in data.get("results", []):
|
|
333
|
+
articles.append({
|
|
334
|
+
"title": result.get("title", ""),
|
|
335
|
+
"url": result.get("url", ""),
|
|
336
|
+
"source": "web_search",
|
|
337
|
+
"published": datetime.now(timezone.utc).isoformat(),
|
|
338
|
+
"content": result.get("content", "")[:500],
|
|
339
|
+
"score": result.get("score", 0),
|
|
340
|
+
})
|
|
341
|
+
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Error in web search: {e}")
|
|
344
|
+
|
|
345
|
+
return articles
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def crawl_ai_news(
|
|
349
|
+
sources: Optional[List[str]] = None,
|
|
350
|
+
max_articles: int = 50,
|
|
351
|
+
time_window_hours: int = 24,
|
|
352
|
+
output_dir: Optional[str] = None,
|
|
353
|
+
) -> Dict[str, Any]:
|
|
354
|
+
"""
|
|
355
|
+
Main function to crawl AI news from all configured sources.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
sources: List of sources to crawl
|
|
359
|
+
max_articles: Maximum total articles
|
|
360
|
+
time_window_hours: Time window for articles
|
|
361
|
+
output_dir: Optional directory to save results
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Dictionary with articles and metadata
|
|
365
|
+
"""
|
|
366
|
+
sources = sources or ["hackernews", "reddit", "arxiv", "github_trending"]
|
|
367
|
+
all_articles = []
|
|
368
|
+
sources_crawled = []
|
|
369
|
+
|
|
370
|
+
per_source_limit = max(5, max_articles // len(sources))
|
|
371
|
+
|
|
372
|
+
for source in sources:
|
|
373
|
+
try:
|
|
374
|
+
if source == "hackernews":
|
|
375
|
+
articles = crawl_hackernews(max_articles=per_source_limit, time_window_hours=time_window_hours)
|
|
376
|
+
elif source == "reddit":
|
|
377
|
+
articles = crawl_reddit(max_articles=per_source_limit, time_window_hours=time_window_hours)
|
|
378
|
+
elif source == "arxiv":
|
|
379
|
+
articles = crawl_arxiv(max_articles=per_source_limit, time_window_hours=time_window_hours)
|
|
380
|
+
elif source == "github_trending":
|
|
381
|
+
articles = crawl_github_trending(max_repos=per_source_limit)
|
|
382
|
+
elif source == "web_search":
|
|
383
|
+
articles = search_web("AI news today", max_results=per_source_limit)
|
|
384
|
+
else:
|
|
385
|
+
logger.warning(f"Unknown source: {source}")
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
all_articles.extend(articles)
|
|
389
|
+
sources_crawled.append(source)
|
|
390
|
+
logger.info(f"Crawled {len(articles)} articles from {source}")
|
|
391
|
+
|
|
392
|
+
except Exception as e:
|
|
393
|
+
logger.error(f"Error crawling {source}: {e}")
|
|
394
|
+
|
|
395
|
+
# Sort by score and limit
|
|
396
|
+
all_articles.sort(key=lambda x: x.get("score", 0), reverse=True)
|
|
397
|
+
all_articles = all_articles[:max_articles]
|
|
398
|
+
|
|
399
|
+
result = {
|
|
400
|
+
"articles": all_articles,
|
|
401
|
+
"crawl_metadata": {
|
|
402
|
+
"total_fetched": len(all_articles),
|
|
403
|
+
"sources_crawled": sources_crawled,
|
|
404
|
+
"crawl_time": datetime.now(timezone.utc).isoformat(),
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
# Save to file if output_dir specified
|
|
409
|
+
if output_dir:
|
|
410
|
+
import os
|
|
411
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
412
|
+
output_path = os.path.join(output_dir, "crawled_news.json")
|
|
413
|
+
with open(output_path, "w") as f:
|
|
414
|
+
json.dump(result, f, indent=2)
|
|
415
|
+
logger.info(f"Saved results to {output_path}")
|
|
416
|
+
|
|
417
|
+
return result
|