openartemis 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. openartemis-0.2.0/PKG-INFO +162 -0
  2. openartemis-0.2.0/README.md +130 -0
  3. openartemis-0.2.0/openartemis/__init__.py +3 -0
  4. openartemis-0.2.0/openartemis/__main__.py +6 -0
  5. openartemis-0.2.0/openartemis/agent/__init__.py +8 -0
  6. openartemis-0.2.0/openartemis/agent/detective.py +344 -0
  7. openartemis-0.2.0/openartemis/agent/orchestrator.py +61 -0
  8. openartemis-0.2.0/openartemis/agent/pipeline.py +109 -0
  9. openartemis-0.2.0/openartemis/agent/worker.py +96 -0
  10. openartemis-0.2.0/openartemis/auth/__init__.py +35 -0
  11. openartemis-0.2.0/openartemis/auth/db.py +302 -0
  12. openartemis-0.2.0/openartemis/auth/email_sender.py +46 -0
  13. openartemis-0.2.0/openartemis/auth/passwords.py +16 -0
  14. openartemis-0.2.0/openartemis/auth/sessions.py +87 -0
  15. openartemis-0.2.0/openartemis/chat.py +138 -0
  16. openartemis-0.2.0/openartemis/cli.py +804 -0
  17. openartemis-0.2.0/openartemis/harvesters/__init__.py +7 -0
  18. openartemis-0.2.0/openartemis/harvesters/base.py +23 -0
  19. openartemis-0.2.0/openartemis/harvesters/social.py +134 -0
  20. openartemis-0.2.0/openartemis/harvesters/youtube.py +421 -0
  21. openartemis-0.2.0/openartemis/output.py +65 -0
  22. openartemis-0.2.0/openartemis/tools/__init__.py +5 -0
  23. openartemis-0.2.0/openartemis/tools/web.py +104 -0
  24. openartemis-0.2.0/openartemis/transcription.py +35 -0
  25. openartemis-0.2.0/openartemis.egg-info/PKG-INFO +162 -0
  26. openartemis-0.2.0/openartemis.egg-info/SOURCES.txt +30 -0
  27. openartemis-0.2.0/openartemis.egg-info/dependency_links.txt +1 -0
  28. openartemis-0.2.0/openartemis.egg-info/entry_points.txt +2 -0
  29. openartemis-0.2.0/openartemis.egg-info/requires.txt +14 -0
  30. openartemis-0.2.0/openartemis.egg-info/top_level.txt +1 -0
  31. openartemis-0.2.0/pyproject.toml +46 -0
  32. openartemis-0.2.0/setup.cfg +4 -0
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: openartemis
3
+ Version: 0.2.0
4
+ Summary: CLI to harvest transcripts from YouTube, TikTok, Instagram, and X — chat with Artemis, an AI that researches for you
5
+ License: MIT
6
+ Keywords: transcripts,youtube,tiktok,whisper,cli,research
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Environment :: Console
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Multimedia :: Video
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: google-api-python-client>=2.100.0
19
+ Requires-Dist: youtube-transcript-api>=0.6.0
20
+ Requires-Dist: yt-dlp>=2024.1.0
21
+ Requires-Dist: ddgs>=9.0.0
22
+ Requires-Dist: openai-whisper>=20231117
23
+ Requires-Dist: requests>=2.28.0
24
+ Requires-Dist: typer[all]>=0.9.0
25
+ Requires-Dist: rich>=13.0.0
26
+ Requires-Dist: questionary>=2.0.0
27
+ Requires-Dist: openai>=1.0.0
28
+ Requires-Dist: python-dotenv>=1.0.0
29
+ Requires-Dist: bcrypt>=4.0.0
30
+ Requires-Dist: trafilatura>=2.0.0
31
+ Requires-Dist: playwright>=1.40.0
32
+
33
+ # OpenArtemis
34
+
35
+ CLI tool to harvest transcripts from YouTube, TikTok, Instagram, and X (Twitter). User-friendly design inspired by Claude CLI — no coding required.
36
+
37
+ **Login required.** First run creates an admin account. Other users register and wait for admin approval. Approval requests are emailed to the admin.
38
+
39
+ **Chat mode** (default): Run `openartemis` for a persistent chat with **Artemis**, an AI that researches for you. Say "research X" or "find out about Y" and Artemis uses tools (transcripts, web search, webpage fetch). Use `/research <query>` for multi-agent deep research. Type `/` for mode menu, `/logout` or `/exit` to quit.
40
+
41
+ **Harvest mode**: `openartemis harvest` for guided step-by-step transcript download.
42
+
43
+ **Admin**: `openartemis admin` — manage users, approve requests, view usage. Admins have unlimited API access.
44
+
45
+ ## Requirements
46
+
47
+ - **Python 3.10+**
48
+ - **ffmpeg** (required for yt-dlp and Whisper; must be on PATH)
49
+ - **YouTube Data API key** (for YouTube search; set `YOUTUBE_API_KEY` or use `--youtube-api-key`)
50
+ - **ScrapingDog API key** (optional, for faster YouTube transcripts; set `SCRAPINGDOG_API_KEY`)
51
+ - **OpenAI API key** (for Artemis chat; set `OPENAI_API_KEY` — never commit your key)
52
+ - **Brave Search API key** (optional, for web search; set `BRAVE_SEARCH_API_KEY` — search.brave.com)
53
+
54
+ ## Installation
55
+
56
+ **From PyPI** (run anywhere):
57
+
58
+ ```bash
59
+ pip install openartemis
60
+ ```
61
+
62
+ Then run:
63
+
64
+ ```bash
65
+ openartemis
66
+ ```
67
+
68
+ **From source** (development):
69
+
70
+ ```bash
71
+ pip install -e .
72
+ playwright install chromium # For browse_webpage (JS-heavy sites)
73
+ ```
74
+
75
+ ## Usage
76
+
77
+ ### Interactive mode (no coding required)
78
+
79
+ Run without arguments for chat (login required):
80
+
81
+ ```bash
82
+ openartemis
83
+ ```
84
+
85
+ ### Command-line mode
86
+
87
+ ```bash
88
+ openartemis harvest # Guided harvest (login required)
89
+ ```
90
+
91
+ ### Arguments
92
+
93
+ | Argument | Description | Default |
94
+ |----------|-------------|---------|
95
+ | `query` | Search query (omit for interactive mode) | - |
96
+ | `--platforms`, `-p` | Comma-separated: youtube,tiktok,instagram,x | youtube |
97
+ | `--max-results`, `-n` | Max results per platform | 10 |
98
+ | `--out-dir`, `-o` | Output directory | ./transcripts |
99
+ | `--youtube-api-key` | YouTube Data API key (or YOUTUBE_API_KEY env) | - |
100
+ | `--scrapingdog-api-key` | ScrapingDog API key (or SCRAPINGDOG_API_KEY env) | - |
101
+ | `--whisper-model`, `-m` | Whisper model: tiny,base,small,medium,large | base |
102
+ | `--interactive`, `-i` | Run in interactive mode | - |
103
+
104
+ ### Artemis Detective (multi-agent research)
105
+
106
+ Tell Artemis what to find — it uses a multi-agent pipeline: one orchestrator plans up to 5 tasks, five workers run in parallel (Brave search, transcript harvest, webpage fetch/browse), then results are synthesized into a report:
107
+
108
+ ```bash
109
+ openartemis investigate "Find out everything about what happened to John Smith"
110
+ # Or run without args for interactive prompt:
111
+ openartemis investigate
112
+ ```
113
+
114
+ Options: `--model gpt-4o-mini` (default, cheapest), `--out-dir ./detective_output`
115
+
116
+ Tools: `harvest_transcripts`, `brave_search`, `fetch_webpage`, `browse_webpage`, `list_transcripts`, `read_transcript`
117
+
118
+ ### Admin panel
119
+
120
+ ```bash
121
+ openartemis admin
122
+ ```
123
+
124
+ Approve users, view usage, revoke access, create users directly.
125
+
126
+ ### Other commands
127
+
128
+ ```bash
129
+ openartemis logout # Log out
130
+ openartemis config # Show configuration help
131
+ ```
132
+
133
+ Shows required env vars and where to get API keys.
134
+
135
+ ### Output
136
+
137
+ Per-post files in `out_dir`:
138
+ - `{platform}_{id}.json` - Metadata + segments + full text
139
+ - `{platform}_{id}.txt` - Plain text transcript
140
+ - `{platform}_{id}.srt` - SRT subtitles
141
+
142
+ ## Pipeline
143
+
144
+ - **YouTube**: Data API search → video IDs → ScrapingDog API (if key set) or youtube-transcript-api for captions; if none, download audio via yt-dlp → Whisper
145
+ - **TikTok/Instagram/X**: Web search (site: operator) → yt-dlp download + extract audio → Whisper
146
+ - **Research**: Multi-agent (orchestrator + 5 workers) — Brave Search, trafilatura (fetch_webpage), Playwright (browse_webpage), transcript harvest
147
+
148
+ ## Auth
149
+
150
+ - Session expiry: 7 days
151
+ - Rate limit: 5 failed logins per 15 min per username
152
+ - Password policy: minimum 8 characters
153
+
154
+ ## Publishing to PyPI
155
+
156
+ To publish a new version:
157
+
158
+ ```bash
159
+ pip install build twine
160
+ python -m build
161
+ twine upload dist/*
162
+ ```
@@ -0,0 +1,130 @@
1
+ # OpenArtemis
2
+
3
+ CLI tool to harvest transcripts from YouTube, TikTok, Instagram, and X (Twitter). User-friendly design inspired by Claude CLI — no coding required.
4
+
5
+ **Login required.** First run creates an admin account. Other users register and wait for admin approval. Approval requests are emailed to the admin.
6
+
7
+ **Chat mode** (default): Run `openartemis` for a persistent chat with **Artemis**, an AI that researches for you. Say "research X" or "find out about Y" and Artemis uses tools (transcripts, web search, webpage fetch). Use `/research <query>` for multi-agent deep research. Type `/` for mode menu, `/logout` or `/exit` to quit.
8
+
9
+ **Harvest mode**: `openartemis harvest` for guided step-by-step transcript download.
10
+
11
+ **Admin**: `openartemis admin` — manage users, approve requests, view usage. Admins have unlimited API access.
12
+
13
+ ## Requirements
14
+
15
+ - **Python 3.10+**
16
+ - **ffmpeg** (required for yt-dlp and Whisper; must be on PATH)
17
+ - **YouTube Data API key** (for YouTube search; set `YOUTUBE_API_KEY` or use `--youtube-api-key`)
18
+ - **ScrapingDog API key** (optional, for faster YouTube transcripts; set `SCRAPINGDOG_API_KEY`)
19
+ - **OpenAI API key** (for Artemis chat; set `OPENAI_API_KEY` — never commit your key)
20
+ - **Brave Search API key** (optional, for web search; set `BRAVE_SEARCH_API_KEY` — search.brave.com)
21
+
22
+ ## Installation
23
+
24
+ **From PyPI** (run anywhere):
25
+
26
+ ```bash
27
+ pip install openartemis
28
+ ```
29
+
30
+ Then run:
31
+
32
+ ```bash
33
+ openartemis
34
+ ```
35
+
36
+ **From source** (development):
37
+
38
+ ```bash
39
+ pip install -e .
40
+ playwright install chromium # For browse_webpage (JS-heavy sites)
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ### Interactive mode (no coding required)
46
+
47
+ Run without arguments for chat (login required):
48
+
49
+ ```bash
50
+ openartemis
51
+ ```
52
+
53
+ ### Command-line mode
54
+
55
+ ```bash
56
+ openartemis harvest # Guided harvest (login required)
57
+ ```
58
+
59
+ ### Arguments
60
+
61
+ | Argument | Description | Default |
62
+ |----------|-------------|---------|
63
+ | `query` | Search query (omit for interactive mode) | - |
64
+ | `--platforms`, `-p` | Comma-separated: youtube,tiktok,instagram,x | youtube |
65
+ | `--max-results`, `-n` | Max results per platform | 10 |
66
+ | `--out-dir`, `-o` | Output directory | ./transcripts |
67
+ | `--youtube-api-key` | YouTube Data API key (or YOUTUBE_API_KEY env) | - |
68
+ | `--scrapingdog-api-key` | ScrapingDog API key (or SCRAPINGDOG_API_KEY env) | - |
69
+ | `--whisper-model`, `-m` | Whisper model: tiny,base,small,medium,large | base |
70
+ | `--interactive`, `-i` | Run in interactive mode | - |
71
+
72
+ ### Artemis Detective (multi-agent research)
73
+
74
+ Tell Artemis what to find — it uses a multi-agent pipeline: one orchestrator plans up to 5 tasks, five workers run in parallel (Brave search, transcript harvest, webpage fetch/browse), then results are synthesized into a report:
75
+
76
+ ```bash
77
+ openartemis investigate "Find out everything about what happened to John Smith"
78
+ # Or run without args for interactive prompt:
79
+ openartemis investigate
80
+ ```
81
+
82
+ Options: `--model gpt-4o-mini` (default, cheapest), `--out-dir ./detective_output`
83
+
84
+ Tools: `harvest_transcripts`, `brave_search`, `fetch_webpage`, `browse_webpage`, `list_transcripts`, `read_transcript`
85
+
86
+ ### Admin panel
87
+
88
+ ```bash
89
+ openartemis admin
90
+ ```
91
+
92
+ Approve users, view usage, revoke access, create users directly.
93
+
94
+ ### Other commands
95
+
96
+ ```bash
97
+ openartemis logout # Log out
98
+ openartemis config # Show configuration help
99
+ ```
100
+
101
+ Shows required env vars and where to get API keys.
102
+
103
+ ### Output
104
+
105
+ Per-post files in `out_dir`:
106
+ - `{platform}_{id}.json` - Metadata + segments + full text
107
+ - `{platform}_{id}.txt` - Plain text transcript
108
+ - `{platform}_{id}.srt` - SRT subtitles
109
+
110
+ ## Pipeline
111
+
112
+ - **YouTube**: Data API search → video IDs → ScrapingDog API (if key set) or youtube-transcript-api for captions; if none, download audio via yt-dlp → Whisper
113
+ - **TikTok/Instagram/X**: Web search (site: operator) → yt-dlp download + extract audio → Whisper
114
+ - **Research**: Multi-agent (orchestrator + 5 workers) — Brave Search, trafilatura (fetch_webpage), Playwright (browse_webpage), transcript harvest
115
+
116
+ ## Auth
117
+
118
+ - Session expiry: 7 days
119
+ - Rate limit: 5 failed logins per 15 min per username
120
+ - Password policy: minimum 8 characters
121
+
122
+ ## Publishing to PyPI
123
+
124
+ To publish a new version:
125
+
126
+ ```bash
127
+ pip install build twine
128
+ python -m build
129
+ twine upload dist/*
130
+ ```
@@ -0,0 +1,3 @@
1
+ """OpenArtemis - Harvest transcripts from YouTube, TikTok, Instagram, and X."""
2
+
3
+ __version__ = "0.2.0"
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m openartemis."""
2
+
3
+ from openartemis.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,8 @@
1
+ """Multi-agent research pipeline — orchestrator, workers, synthesis."""
2
+
3
+ from openartemis.agent.detective import DetectiveAgent, TOOLS
4
+ from openartemis.agent.orchestrator import create_plan
5
+ from openartemis.agent.worker import run_worker
6
+ from openartemis.agent.pipeline import ResearchPipeline
7
+
8
+ __all__ = ["DetectiveAgent", "TOOLS", "create_plan", "run_worker", "ResearchPipeline"]
@@ -0,0 +1,344 @@
1
+ """AI detective agent — uses transcriptharvest as tools to research and find answers."""
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any, Callable
7
+
8
+ from dotenv import load_dotenv
9
+ from openai import OpenAI
10
+
11
+ load_dotenv(Path(__file__).resolve().parents[2] / ".env")
12
+
13
+ from openartemis.harvesters.social import SocialHarvester
14
+ from openartemis.harvesters.youtube import (
15
+ TranscriptAPIInsufficientCreditsError,
16
+ YouTubeHarvester,
17
+ extract_youtube_video_id,
18
+ )
19
+ from openartemis.tools.web import brave_search, fetch_webpage, browse_webpage
20
+
21
+ # Tools the AI can call (harvest + web)
22
+ TOOLS = [
23
+ {
24
+ "type": "function",
25
+ "function": {
26
+ "name": "harvest_transcripts",
27
+ "description": "Search and harvest transcripts from video/social platforms. Use this to find content about a person, event, or topic. Returns the output directory path and count of harvested posts.",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "query": {
32
+ "type": "string",
33
+ "description": "Search query (e.g. person name, event, topic)",
34
+ },
35
+ "platforms": {
36
+ "type": "array",
37
+ "items": {"type": "string", "enum": ["youtube", "tiktok", "instagram", "x"]},
38
+ "description": "Platforms to search",
39
+ "default": ["youtube"],
40
+ },
41
+ "max_results": {
42
+ "type": "integer",
43
+ "description": "Max results per platform",
44
+ "default": 5,
45
+ },
46
+ },
47
+ "required": ["query"],
48
+ },
49
+ },
50
+ },
51
+ {
52
+ "type": "function",
53
+ "function": {
54
+ "name": "list_transcripts",
55
+ "description": "List all harvested transcript files in the output directory. Returns file paths and titles.",
56
+ "parameters": {
57
+ "type": "object",
58
+ "properties": {
59
+ "out_dir": {
60
+ "type": "string",
61
+ "description": "Path to the transcripts directory",
62
+ },
63
+ },
64
+ "required": ["out_dir"],
65
+ },
66
+ },
67
+ },
68
+ {
69
+ "type": "function",
70
+ "function": {
71
+ "name": "read_transcript",
72
+ "description": "Read the full text of a harvested transcript (JSON or TXT file).",
73
+ "parameters": {
74
+ "type": "object",
75
+ "properties": {
76
+ "file_path": {
77
+ "type": "string",
78
+ "description": "Full path to the .json or .txt transcript file",
79
+ },
80
+ },
81
+ "required": ["file_path"],
82
+ },
83
+ },
84
+ },
85
+ {
86
+ "type": "function",
87
+ "function": {
88
+ "name": "brave_search",
89
+ "description": "Search the web via Brave Search API. Returns results with title, url, description.",
90
+ "parameters": {
91
+ "type": "object",
92
+ "properties": {
93
+ "q": {"type": "string", "description": "Search query"},
94
+ "count": {"type": "integer", "description": "Max results (default 10)", "default": 10},
95
+ "freshness": {"type": "string", "enum": ["pd", "pw", "pm", "py"], "description": "pd=day, pw=week, pm=month, py=year"},
96
+ },
97
+ "required": ["q"],
98
+ },
99
+ },
100
+ },
101
+ {
102
+ "type": "function",
103
+ "function": {
104
+ "name": "fetch_webpage",
105
+ "description": "Fetch URL and extract main text (trafilatura). Fast for static HTML.",
106
+ "parameters": {
107
+ "type": "object",
108
+ "properties": {
109
+ "url": {"type": "string", "description": "URL to fetch"},
110
+ },
111
+ "required": ["url"],
112
+ },
113
+ },
114
+ },
115
+ {
116
+ "type": "function",
117
+ "function": {
118
+ "name": "browse_webpage",
119
+ "description": "Fetch URL with headless browser (Playwright). Use for JS-heavy sites.",
120
+ "parameters": {
121
+ "type": "object",
122
+ "properties": {
123
+ "url": {"type": "string", "description": "URL to browse"},
124
+ "selector": {"type": "string", "description": "Optional CSS selector for specific element"},
125
+ },
126
+ "required": ["url"],
127
+ },
128
+ },
129
+ },
130
+ ]
131
+
132
+ SYSTEM_PROMPT = """You are Artemis, a research detective. Your job is to find out everything you can about what the user asks.
133
+
134
+ You have access to tools that let you:
135
+ 1. **harvest_transcripts** — Search YouTube, TikTok, Instagram, X for videos and download their transcripts
136
+ 2. **list_transcripts** — See what transcripts have been harvested
137
+ 3. **read_transcript** — Read the full text of any transcript
138
+ 4. **brave_search** — Web search via Brave API
139
+ 5. **fetch_webpage** — Fast text extraction from static HTML (trafilatura)
140
+ 6. **browse_webpage** — Full render with headless browser for JS-heavy sites
141
+
142
+ Work systematically:
143
+ - Start by harvesting transcripts with relevant search queries (person names, events, dates, related terms)
144
+ - Read the transcripts to extract facts, quotes, and leads
145
+ - If you find new names, places, or angles, harvest more with follow-up searches
146
+ - Keep going until you have a comprehensive picture or hit diminishing returns
147
+
148
+ Be thorough but efficient. Prioritize high-value sources. Summarize findings clearly.
149
+ When done, provide a complete report with sources (URLs from the transcripts)."""
150
+
151
+
152
+ class DetectiveAgent:
153
+ """AI agent that uses transcriptharvest to research."""
154
+
155
+ def __init__(
156
+ self,
157
+ openai_api_key: str | None = None,
158
+ youtube_api_key: str | None = None,
159
+ scrapingdog_api_key: str | None = None,
160
+ transcriptapi_api_key: str | None = None,
161
+ model: str = "gpt-4o-mini",
162
+ out_dir: Path | None = None,
163
+ ):
164
+ self.client = OpenAI(api_key=openai_api_key or os.environ.get("OPENAI_API_KEY"))
165
+ self.youtube_api_key = youtube_api_key or os.environ.get("YOUTUBE_API_KEY")
166
+ self.scrapingdog_api_key = scrapingdog_api_key or os.environ.get("SCRAPINGDOG_API_KEY")
167
+ self.transcriptapi_api_key = transcriptapi_api_key or os.environ.get("TRANSCRIPTAPI_API_KEY")
168
+ self.model = model
169
+ self.out_dir = Path(out_dir or "./detective_output").resolve()
170
+ self.out_dir.mkdir(parents=True, exist_ok=True)
171
+
172
+ def _harvest_transcripts(
173
+ self,
174
+ query: str,
175
+ platforms: list[str] | None = None,
176
+ max_results: int = 5,
177
+ ) -> dict[str, Any]:
178
+ """Tool: harvest transcripts."""
179
+ platforms = platforms or ["youtube"]
180
+ total = 0
181
+
182
+ try:
183
+ for platform in platforms:
184
+ if platform == "youtube":
185
+ if not self.youtube_api_key and not self.transcriptapi_api_key:
186
+ return {"error": "YouTube requires YOUTUBE_API_KEY (Google) or TRANSCRIPTAPI_API_KEY. Skipping YouTube."}
187
+ harvester = YouTubeHarvester(
188
+ api_key=self.youtube_api_key,
189
+ scrapingdog_api_key=self.scrapingdog_api_key,
190
+ transcriptapi_api_key=self.transcriptapi_api_key,
191
+ )
192
+ else:
193
+ harvester = SocialHarvester(platform=platform)
194
+
195
+ for _ in harvester.harvest(
196
+ query=query,
197
+ max_results=max_results,
198
+ out_dir=self.out_dir,
199
+ whisper_model="base",
200
+ ):
201
+ total += 1
202
+ except TranscriptAPIInsufficientCreditsError as e:
203
+ return {"error": str(e), "harvested_count": 0}
204
+
205
+ return {
206
+ "out_dir": str(self.out_dir),
207
+ "query": query,
208
+ "platforms": platforms,
209
+ "harvested_count": total,
210
+ }
211
+
212
+ def harvest_url(self, url: str) -> dict[str, Any]:
213
+ """Harvest transcript for a single YouTube URL. Returns result dict."""
214
+ if not extract_youtube_video_id(url):
215
+ return {"error": f"Invalid YouTube URL: {url}"}
216
+ if not self.youtube_api_key and not self.transcriptapi_api_key:
217
+ return {"error": "YouTube requires YOUTUBE_API_KEY or TRANSCRIPTAPI_API_KEY."}
218
+ harvester = YouTubeHarvester(
219
+ api_key=self.youtube_api_key,
220
+ scrapingdog_api_key=self.scrapingdog_api_key,
221
+ transcriptapi_api_key=self.transcriptapi_api_key,
222
+ )
223
+ try:
224
+ data = harvester.harvest_url(url, self.out_dir, whisper_model="base")
225
+ except TranscriptAPIInsufficientCreditsError as e:
226
+ return {"harvested_count": 0, "error": str(e)}
227
+ if data:
228
+ return {"harvested_count": 1, "out_dir": str(self.out_dir), "title": data.get("title", "")}
229
+ return {"harvested_count": 0, "error": "Failed to fetch transcript"}
230
+
231
+ def _list_transcripts(self, out_dir: str) -> dict[str, Any]:
232
+ """Tool: list transcript files."""
233
+ path = Path(out_dir)
234
+ if not path.exists():
235
+ return {"error": f"Directory not found: {out_dir}"}
236
+ files = []
237
+ for f in path.glob("*.json"):
238
+ try:
239
+ with open(f, encoding="utf-8") as fp:
240
+ data = json.load(fp)
241
+ files.append({
242
+ "path": str(f),
243
+ "title": data.get("title", ""),
244
+ "url": data.get("url", ""),
245
+ "platform": data.get("platform", ""),
246
+ })
247
+ except Exception:
248
+ files.append({"path": str(f), "title": "(parse error)"})
249
+ return {"files": files, "count": len(files)}
250
+
251
+ def _read_transcript(self, file_path: str) -> dict[str, Any]:
252
+ """Tool: read a transcript file."""
253
+ path = Path(file_path)
254
+ if not path.exists():
255
+ return {"error": f"File not found: {file_path}"}
256
+ try:
257
+ if path.suffix == ".json":
258
+ with open(path, encoding="utf-8") as f:
259
+ data = json.load(f)
260
+ return {
261
+ "title": data.get("title", ""),
262
+ "url": data.get("url", ""),
263
+ "platform": data.get("platform", ""),
264
+ "full_text": data.get("full_text", ""),
265
+ "segments_count": len(data.get("segments", [])),
266
+ }
267
+ if path.suffix == ".txt":
268
+ with open(path, encoding="utf-8") as f:
269
+ return {"full_text": f.read()}
270
+ return {"error": "Unsupported file type. Use .json or .txt"}
271
+ except Exception as e:
272
+ return {"error": str(e)}
273
+
274
+ def _execute_tool(self, name: str, args: dict[str, Any]) -> str:
275
+ """Execute a tool and return JSON string result."""
276
+ if name == "harvest_transcripts":
277
+ result = self._harvest_transcripts(
278
+ query=args["query"],
279
+ platforms=args.get("platforms", ["youtube"]),
280
+ max_results=args.get("max_results", 5),
281
+ )
282
+ elif name == "list_transcripts":
283
+ result = self._list_transcripts(args["out_dir"])
284
+ elif name == "read_transcript":
285
+ result = self._read_transcript(args["file_path"])
286
+ elif name == "brave_search":
287
+ result = brave_search(
288
+ q=args["q"],
289
+ count=args.get("count", 10),
290
+ freshness=args.get("freshness"),
291
+ )
292
+ elif name == "fetch_webpage":
293
+ result = fetch_webpage(url=args["url"])
294
+ elif name == "browse_webpage":
295
+ result = browse_webpage(url=args["url"], selector=args.get("selector"))
296
+ else:
297
+ result = {"error": f"Unknown tool: {name}"}
298
+ return json.dumps(result, indent=2)
299
+
300
+ def run(
301
+ self,
302
+ user_prompt: str,
303
+ max_turns: int = 20,
304
+ on_tool_call: Callable[[str, dict], None] | None = None,
305
+ ) -> str:
306
+ """Run the agent. Returns the final report."""
307
+ messages = [
308
+ {"role": "system", "content": SYSTEM_PROMPT},
309
+ {"role": "user", "content": user_prompt},
310
+ ]
311
+
312
+ for turn in range(max_turns):
313
+ response = self.client.chat.completions.create(
314
+ model=self.model,
315
+ messages=messages,
316
+ tools=TOOLS,
317
+ tool_choice="auto",
318
+ )
319
+ choice = response.choices[0]
320
+ msg = choice.message
321
+
322
+ # Convert to dict for API (handles model objects)
323
+ msg_dict = (
324
+ msg.model_dump() if hasattr(msg, "model_dump") else
325
+ {"role": msg.role, "content": msg.content or "", "tool_calls": getattr(msg, "tool_calls", None)}
326
+ )
327
+ messages.append(msg_dict)
328
+
329
+ if not msg.tool_calls:
330
+ return (msg.content or "").strip()
331
+
332
+ for tc in msg.tool_calls:
333
+ name = tc.function.name
334
+ args = json.loads(tc.function.arguments)
335
+ if on_tool_call:
336
+ on_tool_call(name, args)
337
+ result = self._execute_tool(name, args)
338
+ messages.append({
339
+ "role": "tool",
340
+ "tool_call_id": tc.id,
341
+ "content": result,
342
+ })
343
+
344
+ return "Max turns reached. Review harvested transcripts manually."