openartemis 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openartemis-0.2.0/PKG-INFO +162 -0
- openartemis-0.2.0/README.md +130 -0
- openartemis-0.2.0/openartemis/__init__.py +3 -0
- openartemis-0.2.0/openartemis/__main__.py +6 -0
- openartemis-0.2.0/openartemis/agent/__init__.py +8 -0
- openartemis-0.2.0/openartemis/agent/detective.py +344 -0
- openartemis-0.2.0/openartemis/agent/orchestrator.py +61 -0
- openartemis-0.2.0/openartemis/agent/pipeline.py +109 -0
- openartemis-0.2.0/openartemis/agent/worker.py +96 -0
- openartemis-0.2.0/openartemis/auth/__init__.py +35 -0
- openartemis-0.2.0/openartemis/auth/db.py +302 -0
- openartemis-0.2.0/openartemis/auth/email_sender.py +46 -0
- openartemis-0.2.0/openartemis/auth/passwords.py +16 -0
- openartemis-0.2.0/openartemis/auth/sessions.py +87 -0
- openartemis-0.2.0/openartemis/chat.py +138 -0
- openartemis-0.2.0/openartemis/cli.py +804 -0
- openartemis-0.2.0/openartemis/harvesters/__init__.py +7 -0
- openartemis-0.2.0/openartemis/harvesters/base.py +23 -0
- openartemis-0.2.0/openartemis/harvesters/social.py +134 -0
- openartemis-0.2.0/openartemis/harvesters/youtube.py +421 -0
- openartemis-0.2.0/openartemis/output.py +65 -0
- openartemis-0.2.0/openartemis/tools/__init__.py +5 -0
- openartemis-0.2.0/openartemis/tools/web.py +104 -0
- openartemis-0.2.0/openartemis/transcription.py +35 -0
- openartemis-0.2.0/openartemis.egg-info/PKG-INFO +162 -0
- openartemis-0.2.0/openartemis.egg-info/SOURCES.txt +30 -0
- openartemis-0.2.0/openartemis.egg-info/dependency_links.txt +1 -0
- openartemis-0.2.0/openartemis.egg-info/entry_points.txt +2 -0
- openartemis-0.2.0/openartemis.egg-info/requires.txt +14 -0
- openartemis-0.2.0/openartemis.egg-info/top_level.txt +1 -0
- openartemis-0.2.0/pyproject.toml +46 -0
- openartemis-0.2.0/setup.cfg +4 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openartemis
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: CLI to harvest transcripts from YouTube, TikTok, Instagram, and X — chat with Artemis, an AI that researches for you
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: transcripts,youtube,tiktok,whisper,cli,research
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Environment :: Console
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Multimedia :: Video
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: google-api-python-client>=2.100.0
|
|
19
|
+
Requires-Dist: youtube-transcript-api>=0.6.0
|
|
20
|
+
Requires-Dist: yt-dlp>=2024.1.0
|
|
21
|
+
Requires-Dist: ddgs>=9.0.0
|
|
22
|
+
Requires-Dist: openai-whisper>=20231117
|
|
23
|
+
Requires-Dist: requests>=2.28.0
|
|
24
|
+
Requires-Dist: typer[all]>=0.9.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Requires-Dist: questionary>=2.0.0
|
|
27
|
+
Requires-Dist: openai>=1.0.0
|
|
28
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
29
|
+
Requires-Dist: bcrypt>=4.0.0
|
|
30
|
+
Requires-Dist: trafilatura>=2.0.0
|
|
31
|
+
Requires-Dist: playwright>=1.40.0
|
|
32
|
+
|
|
33
|
+
# OpenArtemis
|
|
34
|
+
|
|
35
|
+
CLI tool to harvest transcripts from YouTube, TikTok, Instagram, and X (Twitter). User-friendly design inspired by Claude CLI — no coding required.
|
|
36
|
+
|
|
37
|
+
**Login required.** First run creates an admin account. Other users register and wait for admin approval. Approval requests are emailed to the admin.
|
|
38
|
+
|
|
39
|
+
**Chat mode** (default): Run `openartemis` for a persistent chat with **Artemis**, an AI that researches for you. Say "research X" or "find out about Y" and Artemis uses tools (transcripts, web search, webpage fetch). Use `/research <query>` for multi-agent deep research. Type `/` for mode menu, `/logout` or `/exit` to quit.
|
|
40
|
+
|
|
41
|
+
**Harvest mode**: `openartemis harvest` for guided step-by-step transcript download.
|
|
42
|
+
|
|
43
|
+
**Admin**: `openartemis admin` — manage users, approve requests, view usage. Admins have unlimited API access.
|
|
44
|
+
|
|
45
|
+
## Requirements
|
|
46
|
+
|
|
47
|
+
- **Python 3.10+**
|
|
48
|
+
- **ffmpeg** (required for yt-dlp and Whisper; must be on PATH)
|
|
49
|
+
- **YouTube Data API key** (for YouTube search; set `YOUTUBE_API_KEY` or use `--youtube-api-key`)
|
|
50
|
+
- **ScrapingDog API key** (optional, for faster YouTube transcripts; set `SCRAPINGDOG_API_KEY`)
|
|
51
|
+
- **OpenAI API key** (for Artemis chat; set `OPENAI_API_KEY` — never commit your key)
|
|
52
|
+
- **Brave Search API key** (optional, for web search; set `BRAVE_SEARCH_API_KEY` — search.brave.com)
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
**From PyPI** (run anywhere):
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install openartemis
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Then run:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
openartemis
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**From source** (development):
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install -e .
|
|
72
|
+
playwright install chromium # For browse_webpage (JS-heavy sites)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Interactive mode (no coding required)
|
|
78
|
+
|
|
79
|
+
Run without arguments for chat (login required):
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
openartemis
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Command-line mode
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
openartemis harvest # Guided harvest (login required)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Arguments
|
|
92
|
+
|
|
93
|
+
| Argument | Description | Default |
|
|
94
|
+
|----------|-------------|---------|
|
|
95
|
+
| `query` | Search query (omit for interactive mode) | - |
|
|
96
|
+
| `--platforms`, `-p` | Comma-separated: youtube,tiktok,instagram,x | youtube |
|
|
97
|
+
| `--max-results`, `-n` | Max results per platform | 10 |
|
|
98
|
+
| `--out-dir`, `-o` | Output directory | ./transcripts |
|
|
99
|
+
| `--youtube-api-key` | YouTube Data API key (or YOUTUBE_API_KEY env) | - |
|
|
100
|
+
| `--scrapingdog-api-key` | ScrapingDog API key (or SCRAPINGDOG_API_KEY env) | - |
|
|
101
|
+
| `--whisper-model`, `-m` | Whisper model: tiny,base,small,medium,large | base |
|
|
102
|
+
| `--interactive`, `-i` | Run in interactive mode | - |
|
|
103
|
+
|
|
104
|
+
### Artemis Detective (multi-agent research)
|
|
105
|
+
|
|
106
|
+
Tell Artemis what to find — it uses a multi-agent pipeline: one orchestrator plans up to 5 tasks, five workers run in parallel (Brave search, transcript harvest, webpage fetch/browse), then results are synthesized into a report:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
openartemis investigate "Find out everything about what happened to John Smith"
|
|
110
|
+
# Or run without args for interactive prompt:
|
|
111
|
+
openartemis investigate
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Options: `--model gpt-4o-mini` (default, cheapest), `--out-dir ./detective_output`
|
|
115
|
+
|
|
116
|
+
Tools: `harvest_transcripts`, `brave_search`, `fetch_webpage`, `browse_webpage`, `list_transcripts`, `read_transcript`
|
|
117
|
+
|
|
118
|
+
### Admin panel
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
openartemis admin
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Approve users, view usage, revoke access, create users directly.
|
|
125
|
+
|
|
126
|
+
### Other commands
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
openartemis logout # Log out
|
|
130
|
+
openartemis config # Show configuration help
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Shows required env vars and where to get API keys.
|
|
134
|
+
|
|
135
|
+
### Output
|
|
136
|
+
|
|
137
|
+
Per-post files in `out_dir`:
|
|
138
|
+
- `{platform}_{id}.json` - Metadata + segments + full text
|
|
139
|
+
- `{platform}_{id}.txt` - Plain text transcript
|
|
140
|
+
- `{platform}_{id}.srt` - SRT subtitles
|
|
141
|
+
|
|
142
|
+
## Pipeline
|
|
143
|
+
|
|
144
|
+
- **YouTube**: Data API search → video IDs → ScrapingDog API (if key set) or youtube-transcript-api for captions; if none, download audio via yt-dlp → Whisper
|
|
145
|
+
- **TikTok/Instagram/X**: Web search (site: operator) → yt-dlp download + extract audio → Whisper
|
|
146
|
+
- **Research**: Multi-agent (orchestrator + 5 workers) — Brave Search, trafilatura (fetch_webpage), Playwright (browse_webpage), transcript harvest
|
|
147
|
+
|
|
148
|
+
## Auth
|
|
149
|
+
|
|
150
|
+
- Session expiry: 7 days
|
|
151
|
+
- Rate limit: 5 failed logins per 15 min per username
|
|
152
|
+
- Password policy: minimum 8 characters
|
|
153
|
+
|
|
154
|
+
## Publishing to PyPI
|
|
155
|
+
|
|
156
|
+
To publish a new version:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
pip install build twine
|
|
160
|
+
python -m build
|
|
161
|
+
twine upload dist/*
|
|
162
|
+
```
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# OpenArtemis
|
|
2
|
+
|
|
3
|
+
CLI tool to harvest transcripts from YouTube, TikTok, Instagram, and X (Twitter). User-friendly design inspired by Claude CLI — no coding required.
|
|
4
|
+
|
|
5
|
+
**Login required.** First run creates an admin account. Other users register and wait for admin approval. Approval requests are emailed to the admin.
|
|
6
|
+
|
|
7
|
+
**Chat mode** (default): Run `openartemis` for a persistent chat with **Artemis**, an AI that researches for you. Say "research X" or "find out about Y" and Artemis uses tools (transcripts, web search, webpage fetch). Use `/research <query>` for multi-agent deep research. Type `/` for mode menu, `/logout` or `/exit` to quit.
|
|
8
|
+
|
|
9
|
+
**Harvest mode**: `openartemis harvest` for guided step-by-step transcript download.
|
|
10
|
+
|
|
11
|
+
**Admin**: `openartemis admin` — manage users, approve requests, view usage. Admins have unlimited API access.
|
|
12
|
+
|
|
13
|
+
## Requirements
|
|
14
|
+
|
|
15
|
+
- **Python 3.10+**
|
|
16
|
+
- **ffmpeg** (required for yt-dlp and Whisper; must be on PATH)
|
|
17
|
+
- **YouTube Data API key** (for YouTube search; set `YOUTUBE_API_KEY` or use `--youtube-api-key`)
|
|
18
|
+
- **ScrapingDog API key** (optional, for faster YouTube transcripts; set `SCRAPINGDOG_API_KEY`)
|
|
19
|
+
- **OpenAI API key** (for Artemis chat; set `OPENAI_API_KEY` — never commit your key)
|
|
20
|
+
- **Brave Search API key** (optional, for web search; set `BRAVE_SEARCH_API_KEY` — search.brave.com)
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
**From PyPI** (run anywhere):
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install openartemis
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Then run:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
openartemis
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**From source** (development):
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e .
|
|
40
|
+
playwright install chromium # For browse_webpage (JS-heavy sites)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Interactive mode (no coding required)
|
|
46
|
+
|
|
47
|
+
Run without arguments for chat (login required):
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
openartemis
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Command-line mode
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
openartemis harvest # Guided harvest (login required)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Arguments
|
|
60
|
+
|
|
61
|
+
| Argument | Description | Default |
|
|
62
|
+
|----------|-------------|---------|
|
|
63
|
+
| `query` | Search query (omit for interactive mode) | - |
|
|
64
|
+
| `--platforms`, `-p` | Comma-separated: youtube,tiktok,instagram,x | youtube |
|
|
65
|
+
| `--max-results`, `-n` | Max results per platform | 10 |
|
|
66
|
+
| `--out-dir`, `-o` | Output directory | ./transcripts |
|
|
67
|
+
| `--youtube-api-key` | YouTube Data API key (or YOUTUBE_API_KEY env) | - |
|
|
68
|
+
| `--scrapingdog-api-key` | ScrapingDog API key (or SCRAPINGDOG_API_KEY env) | - |
|
|
69
|
+
| `--whisper-model`, `-m` | Whisper model: tiny,base,small,medium,large | base |
|
|
70
|
+
| `--interactive`, `-i` | Run in interactive mode | - |
|
|
71
|
+
|
|
72
|
+
### Artemis Detective (multi-agent research)
|
|
73
|
+
|
|
74
|
+
Tell Artemis what to find — it uses a multi-agent pipeline: one orchestrator plans up to 5 tasks, five workers run in parallel (Brave search, transcript harvest, webpage fetch/browse), then results are synthesized into a report:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
openartemis investigate "Find out everything about what happened to John Smith"
|
|
78
|
+
# Or run without args for interactive prompt:
|
|
79
|
+
openartemis investigate
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Options: `--model gpt-4o-mini` (default, cheapest), `--out-dir ./detective_output`
|
|
83
|
+
|
|
84
|
+
Tools: `harvest_transcripts`, `brave_search`, `fetch_webpage`, `browse_webpage`, `list_transcripts`, `read_transcript`
|
|
85
|
+
|
|
86
|
+
### Admin panel
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
openartemis admin
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Approve users, view usage, revoke access, create users directly.
|
|
93
|
+
|
|
94
|
+
### Other commands
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
openartemis logout # Log out
|
|
98
|
+
openartemis config # Show configuration help
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Shows required env vars and where to get API keys.
|
|
102
|
+
|
|
103
|
+
### Output
|
|
104
|
+
|
|
105
|
+
Per-post files in `out_dir`:
|
|
106
|
+
- `{platform}_{id}.json` - Metadata + segments + full text
|
|
107
|
+
- `{platform}_{id}.txt` - Plain text transcript
|
|
108
|
+
- `{platform}_{id}.srt` - SRT subtitles
|
|
109
|
+
|
|
110
|
+
## Pipeline
|
|
111
|
+
|
|
112
|
+
- **YouTube**: Data API search → video IDs → ScrapingDog API (if key set) or youtube-transcript-api for captions; if none, download audio via yt-dlp → Whisper
|
|
113
|
+
- **TikTok/Instagram/X**: Web search (site: operator) → yt-dlp download + extract audio → Whisper
|
|
114
|
+
- **Research**: Multi-agent (orchestrator + 5 workers) — Brave Search, trafilatura (fetch_webpage), Playwright (browse_webpage), transcript harvest
|
|
115
|
+
|
|
116
|
+
## Auth
|
|
117
|
+
|
|
118
|
+
- Session expiry: 7 days
|
|
119
|
+
- Rate limit: 5 failed logins per 15 min per username
|
|
120
|
+
- Password policy: minimum 8 characters
|
|
121
|
+
|
|
122
|
+
## Publishing to PyPI
|
|
123
|
+
|
|
124
|
+
To publish a new version:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
pip install build twine
|
|
128
|
+
python -m build
|
|
129
|
+
twine upload dist/*
|
|
130
|
+
```
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Multi-agent research pipeline — orchestrator, workers, synthesis."""
|
|
2
|
+
|
|
3
|
+
from openartemis.agent.detective import DetectiveAgent, TOOLS
|
|
4
|
+
from openartemis.agent.orchestrator import create_plan
|
|
5
|
+
from openartemis.agent.worker import run_worker
|
|
6
|
+
from openartemis.agent.pipeline import ResearchPipeline
|
|
7
|
+
|
|
8
|
+
__all__ = ["DetectiveAgent", "TOOLS", "create_plan", "run_worker", "ResearchPipeline"]
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""AI detective agent — uses transcriptharvest as tools to research and find answers."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from openai import OpenAI
|
|
10
|
+
|
|
11
|
+
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
|
12
|
+
|
|
13
|
+
from openartemis.harvesters.social import SocialHarvester
|
|
14
|
+
from openartemis.harvesters.youtube import (
|
|
15
|
+
TranscriptAPIInsufficientCreditsError,
|
|
16
|
+
YouTubeHarvester,
|
|
17
|
+
extract_youtube_video_id,
|
|
18
|
+
)
|
|
19
|
+
from openartemis.tools.web import brave_search, fetch_webpage, browse_webpage
|
|
20
|
+
|
|
21
|
+
# Tools the AI can call (harvest + web)
|
|
22
|
+
TOOLS = [
|
|
23
|
+
{
|
|
24
|
+
"type": "function",
|
|
25
|
+
"function": {
|
|
26
|
+
"name": "harvest_transcripts",
|
|
27
|
+
"description": "Search and harvest transcripts from video/social platforms. Use this to find content about a person, event, or topic. Returns the output directory path and count of harvested posts.",
|
|
28
|
+
"parameters": {
|
|
29
|
+
"type": "object",
|
|
30
|
+
"properties": {
|
|
31
|
+
"query": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "Search query (e.g. person name, event, topic)",
|
|
34
|
+
},
|
|
35
|
+
"platforms": {
|
|
36
|
+
"type": "array",
|
|
37
|
+
"items": {"type": "string", "enum": ["youtube", "tiktok", "instagram", "x"]},
|
|
38
|
+
"description": "Platforms to search",
|
|
39
|
+
"default": ["youtube"],
|
|
40
|
+
},
|
|
41
|
+
"max_results": {
|
|
42
|
+
"type": "integer",
|
|
43
|
+
"description": "Max results per platform",
|
|
44
|
+
"default": 5,
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
"required": ["query"],
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"type": "function",
|
|
53
|
+
"function": {
|
|
54
|
+
"name": "list_transcripts",
|
|
55
|
+
"description": "List all harvested transcript files in the output directory. Returns file paths and titles.",
|
|
56
|
+
"parameters": {
|
|
57
|
+
"type": "object",
|
|
58
|
+
"properties": {
|
|
59
|
+
"out_dir": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"description": "Path to the transcripts directory",
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
"required": ["out_dir"],
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"type": "function",
|
|
70
|
+
"function": {
|
|
71
|
+
"name": "read_transcript",
|
|
72
|
+
"description": "Read the full text of a harvested transcript (JSON or TXT file).",
|
|
73
|
+
"parameters": {
|
|
74
|
+
"type": "object",
|
|
75
|
+
"properties": {
|
|
76
|
+
"file_path": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "Full path to the .json or .txt transcript file",
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
"required": ["file_path"],
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"type": "function",
|
|
87
|
+
"function": {
|
|
88
|
+
"name": "brave_search",
|
|
89
|
+
"description": "Search the web via Brave Search API. Returns results with title, url, description.",
|
|
90
|
+
"parameters": {
|
|
91
|
+
"type": "object",
|
|
92
|
+
"properties": {
|
|
93
|
+
"q": {"type": "string", "description": "Search query"},
|
|
94
|
+
"count": {"type": "integer", "description": "Max results (default 10)", "default": 10},
|
|
95
|
+
"freshness": {"type": "string", "enum": ["pd", "pw", "pm", "py"], "description": "pd=day, pw=week, pm=month, py=year"},
|
|
96
|
+
},
|
|
97
|
+
"required": ["q"],
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"type": "function",
|
|
103
|
+
"function": {
|
|
104
|
+
"name": "fetch_webpage",
|
|
105
|
+
"description": "Fetch URL and extract main text (trafilatura). Fast for static HTML.",
|
|
106
|
+
"parameters": {
|
|
107
|
+
"type": "object",
|
|
108
|
+
"properties": {
|
|
109
|
+
"url": {"type": "string", "description": "URL to fetch"},
|
|
110
|
+
},
|
|
111
|
+
"required": ["url"],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"type": "function",
|
|
117
|
+
"function": {
|
|
118
|
+
"name": "browse_webpage",
|
|
119
|
+
"description": "Fetch URL with headless browser (Playwright). Use for JS-heavy sites.",
|
|
120
|
+
"parameters": {
|
|
121
|
+
"type": "object",
|
|
122
|
+
"properties": {
|
|
123
|
+
"url": {"type": "string", "description": "URL to browse"},
|
|
124
|
+
"selector": {"type": "string", "description": "Optional CSS selector for specific element"},
|
|
125
|
+
},
|
|
126
|
+
"required": ["url"],
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
SYSTEM_PROMPT = """You are Artemis, a research detective. Your job is to find out everything you can about what the user asks.
|
|
133
|
+
|
|
134
|
+
You have access to tools that let you:
|
|
135
|
+
1. **harvest_transcripts** — Search YouTube, TikTok, Instagram, X for videos and download their transcripts
|
|
136
|
+
2. **list_transcripts** — See what transcripts have been harvested
|
|
137
|
+
3. **read_transcript** — Read the full text of any transcript
|
|
138
|
+
4. **brave_search** — Web search via Brave API
|
|
139
|
+
5. **fetch_webpage** — Fast text extraction from static HTML (trafilatura)
|
|
140
|
+
6. **browse_webpage** — Full render with headless browser for JS-heavy sites
|
|
141
|
+
|
|
142
|
+
Work systematically:
|
|
143
|
+
- Start by harvesting transcripts with relevant search queries (person names, events, dates, related terms)
|
|
144
|
+
- Read the transcripts to extract facts, quotes, and leads
|
|
145
|
+
- If you find new names, places, or angles, harvest more with follow-up searches
|
|
146
|
+
- Keep going until you have a comprehensive picture or hit diminishing returns
|
|
147
|
+
|
|
148
|
+
Be thorough but efficient. Prioritize high-value sources. Summarize findings clearly.
|
|
149
|
+
When done, provide a complete report with sources (URLs from the transcripts)."""
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class DetectiveAgent:
|
|
153
|
+
"""AI agent that uses transcriptharvest to research."""
|
|
154
|
+
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
openai_api_key: str | None = None,
|
|
158
|
+
youtube_api_key: str | None = None,
|
|
159
|
+
scrapingdog_api_key: str | None = None,
|
|
160
|
+
transcriptapi_api_key: str | None = None,
|
|
161
|
+
model: str = "gpt-4o-mini",
|
|
162
|
+
out_dir: Path | None = None,
|
|
163
|
+
):
|
|
164
|
+
self.client = OpenAI(api_key=openai_api_key or os.environ.get("OPENAI_API_KEY"))
|
|
165
|
+
self.youtube_api_key = youtube_api_key or os.environ.get("YOUTUBE_API_KEY")
|
|
166
|
+
self.scrapingdog_api_key = scrapingdog_api_key or os.environ.get("SCRAPINGDOG_API_KEY")
|
|
167
|
+
self.transcriptapi_api_key = transcriptapi_api_key or os.environ.get("TRANSCRIPTAPI_API_KEY")
|
|
168
|
+
self.model = model
|
|
169
|
+
self.out_dir = Path(out_dir or "./detective_output").resolve()
|
|
170
|
+
self.out_dir.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
|
|
172
|
+
def _harvest_transcripts(
|
|
173
|
+
self,
|
|
174
|
+
query: str,
|
|
175
|
+
platforms: list[str] | None = None,
|
|
176
|
+
max_results: int = 5,
|
|
177
|
+
) -> dict[str, Any]:
|
|
178
|
+
"""Tool: harvest transcripts."""
|
|
179
|
+
platforms = platforms or ["youtube"]
|
|
180
|
+
total = 0
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
for platform in platforms:
|
|
184
|
+
if platform == "youtube":
|
|
185
|
+
if not self.youtube_api_key and not self.transcriptapi_api_key:
|
|
186
|
+
return {"error": "YouTube requires YOUTUBE_API_KEY (Google) or TRANSCRIPTAPI_API_KEY. Skipping YouTube."}
|
|
187
|
+
harvester = YouTubeHarvester(
|
|
188
|
+
api_key=self.youtube_api_key,
|
|
189
|
+
scrapingdog_api_key=self.scrapingdog_api_key,
|
|
190
|
+
transcriptapi_api_key=self.transcriptapi_api_key,
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
harvester = SocialHarvester(platform=platform)
|
|
194
|
+
|
|
195
|
+
for _ in harvester.harvest(
|
|
196
|
+
query=query,
|
|
197
|
+
max_results=max_results,
|
|
198
|
+
out_dir=self.out_dir,
|
|
199
|
+
whisper_model="base",
|
|
200
|
+
):
|
|
201
|
+
total += 1
|
|
202
|
+
except TranscriptAPIInsufficientCreditsError as e:
|
|
203
|
+
return {"error": str(e), "harvested_count": 0}
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
"out_dir": str(self.out_dir),
|
|
207
|
+
"query": query,
|
|
208
|
+
"platforms": platforms,
|
|
209
|
+
"harvested_count": total,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
def harvest_url(self, url: str) -> dict[str, Any]:
|
|
213
|
+
"""Harvest transcript for a single YouTube URL. Returns result dict."""
|
|
214
|
+
if not extract_youtube_video_id(url):
|
|
215
|
+
return {"error": f"Invalid YouTube URL: {url}"}
|
|
216
|
+
if not self.youtube_api_key and not self.transcriptapi_api_key:
|
|
217
|
+
return {"error": "YouTube requires YOUTUBE_API_KEY or TRANSCRIPTAPI_API_KEY."}
|
|
218
|
+
harvester = YouTubeHarvester(
|
|
219
|
+
api_key=self.youtube_api_key,
|
|
220
|
+
scrapingdog_api_key=self.scrapingdog_api_key,
|
|
221
|
+
transcriptapi_api_key=self.transcriptapi_api_key,
|
|
222
|
+
)
|
|
223
|
+
try:
|
|
224
|
+
data = harvester.harvest_url(url, self.out_dir, whisper_model="base")
|
|
225
|
+
except TranscriptAPIInsufficientCreditsError as e:
|
|
226
|
+
return {"harvested_count": 0, "error": str(e)}
|
|
227
|
+
if data:
|
|
228
|
+
return {"harvested_count": 1, "out_dir": str(self.out_dir), "title": data.get("title", "")}
|
|
229
|
+
return {"harvested_count": 0, "error": "Failed to fetch transcript"}
|
|
230
|
+
|
|
231
|
+
def _list_transcripts(self, out_dir: str) -> dict[str, Any]:
|
|
232
|
+
"""Tool: list transcript files."""
|
|
233
|
+
path = Path(out_dir)
|
|
234
|
+
if not path.exists():
|
|
235
|
+
return {"error": f"Directory not found: {out_dir}"}
|
|
236
|
+
files = []
|
|
237
|
+
for f in path.glob("*.json"):
|
|
238
|
+
try:
|
|
239
|
+
with open(f, encoding="utf-8") as fp:
|
|
240
|
+
data = json.load(fp)
|
|
241
|
+
files.append({
|
|
242
|
+
"path": str(f),
|
|
243
|
+
"title": data.get("title", ""),
|
|
244
|
+
"url": data.get("url", ""),
|
|
245
|
+
"platform": data.get("platform", ""),
|
|
246
|
+
})
|
|
247
|
+
except Exception:
|
|
248
|
+
files.append({"path": str(f), "title": "(parse error)"})
|
|
249
|
+
return {"files": files, "count": len(files)}
|
|
250
|
+
|
|
251
|
+
def _read_transcript(self, file_path: str) -> dict[str, Any]:
|
|
252
|
+
"""Tool: read a transcript file."""
|
|
253
|
+
path = Path(file_path)
|
|
254
|
+
if not path.exists():
|
|
255
|
+
return {"error": f"File not found: {file_path}"}
|
|
256
|
+
try:
|
|
257
|
+
if path.suffix == ".json":
|
|
258
|
+
with open(path, encoding="utf-8") as f:
|
|
259
|
+
data = json.load(f)
|
|
260
|
+
return {
|
|
261
|
+
"title": data.get("title", ""),
|
|
262
|
+
"url": data.get("url", ""),
|
|
263
|
+
"platform": data.get("platform", ""),
|
|
264
|
+
"full_text": data.get("full_text", ""),
|
|
265
|
+
"segments_count": len(data.get("segments", [])),
|
|
266
|
+
}
|
|
267
|
+
if path.suffix == ".txt":
|
|
268
|
+
with open(path, encoding="utf-8") as f:
|
|
269
|
+
return {"full_text": f.read()}
|
|
270
|
+
return {"error": "Unsupported file type. Use .json or .txt"}
|
|
271
|
+
except Exception as e:
|
|
272
|
+
return {"error": str(e)}
|
|
273
|
+
|
|
274
|
+
def _execute_tool(self, name: str, args: dict[str, Any]) -> str:
|
|
275
|
+
"""Execute a tool and return JSON string result."""
|
|
276
|
+
if name == "harvest_transcripts":
|
|
277
|
+
result = self._harvest_transcripts(
|
|
278
|
+
query=args["query"],
|
|
279
|
+
platforms=args.get("platforms", ["youtube"]),
|
|
280
|
+
max_results=args.get("max_results", 5),
|
|
281
|
+
)
|
|
282
|
+
elif name == "list_transcripts":
|
|
283
|
+
result = self._list_transcripts(args["out_dir"])
|
|
284
|
+
elif name == "read_transcript":
|
|
285
|
+
result = self._read_transcript(args["file_path"])
|
|
286
|
+
elif name == "brave_search":
|
|
287
|
+
result = brave_search(
|
|
288
|
+
q=args["q"],
|
|
289
|
+
count=args.get("count", 10),
|
|
290
|
+
freshness=args.get("freshness"),
|
|
291
|
+
)
|
|
292
|
+
elif name == "fetch_webpage":
|
|
293
|
+
result = fetch_webpage(url=args["url"])
|
|
294
|
+
elif name == "browse_webpage":
|
|
295
|
+
result = browse_webpage(url=args["url"], selector=args.get("selector"))
|
|
296
|
+
else:
|
|
297
|
+
result = {"error": f"Unknown tool: {name}"}
|
|
298
|
+
return json.dumps(result, indent=2)
|
|
299
|
+
|
|
300
|
+
def run(
|
|
301
|
+
self,
|
|
302
|
+
user_prompt: str,
|
|
303
|
+
max_turns: int = 20,
|
|
304
|
+
on_tool_call: Callable[[str, dict], None] | None = None,
|
|
305
|
+
) -> str:
|
|
306
|
+
"""Run the agent. Returns the final report."""
|
|
307
|
+
messages = [
|
|
308
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
309
|
+
{"role": "user", "content": user_prompt},
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
for turn in range(max_turns):
|
|
313
|
+
response = self.client.chat.completions.create(
|
|
314
|
+
model=self.model,
|
|
315
|
+
messages=messages,
|
|
316
|
+
tools=TOOLS,
|
|
317
|
+
tool_choice="auto",
|
|
318
|
+
)
|
|
319
|
+
choice = response.choices[0]
|
|
320
|
+
msg = choice.message
|
|
321
|
+
|
|
322
|
+
# Convert to dict for API (handles model objects)
|
|
323
|
+
msg_dict = (
|
|
324
|
+
msg.model_dump() if hasattr(msg, "model_dump") else
|
|
325
|
+
{"role": msg.role, "content": msg.content or "", "tool_calls": getattr(msg, "tool_calls", None)}
|
|
326
|
+
)
|
|
327
|
+
messages.append(msg_dict)
|
|
328
|
+
|
|
329
|
+
if not msg.tool_calls:
|
|
330
|
+
return (msg.content or "").strip()
|
|
331
|
+
|
|
332
|
+
for tc in msg.tool_calls:
|
|
333
|
+
name = tc.function.name
|
|
334
|
+
args = json.loads(tc.function.arguments)
|
|
335
|
+
if on_tool_call:
|
|
336
|
+
on_tool_call(name, args)
|
|
337
|
+
result = self._execute_tool(name, args)
|
|
338
|
+
messages.append({
|
|
339
|
+
"role": "tool",
|
|
340
|
+
"tool_call_id": tc.id,
|
|
341
|
+
"content": result,
|
|
342
|
+
})
|
|
343
|
+
|
|
344
|
+
return "Max turns reached. Review harvested transcripts manually."
|