ddg-deep-research 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+
9
+ # OS
10
+ .DS_Store
11
+ Thumbs.db
12
+
13
+ # IDE
14
+ .vscode/
15
+ .idea/
16
+
17
+ # Logs
18
+ *.log
19
+ # Research outputs
20
+ outputs/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 crftsmnd
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: ddg-deep-research
3
+ Version: 0.2.0
4
+ Summary: 5-stage deep research pipeline using DuckDuckGo MCP — free, no API key, no rate limits
5
+ Project-URL: Homepage, https://github.com/crftsmnd/ddg-deep-research
6
+ Project-URL: Repository, https://github.com/crftsmnd/ddg-deep-research
7
+ Project-URL: Bug Tracker, https://github.com/crftsmnd/ddg-deep-research/issues
8
+ Author: crftsmnd
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: agent,deep-research,duckduckgo,llm,mcp,research,web-search
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: mcp>=1.0
26
+ Description-Content-Type: text/markdown
27
+
28
+ # 🧠 ddg-deep-research
29
+
30
+ **5-stage deep research pipeline** using DuckDuckGo MCP — **free, no API key, no rate limits**.
31
+
32
+ ```
33
+ pip install ddg-deep-research
34
+ ddg-deep-research search "your query"
35
+ ```
36
+
37
+ ## Why this exists
38
+
39
+ Every other deep research agent requires OpenAI / Anthropic / Google API keys — or runs expensive local models. This one uses **DuckDuckGo MCP** for both search and content extraction. Completely free. Zero API keys. Zero rate limits.
40
+
41
+ ## Pipeline
42
+
43
+ ```
44
+ Stage 1: Decompose ──→ 3-6 sub-questions + search strategy
45
+ Stage 2: Gather ──→ DuckDuckGo MCP search (parallel, unlimited)
46
+ Stage 3: Deep Read ──→ DuckDuckGo MCP fetch_content + browser_use
47
+ Stage 4: Verify ──→ Cross-reference claims, flag contradictions
48
+ Stage 5: Synthesize ──→ Cited brief.md + .provenance.md sidecar
49
+ ```
50
+
51
+ ## Quick Start
52
+
53
+ ```bash
54
+ # Search the web (free, no API key!)
55
+ ddg-deep-research search "latest advances in AI reasoning"
56
+
57
+ # Fetch a webpage
58
+ ddg-deep-research fetch "https://example.com/article"
59
+ ```
60
+
61
+ ### Full Pipeline
62
+
63
+ ```bash
64
+ # Stage 1: Break question into sub-questions (template)
65
+ ddg-deep-research decompose --question "How is RAG evolving?" --output plan.json
66
+
67
+ # Stage 2: Search
68
+ ddg-deep-research ddg_search --query "RAG architectures 2026" --output results.json
69
+
70
+ # Stage 3: Fetch content
71
+ ddg-deep-research ddg_fetch --url "https://..." --output-dir extracted/
72
+
73
+ # Stage 4: Merge results
74
+ ddg-deep-research merge --input-dir raw/ --output merged.json
75
+
76
+ # Stage 5: Clean & verify
77
+ ddg-deep-research clean --input extracted/ --output cleaned/
78
+ ddg-deep-research verify --input cleaned/cleaned.json --output verified.json
79
+
80
+ # Stage 6: Generate final brief
81
+ ddg-deep-research synthesize --verified verified.json --question "..." --output-dir outputs/ --today $(date +%Y-%m-%d)
82
+ ```
83
+
84
+ ### Parallel DAG Execution
85
+
86
+ ```bash
87
+ ddg-deep-research dag --plan workflow.json --verbose
88
+ ```
89
+
90
+ Input JSON:
91
+ ```json
92
+ {
93
+ "tasks": [
94
+ {"id": "search_1", "depends_on": [], "command": "ddg_search --query ..."},
95
+ {"id": "fetch_1", "depends_on": ["search_1"], "command": "ddg_fetch --url ..."},
96
+ {"id": "synthesize", "depends_on": ["search_1", "fetch_1"], "command": "synthesize ..."}
97
+ ]
98
+ }
99
+ ```
100
+
101
+ ## Python API
102
+
103
+ ```python
104
+ import asyncio
105
+ from ddg_deep_research.ddg_mcp import search_web, fetch_content
106
+
107
+ async def main():
108
+ results = await search_web("your query", max_results=10)
109
+ for r in results:
110
+ print(f"{r['title']}: {r['url']}")
111
+
112
+ content = await fetch_content("https://example.com")
113
+ print(content[:500])
114
+
115
+ asyncio.run(main())
116
+ ```
117
+
118
+ ## Requirements
119
+
120
+ - Python 3.10+
121
+ - `uv` installed (for duckduckgo-mcp-server): `curl -LsSf https://astral.sh/uv/install.sh | sh`
122
+ - No API keys. No subscriptions. Nothing.
123
+
124
+ ## How it works
125
+
126
+ This package wraps [duckduckgo-mcp-server](https://github.com/nicholasgriffintn/duckduckgo-mcp-server) via Python's MCP stdio transport. All search and content extraction goes through DuckDuckGo's free anonymous API. The 5-stage pipeline is modeled after production research agents but without the API costs.
127
+
128
+ ## Comparison
129
+
130
+ | Feature | OpenAI Deep Research | LangChain Deep Research | **ddg-deep-research** |
131
+ |---|---|---|---|
132
+ | API key needed | ✅ $200/mo | ✅ OpenAI key | **❌ Free** |
133
+ | Search engine | Bing/Browser | Custom | **DuckDuckGo** |
134
+ | Content extraction | Built-in | Built-in | **DuckDuckGo MCP** |
135
+ | Provenance tracking | ✅ | ✅ | **✅ .provenance.md** |
136
+ | DAG orchestration | ❌ | ❌ | **✅ Built-in** |
137
+ | Open source | ❌ | ✅ | **✅ MIT** |
138
+ | `pip install` | ❌ | ❌ | **✅ pip install** |
139
+
140
+ ## License
141
+
142
+ MIT
@@ -0,0 +1,115 @@
1
+ # 🧠 ddg-deep-research
2
+
3
+ **5-stage deep research pipeline** using DuckDuckGo MCP — **free, no API key, no rate limits**.
4
+
5
+ ```
6
+ pip install ddg-deep-research
7
+ ddg-deep-research search "your query"
8
+ ```
9
+
10
+ ## Why this exists
11
+
12
+ Every other deep research agent requires OpenAI / Anthropic / Google API keys — or runs expensive local models. This one uses **DuckDuckGo MCP** for both search and content extraction. Completely free. Zero API keys. Zero rate limits.
13
+
14
+ ## Pipeline
15
+
16
+ ```
17
+ Stage 1: Decompose ──→ 3-6 sub-questions + search strategy
18
+ Stage 2: Gather ──→ DuckDuckGo MCP search (parallel, unlimited)
19
+ Stage 3: Deep Read ──→ DuckDuckGo MCP fetch_content + browser_use
20
+ Stage 4: Verify ──→ Cross-reference claims, flag contradictions
21
+ Stage 5: Synthesize ──→ Cited brief.md + .provenance.md sidecar
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ```bash
27
+ # Search the web (free, no API key!)
28
+ ddg-deep-research search "latest advances in AI reasoning"
29
+
30
+ # Fetch a webpage
31
+ ddg-deep-research fetch "https://example.com/article"
32
+ ```
33
+
34
+ ### Full Pipeline
35
+
36
+ ```bash
37
+ # Stage 1: Break question into sub-questions (template)
38
+ ddg-deep-research decompose --question "How is RAG evolving?" --output plan.json
39
+
40
+ # Stage 2: Search
41
+ ddg-deep-research ddg_search --query "RAG architectures 2026" --output results.json
42
+
43
+ # Stage 3: Fetch content
44
+ ddg-deep-research ddg_fetch --url "https://..." --output-dir extracted/
45
+
46
+ # Stage 4: Merge results
47
+ ddg-deep-research merge --input-dir raw/ --output merged.json
48
+
49
+ # Stage 5: Clean & verify
50
+ ddg-deep-research clean --input extracted/ --output cleaned/
51
+ ddg-deep-research verify --input cleaned/cleaned.json --output verified.json
52
+
53
+ # Stage 6: Generate final brief
54
+ ddg-deep-research synthesize --verified verified.json --question "..." --output-dir outputs/ --today $(date +%Y-%m-%d)
55
+ ```
56
+
57
+ ### Parallel DAG Execution
58
+
59
+ ```bash
60
+ ddg-deep-research dag --plan workflow.json --verbose
61
+ ```
62
+
63
+ Input JSON:
64
+ ```json
65
+ {
66
+ "tasks": [
67
+ {"id": "search_1", "depends_on": [], "command": "ddg_search --query ..."},
68
+ {"id": "fetch_1", "depends_on": ["search_1"], "command": "ddg_fetch --url ..."},
69
+ {"id": "synthesize", "depends_on": ["search_1", "fetch_1"], "command": "synthesize ..."}
70
+ ]
71
+ }
72
+ ```
73
+
74
+ ## Python API
75
+
76
+ ```python
77
+ import asyncio
78
+ from ddg_deep_research.ddg_mcp import search_web, fetch_content
79
+
80
+ async def main():
81
+ results = await search_web("your query", max_results=10)
82
+ for r in results:
83
+ print(f"{r['title']}: {r['url']}")
84
+
85
+ content = await fetch_content("https://example.com")
86
+ print(content[:500])
87
+
88
+ asyncio.run(main())
89
+ ```
90
+
91
+ ## Requirements
92
+
93
+ - Python 3.10+
94
+ - `uv` installed (for duckduckgo-mcp-server): `curl -LsSf https://astral.sh/uv/install.sh | sh`
95
+ - No API keys. No subscriptions. Nothing.
96
+
97
+ ## How it works
98
+
99
+ This package wraps [duckduckgo-mcp-server](https://github.com/nicholasgriffintn/duckduckgo-mcp-server) via Python's MCP stdio transport. All search and content extraction goes through DuckDuckGo's free anonymous API. The 5-stage pipeline is modeled after production research agents but without the API costs.
100
+
101
+ ## Comparison
102
+
103
+ | Feature | OpenAI Deep Research | LangChain Deep Research | **ddg-deep-research** |
104
+ |---|---|---|---|
105
+ | API key needed | ✅ $200/mo | ✅ OpenAI key | **❌ Free** |
106
+ | Search engine | Bing/Browser | Custom | **DuckDuckGo** |
107
+ | Content extraction | Built-in | Built-in | **DuckDuckGo MCP** |
108
+ | Provenance tracking | ✅ | ✅ | **✅ .provenance.md** |
109
+ | DAG orchestration | ❌ | ❌ | **✅ Built-in** |
110
+ | Open source | ❌ | ✅ | **✅ MIT** |
111
+ | `pip install` | ❌ | ❌ | **✅ pip install** |
112
+
113
+ ## License
114
+
115
+ MIT
@@ -0,0 +1,51 @@
1
+ [project]
2
+ name = "ddg-deep-research"
3
+ version = "0.2.0"
4
+ description = "5-stage deep research pipeline using DuckDuckGo MCP — free, no API key, no rate limits"
5
+ readme = "README.md"
6
+ license = {text = "MIT"}
7
+ requires-python = ">=3.10"
8
+ authors = [
9
+ {name = "crftsmnd"},
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "Intended Audience :: Developers",
14
+ "Intended Audience :: Science/Research",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Topic :: Software Development :: Libraries :: Python Modules",
24
+ ]
25
+ keywords = ["deep-research", "duckduckgo", "research", "agent", "mcp", "web-search", "llm"]
26
+ dependencies = [
27
+ "mcp>=1.0",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/crftsmnd/ddg-deep-research"
32
+ Repository = "https://github.com/crftsmnd/ddg-deep-research"
33
+ "Bug Tracker" = "https://github.com/crftsmnd/ddg-deep-research/issues"
34
+
35
+ [project.scripts]
36
+ ddg-deep-research = "ddg_deep_research.research_pipeline:main"
37
+
38
+ [build-system]
39
+ requires = ["hatchling"]
40
+ build-backend = "hatchling.build"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/ddg_deep_research"]
44
+
45
+ [tool.hatch.build.targets.sdist]
46
+ include = [
47
+ "src/ddg_deep_research/*.py",
48
+ "README.md",
49
+ "LICENSE",
50
+ "pyproject.toml",
51
+ ]
@@ -0,0 +1,8 @@
1
+ """
2
+ ddg-deep-research — 5-stage deep research pipeline using DuckDuckGo MCP (free, no API key).
3
+
4
+ Zero API keys required. Zero rate limits. Zero paywalls.
5
+ """
6
+
7
+ __version__ = "0.2.0"
8
+ __license__ = "MIT"
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DuckDuckGo MCP Bridge — free, no-API-key search & content extraction.
4
+
5
+ Connects to duckduckgo-mcp-server via stdio MCP transport and exposes
6
+ two commands: search (web search) and fetch (page content extraction).
7
+
8
+ Usage:
9
+ uv run ddg_mcp.py search "query" [--max-results 10] [--region us-en]
10
+ uv run ddg_mcp.py fetch <url> [--max-length 8000]
11
+ """
12
+
13
+ import asyncio
14
+ import json
15
+ import re
16
+ import sys
17
+
18
+ from mcp import ClientSession, StdioServerParameters
19
+ from mcp.client.stdio import stdio_client
20
+
21
+ # ── Regex to parse "N. Title\n URL: ...\n Summary: ..." format ────────
22
+ RESULT_BLOCK_RE = re.compile(
23
+ r"^\d+\.\s+(?P<title>.+?)\n\s+URL:\s+(?P<url>\S+?)\s*\n\s+Summary:\s+(?P<snippet>.*?)$",
24
+ re.MULTILINE,
25
+ )
26
+
27
+
28
+ def parse_search_results(text: str) -> list[dict]:
29
+ """Parse the human-readable search result text into structured JSON."""
30
+ results = []
31
+ for match in RESULT_BLOCK_RE.finditer(text):
32
+ results.append(
33
+ {
34
+ "title": match.group("title").strip(),
35
+ "url": match.group("url").rstrip("/"),
36
+ "snippet": match.group("snippet").strip(),
37
+ "source": "duckduckgo",
38
+ }
39
+ )
40
+ return results
41
+
42
+
43
+ async def search_web(
44
+ query: str, max_results: int = 10, region: str = "wt-wt"
45
+ ) -> list[dict]:
46
+ """Search using DuckDuckGo MCP and return structured results."""
47
+ server_params = StdioServerParameters(
48
+ command="uvx",
49
+ args=["duckduckgo-mcp-server"],
50
+ )
51
+ async with stdio_client(server_params) as (read, write):
52
+ async with ClientSession(read, write) as session:
53
+ await session.initialize()
54
+ result = await session.call_tool(
55
+ "search",
56
+ {"query": query, "max_results": max_results, "region": region},
57
+ )
58
+ # result.content is list[TextContent|...]
59
+ text = ""
60
+ for item in result.content:
61
+ if hasattr(item, "text"):
62
+ text += item.text
63
+ return parse_search_results(text)
64
+
65
+
66
+ async def fetch_content(url: str, max_length: int = 8000) -> str:
67
+ """Fetch and extract clean text from a webpage."""
68
+ server_params = StdioServerParameters(
69
+ command="uvx",
70
+ args=["duckduckgo-mcp-server"],
71
+ )
72
+ async with stdio_client(server_params) as (read, write):
73
+ async with ClientSession(read, write) as session:
74
+ await session.initialize()
75
+ result = await session.call_tool(
76
+ "fetch_content",
77
+ {"url": url, "max_length": max_length},
78
+ )
79
+ text = ""
80
+ for item in result.content:
81
+ if hasattr(item, "text"):
82
+ text += item.text
83
+ return text
84
+
85
+
86
+ # ── CLI ──────────────────────────────────────────────────────────────────
87
+
88
+
89
+ def main():
90
+ if len(sys.argv) < 2:
91
+ print(__doc__, file=sys.stderr)
92
+ sys.exit(1)
93
+
94
+ cmd = sys.argv[1]
95
+
96
+ if cmd == "search":
97
+ if len(sys.argv) < 3:
98
+ print("Usage: ddg_mcp.py search <query> [--max-results N] [--region X]", file=sys.stderr)
99
+ sys.exit(1)
100
+ query = sys.argv[2]
101
+ max_results = 10
102
+ region = "wt-wt"
103
+ # Parse optional flags
104
+ args_iter = iter(sys.argv[3:])
105
+ for arg in args_iter:
106
+ if arg == "--max-results":
107
+ max_results = int(next(args_iter))
108
+ elif arg == "--region":
109
+ region = next(args_iter)
110
+ results = asyncio.run(search_web(query, max_results, region))
111
+ print(json.dumps(results, indent=2, ensure_ascii=False))
112
+
113
+ elif cmd == "fetch":
114
+ if len(sys.argv) < 3:
115
+ print("Usage: ddg_mcp.py fetch <url> [--max-length N]", file=sys.stderr)
116
+ sys.exit(1)
117
+ url = sys.argv[2]
118
+ max_length = 8000
119
+ args_iter = iter(sys.argv[3:])
120
+ for arg in args_iter:
121
+ if arg == "--max-length":
122
+ max_length = int(next(args_iter))
123
+ content = asyncio.run(fetch_content(url, max_length))
124
+ # Output has a leading prefix like "Content from https://...\n\n"
125
+ # Strip it for cleaner output
126
+ clean = re.sub(r"^Content from .+?\n\n", "", content, count=1)
127
+ print(clean)
128
+
129
+ else:
130
+ print(f"Unknown command: {cmd}", file=sys.stderr)
131
+ print(__doc__, file=sys.stderr)
132
+ sys.exit(1)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ main()
@@ -0,0 +1,431 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Deep Research Pipeline — orchestration helpers for the 5-stage deep research workflow.
4
+
5
+ Subcommands:
6
+ ddg_search — search web via DuckDuckGo MCP (free, no API key)
7
+ ddg_fetch — fetch page content via DuckDuckGo MCP
8
+ decompose — break a research question into sub-questions + search strategy
9
+ merge — merge & deduplicate search results from parallel subagents
10
+ clean — normalize extracted text from browser_use calls
11
+ verify — cross-reference claims across sources
12
+ synthesize — generate cited brief.md + .provenance.md
13
+
14
+ Usage: uv run research_pipeline.py <subcommand> [options]
15
+ """
16
+
17
+ import argparse
18
+ import asyncio
19
+ import json
20
+ import os
21
+ import re
22
+ import sys
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from urllib.parse import urlparse
26
+
27
+ # ── Helpers ──────────────────────────────────────────────────────────────
28
+
29
+ TOPIC_SLUG_RE = re.compile(r"[^a-z0-9-]+")
30
+
31
+
32
+ def slugify(text: str) -> str:
33
+ return TOPIC_SLUG_RE.sub("-", text.lower()).strip("-")
34
+
35
+
36
+ def ensure_dir(path: str) -> str:
37
+ Path(path).mkdir(parents=True, exist_ok=True)
38
+ return path
39
+
40
+
41
+ def load_json(path: str):
42
+ with open(path) as f:
43
+ return json.load(f)
44
+
45
+
46
+ def save_json(obj, path: str):
47
+ with open(path, "w") as f:
48
+ json.dump(obj, f, indent=2, ensure_ascii=False)
49
+
50
+
51
+ def domain(url: str) -> str:
52
+ try:
53
+ return urlparse(url).netloc
54
+ except Exception:
55
+ return url
56
+
57
+
58
+ # ── Subcommand: dag — Directed Acyclic Graph orchestration ────────────────
59
+
60
+ def cmd_dag(args):
61
+ """
62
+ Execute sub-tasks with dependency graph support.
63
+
64
+ Input: JSON file with:
65
+ {
66
+ "tasks": [
67
+ {"id": "search_1", "depends_on": [], "command": "ddg_search --query ... --output ..."},
68
+ {"id": "fetch_1", "depends_on": ["search_1"], "command": "ddg_fetch ..."},
69
+ {"id": "synthesize", "depends_on": ["search_1", "fetch_1"], "command": "synthesize ..."}
70
+ ]
71
+ }
72
+
73
+ Output: Executes tasks in dependency order, parallelizing independent tasks.
74
+ """
75
+ import subprocess
76
+
77
+ plan = load_json(args.plan)
78
+ tasks = {t["id"]: t for t in plan["tasks"]}
79
+ completed = set()
80
+ results = {}
81
+ max_iterations = 100
82
+ iteration = 0
83
+
84
+ while len(completed) < len(tasks) and iteration < max_iterations:
85
+ iteration += 1
86
+ ready = [
87
+ t for t in plan["tasks"]
88
+ if t["id"] not in completed
89
+ and all(dep in completed for dep in t.get("depends_on", []))
90
+ ]
91
+
92
+ if not ready:
93
+ blocked = [t["id"] for t in plan["tasks"] if t["id"] not in completed]
94
+ raise RuntimeError(f"Dependency cycle detected or unsatisfied: {blocked}")
95
+
96
+ # Execute ready tasks in parallel
97
+ for task in ready:
98
+ cmd_parts = task["command"].split()
99
+ wrapped_cmd = ["uv", "run", "python3", __file__] + cmd_parts
100
+ try:
101
+ result = subprocess.run(wrapped_cmd, capture_output=True, text=True, timeout=args.task_timeout)
102
+ results[task["id"]] = {
103
+ "status": "success" if result.returncode == 0 else "error",
104
+ "stdout": result.stdout[-500:],
105
+ "stderr": result.stderr[-500:],
106
+ }
107
+ except subprocess.TimeoutExpired:
108
+ results[task["id"]] = {"status": "timeout", "stdout": "", "stderr": "Timed out after {args.task_timeout}s"}
109
+ completed.add(task["id"])
110
+
111
+ if args.verbose:
112
+ print(f"[DAG] Iteration {iteration}: completed {len(completed)}/{len(tasks)} tasks")
113
+
114
+ # Save execution report
115
+ output = {
116
+ "plan_file": args.plan,
117
+ "tasks_total": len(tasks),
118
+ "tasks_completed": len(completed),
119
+ "results": results,
120
+ }
121
+ if args.output:
122
+ save_json(output, args.output)
123
+ print(f"DAG execution report → {args.output}")
124
+ else:
125
+ print(json.dumps(output, indent=2, ensure_ascii=False))
126
+
127
+
128
+ # ── Subcommand: ddg_search ────────────────────────────────────────────────
129
+
130
+ def cmd_ddg_search(args):
131
+ """Search using DuckDuckGo MCP (free, no API key)."""
132
+ from ddg_deep_research.ddg_mcp import search_web
133
+
134
+ results = asyncio.run(search_web(args.query, args.max_results, args.region))
135
+ save_json(results, args.output)
136
+ print(f"DuckDuckGo search returned {len(results)} results → {args.output}")
137
+
138
+
139
+ # ── Subcommand: ddg_fetch ─────────────────────────────────────────────────
140
+
141
+ def cmd_ddg_fetch(args):
142
+ """Fetch webpage content using DuckDuckGo MCP."""
143
+ from ddg_deep_research.ddg_mcp import fetch_content
144
+
145
+ content = asyncio.run(fetch_content(args.url, args.max_length))
146
+ out_dir = ensure_dir(args.output_dir)
147
+ slug = slugify(args.url.replace("https://", "").replace("http://", "").replace("/", "-"))
148
+ out_path = os.path.join(out_dir, f"{slug}.txt")
149
+ with open(out_path, "w") as f:
150
+ f.write(content)
151
+ print(f"Fetched {len(content)} chars → {out_path}")
152
+
153
+
154
+ # ── Subcommand: decompose ─────────────────────────────────────────────────
155
+
156
+ def cmd_decompose(args):
157
+ """Generate a research plan with sub-questions and search strategies."""
158
+ output_dir = Path(args.output).parent
159
+ ensure_dir(str(output_dir))
160
+
161
+ # This is called by the agent AFTER the agent has decomposed the question.
162
+ # The agent writes the plan, and this script just saves it in a standard format.
163
+ plan = {
164
+ "original_question": args.question,
165
+ "generated_at": datetime.now().isoformat(),
166
+ "slug": slugify(args.question),
167
+ "status": "plan_ready",
168
+ "sub_questions": [],
169
+ "notes": (
170
+ "The agent should populate sub_questions with 3-6 items. "
171
+ "Each item: {question, engine, priority, max_results, search_strategy}"
172
+ ),
173
+ }
174
+ save_json(plan, args.output)
175
+ print(f"Plan template saved to {args.output}")
176
+ print(f"Agent must now populate the sub_questions array and re-run.")
177
+
178
+
179
+ # ── Subcommand: merge ────────────────────────────────────────────────────
180
+
181
+ def cmd_merge(args):
182
+ """Merge and deduplicate search results from multiple subagent outputs."""
183
+ input_dir = Path(args.input_dir)
184
+ results = []
185
+ seen_urls = set()
186
+
187
+ for fpath in sorted(input_dir.glob("*.json")):
188
+ data = load_json(str(fpath))
189
+ items = data if isinstance(data, list) else data.get("results", data.get("items", []))
190
+ for item in items:
191
+ url = item.get("url", "") or item.get("link", "") or item.get("href", "")
192
+ if not url:
193
+ continue
194
+ norm_url = url.rstrip("/").lower()
195
+ if norm_url in seen_urls:
196
+ continue
197
+ seen_urls.add(norm_url)
198
+ results.append({
199
+ "title": item.get("title", "") or item.get("name", ""),
200
+ "url": url,
201
+ "snippet": item.get("snippet", "") or item.get("description", "") or "",
202
+ "source": item.get("source", fpath.stem),
203
+ "domain": domain(url),
204
+ "sub_question": item.get("sub_question", ""),
205
+ })
206
+
207
+ output = {
208
+ "total_urls": len(results),
209
+ "deduped_count": len(results),
210
+ "results": sorted(results, key=lambda r: r["url"]),
211
+ "generated_at": datetime.now().isoformat(),
212
+ }
213
+ save_json(output, args.output)
214
+ print(f"Merged {len(results)} unique results → {args.output}")
215
+
216
+
217
+ # ── Subcommand: clean ────────────────────────────────────────────────────
218
+
219
+ def cmd_clean(args):
220
+ """Normalize extracted text files into structured JSON records."""
221
+ input_dir = Path(args.input_dir)
222
+ ensure_dir(args.output)
223
+
224
+ records = []
225
+ for fpath in sorted(input_dir.rglob("*")):
226
+ if fpath.is_dir():
227
+ continue
228
+ text = fpath.read_text(encoding="utf-8", errors="replace")
229
+ # Strip excessive whitespace
230
+ lines = [l.strip() for l in text.split("\n")]
231
+ text = "\n".join(l for l in lines if l)
232
+ records.append({
233
+ "source_file": fpath.name,
234
+ "url": fpath.stem, # Agent should write files named by URL-slug
235
+ "length_chars": len(text),
236
+ "length_words": len(text.split()),
237
+ "content": text,
238
+ "cleaned_at": datetime.now().isoformat(),
239
+ })
240
+
241
+ outpath = os.path.join(args.output, "cleaned.json")
242
+ save_json(records, outpath)
243
+ print(f"Cleaned {len(records)} extracts → {outpath}")
244
+
245
+
246
+ # ── Subcommand: verify ───────────────────────────────────────────────────
247
+
248
+ def cmd_verify(args):
249
+ """Cross-reference claims across cleaned extracts (structure for agent use)."""
250
+ cleaned_data = load_json(args.input)
251
+ # This prepares a verification skeleton — the agent (LLM) does the actual
252
+ # claim extraction and cross-referencing, then writes back.
253
+ sources = []
254
+ for rec in cleaned_data:
255
+ sources.append({
256
+ "url": rec.get("url", "unknown"),
257
+ "length_chars": rec.get("length_chars", 0),
258
+ "verified": False,
259
+ "claims_extracted": 0,
260
+ })
261
+
262
+ output = {
263
+ "total_sources": len(sources),
264
+ "sources": sources,
265
+ "verified_at": datetime.now().isoformat(),
266
+ "status": "ready_for_agent_verification",
267
+ "note": "Agent should read each source, extract claims, and annotate verified_at/confidence per claim.",
268
+ }
269
+ save_json(output, args.output)
270
+ print(f"Verification skeleton ({len(sources)} sources) → {args.output}")
271
+
272
+
273
+ # ── Subcommand: synthesize ───────────────────────────────────────────────
274
+
275
+ def cmd_synthesize(args):
276
+ """Generate the final brief.md + .provenance.md files."""
277
+ verified = load_json(args.verified)
278
+ output_dir = ensure_dir(args.output_dir)
279
+
280
+ slug = slugify(args.question)
281
+
282
+ # ── .provenance.md ───────────────────────────────────────────────────
283
+ prov_lines = [
284
+ f"# Provenance: {args.question}",
285
+ "",
286
+ f"**Research date:** {args.today}",
287
+ f"**Slug:** {slug}",
288
+ f"**Sources consulted:** {len(verified.get('sources', []))}",
289
+ "",
290
+ "## Sources",
291
+ "",
292
+ ]
293
+ for i, src in enumerate(verified.get("sources", []), 1):
294
+ prov_lines.append(f"{i}. **{src.get('url', 'unknown')}**")
295
+ prov_lines.append(f" - Length: {src.get('length_chars', 0)} chars")
296
+ prov_lines.append(f" - Claims extracted: {src.get('claims_extracted', 0)}")
297
+ if src.get("verified"):
298
+ prov_lines.append(f" - ✓ Verified at: {src.get('verified_at', 'N/A')}")
299
+ else:
300
+ prov_lines.append(f" - ○ Not yet verified")
301
+ prov_lines.append("")
302
+
303
+ prov_lines.extend([
304
+ "",
305
+ "## Methodology",
306
+ "",
307
+ "- **Stage 1 (Decompose):** Research question broken into focused sub-questions",
308
+ "- **Stage 2 (Gather):** Parallel search via DuckDuckGo MCP (free, no API key)",
309
+ "- **Stage 3 (Read):** Top URLs extracted via DuckDuckGo MCP fetch_content + browser_use",
310
+ "- **Stage 4 (Verify):** Claims cross-referenced across independent sources",
311
+ "- **Stage 5 (Synthesize):** This brief generated with full provenance tracking",
312
+ "",
313
+ f"*Generated by Omnibot Deep Research pipeline on {args.today}*",
314
+ ])
315
+
316
+ provenance_path = os.path.join(output_dir, ".provenance.md")
317
+ with open(provenance_path, "w") as f:
318
+ f.write("\n".join(prov_lines))
319
+ print(f"Provenance sidecar → {provenance_path}")
320
+
321
+ # ── brief.md ─────────────────────────────────────────────────────────
322
+ brief_lines = [
323
+ f"# Research Brief: {args.question}",
324
+ "",
325
+ f"**Date:** {args.today} **Sources:** {len(verified.get('sources', []))}",
326
+ "",
327
+ "## Executive Summary",
328
+ "",
329
+ "*Agent: populate executive summary from verified claims here.*",
330
+ "",
331
+ "## Key Findings",
332
+ "",
333
+ "*Agent: organize findings by sub-question, each with [source: url] citations.*",
334
+ "",
335
+ "## Contradictions & Open Questions",
336
+ "",
337
+ "*Agent: list any conflicting claims across sources here.*",
338
+ "",
339
+ "## Gaps",
340
+ "",
341
+ "*Agent: note what wasn't found or needs more investigation.*",
342
+ "",
343
+ "## Follow-up Questions",
344
+ "",
345
+ "*Agent: suggest 2-3 questions the user could explore next.*",
346
+ "",
347
+ f"---",
348
+ f"*Generated by Omnibot Deep Research pipeline on {args.today}*",
349
+ f"*See [.provenance.md](.provenance.md) for full source list*",
350
+ ]
351
+
352
+ brief_path = os.path.join(output_dir, "brief.md")
353
+ with open(brief_path, "w") as f:
354
+ f.write("\n".join(brief_lines))
355
+ print(f"Cited brief → {brief_path}")
356
+
357
+ print(f"\nDone. Output in {output_dir}/")
358
+ print(f" brief.md — the cited research brief (agent to fill findings)")
359
+ print(f" .provenance.md — full source tracking")
360
+
361
+
362
+ # ── CLI ──────────────────────────────────────────────────────────────────
363
+
364
+ def main():
365
+ parser = argparse.ArgumentParser(description="Deep Research Pipeline")
366
+ sub = parser.add_subparsers(dest="command", required=True)
367
+
368
+ # decompose
369
+ p = sub.add_parser("decompose", help="Create research plan template")
370
+ p.add_argument("--question", required=True)
371
+ p.add_argument("--output", required=True)
372
+
373
+ # merge
374
+ p = sub.add_parser("merge", help="Merge & deduplicate search results")
375
+ p.add_argument("--input-dir", required=True)
376
+ p.add_argument("--output", required=True)
377
+
378
+ # clean
379
+ p = sub.add_parser("clean", help="Normalize extracted text")
380
+ p.add_argument("--input", required=True)
381
+ p.add_argument("--output", required=True)
382
+
383
+ # verify
384
+ p = sub.add_parser("verify", help="Prepare verification skeleton")
385
+ p.add_argument("--input", required=True)
386
+ p.add_argument("--output", required=True)
387
+
388
+ # synthesize
389
+ p = sub.add_parser("synthesize", help="Generate brief + provenance")
390
+ p.add_argument("--verified", required=True)
391
+ p.add_argument("--question", required=True)
392
+ p.add_argument("--output-dir", required=True)
393
+ p.add_argument("--today", default=datetime.now().strftime("%Y-%m-%d"))
394
+
395
+ # dag — Directed Acyclic Graph orchestration
396
+ p = sub.add_parser("dag", help="Execute sub-tasks with dependency graph")
397
+ p.add_argument("--plan", required=True, help="JSON file with tasks + dependencies")
398
+ p.add_argument("--output", default=None, help="Output execution report path")
399
+ p.add_argument("--task-timeout", type=int, default=120, help="Per-task timeout in seconds")
400
+ p.add_argument("--verbose", action="store_true", help="Print progress")
401
+
402
+ # ddg_search — DuckDuckGo MCP (free, no API key)
403
+ p = sub.add_parser("ddg_search", help="Search via DuckDuckGo MCP (free)")
404
+ p.add_argument("--query", required=True)
405
+ p.add_argument("--max-results", type=int, default=10)
406
+ p.add_argument("--region", default="wt-wt")
407
+ p.add_argument("--output", required=True)
408
+
409
+ # ddg_fetch — fetch page content via DuckDuckGo MCP
410
+ p = sub.add_parser("ddg_fetch", help="Fetch page content via DuckDuckGo MCP")
411
+ p.add_argument("--url", required=True)
412
+ p.add_argument("--max-length", type=int, default=8000)
413
+ p.add_argument("--output-dir", required=True)
414
+
415
+ args = parser.parse_args()
416
+
417
+ commands = {
418
+ "decompose": cmd_decompose,
419
+ "merge": cmd_merge,
420
+ "clean": cmd_clean,
421
+ "verify": cmd_verify,
422
+ "synthesize": cmd_synthesize,
423
+ "ddg_search": cmd_ddg_search,
424
+ "ddg_fetch": cmd_ddg_fetch,
425
+ "dag": cmd_dag,
426
+ }
427
+ commands[args.command](args)
428
+
429
+
430
+ if __name__ == "__main__":
431
+ main()