alita-sdk 0.3.176__py3-none-any.whl → 0.3.177__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. alita_sdk/community/__init__.py +7 -17
  2. alita_sdk/tools/carrier/api_wrapper.py +6 -0
  3. alita_sdk/tools/carrier/backend_tests_tool.py +308 -7
  4. alita_sdk/tools/carrier/carrier_sdk.py +18 -0
  5. alita_sdk/tools/carrier/tools.py +2 -1
  6. {alita_sdk-0.3.176.dist-info → alita_sdk-0.3.177.dist-info}/METADATA +1 -2
  7. {alita_sdk-0.3.176.dist-info → alita_sdk-0.3.177.dist-info}/RECORD +10 -41
  8. alita_sdk/community/browseruse/__init__.py +0 -73
  9. alita_sdk/community/browseruse/api_wrapper.py +0 -288
  10. alita_sdk/community/deep_researcher/__init__.py +0 -70
  11. alita_sdk/community/deep_researcher/agents/__init__.py +0 -1
  12. alita_sdk/community/deep_researcher/agents/baseclass.py +0 -182
  13. alita_sdk/community/deep_researcher/agents/knowledge_gap_agent.py +0 -74
  14. alita_sdk/community/deep_researcher/agents/long_writer_agent.py +0 -251
  15. alita_sdk/community/deep_researcher/agents/planner_agent.py +0 -124
  16. alita_sdk/community/deep_researcher/agents/proofreader_agent.py +0 -80
  17. alita_sdk/community/deep_researcher/agents/thinking_agent.py +0 -64
  18. alita_sdk/community/deep_researcher/agents/tool_agents/__init__.py +0 -20
  19. alita_sdk/community/deep_researcher/agents/tool_agents/crawl_agent.py +0 -87
  20. alita_sdk/community/deep_researcher/agents/tool_agents/search_agent.py +0 -96
  21. alita_sdk/community/deep_researcher/agents/tool_selector_agent.py +0 -83
  22. alita_sdk/community/deep_researcher/agents/utils/__init__.py +0 -0
  23. alita_sdk/community/deep_researcher/agents/utils/parse_output.py +0 -148
  24. alita_sdk/community/deep_researcher/agents/writer_agent.py +0 -63
  25. alita_sdk/community/deep_researcher/api_wrapper.py +0 -116
  26. alita_sdk/community/deep_researcher/deep_research.py +0 -185
  27. alita_sdk/community/deep_researcher/examples/deep_example.py +0 -30
  28. alita_sdk/community/deep_researcher/examples/iterative_example.py +0 -34
  29. alita_sdk/community/deep_researcher/examples/report_plan_example.py +0 -27
  30. alita_sdk/community/deep_researcher/iterative_research.py +0 -419
  31. alita_sdk/community/deep_researcher/llm_config.py +0 -87
  32. alita_sdk/community/deep_researcher/main.py +0 -67
  33. alita_sdk/community/deep_researcher/tools/__init__.py +0 -2
  34. alita_sdk/community/deep_researcher/tools/crawl_website.py +0 -109
  35. alita_sdk/community/deep_researcher/tools/web_search.py +0 -294
  36. alita_sdk/community/deep_researcher/utils/__init__.py +0 -0
  37. alita_sdk/community/deep_researcher/utils/md_to_pdf.py +0 -8
  38. alita_sdk/community/deep_researcher/utils/os.py +0 -21
  39. {alita_sdk-0.3.176.dist-info → alita_sdk-0.3.177.dist-info}/WHEEL +0 -0
  40. {alita_sdk-0.3.176.dist-info → alita_sdk-0.3.177.dist-info}/licenses/LICENSE +0 -0
  41. {alita_sdk-0.3.176.dist-info → alita_sdk-0.3.177.dist-info}/top_level.txt +0 -0
@@ -1,419 +0,0 @@
1
- from __future__ import annotations
2
- import asyncio
3
- import time
4
- from typing import Dict, List, Optional, Any
5
- from agents import custom_span, gen_trace_id, trace
6
- from .agents.baseclass import ResearchRunner
7
- from .agents.writer_agent import init_writer_agent
8
- from .agents.knowledge_gap_agent import KnowledgeGapOutput, init_knowledge_gap_agent
9
- from .agents.tool_selector_agent import AgentTask, AgentSelectionPlan, init_tool_selector_agent
10
- from .agents.thinking_agent import init_thinking_agent
11
- from .agents.tool_agents import init_tool_agents, ToolAgentOutput
12
- from pydantic import BaseModel, Field
13
- from .llm_config import LLMConfig, create_default_config
14
-
15
-
16
- class IterationData(BaseModel):
17
- """Data for a single iteration of the research loop."""
18
- gap: str = Field(description="The gap addressed in the iteration", default_factory=list)
19
- tool_calls: List[str] = Field(description="The tool calls made", default_factory=list)
20
- findings: List[str] = Field(description="The findings collected from tool calls", default_factory=list)
21
- thought: List[str] = Field(description="The thinking done to reflect on the success of the iteration and next steps", default_factory=list)
22
-
23
-
24
- class Conversation(BaseModel):
25
- """A conversation between the user and the iterative researcher."""
26
- history: List[IterationData] = Field(description="The data for each iteration of the research loop", default_factory=list)
27
-
28
- def add_iteration(self, iteration_data: Optional[IterationData] = None):
29
- if iteration_data is None:
30
- iteration_data = IterationData()
31
- self.history.append(iteration_data)
32
-
33
- def set_latest_gap(self, gap: str):
34
- self.history[-1].gap = gap
35
-
36
- def set_latest_tool_calls(self, tool_calls: List[str]):
37
- self.history[-1].tool_calls = tool_calls
38
-
39
- def set_latest_findings(self, findings: List[str]):
40
- self.history[-1].findings = findings
41
-
42
- def set_latest_thought(self, thought: str):
43
- self.history[-1].thought = thought
44
-
45
- def get_latest_gap(self) -> str:
46
- return self.history[-1].gap
47
-
48
- def get_latest_tool_calls(self) -> List[str]:
49
- return self.history[-1].tool_calls
50
-
51
- def get_latest_findings(self) -> List[str]:
52
- return self.history[-1].findings
53
-
54
- def get_latest_thought(self) -> str:
55
- return self.history[-1].thought
56
-
57
- def get_all_findings(self) -> List[str]:
58
- return [finding for iteration_data in self.history for finding in iteration_data.findings]
59
-
60
- def compile_conversation_history(self) -> str:
61
- """Compile the conversation history into a string."""
62
- conversation = ""
63
- for iteration_num, iteration_data in enumerate(self.history):
64
- conversation += f"[ITERATION {iteration_num + 1}]\n\n"
65
- if iteration_data.thought:
66
- conversation += f"{self.get_thought_string(iteration_num)}\n\n"
67
- if iteration_data.gap:
68
- conversation += f"{self.get_task_string(iteration_num)}\n\n"
69
- if iteration_data.tool_calls:
70
- conversation += f"{self.get_action_string(iteration_num)}\n\n"
71
- if iteration_data.findings:
72
- conversation += f"{self.get_findings_string(iteration_num)}\n\n"
73
-
74
- return conversation
75
-
76
- def get_task_string(self, iteration_num: int) -> str:
77
- """Get the task for the current iteration."""
78
- if self.history[iteration_num].gap:
79
- return f"<task>\nAddress this knowledge gap: {self.history[iteration_num].gap}\n</task>"
80
- return ""
81
-
82
- def get_action_string(self, iteration_num: int) -> str:
83
- """Get the action for the current iteration."""
84
- if self.history[iteration_num].tool_calls:
85
- joined_calls = '\n'.join(self.history[iteration_num].tool_calls)
86
- return (
87
- "<action>\nCalling the following tools to address the knowledge gap:\n"
88
- f"{joined_calls}\n</action>"
89
- )
90
- return ""
91
-
92
- def get_findings_string(self, iteration_num: int) -> str:
93
- """Get the findings for the current iteration."""
94
- if self.history[iteration_num].findings:
95
- joined_findings = '\n\n'.join(self.history[iteration_num].findings)
96
- return f"<findings>\n{joined_findings}\n</findings>"
97
- return ""
98
-
99
- def get_thought_string(self, iteration_num: int) -> str:
100
- """Get the thought for the current iteration."""
101
- if self.history[iteration_num].thought:
102
- return f"<thought>\n{self.history[iteration_num].thought}\n</thought>"
103
- return ""
104
-
105
- def latest_task_string(self) -> str:
106
- """Get the latest task."""
107
- return self.get_task_string(len(self.history) - 1)
108
-
109
- def latest_action_string(self) -> str:
110
- """Get the latest action."""
111
- return self.get_action_string(len(self.history) - 1)
112
-
113
- def latest_findings_string(self) -> str:
114
- """Get the latest findings."""
115
- return self.get_findings_string(len(self.history) - 1)
116
-
117
- def latest_thought_string(self) -> str:
118
- """Get the latest thought."""
119
- return self.get_thought_string(len(self.history) - 1)
120
-
121
-
122
- class IterativeResearcher:
123
- """Manager for the iterative research workflow that conducts research on a topic or subtopic by running a continuous research loop."""
124
-
125
- def __init__(
126
- self,
127
- max_iterations: int = 5,
128
- max_time_minutes: int = 10,
129
- verbose: bool = True,
130
- tracing: bool = False,
131
- config: Optional[LLMConfig] = None,
132
- llm: Optional[Any] = None,
133
- alita: Optional[Any] = None
134
- ):
135
- self.max_iterations: int = max_iterations
136
- self.max_time_minutes: int = max_time_minutes
137
- self.start_time: float = None
138
- self.iteration: int = 0
139
- self.conversation: Conversation = Conversation()
140
- self.should_continue: bool = True
141
- self.verbose: bool = verbose
142
- self.tracing: bool = tracing
143
- self.alita = alita
144
-
145
- # Initialize config with langchain LLM if provided
146
- if llm is not None:
147
- self.config = create_default_config(langchain_llm=llm)
148
- elif config is not None:
149
- self.config = config
150
- else:
151
- self.config = create_default_config()
152
-
153
- # Initialize all the agents
154
- self.knowledge_gap_agent = init_knowledge_gap_agent(self.config)
155
- self.tool_selector_agent = init_tool_selector_agent(self.config)
156
- self.thinking_agent = init_thinking_agent(self.config)
157
- self.writer_agent = init_writer_agent(self.config)
158
- self.tool_agents = init_tool_agents(self.config)
159
-
160
- async def run(
161
- self,
162
- query: str,
163
- output_length: str = "", # A text description of the desired output length, can be left blank
164
- output_instructions: str = "", # Instructions for the final report (e.g. don't include any headings, just a couple of paragraphs of text)
165
- background_context: str = "",
166
- ) -> str:
167
- """Run the deep research workflow for a given query."""
168
- self.start_time = time.time()
169
-
170
- if self.tracing:
171
- trace_id = gen_trace_id()
172
- workflow_trace = trace("iterative_researcher", trace_id=trace_id)
173
- print(f"View trace: https://platform.openai.com/traces/trace?trace_id={trace_id}")
174
- workflow_trace.start(mark_as_current=True)
175
-
176
- self._log_message("=== Starting Iterative Research Workflow ===")
177
-
178
- # Iterative research loop
179
- while self.should_continue and self._check_constraints():
180
- self.iteration += 1
181
- self._log_message(f"\n=== Starting Iteration {self.iteration} ===")
182
-
183
- # Set up blank IterationData for this iteration
184
- self.conversation.add_iteration()
185
-
186
- # 1. Generate observations
187
- observations: str = await self._generate_observations(query, background_context=background_context)
188
-
189
- # 2. Evaluate current gaps in the research
190
- evaluation: KnowledgeGapOutput = await self._evaluate_gaps(query, background_context=background_context)
191
-
192
- # Check if we should continue or break the loop
193
- if not evaluation.research_complete:
194
- next_gap = evaluation.outstanding_gaps[0]
195
-
196
- # 3. Select agents to address knowledge gap
197
- selection_plan: AgentSelectionPlan = await self._select_agents(next_gap, query, background_context=background_context)
198
-
199
- # 4. Run the selected agents to gather information
200
- results: Dict[str, ToolAgentOutput] = await self._execute_tools(selection_plan.tasks)
201
- else:
202
- self.should_continue = False
203
- self._log_message("=== IterativeResearcher Marked As Complete - Finalizing Output ===")
204
-
205
- # Create final report
206
- report = await self._create_final_report(query, length=output_length, instructions=output_instructions)
207
-
208
- elapsed_time = time.time() - self.start_time
209
- self._log_message(f"IterativeResearcher completed in {int(elapsed_time // 60)} minutes and {int(elapsed_time % 60)} seconds after {self.iteration} iterations.")
210
-
211
- if self.tracing:
212
- workflow_trace.finish(reset_current=True)
213
-
214
- return report
215
-
216
- def _check_constraints(self) -> bool:
217
- """Check if we've exceeded our constraints (max iterations or time)."""
218
- if self.iteration >= self.max_iterations:
219
- self._log_message("\n=== Ending Research Loop ===")
220
- self._log_message(f"Reached maximum iterations ({self.max_iterations})")
221
- return False
222
-
223
- elapsed_minutes = (time.time() - self.start_time) / 60
224
- if elapsed_minutes >= self.max_time_minutes:
225
- self._log_message("\n=== Ending Research Loop ===")
226
- self._log_message(f"Reached maximum time ({self.max_time_minutes} minutes)")
227
- return False
228
-
229
- return True
230
-
231
- async def _evaluate_gaps(
232
- self,
233
- query: str,
234
- background_context: str = ""
235
- ) -> KnowledgeGapOutput:
236
- """Evaluate the current state of research and identify knowledge gaps."""
237
-
238
- background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
239
-
240
- input_str = f"""
241
- Current Iteration Number: {self.iteration}
242
- Time Elapsed: {(time.time() - self.start_time) / 60:.2f} minutes of maximum {self.max_time_minutes} minutes
243
-
244
- ORIGINAL QUERY:
245
- {query}
246
-
247
- {background}
248
-
249
- HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
250
- {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}
251
- """
252
-
253
- result = await ResearchRunner.run(
254
- self.knowledge_gap_agent,
255
- input_str,
256
- )
257
-
258
- evaluation = result.final_output_as(KnowledgeGapOutput)
259
-
260
- if not evaluation.research_complete:
261
- next_gap = evaluation.outstanding_gaps[0]
262
- self.conversation.set_latest_gap(next_gap)
263
- self._log_message(self.conversation.latest_task_string())
264
-
265
- return evaluation
266
-
267
- async def _select_agents(
268
- self,
269
- gap: str,
270
- query: str,
271
- background_context: str = ""
272
- ) -> AgentSelectionPlan:
273
- """Select agents to address the identified knowledge gap."""
274
-
275
- background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
276
-
277
- input_str = f"""
278
- ORIGINAL QUERY:
279
- {query}
280
-
281
- KNOWLEDGE GAP TO ADDRESS:
282
- {gap}
283
-
284
- {background}
285
-
286
- HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
287
- {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}
288
- """
289
-
290
- result = await ResearchRunner.run(
291
- self.tool_selector_agent,
292
- input_str,
293
- )
294
-
295
- selection_plan = result.final_output_as(AgentSelectionPlan)
296
-
297
- # Add the tool calls to the conversation
298
- self.conversation.set_latest_tool_calls([
299
- f"[Agent] {task.agent} [Query] {task.query} [Entity] {task.entity_website if task.entity_website else 'null'}" for task in selection_plan.tasks
300
- ])
301
- self._log_message(self.conversation.latest_action_string())
302
-
303
- return selection_plan
304
-
305
- async def _execute_tools(self, tasks: List[AgentTask]) -> Dict[str, ToolAgentOutput]:
306
- """Execute the selected tools concurrently to gather information."""
307
- with custom_span("Execute Tool Agents"):
308
- # Create a task for each agent
309
- async_tasks = []
310
- for task in tasks:
311
- async_tasks.append(self._run_agent_task(task))
312
-
313
- # Run all tasks concurrently
314
- num_completed = 0
315
- results = {}
316
- for future in asyncio.as_completed(async_tasks):
317
- gap, agent_name, result = await future
318
- results[f"{agent_name}_{gap}"] = result
319
- num_completed += 1
320
- self._log_message(f"<processing>\nTool execution progress: {num_completed}/{len(async_tasks)}\n</processing>")
321
-
322
- # Add findings from the tool outputs to the conversation
323
- findings = []
324
- for tool_output in results.values():
325
- findings.append(tool_output.output)
326
- self.conversation.set_latest_findings(findings)
327
-
328
- return results
329
-
330
- async def _run_agent_task(self, task: AgentTask) -> tuple[str, str, ToolAgentOutput]:
331
- """Run a single agent task and return the result."""
332
- try:
333
- agent_name = task.agent
334
- agent = self.tool_agents.get(agent_name)
335
- if agent:
336
- result = await ResearchRunner.run(
337
- agent,
338
- task.model_dump_json(),
339
- )
340
- # Extract ToolAgentOutput from RunResult
341
- output = result.final_output_as(ToolAgentOutput)
342
- else:
343
- output = ToolAgentOutput(
344
- output=f"No implementation found for agent {agent_name}",
345
- sources=[]
346
- )
347
-
348
- return task.gap, agent_name, output
349
- except Exception as e:
350
- error_output = ToolAgentOutput(
351
- output=f"Error executing {task.agent} for gap '{task.gap}': {str(e)}",
352
- sources=[]
353
- )
354
- return task.gap, task.agent, error_output
355
-
356
- async def _generate_observations(self, query: str, background_context: str = "") -> str:
357
- """Generate observations from the current state of the research."""
358
-
359
- background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
360
-
361
- input_str = f"""
362
- You are starting iteration {self.iteration} of your research process.
363
-
364
- ORIGINAL QUERY:
365
- {query}
366
-
367
- {background}
368
-
369
- HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
370
- {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}
371
- """
372
- result = await ResearchRunner.run(
373
- self.thinking_agent,
374
- input_str,
375
- )
376
-
377
- # Add the observations to the conversation
378
- observations = result.final_output
379
- self.conversation.set_latest_thought(observations)
380
- self._log_message(self.conversation.latest_thought_string())
381
- return observations
382
-
383
- async def _create_final_report(
384
- self,
385
- query: str,
386
- length: str = "",
387
- instructions: str = ""
388
- ) -> str:
389
- """Create the final response from the completed draft."""
390
- self._log_message("=== Drafting Final Response ===")
391
-
392
- length_str = f"* The full response should be approximately {length}.\n" if length else ""
393
- instructions_str = f"* {instructions}" if instructions else ""
394
- guidelines_str = ("\n\nGUIDELINES:\n" + length_str + instructions_str).strip('\n') if length or instructions else ""
395
-
396
- all_findings = '\n\n'.join(self.conversation.get_all_findings()) or "No findings available yet."
397
-
398
- input_str = f"""
399
- Provide a response based on the query and findings below with as much detail as possible. {guidelines_str}
400
-
401
- QUERY: {query}
402
-
403
- FINDINGS:
404
- {all_findings}
405
- """
406
-
407
- result = await ResearchRunner.run(
408
- self.writer_agent,
409
- input_str,
410
- )
411
-
412
- self._log_message("Final response from IterativeResearcher created successfully")
413
-
414
- return result.final_output
415
-
416
- def _log_message(self, message: str) -> None:
417
- """Log a message if verbose is True"""
418
- if self.verbose:
419
- print(message)
@@ -1,87 +0,0 @@
1
- from typing import Optional, Any
2
- from dotenv import load_dotenv
3
- from .utils.os import get_env_with_prefix
4
-
5
- load_dotenv(override=True)
6
-
7
- # Only keeping the necessary environment variable for search provider
8
- SEARCH_PROVIDER = get_env_with_prefix("SEARCH_PROVIDER", "serper")
9
-
10
- class LLMConfig:
11
- def __init__(
12
- self,
13
- search_provider: str,
14
- langchain_llm: Any,
15
- ):
16
- self.search_provider = search_provider
17
- self.reasoning_model = LangchainModelAdapter(langchain_llm)
18
- self.main_model = LangchainModelAdapter(langchain_llm)
19
- self.fast_model = LangchainModelAdapter(langchain_llm)
20
-
21
-
22
- def create_default_config(langchain_llm: Any) -> LLMConfig:
23
- """Create a default config using a Langchain LLM"""
24
- return LLMConfig(
25
- search_provider=SEARCH_PROVIDER,
26
- langchain_llm=langchain_llm
27
- )
28
-
29
-
30
- class LangchainModelAdapter:
31
- """Adapter class to make Langchain LLMs work with the DeepResearcher framework"""
32
-
33
- def __init__(self, langchain_llm):
34
- self.langchain_llm = langchain_llm
35
- self._client = type('DummyClient', (), {'_base_url': 'langchain'})()
36
-
37
- async def agenerate_response(self, messages, **kwargs):
38
- """Adapter method to match the expected interface"""
39
- from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
40
-
41
- # Convert message format to Langchain format
42
- lc_messages = []
43
- for message in messages:
44
- role = message.get('role', '')
45
- content = message.get('content', '')
46
-
47
- if role == 'system':
48
- lc_messages.append(SystemMessage(content=content))
49
- elif role == 'user':
50
- lc_messages.append(HumanMessage(content=content))
51
- elif role == 'assistant':
52
- lc_messages.append(AIMessage(content=content))
53
-
54
- # Use langchain LLM to generate response
55
- response = await self.langchain_llm.ainvoke(lc_messages)
56
-
57
- # Return in format compatible with the existing code
58
- return type('Response', (), {
59
- 'choices': [
60
- type('Choice', (), {
61
- 'message': type('Message', (), {
62
- 'content': response.content,
63
- 'role': 'assistant'
64
- })
65
- })
66
- ]
67
- })
68
-
69
- async def agenerate_text(self, prompt, **kwargs):
70
- """Simple text completion adapter method"""
71
- from langchain_core.messages import HumanMessage
72
-
73
- response = await self.langchain_llm.ainvoke([HumanMessage(content=prompt)])
74
-
75
- # Return in format compatible with the existing code
76
- return type('Response', (), {
77
- 'choices': [
78
- type('Choice', (), {
79
- 'text': response.content
80
- })
81
- ]
82
- })
83
-
84
- def supports_json_mode(self):
85
- """Check if the model supports JSON mode"""
86
- # Most Langchain LLMs support structured output, so return True by default
87
- return True
@@ -1,67 +0,0 @@
1
- import asyncio
2
- import argparse
3
- from .iterative_research import IterativeResearcher
4
- from .deep_research import DeepResearcher
5
- from typing import Literal
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv(override=True)
9
-
10
-
11
- async def main() -> None:
12
- parser = argparse.ArgumentParser(description="Deep Research Assistant")
13
- parser.add_argument("--query", type=str, help="Research query")
14
- parser.add_argument("--model", type=str, choices=["deep", "simple"],
15
- help="Mode of research (deep or simple)", default="deep")
16
- parser.add_argument("--max-iterations", type=int, default=5,
17
- help="Maximum number of iterations for deep research")
18
- parser.add_argument("--max-time", type=int, default=10,
19
- help="Maximum time in minutes for deep research")
20
- parser.add_argument("--output-length", type=str, default="5 pages",
21
- help="Desired output length for the report")
22
- parser.add_argument("--output-instructions", type=str, default="",
23
- help="Additional instructions for the report")
24
- parser.add_argument("--verbose", action="store_true",
25
- help="Print status updates to the console")
26
- parser.add_argument("--tracing", action="store_true",
27
- help="Enable tracing for the research (only valid for OpenAI models)")
28
-
29
- args = parser.parse_args()
30
-
31
- # If no query is provided via command line, prompt the user
32
- query = args.query if args.query else input("What would you like to research? ")
33
-
34
- print(f"Starting deep research on: {query}")
35
- print(f"Max iterations: {args.max_iterations}, Max time: {args.max_time} minutes")
36
-
37
- if args.model == "deep":
38
- manager = DeepResearcher(
39
- max_iterations=args.max_iterations,
40
- max_time_minutes=args.max_time,
41
- verbose=args.verbose,
42
- tracing=args.tracing
43
- )
44
- report = await manager.run(query)
45
- else:
46
- manager = IterativeResearcher(
47
- max_iterations=args.max_iterations,
48
- max_time_minutes=args.max_time,
49
- verbose=args.verbose,
50
- tracing=args.tracing
51
- )
52
- report = await manager.run(
53
- query,
54
- output_length=args.output_length,
55
- output_instructions=args.output_instructions
56
- )
57
-
58
- print("\n=== Final Report ===")
59
- print(report)
60
-
61
- # Command line entry point
62
- def cli_entry():
63
- """Entry point for the command-line interface."""
64
- asyncio.run(main())
65
-
66
- if __name__ == "__main__":
67
- cli_entry()
@@ -1,2 +0,0 @@
1
- from .web_search import create_web_search_tool
2
- from .crawl_website import crawl_website
@@ -1,109 +0,0 @@
1
- from typing import List, Set, Union
2
- from urllib.parse import urlparse, urljoin
3
- from bs4 import BeautifulSoup
4
- import aiohttp
5
- from .web_search import scrape_urls, ssl_context, ScrapeResult, WebpageSnippet
6
- from agents import function_tool
7
-
8
-
9
- @function_tool
10
- async def crawl_website(starting_url: str) -> Union[List[ScrapeResult], str]:
11
- """Crawls the pages of a website starting with the starting_url and then descending into the pages linked from there.
12
- Prioritizes links found in headers/navigation, then body links, then subsequent pages.
13
-
14
- Args:
15
- starting_url: Starting URL to scrape
16
-
17
- Returns:
18
- List of ScrapeResult objects which have the following fields:
19
- - url: The URL of the web page
20
- - title: The title of the web page
21
- - description: The description of the web page
22
- - text: The text content of the web page
23
- """
24
- if not starting_url:
25
- return "Empty URL provided"
26
-
27
- # Ensure URL has a protocol
28
- if not starting_url.startswith(('http://', 'https://')):
29
- starting_url = 'http://' + starting_url
30
-
31
- max_pages = 10
32
- base_domain = urlparse(starting_url).netloc
33
-
34
- async def extract_links(html: str, current_url: str) -> tuple[List[str], List[str]]:
35
- """Extract prioritized links from HTML content"""
36
- soup = BeautifulSoup(html, 'html.parser')
37
- nav_links = set()
38
- body_links = set()
39
-
40
- # Find navigation/header links
41
- for nav_element in soup.find_all(['nav', 'header']):
42
- for a in nav_element.find_all('a', href=True):
43
- link = urljoin(current_url, a['href'])
44
- if urlparse(link).netloc == base_domain:
45
- nav_links.add(link)
46
-
47
- # Find remaining body links
48
- for a in soup.find_all('a', href=True):
49
- link = urljoin(current_url, a['href'])
50
- if urlparse(link).netloc == base_domain and link not in nav_links:
51
- body_links.add(link)
52
-
53
- return list(nav_links), list(body_links)
54
-
55
- async def fetch_page(url: str) -> str:
56
- """Fetch HTML content from a URL"""
57
- connector = aiohttp.TCPConnector(ssl=ssl_context)
58
- async with aiohttp.ClientSession(connector=connector) as session:
59
- try:
60
- async with session.get(url, timeout=30) as response:
61
- if response.status == 200:
62
- return await response.text()
63
- except Exception as e:
64
- print(f"Error fetching {url}: {str(e)}")
65
- return "Error fetching page"
66
-
67
- # Initialize with starting URL
68
- queue: List[str] = [starting_url]
69
- next_level_queue: List[str] = []
70
- all_pages_to_scrape: Set[str] = set([starting_url])
71
-
72
- # Breadth-first crawl
73
- while queue and len(all_pages_to_scrape) < max_pages:
74
- current_url = queue.pop(0)
75
-
76
- # Fetch and process the page
77
- html_content = await fetch_page(current_url)
78
- if html_content:
79
- nav_links, body_links = await extract_links(html_content, current_url)
80
-
81
- # Add unvisited nav links to current queue (higher priority)
82
- remaining_slots = max_pages - len(all_pages_to_scrape)
83
- for link in nav_links:
84
- link = link.rstrip('/')
85
- if link not in all_pages_to_scrape and remaining_slots > 0:
86
- queue.append(link)
87
- all_pages_to_scrape.add(link)
88
- remaining_slots -= 1
89
-
90
- # Add unvisited body links to next level queue (lower priority)
91
- for link in body_links:
92
- link = link.rstrip('/')
93
- if link not in all_pages_to_scrape and remaining_slots > 0:
94
- next_level_queue.append(link)
95
- all_pages_to_scrape.add(link)
96
- remaining_slots -= 1
97
-
98
- # If current queue is empty, add next level links
99
- if not queue:
100
- queue = next_level_queue
101
- next_level_queue = []
102
-
103
- # Convert set to list for final processing
104
- pages_to_scrape = list(all_pages_to_scrape)[:max_pages]
105
- pages_to_scrape = [WebpageSnippet(url=page, title="", description="") for page in pages_to_scrape]
106
-
107
- # Use scrape_urls to get the content for all discovered pages
108
- result = await scrape_urls(pages_to_scrape)
109
- return result