camel-ai 0.2.15a0__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +18 -4
- camel/agents/multi_hop_generator_agent.py +85 -0
- camel/agents/programmed_agent_instruction.py +148 -0
- camel/benchmarks/__init__.py +13 -1
- camel/benchmarks/apibank.py +565 -0
- camel/benchmarks/apibench.py +500 -0
- camel/benchmarks/gaia.py +4 -4
- camel/benchmarks/nexus.py +518 -0
- camel/benchmarks/ragbench.py +333 -0
- camel/bots/__init__.py +1 -1
- camel/bots/discord/__init__.py +26 -0
- camel/bots/discord/discord_app.py +384 -0
- camel/bots/discord/discord_installation.py +64 -0
- camel/bots/discord/discord_store.py +160 -0
- camel/configs/__init__.py +3 -0
- camel/configs/anthropic_config.py +17 -15
- camel/configs/internlm_config.py +60 -0
- camel/data_collector/base.py +5 -5
- camel/data_collector/sharegpt_collector.py +2 -2
- camel/datagen/__init__.py +6 -2
- camel/datagen/{o1datagen.py → cotdatagen.py} +19 -6
- camel/datagen/self_instruct/__init__.py +36 -0
- camel/datagen/self_instruct/filter/__init__.py +34 -0
- camel/datagen/self_instruct/filter/filter_function.py +216 -0
- camel/datagen/self_instruct/filter/filter_registry.py +56 -0
- camel/datagen/self_instruct/filter/instruction_filter.py +81 -0
- camel/datagen/self_instruct/self_instruct.py +393 -0
- camel/datagen/self_instruct/templates.py +382 -0
- camel/datahubs/huggingface.py +12 -2
- camel/datahubs/models.py +2 -3
- camel/embeddings/mistral_embedding.py +5 -1
- camel/embeddings/openai_compatible_embedding.py +6 -1
- camel/embeddings/openai_embedding.py +5 -1
- camel/interpreters/e2b_interpreter.py +5 -1
- camel/loaders/__init__.py +2 -0
- camel/loaders/apify_reader.py +5 -1
- camel/loaders/chunkr_reader.py +5 -1
- camel/loaders/firecrawl_reader.py +0 -30
- camel/loaders/panda_reader.py +337 -0
- camel/logger.py +11 -5
- camel/messages/__init__.py +10 -4
- camel/messages/conversion/conversation_models.py +5 -0
- camel/messages/func_message.py +30 -22
- camel/models/__init__.py +2 -0
- camel/models/anthropic_model.py +6 -23
- camel/models/azure_openai_model.py +1 -2
- camel/models/cohere_model.py +13 -1
- camel/models/deepseek_model.py +5 -1
- camel/models/gemini_model.py +15 -2
- camel/models/groq_model.py +5 -1
- camel/models/internlm_model.py +143 -0
- camel/models/mistral_model.py +19 -8
- camel/models/model_factory.py +3 -0
- camel/models/nemotron_model.py +5 -1
- camel/models/nvidia_model.py +5 -1
- camel/models/openai_model.py +5 -1
- camel/models/qwen_model.py +5 -1
- camel/models/reka_model.py +5 -1
- camel/models/reward/__init__.py +2 -0
- camel/models/reward/nemotron_model.py +5 -1
- camel/models/reward/skywork_model.py +88 -0
- camel/models/samba_model.py +5 -1
- camel/models/togetherai_model.py +5 -1
- camel/models/yi_model.py +5 -1
- camel/models/zhipuai_model.py +5 -1
- camel/schemas/openai_converter.py +5 -1
- camel/storages/graph_storages/nebula_graph.py +89 -20
- camel/storages/graph_storages/neo4j_graph.py +138 -0
- camel/synthetic_datagen/source2synth/data_processor.py +373 -0
- camel/synthetic_datagen/source2synth/models.py +68 -0
- camel/synthetic_datagen/source2synth/user_data_processor_config.py +73 -0
- camel/toolkits/__init__.py +4 -0
- camel/toolkits/arxiv_toolkit.py +20 -3
- camel/toolkits/dappier_toolkit.py +196 -0
- camel/toolkits/function_tool.py +61 -61
- camel/toolkits/google_scholar_toolkit.py +9 -0
- camel/toolkits/meshy_toolkit.py +5 -1
- camel/toolkits/notion_toolkit.py +1 -1
- camel/toolkits/openbb_toolkit.py +869 -0
- camel/toolkits/search_toolkit.py +91 -5
- camel/toolkits/stripe_toolkit.py +5 -1
- camel/toolkits/twitter_toolkit.py +24 -16
- camel/types/__init__.py +4 -2
- camel/types/enums.py +34 -1
- camel/types/openai_types.py +6 -4
- camel/types/unified_model_type.py +5 -0
- camel/utils/__init__.py +2 -0
- camel/utils/commons.py +104 -19
- camel/utils/token_counting.py +3 -3
- {camel_ai-0.2.15a0.dist-info → camel_ai-0.2.17.dist-info}/METADATA +160 -177
- {camel_ai-0.2.15a0.dist-info → camel_ai-0.2.17.dist-info}/RECORD +94 -69
- {camel_ai-0.2.15a0.dist-info → camel_ai-0.2.17.dist-info}/WHEEL +1 -1
- camel/bots/discord_app.py +0 -138
- {camel_ai-0.2.15a0.dist-info → camel_ai-0.2.17.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
import random
|
|
16
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
|
|
21
|
+
from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
|
|
22
|
+
from camel.logger import get_logger
|
|
23
|
+
from camel.synthetic_datagen.source2synth.user_data_processor_config import (
|
|
24
|
+
ProcessorConfig,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class UserDataProcessor:
|
|
31
|
+
r"""User Data Processor."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: Optional[ProcessorConfig] = None):
|
|
34
|
+
self.config = config or ProcessorConfig()
|
|
35
|
+
random.seed(self.config.seed)
|
|
36
|
+
np.random.seed(self.config.seed)
|
|
37
|
+
self.multi_hop_agent = (
|
|
38
|
+
MultiHopGeneratorAgent() if self.config.use_ai_model else None
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def process_text(
|
|
42
|
+
self, text: str, source: str = "user_input"
|
|
43
|
+
) -> List[Dict[str, Any]]:
|
|
44
|
+
r"""Process a single text."""
|
|
45
|
+
# Convert text to standard format
|
|
46
|
+
raw_data = [
|
|
47
|
+
{
|
|
48
|
+
'text': text,
|
|
49
|
+
'source': source,
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Construct examples
|
|
54
|
+
constructor = ExampleConstructor(self.config, self.multi_hop_agent)
|
|
55
|
+
examples = constructor.construct_examples(raw_data)
|
|
56
|
+
|
|
57
|
+
# Manage data
|
|
58
|
+
curator = DataCurator(self.config)
|
|
59
|
+
final_dataset = curator.curate_dataset(examples)
|
|
60
|
+
|
|
61
|
+
return final_dataset
|
|
62
|
+
|
|
63
|
+
def process_batch(
|
|
64
|
+
self, texts: List[str], sources: Optional[List[str]] = None
|
|
65
|
+
) -> List[Dict[str, Any]]:
|
|
66
|
+
r"""Process multiple texts in batch."""
|
|
67
|
+
if sources is None:
|
|
68
|
+
sources = ["user_input"] * len(texts)
|
|
69
|
+
elif len(sources) != len(texts):
|
|
70
|
+
raise ValueError("Length of sources must match length of texts")
|
|
71
|
+
|
|
72
|
+
raw_data = [
|
|
73
|
+
{
|
|
74
|
+
'text': text,
|
|
75
|
+
'source': source,
|
|
76
|
+
}
|
|
77
|
+
for text, source in zip(texts, sources)
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
# Construct examples
|
|
81
|
+
constructor = ExampleConstructor(self.config, self.multi_hop_agent)
|
|
82
|
+
examples = constructor.construct_examples(raw_data)
|
|
83
|
+
|
|
84
|
+
# Manage data
|
|
85
|
+
curator = DataCurator(self.config)
|
|
86
|
+
final_dataset = curator.curate_dataset(examples)
|
|
87
|
+
|
|
88
|
+
return final_dataset
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ExampleConstructor:
|
|
92
|
+
r"""Example Constructor."""
|
|
93
|
+
|
|
94
|
+
def __init__(
|
|
95
|
+
self,
|
|
96
|
+
config: ProcessorConfig,
|
|
97
|
+
multi_hop_agent: Optional[MultiHopGeneratorAgent] = None,
|
|
98
|
+
):
|
|
99
|
+
self.config = config
|
|
100
|
+
self.multi_hop_agent = multi_hop_agent
|
|
101
|
+
|
|
102
|
+
def construct_examples(
|
|
103
|
+
self, raw_data: List[Dict[str, Any]]
|
|
104
|
+
) -> List[Dict[str, Any]]:
|
|
105
|
+
r"""Construct training examples."""
|
|
106
|
+
logger.info("Starting to construct training examples...")
|
|
107
|
+
examples = []
|
|
108
|
+
|
|
109
|
+
for data in tqdm(raw_data, desc="Constructing examples"):
|
|
110
|
+
# 1. Text preprocessing
|
|
111
|
+
processed_text = self._preprocess_text(data.get('text', ''))
|
|
112
|
+
if not processed_text:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# 2. Generate key information pairs
|
|
116
|
+
info_pairs = self._extract_info_pairs(processed_text)
|
|
117
|
+
|
|
118
|
+
# 3. Construct question-answer pairs
|
|
119
|
+
qa_pairs = self._generate_qa_pairs(info_pairs)
|
|
120
|
+
|
|
121
|
+
# 4. Add metadata
|
|
122
|
+
example = {
|
|
123
|
+
'text': processed_text,
|
|
124
|
+
'qa_pairs': qa_pairs,
|
|
125
|
+
'metadata': {
|
|
126
|
+
'source': data.get('source', 'unknown'),
|
|
127
|
+
'timestamp': data.get('timestamp', ''),
|
|
128
|
+
'complexity': self._calculate_complexity(qa_pairs),
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
examples.append(example)
|
|
133
|
+
|
|
134
|
+
logger.info(f"Successfully constructed {len(examples)} examples")
|
|
135
|
+
return examples
|
|
136
|
+
|
|
137
|
+
def _preprocess_text(self, text: str) -> str:
|
|
138
|
+
r"""Text preprocessing."""
|
|
139
|
+
if not isinstance(text, str):
|
|
140
|
+
return ''
|
|
141
|
+
|
|
142
|
+
# 1. Basic cleaning
|
|
143
|
+
text = text.strip()
|
|
144
|
+
|
|
145
|
+
# 2. Length check
|
|
146
|
+
if (
|
|
147
|
+
len(text) < self.config.min_length
|
|
148
|
+
or len(text) > self.config.max_length
|
|
149
|
+
):
|
|
150
|
+
return ''
|
|
151
|
+
|
|
152
|
+
# 3. Quality check
|
|
153
|
+
if not self._check_text_quality(text):
|
|
154
|
+
return ''
|
|
155
|
+
|
|
156
|
+
return text
|
|
157
|
+
|
|
158
|
+
def _check_text_quality(self, text: str) -> bool:
|
|
159
|
+
r"""Check text quality."""
|
|
160
|
+
# 1. Basic quality check
|
|
161
|
+
if text.count('.') < 2: # Must have at least 2 sentences
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
# 2. Special character ratio check
|
|
165
|
+
special_char_ratio = len(
|
|
166
|
+
[c for c in text if not c.isalnum() and not c.isspace()]
|
|
167
|
+
) / len(text)
|
|
168
|
+
if special_char_ratio > 0.3: # No more than 30% special characters
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
return True
|
|
172
|
+
|
|
173
|
+
def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
|
|
174
|
+
r"""Extract information pairs and relationships."""
|
|
175
|
+
# Split into sentences
|
|
176
|
+
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
|
177
|
+
info_pairs = []
|
|
178
|
+
|
|
179
|
+
# Extract combinations of multiple related sentences
|
|
180
|
+
for i in range(len(sentences) - 2):
|
|
181
|
+
if len(sentences[i]) > 10 and len(sentences[i + 1]) > 10:
|
|
182
|
+
info_pairs.append(
|
|
183
|
+
{
|
|
184
|
+
'premise': sentences[i],
|
|
185
|
+
'intermediate': sentences[i + 1],
|
|
186
|
+
'conclusion': sentences[i + 2]
|
|
187
|
+
if i + 2 < len(sentences)
|
|
188
|
+
else '',
|
|
189
|
+
'related_contexts': [
|
|
190
|
+
s
|
|
191
|
+
for j, s in enumerate(sentences)
|
|
192
|
+
if j != i and j != i + 1 and len(s) > 10
|
|
193
|
+
][:2],
|
|
194
|
+
# Limit to 2 additional related contexts
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return info_pairs
|
|
199
|
+
|
|
200
|
+
def _generate_qa_pairs(
|
|
201
|
+
self, info_pairs: List[Dict[str, Sequence[str]]]
|
|
202
|
+
) -> List[Dict[str, str]]:
|
|
203
|
+
r"""Generate multi-hop question-answer pairs."""
|
|
204
|
+
qa_pairs = []
|
|
205
|
+
|
|
206
|
+
for pair in info_pairs:
|
|
207
|
+
# 1. Generate multi-hop question-answer pair using AI
|
|
208
|
+
if self.multi_hop_agent:
|
|
209
|
+
# Construct full context
|
|
210
|
+
context = (
|
|
211
|
+
f"{pair['premise']}. {pair['intermediate']}."
|
|
212
|
+
f" {pair['conclusion']}"
|
|
213
|
+
)
|
|
214
|
+
response = self.multi_hop_agent.generate_multi_hop_qa(context)
|
|
215
|
+
if response:
|
|
216
|
+
qa_pairs.append(response.value.dict())
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
return qa_pairs
|
|
220
|
+
|
|
221
|
+
def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
|
|
222
|
+
r"""Calculate complexity of QA pairs."""
|
|
223
|
+
if not qa_pairs:
|
|
224
|
+
return 0.0
|
|
225
|
+
|
|
226
|
+
# Calculate complexity based on multiple factors
|
|
227
|
+
complexities = []
|
|
228
|
+
for qa in qa_pairs:
|
|
229
|
+
# 1. Number of reasoning steps
|
|
230
|
+
reasoning_steps_count = len(qa.get('reasoning_steps', []))
|
|
231
|
+
|
|
232
|
+
# 2. Number of supporting facts
|
|
233
|
+
supporting_facts_count = len(qa.get('supporting_facts', []))
|
|
234
|
+
|
|
235
|
+
# 3. Question length
|
|
236
|
+
question_length = len(qa['question'].split())
|
|
237
|
+
|
|
238
|
+
# 4. Answer length
|
|
239
|
+
answer_length = len(qa['answer'].split())
|
|
240
|
+
|
|
241
|
+
# Calculate complexity of a single QA pair
|
|
242
|
+
qa_complexity = (
|
|
243
|
+
min(reasoning_steps_count / 3, 1.0)
|
|
244
|
+
* 0.4 # Weight for reasoning steps
|
|
245
|
+
+ min(supporting_facts_count / 3, 1.0)
|
|
246
|
+
* 0.3 # Weight for supporting facts
|
|
247
|
+
+ min(question_length / 20, 1.0)
|
|
248
|
+
* 0.15 # Weight for question length
|
|
249
|
+
+ min(answer_length / 50, 1.0) * 0.15
|
|
250
|
+
# Weight for answer length
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
complexities.append(qa_complexity)
|
|
254
|
+
|
|
255
|
+
return sum(complexities) / len(complexities)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class DataCurator:
|
|
259
|
+
r"""Data Manager."""
|
|
260
|
+
|
|
261
|
+
def __init__(self, config: ProcessorConfig):
|
|
262
|
+
self.config = config
|
|
263
|
+
|
|
264
|
+
def curate_dataset(
|
|
265
|
+
self, examples: List[Dict[str, Any]]
|
|
266
|
+
) -> List[Dict[str, Any]]:
|
|
267
|
+
r"""Dataset management."""
|
|
268
|
+
logger.info("Starting dataset management...")
|
|
269
|
+
|
|
270
|
+
# 1. Quality filtering
|
|
271
|
+
quality_filtered = self._quality_filter(examples)
|
|
272
|
+
logger.info(
|
|
273
|
+
f"Remaining examples after quality filtering:"
|
|
274
|
+
f" {len(quality_filtered)}"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# 2. Complexity filtering
|
|
278
|
+
complexity_filtered = self._complexity_filter(quality_filtered)
|
|
279
|
+
logger.info(
|
|
280
|
+
f"Remaining examples after complexity filtering:"
|
|
281
|
+
f" {len(complexity_filtered)}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# 3. Deduplication
|
|
285
|
+
deduplicated = self._remove_duplicates(complexity_filtered)
|
|
286
|
+
logger.info(
|
|
287
|
+
f"Remaining examples after deduplication: {len(deduplicated)}"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# 4. Sample to target size
|
|
291
|
+
final_dataset = self._sample_dataset(deduplicated)
|
|
292
|
+
logger.info(f"Final dataset size: {len(final_dataset)}")
|
|
293
|
+
|
|
294
|
+
return final_dataset
|
|
295
|
+
|
|
296
|
+
def _quality_filter(
|
|
297
|
+
self, examples: List[Dict[str, Any]]
|
|
298
|
+
) -> List[Dict[str, Any]]:
|
|
299
|
+
r"""Quality filtering."""
|
|
300
|
+
filtered = []
|
|
301
|
+
|
|
302
|
+
for example in examples:
|
|
303
|
+
# 1. Check QA pair quality
|
|
304
|
+
qa_quality = self._check_qa_quality(example.get('qa_pairs', []))
|
|
305
|
+
|
|
306
|
+
# 2. Check text quality
|
|
307
|
+
text_quality = (
|
|
308
|
+
len(example.get('text', '').split()) >= 20
|
|
309
|
+
) # At least 20 words
|
|
310
|
+
|
|
311
|
+
if qa_quality and text_quality:
|
|
312
|
+
filtered.append(example)
|
|
313
|
+
|
|
314
|
+
return filtered
|
|
315
|
+
|
|
316
|
+
def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
|
|
317
|
+
r"""Check quality of QA pairs."""
|
|
318
|
+
if not qa_pairs:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
for qa in qa_pairs:
|
|
322
|
+
# 1. Length check
|
|
323
|
+
if (
|
|
324
|
+
len(qa.get('question', '')) < 10
|
|
325
|
+
or len(qa.get('answer', '')) < 5
|
|
326
|
+
):
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
# 2. QA pair duplication check
|
|
330
|
+
if qa.get('question', '') == qa.get('answer', ''):
|
|
331
|
+
return False
|
|
332
|
+
|
|
333
|
+
return True
|
|
334
|
+
|
|
335
|
+
def _complexity_filter(
|
|
336
|
+
self, examples: List[Dict[str, Any]]
|
|
337
|
+
) -> List[Dict[str, Any]]:
|
|
338
|
+
r"""Complexity filtering."""
|
|
339
|
+
return [
|
|
340
|
+
example
|
|
341
|
+
for example in examples
|
|
342
|
+
if example.get('metadata', {}).get('complexity', 0)
|
|
343
|
+
>= self.config.complexity_threshold
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
def _remove_duplicates(
|
|
347
|
+
self, examples: List[Dict[str, Any]]
|
|
348
|
+
) -> List[Dict[str, Any]]:
|
|
349
|
+
r"""Remove duplicates."""
|
|
350
|
+
seen = set()
|
|
351
|
+
unique_examples = []
|
|
352
|
+
|
|
353
|
+
for example in examples:
|
|
354
|
+
# Use text and QA pair combination as unique identifier
|
|
355
|
+
text = example.get('text', '')
|
|
356
|
+
qa_str = str(example.get('qa_pairs', []))
|
|
357
|
+
|
|
358
|
+
identifier = hash(text + qa_str)
|
|
359
|
+
|
|
360
|
+
if identifier not in seen:
|
|
361
|
+
seen.add(identifier)
|
|
362
|
+
unique_examples.append(example)
|
|
363
|
+
|
|
364
|
+
return unique_examples
|
|
365
|
+
|
|
366
|
+
def _sample_dataset(
|
|
367
|
+
self, examples: List[Dict[str, Any]]
|
|
368
|
+
) -> List[Dict[str, Any]]:
|
|
369
|
+
r"""Sample to target dataset size."""
|
|
370
|
+
if len(examples) <= self.config.dataset_size:
|
|
371
|
+
return examples
|
|
372
|
+
|
|
373
|
+
return random.sample(examples, self.config.dataset_size)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from typing import Any, ClassVar, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ReasoningStep(BaseModel):
|
|
20
|
+
step: str = Field(
|
|
21
|
+
..., description="A single step in the reasoning process."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MultiHopQA(BaseModel):
|
|
26
|
+
question: str = Field(
|
|
27
|
+
..., description="The question that requires multi-hop reasoning."
|
|
28
|
+
)
|
|
29
|
+
reasoning_steps: List[ReasoningStep] = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="The steps involved in reasoning to answer the question.",
|
|
32
|
+
)
|
|
33
|
+
answer: str = Field(
|
|
34
|
+
..., description="The answer to the multi-hop question."
|
|
35
|
+
)
|
|
36
|
+
supporting_facts: List[str] = Field(
|
|
37
|
+
..., description="Facts that support the reasoning and answer."
|
|
38
|
+
)
|
|
39
|
+
type: str = Field(description="The type of question-answer pair.")
|
|
40
|
+
|
|
41
|
+
class Config:
|
|
42
|
+
json_schema_extra: ClassVar[Dict[str, Any]] = {
|
|
43
|
+
"example": {
|
|
44
|
+
"question": "What is the capital of France?",
|
|
45
|
+
"reasoning_steps": [
|
|
46
|
+
{"step": "Identify the country France."},
|
|
47
|
+
{"step": "Find the capital city of France."},
|
|
48
|
+
],
|
|
49
|
+
"answer": "Paris",
|
|
50
|
+
"supporting_facts": [
|
|
51
|
+
"France is a country in Europe.",
|
|
52
|
+
"Paris is the capital city of France.",
|
|
53
|
+
],
|
|
54
|
+
"type": "multi_hop_qa",
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ContextPrompt(BaseModel):
|
|
60
|
+
main_context: str = Field(
|
|
61
|
+
...,
|
|
62
|
+
description="The main context for generating"
|
|
63
|
+
" the question-answer pair.",
|
|
64
|
+
)
|
|
65
|
+
related_contexts: Optional[List[str]] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="Additional contexts related to the main context.",
|
|
68
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
import random
|
|
16
|
+
|
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
18
|
+
|
|
19
|
+
from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ProcessorConfig(BaseModel):
|
|
23
|
+
r"""Data processing configuration class"""
|
|
24
|
+
|
|
25
|
+
def __repr__(self):
|
|
26
|
+
return "MultiHopGeneratorAgent()"
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(
|
|
29
|
+
validate_assignment=True,
|
|
30
|
+
frozen=False,
|
|
31
|
+
protected_namespaces=(),
|
|
32
|
+
arbitrary_types_allowed=True,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
seed: int = Field( # Generate a random seed for reproducibility
|
|
36
|
+
default_factory=lambda: random.randint(0, 1000),
|
|
37
|
+
description="Random seed for reproducibility",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
min_length: int = Field(
|
|
41
|
+
default=50, description="Minimum text length", ge=0
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
max_length: int = Field(
|
|
45
|
+
default=512, description="Maximum text length", gt=0
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
quality_threshold: float = Field(
|
|
49
|
+
default=0.7,
|
|
50
|
+
description="Quality threshold for processing",
|
|
51
|
+
ge=0.0,
|
|
52
|
+
le=1.0,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
complexity_threshold: float = Field(
|
|
56
|
+
default=0.5,
|
|
57
|
+
description="Complexity threshold for processing",
|
|
58
|
+
ge=0.0,
|
|
59
|
+
le=1.0,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
dataset_size: int = Field(
|
|
63
|
+
default=1000, description="Target size of the dataset", gt=0
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
use_ai_model: bool = Field(
|
|
67
|
+
default=True, description="Whether to use AI model in processing"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
hop_generating_agent: MultiHopGeneratorAgent = Field(
|
|
71
|
+
default_factory=lambda: MultiHopGeneratorAgent(),
|
|
72
|
+
description="Agent for generating multi-hop text",
|
|
73
|
+
)
|
camel/toolkits/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ from .ask_news_toolkit import AskNewsToolkit, AsyncAskNewsToolkit
|
|
|
28
28
|
from .linkedin_toolkit import LinkedInToolkit
|
|
29
29
|
from .reddit_toolkit import RedditToolkit
|
|
30
30
|
from .meshy_toolkit import MeshyToolkit
|
|
31
|
+
from .openbb_toolkit import OpenBBToolkit
|
|
31
32
|
|
|
32
33
|
from .base import BaseToolkit
|
|
33
34
|
from .google_maps_toolkit import GoogleMapsToolkit
|
|
@@ -43,6 +44,7 @@ from .notion_toolkit import NotionToolkit
|
|
|
43
44
|
from .human_toolkit import HumanToolkit
|
|
44
45
|
from .stripe_toolkit import StripeToolkit
|
|
45
46
|
from .video_toolkit import VideoDownloaderToolkit
|
|
47
|
+
from .dappier_toolkit import DappierToolkit
|
|
46
48
|
|
|
47
49
|
__all__ = [
|
|
48
50
|
'BaseToolkit',
|
|
@@ -73,4 +75,6 @@ __all__ = [
|
|
|
73
75
|
'VideoDownloaderToolkit',
|
|
74
76
|
'StripeToolkit',
|
|
75
77
|
'MeshyToolkit',
|
|
78
|
+
'OpenBBToolkit',
|
|
79
|
+
'DappierToolkit',
|
|
76
80
|
]
|
camel/toolkits/arxiv_toolkit.py
CHANGED
|
@@ -14,10 +14,13 @@
|
|
|
14
14
|
|
|
15
15
|
from typing import Dict, Generator, List, Optional
|
|
16
16
|
|
|
17
|
+
from camel.logger import get_logger
|
|
17
18
|
from camel.toolkits.base import BaseToolkit
|
|
18
19
|
from camel.toolkits.function_tool import FunctionTool
|
|
19
20
|
from camel.utils import dependencies_required
|
|
20
21
|
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
21
24
|
|
|
22
25
|
class ArxivToolkit(BaseToolkit):
|
|
23
26
|
r"""A toolkit for interacting with the arXiv API to search and download
|
|
@@ -98,10 +101,24 @@ class ArxivToolkit(BaseToolkit):
|
|
|
98
101
|
"authors": [author.name for author in paper.authors],
|
|
99
102
|
"entry_id": paper.entry_id,
|
|
100
103
|
"summary": paper.summary,
|
|
101
|
-
|
|
102
|
-
# performance
|
|
103
|
-
"paper_text": arxiv_to_text(paper.pdf_url),
|
|
104
|
+
"pdf_url": paper.pdf_url,
|
|
104
105
|
}
|
|
106
|
+
|
|
107
|
+
# Extract text from the paper
|
|
108
|
+
try:
|
|
109
|
+
# TODO: Use chunkr instead of atxiv_to_text for better
|
|
110
|
+
# performance and reliability
|
|
111
|
+
text = arxiv_to_text(paper_info["pdf_url"])
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(
|
|
114
|
+
"Failed to extract text content from the PDF at "
|
|
115
|
+
"the specified URL. "
|
|
116
|
+
f"URL: {paper_info.get('pdf_url', 'Unknown')} | Error: {e}"
|
|
117
|
+
)
|
|
118
|
+
text = ""
|
|
119
|
+
|
|
120
|
+
paper_info['paper_text'] = text
|
|
121
|
+
|
|
105
122
|
papers_data.append(paper_info)
|
|
106
123
|
|
|
107
124
|
return papers_data
|