camel-ai 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +29 -30
- camel/agents/knowledge_graph_agent.py +1 -5
- camel/agents/multi_hop_generator_agent.py +35 -3
- camel/agents/programmed_agent_instruction.py +73 -18
- camel/benchmarks/apibench.py +1 -5
- camel/benchmarks/nexus.py +1 -5
- camel/benchmarks/ragbench.py +2 -2
- camel/bots/telegram_bot.py +1 -5
- camel/configs/__init__.py +9 -0
- camel/configs/aiml_config.py +80 -0
- camel/configs/gemini_config.py +1 -1
- camel/configs/moonshot_config.py +63 -0
- camel/configs/sglang_config.py +4 -0
- camel/configs/siliconflow_config.py +91 -0
- camel/datagen/__init__.py +3 -1
- camel/datagen/self_improving_cot.py +821 -0
- camel/datagen/source2synth/__init__.py +31 -0
- camel/{synthetic_datagen → datagen}/source2synth/data_processor.py +194 -29
- camel/{synthetic_datagen → datagen}/source2synth/models.py +25 -0
- camel/{synthetic_datagen → datagen}/source2synth/user_data_processor_config.py +9 -8
- camel/datahubs/huggingface.py +3 -3
- camel/embeddings/__init__.py +2 -0
- camel/embeddings/jina_embedding.py +161 -0
- camel/messages/func_message.py +1 -1
- camel/models/__init__.py +4 -0
- camel/models/aiml_model.py +147 -0
- camel/models/deepseek_model.py +29 -11
- camel/models/groq_model.py +0 -2
- camel/models/model_factory.py +9 -0
- camel/models/moonshot_model.py +138 -0
- camel/models/openai_model.py +1 -9
- camel/models/siliconflow_model.py +142 -0
- camel/societies/workforce/role_playing_worker.py +2 -4
- camel/societies/workforce/single_agent_worker.py +1 -6
- camel/societies/workforce/workforce.py +3 -9
- camel/toolkits/__init__.py +4 -0
- camel/toolkits/reddit_toolkit.py +8 -38
- camel/toolkits/search_toolkit.py +17 -6
- camel/toolkits/semantic_scholar_toolkit.py +308 -0
- camel/toolkits/sympy_toolkit.py +778 -0
- camel/toolkits/whatsapp_toolkit.py +11 -32
- camel/types/enums.py +205 -16
- camel/types/unified_model_type.py +5 -0
- camel/utils/__init__.py +7 -2
- camel/utils/commons.py +198 -21
- camel/utils/deduplication.py +199 -0
- camel/utils/token_counting.py +1 -39
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.20.dist-info}/METADATA +17 -12
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.20.dist-info}/RECORD +53 -41
- /camel/datagen/{cotdatagen.py → cot_datagen.py} +0 -0
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.20.dist-info}/LICENSE +0 -0
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.20.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from .data_processor import (
|
|
15
|
+
DataCurator,
|
|
16
|
+
ExampleConstructor,
|
|
17
|
+
UserDataProcessor,
|
|
18
|
+
)
|
|
19
|
+
from .models import MultiHopQA, ReasoningStep
|
|
20
|
+
from .user_data_processor_config import (
|
|
21
|
+
ProcessorConfig,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"DataCurator",
|
|
26
|
+
"ExampleConstructor",
|
|
27
|
+
"ProcessorConfig",
|
|
28
|
+
"UserDataProcessor",
|
|
29
|
+
"ReasoningStep",
|
|
30
|
+
"MultiHopQA",
|
|
31
|
+
]
|
|
@@ -15,33 +15,61 @@
|
|
|
15
15
|
import random
|
|
16
16
|
from typing import Any, Dict, List, Optional, Sequence
|
|
17
17
|
|
|
18
|
-
import numpy as np
|
|
19
18
|
from tqdm import tqdm
|
|
20
19
|
|
|
21
20
|
from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
|
|
22
|
-
from camel.
|
|
23
|
-
from camel.synthetic_datagen.source2synth.user_data_processor_config import (
|
|
21
|
+
from camel.datagen.source2synth.user_data_processor_config import (
|
|
24
22
|
ProcessorConfig,
|
|
25
23
|
)
|
|
24
|
+
from camel.logger import get_logger
|
|
26
25
|
|
|
27
26
|
logger = get_logger(__name__)
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class UserDataProcessor:
|
|
31
|
-
r"""
|
|
30
|
+
r"""A processor for generating multi-hop question-answer pairs from user
|
|
31
|
+
data.
|
|
32
|
+
|
|
33
|
+
This class handles the processing of text data to generate multi-hop
|
|
34
|
+
question-answer pairs using either an AI model or rule-based approaches.
|
|
35
|
+
It manages the entire pipeline from text preprocessing to dataset curation.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
config (ProcessorConfig): Configuration for data processing parameters.
|
|
39
|
+
rng (random.Random): Random number generator for reproducibility.
|
|
40
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for
|
|
41
|
+
generating QA pairs.
|
|
42
|
+
"""
|
|
32
43
|
|
|
33
44
|
def __init__(self, config: Optional[ProcessorConfig] = None):
|
|
45
|
+
r"""Initialize the UserDataProcessor.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
config (Optional[ProcessorConfig], optional): Configuration for
|
|
49
|
+
data processing. (default: :obj:`None`)
|
|
50
|
+
"""
|
|
34
51
|
self.config = config or ProcessorConfig()
|
|
35
|
-
random.
|
|
36
|
-
np.random.seed(self.config.seed)
|
|
52
|
+
self.rng = random.Random(self.config.seed)
|
|
37
53
|
self.multi_hop_agent = (
|
|
38
|
-
|
|
54
|
+
self.config.hop_generating_agent
|
|
55
|
+
if self.config.use_ai_model
|
|
56
|
+
else None
|
|
39
57
|
)
|
|
40
58
|
|
|
41
59
|
def process_text(
|
|
42
60
|
self, text: str, source: str = "user_input"
|
|
43
61
|
) -> List[Dict[str, Any]]:
|
|
44
|
-
r"""Process a single text.
|
|
62
|
+
r"""Process a single text to generate multi-hop QA pairs.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
text (str): The input text to process.
|
|
66
|
+
source (str, optional): Source identifier for the text.
|
|
67
|
+
(default: :obj:`"user_input"`)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List[Dict[str, Any]]: List of processed examples with QA pairs and
|
|
71
|
+
metadata.
|
|
72
|
+
"""
|
|
45
73
|
# Convert text to standard format
|
|
46
74
|
raw_data = [
|
|
47
75
|
{
|
|
@@ -55,7 +83,7 @@ class UserDataProcessor:
|
|
|
55
83
|
examples = constructor.construct_examples(raw_data)
|
|
56
84
|
|
|
57
85
|
# Manage data
|
|
58
|
-
curator = DataCurator(self.config)
|
|
86
|
+
curator = DataCurator(self.config, self.rng)
|
|
59
87
|
final_dataset = curator.curate_dataset(examples)
|
|
60
88
|
|
|
61
89
|
return final_dataset
|
|
@@ -63,7 +91,20 @@ class UserDataProcessor:
|
|
|
63
91
|
def process_batch(
|
|
64
92
|
self, texts: List[str], sources: Optional[List[str]] = None
|
|
65
93
|
) -> List[Dict[str, Any]]:
|
|
66
|
-
r"""Process multiple texts in batch.
|
|
94
|
+
r"""Process multiple texts in batch to generate multi-hop QA pairs.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
texts (List[str]): List of input texts to process.
|
|
98
|
+
sources (Optional[List[str]], optional): List of source
|
|
99
|
+
identifiers. (default: :obj:`None`)
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List[Dict[str, Any]]: List of processed examples with QA pairs and
|
|
103
|
+
metadata.
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
ValueError: If length of sources doesn't match length of texts.
|
|
107
|
+
"""
|
|
67
108
|
if sources is None:
|
|
68
109
|
sources = ["user_input"] * len(texts)
|
|
69
110
|
elif len(sources) != len(texts):
|
|
@@ -82,27 +123,52 @@ class UserDataProcessor:
|
|
|
82
123
|
examples = constructor.construct_examples(raw_data)
|
|
83
124
|
|
|
84
125
|
# Manage data
|
|
85
|
-
curator = DataCurator(self.config)
|
|
126
|
+
curator = DataCurator(self.config, self.rng)
|
|
86
127
|
final_dataset = curator.curate_dataset(examples)
|
|
87
128
|
|
|
88
129
|
return final_dataset
|
|
89
130
|
|
|
90
131
|
|
|
91
132
|
class ExampleConstructor:
|
|
92
|
-
r"""
|
|
133
|
+
r"""Constructs training examples from raw text data.
|
|
134
|
+
|
|
135
|
+
This class handles the construction of training examples by preprocessing
|
|
136
|
+
text, extracting information pairs, and generating question-answer pairs.
|
|
137
|
+
|
|
138
|
+
Attributes:
|
|
139
|
+
config (ProcessorConfig): Configuration for example construction.
|
|
140
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for QA
|
|
141
|
+
generation.
|
|
142
|
+
"""
|
|
93
143
|
|
|
94
144
|
def __init__(
|
|
95
145
|
self,
|
|
96
146
|
config: ProcessorConfig,
|
|
97
147
|
multi_hop_agent: Optional[MultiHopGeneratorAgent] = None,
|
|
98
148
|
):
|
|
149
|
+
r"""Initialize the ExampleConstructor.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
config (ProcessorConfig): Configuration for example construction.
|
|
153
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
|
|
154
|
+
Agent for generating multi-hop QA pairs. (default: :obj:`None`)
|
|
155
|
+
"""
|
|
99
156
|
self.config = config
|
|
100
157
|
self.multi_hop_agent = multi_hop_agent
|
|
101
158
|
|
|
102
159
|
def construct_examples(
|
|
103
160
|
self, raw_data: List[Dict[str, Any]]
|
|
104
161
|
) -> List[Dict[str, Any]]:
|
|
105
|
-
r"""Construct training examples.
|
|
162
|
+
r"""Construct training examples from raw data.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
raw_data (List[Dict[str, Any]]): List of raw data dictionaries
|
|
166
|
+
containing text and metadata.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
List[Dict[str, Any]]: List of constructed examples with QA pairs
|
|
170
|
+
and metadata.
|
|
171
|
+
"""
|
|
106
172
|
logger.info("Starting to construct training examples...")
|
|
107
173
|
examples = []
|
|
108
174
|
|
|
@@ -135,7 +201,15 @@ class ExampleConstructor:
|
|
|
135
201
|
return examples
|
|
136
202
|
|
|
137
203
|
def _preprocess_text(self, text: str) -> str:
|
|
138
|
-
r"""
|
|
204
|
+
r"""Preprocess input text for example construction.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
text (str): Input text to preprocess.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
str: Preprocessed text, or empty string if text fails quality
|
|
211
|
+
checks.
|
|
212
|
+
"""
|
|
139
213
|
if not isinstance(text, str):
|
|
140
214
|
return ''
|
|
141
215
|
|
|
@@ -156,7 +230,14 @@ class ExampleConstructor:
|
|
|
156
230
|
return text
|
|
157
231
|
|
|
158
232
|
def _check_text_quality(self, text: str) -> bool:
|
|
159
|
-
r"""Check
|
|
233
|
+
r"""Check the quality of input text.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
text (str): Text to check quality for.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
bool: True if text passes quality checks, False otherwise.
|
|
240
|
+
"""
|
|
160
241
|
# 1. Basic quality check
|
|
161
242
|
if text.count('.') < 2: # Must have at least 2 sentences
|
|
162
243
|
return False
|
|
@@ -171,7 +252,15 @@ class ExampleConstructor:
|
|
|
171
252
|
return True
|
|
172
253
|
|
|
173
254
|
def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
|
|
174
|
-
r"""Extract information pairs and relationships.
|
|
255
|
+
r"""Extract information pairs and relationships from text.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
text (str): Input text to extract information from.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List[Dict[str, Sequence[str]]]: List of dictionaries containing
|
|
262
|
+
premise, intermediate, conclusion, and related contexts.
|
|
263
|
+
"""
|
|
175
264
|
# Split into sentences
|
|
176
265
|
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
|
177
266
|
info_pairs = []
|
|
@@ -200,7 +289,15 @@ class ExampleConstructor:
|
|
|
200
289
|
def _generate_qa_pairs(
|
|
201
290
|
self, info_pairs: List[Dict[str, Sequence[str]]]
|
|
202
291
|
) -> List[Dict[str, str]]:
|
|
203
|
-
r"""Generate multi-hop question-answer pairs.
|
|
292
|
+
r"""Generate multi-hop question-answer pairs from information pairs.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
info_pairs (List[Dict[str, Sequence[str]]]): List of information
|
|
296
|
+
pairs extracted from text.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List[Dict[str, str]]: List of generated QA pairs.
|
|
300
|
+
"""
|
|
204
301
|
qa_pairs = []
|
|
205
302
|
|
|
206
303
|
for pair in info_pairs:
|
|
@@ -219,7 +316,15 @@ class ExampleConstructor:
|
|
|
219
316
|
return qa_pairs
|
|
220
317
|
|
|
221
318
|
def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
|
|
222
|
-
r"""Calculate complexity of QA pairs.
|
|
319
|
+
r"""Calculate the complexity score for a set of QA pairs.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
|
|
323
|
+
complexity for.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
float: Complexity score between 0.0 and 1.0.
|
|
327
|
+
"""
|
|
223
328
|
if not qa_pairs:
|
|
224
329
|
return 0.0
|
|
225
330
|
|
|
@@ -233,10 +338,10 @@ class ExampleConstructor:
|
|
|
233
338
|
supporting_facts_count = len(qa.get('supporting_facts', []))
|
|
234
339
|
|
|
235
340
|
# 3. Question length
|
|
236
|
-
question_length = len(qa
|
|
341
|
+
question_length = len(qa.get('question', '').split())
|
|
237
342
|
|
|
238
343
|
# 4. Answer length
|
|
239
|
-
answer_length = len(qa
|
|
344
|
+
answer_length = len(qa.get('answer', '').split())
|
|
240
345
|
|
|
241
346
|
# Calculate complexity of a single QA pair
|
|
242
347
|
qa_complexity = (
|
|
@@ -256,15 +361,37 @@ class ExampleConstructor:
|
|
|
256
361
|
|
|
257
362
|
|
|
258
363
|
class DataCurator:
|
|
259
|
-
r"""
|
|
364
|
+
r"""Manages and curates datasets of multi-hop question-answer pairs.
|
|
365
|
+
|
|
366
|
+
This class handles dataset management tasks including quality filtering,
|
|
367
|
+
complexity filtering, deduplication, and dataset sampling.
|
|
260
368
|
|
|
261
|
-
|
|
369
|
+
Attributes:
|
|
370
|
+
config (ProcessorConfig): Configuration for data curation parameters.
|
|
371
|
+
rng (random.Random): Random number generator for reproducible sampling.
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
def __init__(self, config: ProcessorConfig, rng: random.Random):
|
|
375
|
+
r"""Initialize the DataCurator.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
config (ProcessorConfig): Configuration for data curation.
|
|
379
|
+
rng (random.Random): Random number generator for reproducibility.
|
|
380
|
+
"""
|
|
262
381
|
self.config = config
|
|
382
|
+
self.rng = rng
|
|
263
383
|
|
|
264
384
|
def curate_dataset(
|
|
265
385
|
self, examples: List[Dict[str, Any]]
|
|
266
386
|
) -> List[Dict[str, Any]]:
|
|
267
|
-
r"""
|
|
387
|
+
r"""Manage and curate a dataset through multiple filtering stages.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
examples (List[Dict[str, Any]]): List of examples to curate.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
List[Dict[str, Any]]: Curated dataset meeting quality criteria.
|
|
394
|
+
"""
|
|
268
395
|
logger.info("Starting dataset management...")
|
|
269
396
|
|
|
270
397
|
# 1. Quality filtering
|
|
@@ -296,7 +423,14 @@ class DataCurator:
|
|
|
296
423
|
def _quality_filter(
|
|
297
424
|
self, examples: List[Dict[str, Any]]
|
|
298
425
|
) -> List[Dict[str, Any]]:
|
|
299
|
-
r"""
|
|
426
|
+
r"""Filter examples based on quality criteria.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
examples (List[Dict[str, Any]]): List of examples to filter.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
List[Dict[str, Any]]: Examples that pass quality checks.
|
|
433
|
+
"""
|
|
300
434
|
filtered = []
|
|
301
435
|
|
|
302
436
|
for example in examples:
|
|
@@ -314,7 +448,14 @@ class DataCurator:
|
|
|
314
448
|
return filtered
|
|
315
449
|
|
|
316
450
|
def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
|
|
317
|
-
r"""Check quality of
|
|
451
|
+
r"""Check the quality of question-answer pairs.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
qa_pairs (List[Dict[str, str]]): List of QA pairs to check.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
bool: True if QA pairs meet quality criteria, False otherwise.
|
|
458
|
+
"""
|
|
318
459
|
if not qa_pairs:
|
|
319
460
|
return False
|
|
320
461
|
|
|
@@ -335,7 +476,17 @@ class DataCurator:
|
|
|
335
476
|
def _complexity_filter(
|
|
336
477
|
self, examples: List[Dict[str, Any]]
|
|
337
478
|
) -> List[Dict[str, Any]]:
|
|
338
|
-
|
|
479
|
+
"""
|
|
480
|
+
Filter examples based on complexity threshold.
|
|
481
|
+
|
|
482
|
+
Removes examples with complexity scores below the configured threshold.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
examples (List[Dict[str, Any]]): List of examples to filter.
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
List[Dict[str, Any]]: Examples meeting complexity threshold.
|
|
489
|
+
"""
|
|
339
490
|
return [
|
|
340
491
|
example
|
|
341
492
|
for example in examples
|
|
@@ -346,7 +497,14 @@ class DataCurator:
|
|
|
346
497
|
def _remove_duplicates(
|
|
347
498
|
self, examples: List[Dict[str, Any]]
|
|
348
499
|
) -> List[Dict[str, Any]]:
|
|
349
|
-
r"""Remove
|
|
500
|
+
r"""Remove duplicate examples from the dataset.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
examples (List[Dict[str, Any]]): List of examples to deduplicate.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
List[Dict[str, Any]]: Deduplicated examples.
|
|
507
|
+
"""
|
|
350
508
|
seen = set()
|
|
351
509
|
unique_examples = []
|
|
352
510
|
|
|
@@ -366,8 +524,15 @@ class DataCurator:
|
|
|
366
524
|
def _sample_dataset(
|
|
367
525
|
self, examples: List[Dict[str, Any]]
|
|
368
526
|
) -> List[Dict[str, Any]]:
|
|
369
|
-
r"""Sample to target dataset size.
|
|
527
|
+
r"""Sample examples to match target dataset size.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
examples (List[Dict[str, Any]]): List of examples to sample from.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
List[Dict[str, Any]]: Sampled dataset of target size or smaller.
|
|
534
|
+
"""
|
|
370
535
|
if len(examples) <= self.config.dataset_size:
|
|
371
536
|
return examples
|
|
372
537
|
|
|
373
|
-
return
|
|
538
|
+
return self.rng.sample(examples, self.config.dataset_size)
|
|
@@ -17,12 +17,30 @@ from pydantic import BaseModel, Field
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class ReasoningStep(BaseModel):
|
|
20
|
+
r"""A single step in a multi-hop reasoning process.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
step (str): The textual description of the reasoning step.
|
|
24
|
+
"""
|
|
25
|
+
|
|
20
26
|
step: str = Field(
|
|
21
27
|
..., description="A single step in the reasoning process."
|
|
22
28
|
)
|
|
23
29
|
|
|
24
30
|
|
|
25
31
|
class MultiHopQA(BaseModel):
|
|
32
|
+
r"""A multi-hop question-answer pair with reasoning steps and supporting
|
|
33
|
+
facts.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
question (str): The question requiring multi-hop reasoning.
|
|
37
|
+
reasoning_steps (List[ReasoningStep]): List of reasoning steps to
|
|
38
|
+
answer.
|
|
39
|
+
answer (str): The final answer to the question.
|
|
40
|
+
supporting_facts (List[str]): List of facts supporting the reasoning.
|
|
41
|
+
type (str): The type of question-answer pair.
|
|
42
|
+
"""
|
|
43
|
+
|
|
26
44
|
question: str = Field(
|
|
27
45
|
..., description="The question that requires multi-hop reasoning."
|
|
28
46
|
)
|
|
@@ -57,6 +75,13 @@ class MultiHopQA(BaseModel):
|
|
|
57
75
|
|
|
58
76
|
|
|
59
77
|
class ContextPrompt(BaseModel):
|
|
78
|
+
r"""A context prompt for generating multi-hop question-answer pairs.
|
|
79
|
+
|
|
80
|
+
Attributes:
|
|
81
|
+
main_context (str): The primary context for generating QA pairs.
|
|
82
|
+
related_contexts (Optional[List[str]]): Additional related contexts.
|
|
83
|
+
"""
|
|
84
|
+
|
|
60
85
|
main_context: str = Field(
|
|
61
86
|
...,
|
|
62
87
|
description="The main context for generating"
|
|
@@ -23,7 +23,15 @@ class ProcessorConfig(BaseModel):
|
|
|
23
23
|
r"""Data processing configuration class"""
|
|
24
24
|
|
|
25
25
|
def __repr__(self):
|
|
26
|
-
return
|
|
26
|
+
return (
|
|
27
|
+
f"ProcessorConfig("
|
|
28
|
+
f"seed={self.seed}, min_length={self.min_length}, "
|
|
29
|
+
f"max_length={self.max_length}, "
|
|
30
|
+
f"complexity_threshold={self.complexity_threshold}, "
|
|
31
|
+
f"dataset_size={self.dataset_size}, "
|
|
32
|
+
f"use_ai_model={self.use_ai_model}"
|
|
33
|
+
f")"
|
|
34
|
+
)
|
|
27
35
|
|
|
28
36
|
model_config = ConfigDict(
|
|
29
37
|
validate_assignment=True,
|
|
@@ -45,13 +53,6 @@ class ProcessorConfig(BaseModel):
|
|
|
45
53
|
default=512, description="Maximum text length", gt=0
|
|
46
54
|
)
|
|
47
55
|
|
|
48
|
-
quality_threshold: float = Field(
|
|
49
|
-
default=0.7,
|
|
50
|
-
description="Quality threshold for processing",
|
|
51
|
-
ge=0.0,
|
|
52
|
-
le=1.0,
|
|
53
|
-
)
|
|
54
|
-
|
|
55
56
|
complexity_threshold: float = Field(
|
|
56
57
|
default=0.5,
|
|
57
58
|
description="Complexity threshold for processing",
|
camel/datahubs/huggingface.py
CHANGED
|
@@ -32,19 +32,19 @@ class HuggingFaceDatasetManager(BaseDatasetManager):
|
|
|
32
32
|
|
|
33
33
|
Args:
|
|
34
34
|
token (str): The Hugging Face API token. If not provided, the token
|
|
35
|
-
will be read from the environment variable `
|
|
35
|
+
will be read from the environment variable `HF_TOKEN`.
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
@api_keys_required(
|
|
39
39
|
[
|
|
40
|
-
("token", "
|
|
40
|
+
("token", "HF_TOKEN"),
|
|
41
41
|
]
|
|
42
42
|
)
|
|
43
43
|
@dependencies_required('huggingface_hub')
|
|
44
44
|
def __init__(self, token: Optional[str] = None):
|
|
45
45
|
from huggingface_hub import HfApi
|
|
46
46
|
|
|
47
|
-
self._api_key = token or os.getenv("
|
|
47
|
+
self._api_key = token or os.getenv("HF_TOKEN")
|
|
48
48
|
self.api = HfApi(token=self._api_key)
|
|
49
49
|
|
|
50
50
|
def create_dataset_card(
|
camel/embeddings/__init__.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
14
|
from .base import BaseEmbedding
|
|
15
|
+
from .jina_embedding import JinaEmbedding
|
|
15
16
|
from .mistral_embedding import MistralEmbedding
|
|
16
17
|
from .openai_compatible_embedding import OpenAICompatibleEmbedding
|
|
17
18
|
from .openai_embedding import OpenAIEmbedding
|
|
@@ -25,4 +26,5 @@ __all__ = [
|
|
|
25
26
|
"VisionLanguageEmbedding",
|
|
26
27
|
"MistralEmbedding",
|
|
27
28
|
"OpenAICompatibleEmbedding",
|
|
29
|
+
"JinaEmbedding",
|
|
28
30
|
]
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import io
|
|
17
|
+
import os
|
|
18
|
+
from typing import Any, Optional, Union
|
|
19
|
+
|
|
20
|
+
import requests
|
|
21
|
+
from PIL import Image
|
|
22
|
+
|
|
23
|
+
from camel.embeddings import BaseEmbedding
|
|
24
|
+
from camel.types.enums import EmbeddingModelType
|
|
25
|
+
from camel.utils import api_keys_required
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JinaEmbedding(BaseEmbedding[Union[str, Image.Image]]):
|
|
29
|
+
r"""Provides text and image embedding functionalities using Jina AI's API.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
model_type (EmbeddingModelType, optional): The model to use for
|
|
33
|
+
embeddings. (default: :obj:`JINA_EMBEDDINGS_V3`)
|
|
34
|
+
api_key (Optional[str], optional): The API key for authenticating with
|
|
35
|
+
Jina AI. (default: :obj:`None`)
|
|
36
|
+
dimensions (Optional[int], optional): The dimension of the output
|
|
37
|
+
embeddings. (default: :obj:`None`)
|
|
38
|
+
embedding_type (Optional[str], optional): The type of embedding format
|
|
39
|
+
to generate. Options: 'int8' (binary encoding with higher storage
|
|
40
|
+
and transfer efficiency), 'uint8' (unsigned binary encoding with
|
|
41
|
+
higher storage and transfer efficiency), 'base64' (base64 string
|
|
42
|
+
encoding with higher transfer efficiency). (default: :obj:`None`)
|
|
43
|
+
task (Optional[str], optional): The type of task for text embeddings.
|
|
44
|
+
Options: retrieval.query, retrieval.passage, text-matching,
|
|
45
|
+
classification, separation. (default: :obj:`None`)
|
|
46
|
+
late_chunking (bool, optional): If true, concatenates all sentences in
|
|
47
|
+
input and treats as a single input. (default: :obj:`False`)
|
|
48
|
+
normalized (bool, optional): If true, embeddings are normalized to unit
|
|
49
|
+
L2 norm. (default: :obj:`False`)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@api_keys_required([("api_key", 'JINA_API_KEY')])
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
model_type: EmbeddingModelType = EmbeddingModelType.JINA_EMBEDDINGS_V3,
|
|
56
|
+
api_key: Optional[str] = None,
|
|
57
|
+
dimensions: Optional[int] = None,
|
|
58
|
+
embedding_type: Optional[str] = None,
|
|
59
|
+
task: Optional[str] = None,
|
|
60
|
+
late_chunking: bool = False,
|
|
61
|
+
normalized: bool = False,
|
|
62
|
+
) -> None:
|
|
63
|
+
if not model_type.is_jina:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Model type {model_type} is not a Jina model. "
|
|
66
|
+
"Please use a valid Jina model type."
|
|
67
|
+
)
|
|
68
|
+
self.model_type = model_type
|
|
69
|
+
if dimensions is None:
|
|
70
|
+
self.output_dim = model_type.output_dim
|
|
71
|
+
else:
|
|
72
|
+
self.output_dim = dimensions
|
|
73
|
+
self._api_key = api_key or os.environ.get("JINA_API_KEY")
|
|
74
|
+
|
|
75
|
+
self.embedding_type = embedding_type
|
|
76
|
+
self.task = task
|
|
77
|
+
self.late_chunking = late_chunking
|
|
78
|
+
self.normalized = normalized
|
|
79
|
+
self.url = 'https://api.jina.ai/v1/embeddings'
|
|
80
|
+
self.headers = {
|
|
81
|
+
'Content-Type': 'application/json',
|
|
82
|
+
'Accept': 'application/json',
|
|
83
|
+
'Authorization': f'Bearer {self._api_key}',
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def embed_list(
|
|
87
|
+
self,
|
|
88
|
+
objs: list[Union[str, Image.Image]],
|
|
89
|
+
**kwargs: Any,
|
|
90
|
+
) -> list[list[float]]:
|
|
91
|
+
r"""Generates embeddings for the given texts or images.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
objs (list[Union[str, Image.Image]]): The texts or images for which
|
|
95
|
+
to generate the embeddings.
|
|
96
|
+
**kwargs (Any): Extra kwargs passed to the embedding API. Not used
|
|
97
|
+
in this implementation.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
list[list[float]]: A list that represents the generated embedding
|
|
101
|
+
as a list of floating-point numbers.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If the input type is not supported.
|
|
105
|
+
RuntimeError: If the API request fails.
|
|
106
|
+
"""
|
|
107
|
+
input_data = []
|
|
108
|
+
for obj in objs:
|
|
109
|
+
if isinstance(obj, str):
|
|
110
|
+
if self.model_type == EmbeddingModelType.JINA_CLIP_V2:
|
|
111
|
+
input_data.append({"text": obj})
|
|
112
|
+
else:
|
|
113
|
+
input_data.append(obj) # type: ignore[arg-type]
|
|
114
|
+
elif isinstance(obj, Image.Image):
|
|
115
|
+
if self.model_type != EmbeddingModelType.JINA_CLIP_V2:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Model {self.model_type} does not support "
|
|
118
|
+
"image input. Use JINA_CLIP_V2 for image embeddings."
|
|
119
|
+
)
|
|
120
|
+
# Convert PIL Image to base64 string
|
|
121
|
+
buffered = io.BytesIO()
|
|
122
|
+
obj.save(buffered, format="PNG")
|
|
123
|
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
|
124
|
+
input_data.append({"image": img_str})
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Input type {type(obj)} is not supported. "
|
|
128
|
+
"Must be either str or PIL.Image."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
data = {
|
|
132
|
+
"model": self.model_type.value,
|
|
133
|
+
"input": input_data,
|
|
134
|
+
"embedding_type": "float",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if self.embedding_type is not None:
|
|
138
|
+
data["embedding_type"] = self.embedding_type
|
|
139
|
+
if self.task is not None:
|
|
140
|
+
data["task"] = self.task
|
|
141
|
+
if self.late_chunking:
|
|
142
|
+
data["late_chunking"] = self.late_chunking # type: ignore[assignment]
|
|
143
|
+
if self.normalized:
|
|
144
|
+
data["normalized"] = self.normalized # type: ignore[assignment]
|
|
145
|
+
try:
|
|
146
|
+
response = requests.post(
|
|
147
|
+
self.url, headers=self.headers, json=data, timeout=180
|
|
148
|
+
)
|
|
149
|
+
response.raise_for_status()
|
|
150
|
+
result = response.json()
|
|
151
|
+
return [data["embedding"] for data in result["data"]]
|
|
152
|
+
except requests.exceptions.RequestException as e:
|
|
153
|
+
raise RuntimeError(f"Failed to get embeddings from Jina AI: {e}")
|
|
154
|
+
|
|
155
|
+
def get_output_dim(self) -> int:
|
|
156
|
+
r"""Returns the output dimension of the embeddings.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
int: The dimensionality of the embedding for the current model.
|
|
160
|
+
"""
|
|
161
|
+
return self.output_dim
|