camel-ai 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/multi_hop_generator_agent.py +35 -3
- camel/agents/programmed_agent_instruction.py +73 -18
- camel/configs/gemini_config.py +1 -1
- camel/configs/sglang_config.py +4 -0
- camel/datagen/source2synth/__init__.py +31 -0
- camel/{synthetic_datagen → datagen}/source2synth/data_processor.py +194 -29
- camel/{synthetic_datagen → datagen}/source2synth/models.py +25 -0
- camel/{synthetic_datagen → datagen}/source2synth/user_data_processor_config.py +9 -8
- camel/embeddings/__init__.py +2 -0
- camel/embeddings/jina_embedding.py +156 -0
- camel/messages/func_message.py +1 -1
- camel/models/deepseek_model.py +29 -11
- camel/models/groq_model.py +0 -2
- camel/models/openai_model.py +1 -9
- camel/toolkits/search_toolkit.py +5 -6
- camel/types/enums.py +68 -10
- camel/utils/token_counting.py +1 -1
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.19.dist-info}/METADATA +5 -2
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.19.dist-info}/RECORD +22 -20
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.19.dist-info}/LICENSE +0 -0
- {camel_ai-0.2.18.dist-info → camel_ai-0.2.19.dist-info}/WHEEL +0 -0
camel/__init__.py
CHANGED
|
@@ -22,17 +22,36 @@ from camel.agents.programmed_agent_instruction import (
|
|
|
22
22
|
ProgrammedAgentInstructionResult,
|
|
23
23
|
programmable_capability,
|
|
24
24
|
)
|
|
25
|
-
from camel.
|
|
26
|
-
from camel.synthetic_datagen.source2synth.models import (
|
|
25
|
+
from camel.datagen.source2synth.models import (
|
|
27
26
|
ContextPrompt,
|
|
28
27
|
MultiHopQA,
|
|
29
28
|
)
|
|
29
|
+
from camel.messages import BaseMessage
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class MultiHopGeneratorAgent(ProgrammableChatAgent):
|
|
33
|
+
r"""An agent specialized in generating multi-hop question-answer pairs.
|
|
34
|
+
|
|
35
|
+
This agent is designed to create complex questions that require multiple
|
|
36
|
+
steps of reasoning to answer. It analyzes context to identify related
|
|
37
|
+
facts and generates questions that require connecting these facts
|
|
38
|
+
logically.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
model_config (ConfigDict): Configuration for model behavior.
|
|
42
|
+
system_message (BaseMessage): System message defining agent's role and
|
|
43
|
+
instructions.
|
|
44
|
+
"""
|
|
45
|
+
|
|
33
46
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
34
47
|
|
|
35
|
-
def __init__(self, **kwargs: Any):
|
|
48
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
49
|
+
r"""Initialize the MultiHopGeneratorAgent.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
**kwargs (Any): Additional keyword arguments to pass to parent
|
|
53
|
+
class.
|
|
54
|
+
"""
|
|
36
55
|
super().__init__(**kwargs)
|
|
37
56
|
|
|
38
57
|
system_text: str = textwrap.dedent(
|
|
@@ -64,6 +83,19 @@ class MultiHopGeneratorAgent(ProgrammableChatAgent):
|
|
|
64
83
|
def generate_multi_hop_qa(
|
|
65
84
|
self, context: str
|
|
66
85
|
) -> ProgrammedAgentInstructionResult[MultiHopQA]:
|
|
86
|
+
r"""Generate a multi-hop question-answer pair from given context.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
context (str): The input text context to generate QA from.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
ProgrammedAgentInstructionResult[MultiHopQA]: Result containing the
|
|
93
|
+
generated question, reasoning steps, answer, and supporting
|
|
94
|
+
facts.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
RuntimeError: If the agent fails to generate a response.
|
|
98
|
+
"""
|
|
67
99
|
context_prompt = ContextPrompt(
|
|
68
100
|
main_context=context, related_contexts=None
|
|
69
101
|
)
|
|
@@ -26,6 +26,16 @@ T = TypeVar('T')
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class ProgrammableAgentRequirement(Enum):
|
|
29
|
+
r"""Requirements for programmable agent state.
|
|
30
|
+
|
|
31
|
+
Defines the possible requirements that can be used to repair the state
|
|
32
|
+
of a programmable agent.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
LAST_MESSAGE_NOT_USER (str): Requires that the last message in the
|
|
36
|
+
conversation was not from the user.
|
|
37
|
+
"""
|
|
38
|
+
|
|
29
39
|
LAST_MESSAGE_NOT_USER = "LAST_MESSAGE_NOT_USER"
|
|
30
40
|
|
|
31
41
|
|
|
@@ -34,6 +44,11 @@ class ProgrammedAgentInstructionResult(BaseModel, Generic[T]):
|
|
|
34
44
|
|
|
35
45
|
Contains the messages exchanged during execution and the computed value.
|
|
36
46
|
The value type is specified by the generic type parameter T.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
user_message (BaseMessage): The message sent by the user.
|
|
50
|
+
agent_message (BaseMessage): The message sent by the agent.
|
|
51
|
+
value (T): The computed result value of type T.
|
|
37
52
|
"""
|
|
38
53
|
|
|
39
54
|
user_message: BaseMessage
|
|
@@ -48,8 +63,7 @@ class AbstractProgrammableAgent(abc.ABC):
|
|
|
48
63
|
|
|
49
64
|
A programmable agent is an agent that can be programmed to perform a
|
|
50
65
|
specific function or task. This class defines the interface for a
|
|
51
|
-
programmable
|
|
52
|
-
agent.
|
|
66
|
+
programmable agent.
|
|
53
67
|
|
|
54
68
|
These methods should be implemented in order to ensure the agent supports
|
|
55
69
|
the necessary guarantees to enable a programming interface while
|
|
@@ -68,16 +82,15 @@ class AbstractProgrammableAgent(abc.ABC):
|
|
|
68
82
|
An atomic operation is an operation that is guaranteed to
|
|
69
83
|
be executed without interruption by any other operation.
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
|
|
85
|
+
Args:
|
|
86
|
+
callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
|
|
87
|
+
operation to execute atomically.
|
|
73
88
|
|
|
74
|
-
|
|
75
|
-
|
|
89
|
+
Returns:
|
|
90
|
+
ProgrammedAgentInstructionResult[T]: The result of the operation.
|
|
76
91
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
operation again. Though if state changes in successful operation
|
|
80
|
-
improve its ability to perform the operation, it should keep them.
|
|
92
|
+
Raises:
|
|
93
|
+
RuntimeError: If an operation is already in progress.
|
|
81
94
|
"""
|
|
82
95
|
raise NotImplementedError
|
|
83
96
|
|
|
@@ -86,10 +99,13 @@ class AbstractProgrammableAgent(abc.ABC):
|
|
|
86
99
|
r"""Repair the state of the agent.
|
|
87
100
|
|
|
88
101
|
Agents may have other non-atomic interfaces, such as a user interface,
|
|
89
|
-
or chat between other agents.
|
|
102
|
+
or chat between other agents. This method should restore the agent to
|
|
103
|
+
a state where it can perform operations according to the specified
|
|
104
|
+
requirement.
|
|
90
105
|
|
|
91
|
-
|
|
92
|
-
|
|
106
|
+
Args:
|
|
107
|
+
requirement (ProgrammableAgentRequirement): The requirement to
|
|
108
|
+
repair the state for.
|
|
93
109
|
"""
|
|
94
110
|
raise NotImplementedError
|
|
95
111
|
|
|
@@ -99,10 +115,16 @@ def programmable_capability(
|
|
|
99
115
|
) -> Callable[..., ProgrammedAgentInstructionResult[T]]:
|
|
100
116
|
r"""Decorator for programmable agent capabilities.
|
|
101
117
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
118
|
+
This decorator ensures that the decorated method is executed atomically
|
|
119
|
+
and maintains the agent's state guarantees.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
func (Callable[..., ProgrammedAgentInstructionResult[T]]): The method
|
|
123
|
+
to decorate.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Callable[..., ProgrammedAgentInstructionResult[T]]: The decorated
|
|
127
|
+
method that ensures atomic execution.
|
|
106
128
|
"""
|
|
107
129
|
|
|
108
130
|
@wraps(func)
|
|
@@ -120,9 +142,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
|
|
|
120
142
|
Provides a default implementation of atomic execution using threading locks
|
|
121
143
|
and basic state tracking for message roles. Implementing classes need to
|
|
122
144
|
provide specific repair logic for their use cases.
|
|
145
|
+
|
|
146
|
+
Attributes:
|
|
147
|
+
_operation_lock (threading.Lock): Lock for ensuring atomic operations.
|
|
148
|
+
_last_message_role (Optional[str]): Role of the last message in the
|
|
149
|
+
conversation.
|
|
123
150
|
"""
|
|
124
151
|
|
|
125
|
-
def __init__(self, **kwargs: Any):
|
|
152
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
153
|
+
r"""Initialize the ProgrammableChatAgent.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
**kwargs (Any): Additional keyword arguments to pass to parent
|
|
157
|
+
class.
|
|
158
|
+
"""
|
|
126
159
|
super().__init__(**kwargs)
|
|
127
160
|
self._operation_lock = threading.Lock()
|
|
128
161
|
self._last_message_role: Optional[str] = None
|
|
@@ -130,6 +163,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
|
|
|
130
163
|
def run_atomic(
|
|
131
164
|
self, callback: Callable[[], ProgrammedAgentInstructionResult[T]]
|
|
132
165
|
) -> ProgrammedAgentInstructionResult[T]:
|
|
166
|
+
r"""Run an atomic operation on the agent.
|
|
167
|
+
|
|
168
|
+
Ensures thread-safe execution of the callback function by using a lock.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
|
|
172
|
+
operation to execute atomically.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
ProgrammedAgentInstructionResult[T]: The result of the operation.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
RuntimeError: If an operation is already in progress.
|
|
179
|
+
"""
|
|
133
180
|
if not self._operation_lock.acquire(blocking=False):
|
|
134
181
|
raise RuntimeError("Operation already in progress")
|
|
135
182
|
|
|
@@ -141,6 +188,14 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
|
|
|
141
188
|
self._operation_lock.release()
|
|
142
189
|
|
|
143
190
|
def repair_state(self, requirement: ProgrammableAgentRequirement) -> None:
|
|
191
|
+
r"""Repair the state of the agent.
|
|
192
|
+
|
|
193
|
+
Implements basic state repair for message role requirements.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
requirement (ProgrammableAgentRequirement): The requirement to
|
|
197
|
+
repair the state for.
|
|
198
|
+
"""
|
|
144
199
|
if requirement == ProgrammableAgentRequirement.LAST_MESSAGE_NOT_USER:
|
|
145
200
|
if self._last_message_role == "user":
|
|
146
201
|
raise NotImplementedError(
|
camel/configs/gemini_config.py
CHANGED
|
@@ -83,7 +83,7 @@ class GeminiConfig(BaseConfig):
|
|
|
83
83
|
stop: Union[str, Sequence[str], NotGiven] = NOT_GIVEN
|
|
84
84
|
max_tokens: Union[int, NotGiven] = NOT_GIVEN
|
|
85
85
|
response_format: Union[Type[BaseModel], dict, NotGiven] = NOT_GIVEN
|
|
86
|
-
tool_choice: Optional[Union[dict[str, str], str]] =
|
|
86
|
+
tool_choice: Optional[Union[dict[str, str], str, NotGiven]] = NOT_GIVEN
|
|
87
87
|
|
|
88
88
|
def as_dict(self) -> dict[str, Any]:
|
|
89
89
|
r"""Convert the current configuration to a dictionary.
|
camel/configs/sglang_config.py
CHANGED
|
@@ -56,6 +56,10 @@ class SGLangConfig(BaseConfig):
|
|
|
56
56
|
in the chat completion. The total length of input tokens and
|
|
57
57
|
generated tokens is limited by the model's context length.
|
|
58
58
|
(default: :obj:`None`)
|
|
59
|
+
tools (list[FunctionTool], optional): A list of tools the model may
|
|
60
|
+
call. Currently, only functions are supported as a tool. Use this
|
|
61
|
+
to provide a list of functions the model may generate JSON inputs
|
|
62
|
+
for. A max of 128 functions are supported.
|
|
59
63
|
"""
|
|
60
64
|
|
|
61
65
|
stop: Union[str, Sequence[str], NotGiven] = NOT_GIVEN
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from .data_processor import (
|
|
15
|
+
DataCurator,
|
|
16
|
+
ExampleConstructor,
|
|
17
|
+
UserDataProcessor,
|
|
18
|
+
)
|
|
19
|
+
from .models import MultiHopQA, ReasoningStep
|
|
20
|
+
from .user_data_processor_config import (
|
|
21
|
+
ProcessorConfig,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"DataCurator",
|
|
26
|
+
"ExampleConstructor",
|
|
27
|
+
"ProcessorConfig",
|
|
28
|
+
"UserDataProcessor",
|
|
29
|
+
"ReasoningStep",
|
|
30
|
+
"MultiHopQA",
|
|
31
|
+
]
|
|
@@ -15,33 +15,61 @@
|
|
|
15
15
|
import random
|
|
16
16
|
from typing import Any, Dict, List, Optional, Sequence
|
|
17
17
|
|
|
18
|
-
import numpy as np
|
|
19
18
|
from tqdm import tqdm
|
|
20
19
|
|
|
21
20
|
from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
|
|
22
|
-
from camel.
|
|
23
|
-
from camel.synthetic_datagen.source2synth.user_data_processor_config import (
|
|
21
|
+
from camel.datagen.source2synth.user_data_processor_config import (
|
|
24
22
|
ProcessorConfig,
|
|
25
23
|
)
|
|
24
|
+
from camel.logger import get_logger
|
|
26
25
|
|
|
27
26
|
logger = get_logger(__name__)
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class UserDataProcessor:
|
|
31
|
-
r"""
|
|
30
|
+
r"""A processor for generating multi-hop question-answer pairs from user
|
|
31
|
+
data.
|
|
32
|
+
|
|
33
|
+
This class handles the processing of text data to generate multi-hop
|
|
34
|
+
question-answer pairs using either an AI model or rule-based approaches.
|
|
35
|
+
It manages the entire pipeline from text preprocessing to dataset curation.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
config (ProcessorConfig): Configuration for data processing parameters.
|
|
39
|
+
rng (random.Random): Random number generator for reproducibility.
|
|
40
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for
|
|
41
|
+
generating QA pairs.
|
|
42
|
+
"""
|
|
32
43
|
|
|
33
44
|
def __init__(self, config: Optional[ProcessorConfig] = None):
|
|
45
|
+
r"""Initialize the UserDataProcessor.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
config (Optional[ProcessorConfig], optional): Configuration for
|
|
49
|
+
data processing. (default: :obj:`None`)
|
|
50
|
+
"""
|
|
34
51
|
self.config = config or ProcessorConfig()
|
|
35
|
-
random.
|
|
36
|
-
np.random.seed(self.config.seed)
|
|
52
|
+
self.rng = random.Random(self.config.seed)
|
|
37
53
|
self.multi_hop_agent = (
|
|
38
|
-
|
|
54
|
+
self.config.hop_generating_agent
|
|
55
|
+
if self.config.use_ai_model
|
|
56
|
+
else None
|
|
39
57
|
)
|
|
40
58
|
|
|
41
59
|
def process_text(
|
|
42
60
|
self, text: str, source: str = "user_input"
|
|
43
61
|
) -> List[Dict[str, Any]]:
|
|
44
|
-
r"""Process a single text.
|
|
62
|
+
r"""Process a single text to generate multi-hop QA pairs.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
text (str): The input text to process.
|
|
66
|
+
source (str, optional): Source identifier for the text.
|
|
67
|
+
(default: :obj:`"user_input"`)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List[Dict[str, Any]]: List of processed examples with QA pairs and
|
|
71
|
+
metadata.
|
|
72
|
+
"""
|
|
45
73
|
# Convert text to standard format
|
|
46
74
|
raw_data = [
|
|
47
75
|
{
|
|
@@ -55,7 +83,7 @@ class UserDataProcessor:
|
|
|
55
83
|
examples = constructor.construct_examples(raw_data)
|
|
56
84
|
|
|
57
85
|
# Manage data
|
|
58
|
-
curator = DataCurator(self.config)
|
|
86
|
+
curator = DataCurator(self.config, self.rng)
|
|
59
87
|
final_dataset = curator.curate_dataset(examples)
|
|
60
88
|
|
|
61
89
|
return final_dataset
|
|
@@ -63,7 +91,20 @@ class UserDataProcessor:
|
|
|
63
91
|
def process_batch(
|
|
64
92
|
self, texts: List[str], sources: Optional[List[str]] = None
|
|
65
93
|
) -> List[Dict[str, Any]]:
|
|
66
|
-
r"""Process multiple texts in batch.
|
|
94
|
+
r"""Process multiple texts in batch to generate multi-hop QA pairs.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
texts (List[str]): List of input texts to process.
|
|
98
|
+
sources (Optional[List[str]], optional): List of source
|
|
99
|
+
identifiers. (default: :obj:`None`)
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List[Dict[str, Any]]: List of processed examples with QA pairs and
|
|
103
|
+
metadata.
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
ValueError: If length of sources doesn't match length of texts.
|
|
107
|
+
"""
|
|
67
108
|
if sources is None:
|
|
68
109
|
sources = ["user_input"] * len(texts)
|
|
69
110
|
elif len(sources) != len(texts):
|
|
@@ -82,27 +123,52 @@ class UserDataProcessor:
|
|
|
82
123
|
examples = constructor.construct_examples(raw_data)
|
|
83
124
|
|
|
84
125
|
# Manage data
|
|
85
|
-
curator = DataCurator(self.config)
|
|
126
|
+
curator = DataCurator(self.config, self.rng)
|
|
86
127
|
final_dataset = curator.curate_dataset(examples)
|
|
87
128
|
|
|
88
129
|
return final_dataset
|
|
89
130
|
|
|
90
131
|
|
|
91
132
|
class ExampleConstructor:
|
|
92
|
-
r"""
|
|
133
|
+
r"""Constructs training examples from raw text data.
|
|
134
|
+
|
|
135
|
+
This class handles the construction of training examples by preprocessing
|
|
136
|
+
text, extracting information pairs, and generating question-answer pairs.
|
|
137
|
+
|
|
138
|
+
Attributes:
|
|
139
|
+
config (ProcessorConfig): Configuration for example construction.
|
|
140
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for QA
|
|
141
|
+
generation.
|
|
142
|
+
"""
|
|
93
143
|
|
|
94
144
|
def __init__(
|
|
95
145
|
self,
|
|
96
146
|
config: ProcessorConfig,
|
|
97
147
|
multi_hop_agent: Optional[MultiHopGeneratorAgent] = None,
|
|
98
148
|
):
|
|
149
|
+
r"""Initialize the ExampleConstructor.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
config (ProcessorConfig): Configuration for example construction.
|
|
153
|
+
multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
|
|
154
|
+
Agent for generating multi-hop QA pairs. (default: :obj:`None`)
|
|
155
|
+
"""
|
|
99
156
|
self.config = config
|
|
100
157
|
self.multi_hop_agent = multi_hop_agent
|
|
101
158
|
|
|
102
159
|
def construct_examples(
|
|
103
160
|
self, raw_data: List[Dict[str, Any]]
|
|
104
161
|
) -> List[Dict[str, Any]]:
|
|
105
|
-
r"""Construct training examples.
|
|
162
|
+
r"""Construct training examples from raw data.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
raw_data (List[Dict[str, Any]]): List of raw data dictionaries
|
|
166
|
+
containing text and metadata.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
List[Dict[str, Any]]: List of constructed examples with QA pairs
|
|
170
|
+
and metadata.
|
|
171
|
+
"""
|
|
106
172
|
logger.info("Starting to construct training examples...")
|
|
107
173
|
examples = []
|
|
108
174
|
|
|
@@ -135,7 +201,15 @@ class ExampleConstructor:
|
|
|
135
201
|
return examples
|
|
136
202
|
|
|
137
203
|
def _preprocess_text(self, text: str) -> str:
|
|
138
|
-
r"""
|
|
204
|
+
r"""Preprocess input text for example construction.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
text (str): Input text to preprocess.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
str: Preprocessed text, or empty string if text fails quality
|
|
211
|
+
checks.
|
|
212
|
+
"""
|
|
139
213
|
if not isinstance(text, str):
|
|
140
214
|
return ''
|
|
141
215
|
|
|
@@ -156,7 +230,14 @@ class ExampleConstructor:
|
|
|
156
230
|
return text
|
|
157
231
|
|
|
158
232
|
def _check_text_quality(self, text: str) -> bool:
|
|
159
|
-
r"""Check
|
|
233
|
+
r"""Check the quality of input text.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
text (str): Text to check quality for.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
bool: True if text passes quality checks, False otherwise.
|
|
240
|
+
"""
|
|
160
241
|
# 1. Basic quality check
|
|
161
242
|
if text.count('.') < 2: # Must have at least 2 sentences
|
|
162
243
|
return False
|
|
@@ -171,7 +252,15 @@ class ExampleConstructor:
|
|
|
171
252
|
return True
|
|
172
253
|
|
|
173
254
|
def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
|
|
174
|
-
r"""Extract information pairs and relationships.
|
|
255
|
+
r"""Extract information pairs and relationships from text.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
text (str): Input text to extract information from.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List[Dict[str, Sequence[str]]]: List of dictionaries containing
|
|
262
|
+
premise, intermediate, conclusion, and related contexts.
|
|
263
|
+
"""
|
|
175
264
|
# Split into sentences
|
|
176
265
|
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
|
177
266
|
info_pairs = []
|
|
@@ -200,7 +289,15 @@ class ExampleConstructor:
|
|
|
200
289
|
def _generate_qa_pairs(
|
|
201
290
|
self, info_pairs: List[Dict[str, Sequence[str]]]
|
|
202
291
|
) -> List[Dict[str, str]]:
|
|
203
|
-
r"""Generate multi-hop question-answer pairs.
|
|
292
|
+
r"""Generate multi-hop question-answer pairs from information pairs.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
info_pairs (List[Dict[str, Sequence[str]]]): List of information
|
|
296
|
+
pairs extracted from text.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List[Dict[str, str]]: List of generated QA pairs.
|
|
300
|
+
"""
|
|
204
301
|
qa_pairs = []
|
|
205
302
|
|
|
206
303
|
for pair in info_pairs:
|
|
@@ -219,7 +316,15 @@ class ExampleConstructor:
|
|
|
219
316
|
return qa_pairs
|
|
220
317
|
|
|
221
318
|
def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
|
|
222
|
-
r"""Calculate complexity of QA pairs.
|
|
319
|
+
r"""Calculate the complexity score for a set of QA pairs.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
|
|
323
|
+
complexity for.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
float: Complexity score between 0.0 and 1.0.
|
|
327
|
+
"""
|
|
223
328
|
if not qa_pairs:
|
|
224
329
|
return 0.0
|
|
225
330
|
|
|
@@ -233,10 +338,10 @@ class ExampleConstructor:
|
|
|
233
338
|
supporting_facts_count = len(qa.get('supporting_facts', []))
|
|
234
339
|
|
|
235
340
|
# 3. Question length
|
|
236
|
-
question_length = len(qa
|
|
341
|
+
question_length = len(qa.get('question', '').split())
|
|
237
342
|
|
|
238
343
|
# 4. Answer length
|
|
239
|
-
answer_length = len(qa
|
|
344
|
+
answer_length = len(qa.get('answer', '').split())
|
|
240
345
|
|
|
241
346
|
# Calculate complexity of a single QA pair
|
|
242
347
|
qa_complexity = (
|
|
@@ -256,15 +361,37 @@ class ExampleConstructor:
|
|
|
256
361
|
|
|
257
362
|
|
|
258
363
|
class DataCurator:
|
|
259
|
-
r"""
|
|
364
|
+
r"""Manages and curates datasets of multi-hop question-answer pairs.
|
|
365
|
+
|
|
366
|
+
This class handles dataset management tasks including quality filtering,
|
|
367
|
+
complexity filtering, deduplication, and dataset sampling.
|
|
260
368
|
|
|
261
|
-
|
|
369
|
+
Attributes:
|
|
370
|
+
config (ProcessorConfig): Configuration for data curation parameters.
|
|
371
|
+
rng (random.Random): Random number generator for reproducible sampling.
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
def __init__(self, config: ProcessorConfig, rng: random.Random):
|
|
375
|
+
r"""Initialize the DataCurator.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
config (ProcessorConfig): Configuration for data curation.
|
|
379
|
+
rng (random.Random): Random number generator for reproducibility.
|
|
380
|
+
"""
|
|
262
381
|
self.config = config
|
|
382
|
+
self.rng = rng
|
|
263
383
|
|
|
264
384
|
def curate_dataset(
|
|
265
385
|
self, examples: List[Dict[str, Any]]
|
|
266
386
|
) -> List[Dict[str, Any]]:
|
|
267
|
-
r"""
|
|
387
|
+
r"""Manage and curate a dataset through multiple filtering stages.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
examples (List[Dict[str, Any]]): List of examples to curate.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
List[Dict[str, Any]]: Curated dataset meeting quality criteria.
|
|
394
|
+
"""
|
|
268
395
|
logger.info("Starting dataset management...")
|
|
269
396
|
|
|
270
397
|
# 1. Quality filtering
|
|
@@ -296,7 +423,14 @@ class DataCurator:
|
|
|
296
423
|
def _quality_filter(
|
|
297
424
|
self, examples: List[Dict[str, Any]]
|
|
298
425
|
) -> List[Dict[str, Any]]:
|
|
299
|
-
r"""
|
|
426
|
+
r"""Filter examples based on quality criteria.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
examples (List[Dict[str, Any]]): List of examples to filter.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
List[Dict[str, Any]]: Examples that pass quality checks.
|
|
433
|
+
"""
|
|
300
434
|
filtered = []
|
|
301
435
|
|
|
302
436
|
for example in examples:
|
|
@@ -314,7 +448,14 @@ class DataCurator:
|
|
|
314
448
|
return filtered
|
|
315
449
|
|
|
316
450
|
def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
|
|
317
|
-
r"""Check quality of
|
|
451
|
+
r"""Check the quality of question-answer pairs.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
qa_pairs (List[Dict[str, str]]): List of QA pairs to check.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
bool: True if QA pairs meet quality criteria, False otherwise.
|
|
458
|
+
"""
|
|
318
459
|
if not qa_pairs:
|
|
319
460
|
return False
|
|
320
461
|
|
|
@@ -335,7 +476,17 @@ class DataCurator:
|
|
|
335
476
|
def _complexity_filter(
|
|
336
477
|
self, examples: List[Dict[str, Any]]
|
|
337
478
|
) -> List[Dict[str, Any]]:
|
|
338
|
-
|
|
479
|
+
"""
|
|
480
|
+
Filter examples based on complexity threshold.
|
|
481
|
+
|
|
482
|
+
Removes examples with complexity scores below the configured threshold.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
examples (List[Dict[str, Any]]): List of examples to filter.
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
List[Dict[str, Any]]: Examples meeting complexity threshold.
|
|
489
|
+
"""
|
|
339
490
|
return [
|
|
340
491
|
example
|
|
341
492
|
for example in examples
|
|
@@ -346,7 +497,14 @@ class DataCurator:
|
|
|
346
497
|
def _remove_duplicates(
|
|
347
498
|
self, examples: List[Dict[str, Any]]
|
|
348
499
|
) -> List[Dict[str, Any]]:
|
|
349
|
-
r"""Remove
|
|
500
|
+
r"""Remove duplicate examples from the dataset.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
examples (List[Dict[str, Any]]): List of examples to deduplicate.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
List[Dict[str, Any]]: Deduplicated examples.
|
|
507
|
+
"""
|
|
350
508
|
seen = set()
|
|
351
509
|
unique_examples = []
|
|
352
510
|
|
|
@@ -366,8 +524,15 @@ class DataCurator:
|
|
|
366
524
|
def _sample_dataset(
|
|
367
525
|
self, examples: List[Dict[str, Any]]
|
|
368
526
|
) -> List[Dict[str, Any]]:
|
|
369
|
-
r"""Sample to target dataset size.
|
|
527
|
+
r"""Sample examples to match target dataset size.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
examples (List[Dict[str, Any]]): List of examples to sample from.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
List[Dict[str, Any]]: Sampled dataset of target size or smaller.
|
|
534
|
+
"""
|
|
370
535
|
if len(examples) <= self.config.dataset_size:
|
|
371
536
|
return examples
|
|
372
537
|
|
|
373
|
-
return
|
|
538
|
+
return self.rng.sample(examples, self.config.dataset_size)
|