camel-ai 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

camel/__init__.py CHANGED
@@ -14,7 +14,7 @@
14
14
 
15
15
  from camel.logger import disable_logging, enable_logging, set_log_level
16
16
 
17
- __version__ = '0.2.18'
17
+ __version__ = '0.2.19'
18
18
 
19
19
  __all__ = [
20
20
  '__version__',
@@ -22,17 +22,36 @@ from camel.agents.programmed_agent_instruction import (
22
22
  ProgrammedAgentInstructionResult,
23
23
  programmable_capability,
24
24
  )
25
- from camel.messages import BaseMessage
26
- from camel.synthetic_datagen.source2synth.models import (
25
+ from camel.datagen.source2synth.models import (
27
26
  ContextPrompt,
28
27
  MultiHopQA,
29
28
  )
29
+ from camel.messages import BaseMessage
30
30
 
31
31
 
32
32
  class MultiHopGeneratorAgent(ProgrammableChatAgent):
33
+ r"""An agent specialized in generating multi-hop question-answer pairs.
34
+
35
+ This agent is designed to create complex questions that require multiple
36
+ steps of reasoning to answer. It analyzes context to identify related
37
+ facts and generates questions that require connecting these facts
38
+ logically.
39
+
40
+ Attributes:
41
+ model_config (ConfigDict): Configuration for model behavior.
42
+ system_message (BaseMessage): System message defining agent's role and
43
+ instructions.
44
+ """
45
+
33
46
  model_config = ConfigDict(arbitrary_types_allowed=True)
34
47
 
35
- def __init__(self, **kwargs: Any):
48
+ def __init__(self, **kwargs: Any) -> None:
49
+ r"""Initialize the MultiHopGeneratorAgent.
50
+
51
+ Args:
52
+ **kwargs (Any): Additional keyword arguments to pass to parent
53
+ class.
54
+ """
36
55
  super().__init__(**kwargs)
37
56
 
38
57
  system_text: str = textwrap.dedent(
@@ -64,6 +83,19 @@ class MultiHopGeneratorAgent(ProgrammableChatAgent):
64
83
  def generate_multi_hop_qa(
65
84
  self, context: str
66
85
  ) -> ProgrammedAgentInstructionResult[MultiHopQA]:
86
+ r"""Generate a multi-hop question-answer pair from given context.
87
+
88
+ Args:
89
+ context (str): The input text context to generate QA from.
90
+
91
+ Returns:
92
+ ProgrammedAgentInstructionResult[MultiHopQA]: Result containing the
93
+ generated question, reasoning steps, answer, and supporting
94
+ facts.
95
+
96
+ Raises:
97
+ RuntimeError: If the agent fails to generate a response.
98
+ """
67
99
  context_prompt = ContextPrompt(
68
100
  main_context=context, related_contexts=None
69
101
  )
@@ -26,6 +26,16 @@ T = TypeVar('T')
26
26
 
27
27
 
28
28
  class ProgrammableAgentRequirement(Enum):
29
+ r"""Requirements for programmable agent state.
30
+
31
+ Defines the possible requirements that can be used to repair the state
32
+ of a programmable agent.
33
+
34
+ Attributes:
35
+ LAST_MESSAGE_NOT_USER (str): Requires that the last message in the
36
+ conversation was not from the user.
37
+ """
38
+
29
39
  LAST_MESSAGE_NOT_USER = "LAST_MESSAGE_NOT_USER"
30
40
 
31
41
 
@@ -34,6 +44,11 @@ class ProgrammedAgentInstructionResult(BaseModel, Generic[T]):
34
44
 
35
45
  Contains the messages exchanged during execution and the computed value.
36
46
  The value type is specified by the generic type parameter T.
47
+
48
+ Attributes:
49
+ user_message (BaseMessage): The message sent by the user.
50
+ agent_message (BaseMessage): The message sent by the agent.
51
+ value (T): The computed result value of type T.
37
52
  """
38
53
 
39
54
  user_message: BaseMessage
@@ -48,8 +63,7 @@ class AbstractProgrammableAgent(abc.ABC):
48
63
 
49
64
  A programmable agent is an agent that can be programmed to perform a
50
65
  specific function or task. This class defines the interface for a
51
- programmable
52
- agent.
66
+ programmable agent.
53
67
 
54
68
  These methods should be implemented in order to ensure the agent supports
55
69
  the necessary guarantees to enable a programming interface while
@@ -68,16 +82,15 @@ class AbstractProgrammableAgent(abc.ABC):
68
82
  An atomic operation is an operation that is guaranteed to
69
83
  be executed without interruption by any other operation.
70
84
 
71
- If the operation fails or times out the agents state should be
72
- unchanged.
85
+ Args:
86
+ callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
87
+ operation to execute atomically.
73
88
 
74
- If an operation is already in progress, this method should throw an
75
- exception. (It is up to the caller to do any queuing)
89
+ Returns:
90
+ ProgrammedAgentInstructionResult[T]: The result of the operation.
76
91
 
77
- If the agent is in a state where it can perform the operation,
78
- it must leave the agent in a state where it can perform the
79
- operation again. Though if state changes in successful operation
80
- improve its ability to perform the operation, it should keep them.
92
+ Raises:
93
+ RuntimeError: If an operation is already in progress.
81
94
  """
82
95
  raise NotImplementedError
83
96
 
@@ -86,10 +99,13 @@ class AbstractProgrammableAgent(abc.ABC):
86
99
  r"""Repair the state of the agent.
87
100
 
88
101
  Agents may have other non-atomic interfaces, such as a user interface,
89
- or chat between other agents.
102
+ or chat between other agents. This method should restore the agent to
103
+ a state where it can perform operations according to the specified
104
+ requirement.
90
105
 
91
- This method should restore the agent to a state where it can perform
92
- operations according to the specified requirement.
106
+ Args:
107
+ requirement (ProgrammableAgentRequirement): The requirement to
108
+ repair the state for.
93
109
  """
94
110
  raise NotImplementedError
95
111
 
@@ -99,10 +115,16 @@ def programmable_capability(
99
115
  ) -> Callable[..., ProgrammedAgentInstructionResult[T]]:
100
116
  r"""Decorator for programmable agent capabilities.
101
117
 
102
- Wraps a method to ensure it is executed atomically via the agent's
103
- run_atomic interface.
104
- The decorated method must return a ProgrammedAgentInstructionResult with
105
- appropriate type parameter.
118
+ This decorator ensures that the decorated method is executed atomically
119
+ and maintains the agent's state guarantees.
120
+
121
+ Args:
122
+ func (Callable[..., ProgrammedAgentInstructionResult[T]]): The method
123
+ to decorate.
124
+
125
+ Returns:
126
+ Callable[..., ProgrammedAgentInstructionResult[T]]: The decorated
127
+ method that ensures atomic execution.
106
128
  """
107
129
 
108
130
  @wraps(func)
@@ -120,9 +142,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
120
142
  Provides a default implementation of atomic execution using threading locks
121
143
  and basic state tracking for message roles. Implementing classes need to
122
144
  provide specific repair logic for their use cases.
145
+
146
+ Attributes:
147
+ _operation_lock (threading.Lock): Lock for ensuring atomic operations.
148
+ _last_message_role (Optional[str]): Role of the last message in the
149
+ conversation.
123
150
  """
124
151
 
125
- def __init__(self, **kwargs: Any):
152
+ def __init__(self, **kwargs: Any) -> None:
153
+ r"""Initialize the ProgrammableChatAgent.
154
+
155
+ Args:
156
+ **kwargs (Any): Additional keyword arguments to pass to parent
157
+ class.
158
+ """
126
159
  super().__init__(**kwargs)
127
160
  self._operation_lock = threading.Lock()
128
161
  self._last_message_role: Optional[str] = None
@@ -130,6 +163,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
130
163
  def run_atomic(
131
164
  self, callback: Callable[[], ProgrammedAgentInstructionResult[T]]
132
165
  ) -> ProgrammedAgentInstructionResult[T]:
166
+ r"""Run an atomic operation on the agent.
167
+
168
+ Ensures thread-safe execution of the callback function by using a lock.
169
+
170
+ Args:
171
+ callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
172
+ operation to execute atomically.
173
+
174
+ Returns:
175
+ ProgrammedAgentInstructionResult[T]: The result of the operation.
176
+
177
+ Raises:
178
+ RuntimeError: If an operation is already in progress.
179
+ """
133
180
  if not self._operation_lock.acquire(blocking=False):
134
181
  raise RuntimeError("Operation already in progress")
135
182
 
@@ -141,6 +188,14 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
141
188
  self._operation_lock.release()
142
189
 
143
190
  def repair_state(self, requirement: ProgrammableAgentRequirement) -> None:
191
+ r"""Repair the state of the agent.
192
+
193
+ Implements basic state repair for message role requirements.
194
+
195
+ Args:
196
+ requirement (ProgrammableAgentRequirement): The requirement to
197
+ repair the state for.
198
+ """
144
199
  if requirement == ProgrammableAgentRequirement.LAST_MESSAGE_NOT_USER:
145
200
  if self._last_message_role == "user":
146
201
  raise NotImplementedError(
@@ -83,7 +83,7 @@ class GeminiConfig(BaseConfig):
83
83
  stop: Union[str, Sequence[str], NotGiven] = NOT_GIVEN
84
84
  max_tokens: Union[int, NotGiven] = NOT_GIVEN
85
85
  response_format: Union[Type[BaseModel], dict, NotGiven] = NOT_GIVEN
86
- tool_choice: Optional[Union[dict[str, str], str]] = None
86
+ tool_choice: Optional[Union[dict[str, str], str, NotGiven]] = NOT_GIVEN
87
87
 
88
88
  def as_dict(self) -> dict[str, Any]:
89
89
  r"""Convert the current configuration to a dictionary.
@@ -56,6 +56,10 @@ class SGLangConfig(BaseConfig):
56
56
  in the chat completion. The total length of input tokens and
57
57
  generated tokens is limited by the model's context length.
58
58
  (default: :obj:`None`)
59
+ tools (list[FunctionTool], optional): A list of tools the model may
60
+ call. Currently, only functions are supported as a tool. Use this
61
+ to provide a list of functions the model may generate JSON inputs
62
+ for. A max of 128 functions are supported.
59
63
  """
60
64
 
61
65
  stop: Union[str, Sequence[str], NotGiven] = NOT_GIVEN
@@ -0,0 +1,31 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ from .data_processor import (
15
+ DataCurator,
16
+ ExampleConstructor,
17
+ UserDataProcessor,
18
+ )
19
+ from .models import MultiHopQA, ReasoningStep
20
+ from .user_data_processor_config import (
21
+ ProcessorConfig,
22
+ )
23
+
24
+ __all__ = [
25
+ "DataCurator",
26
+ "ExampleConstructor",
27
+ "ProcessorConfig",
28
+ "UserDataProcessor",
29
+ "ReasoningStep",
30
+ "MultiHopQA",
31
+ ]
@@ -15,33 +15,61 @@
15
15
  import random
16
16
  from typing import Any, Dict, List, Optional, Sequence
17
17
 
18
- import numpy as np
19
18
  from tqdm import tqdm
20
19
 
21
20
  from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
22
- from camel.logger import get_logger
23
- from camel.synthetic_datagen.source2synth.user_data_processor_config import (
21
+ from camel.datagen.source2synth.user_data_processor_config import (
24
22
  ProcessorConfig,
25
23
  )
24
+ from camel.logger import get_logger
26
25
 
27
26
  logger = get_logger(__name__)
28
27
 
29
28
 
30
29
  class UserDataProcessor:
31
- r"""User Data Processor."""
30
+ r"""A processor for generating multi-hop question-answer pairs from user
31
+ data.
32
+
33
+ This class handles the processing of text data to generate multi-hop
34
+ question-answer pairs using either an AI model or rule-based approaches.
35
+ It manages the entire pipeline from text preprocessing to dataset curation.
36
+
37
+ Attributes:
38
+ config (ProcessorConfig): Configuration for data processing parameters.
39
+ rng (random.Random): Random number generator for reproducibility.
40
+ multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for
41
+ generating QA pairs.
42
+ """
32
43
 
33
44
  def __init__(self, config: Optional[ProcessorConfig] = None):
45
+ r"""Initialize the UserDataProcessor.
46
+
47
+ Args:
48
+ config (Optional[ProcessorConfig], optional): Configuration for
49
+ data processing. (default: :obj:`None`)
50
+ """
34
51
  self.config = config or ProcessorConfig()
35
- random.seed(self.config.seed)
36
- np.random.seed(self.config.seed)
52
+ self.rng = random.Random(self.config.seed)
37
53
  self.multi_hop_agent = (
38
- MultiHopGeneratorAgent() if self.config.use_ai_model else None
54
+ self.config.hop_generating_agent
55
+ if self.config.use_ai_model
56
+ else None
39
57
  )
40
58
 
41
59
  def process_text(
42
60
  self, text: str, source: str = "user_input"
43
61
  ) -> List[Dict[str, Any]]:
44
- r"""Process a single text."""
62
+ r"""Process a single text to generate multi-hop QA pairs.
63
+
64
+ Args:
65
+ text (str): The input text to process.
66
+ source (str, optional): Source identifier for the text.
67
+ (default: :obj:`"user_input"`)
68
+
69
+ Returns:
70
+ List[Dict[str, Any]]: List of processed examples with QA pairs and
71
+ metadata.
72
+ """
45
73
  # Convert text to standard format
46
74
  raw_data = [
47
75
  {
@@ -55,7 +83,7 @@ class UserDataProcessor:
55
83
  examples = constructor.construct_examples(raw_data)
56
84
 
57
85
  # Manage data
58
- curator = DataCurator(self.config)
86
+ curator = DataCurator(self.config, self.rng)
59
87
  final_dataset = curator.curate_dataset(examples)
60
88
 
61
89
  return final_dataset
@@ -63,7 +91,20 @@ class UserDataProcessor:
63
91
  def process_batch(
64
92
  self, texts: List[str], sources: Optional[List[str]] = None
65
93
  ) -> List[Dict[str, Any]]:
66
- r"""Process multiple texts in batch."""
94
+ r"""Process multiple texts in batch to generate multi-hop QA pairs.
95
+
96
+ Args:
97
+ texts (List[str]): List of input texts to process.
98
+ sources (Optional[List[str]], optional): List of source
99
+ identifiers. (default: :obj:`None`)
100
+
101
+ Returns:
102
+ List[Dict[str, Any]]: List of processed examples with QA pairs and
103
+ metadata.
104
+
105
+ Raises:
106
+ ValueError: If length of sources doesn't match length of texts.
107
+ """
67
108
  if sources is None:
68
109
  sources = ["user_input"] * len(texts)
69
110
  elif len(sources) != len(texts):
@@ -82,27 +123,52 @@ class UserDataProcessor:
82
123
  examples = constructor.construct_examples(raw_data)
83
124
 
84
125
  # Manage data
85
- curator = DataCurator(self.config)
126
+ curator = DataCurator(self.config, self.rng)
86
127
  final_dataset = curator.curate_dataset(examples)
87
128
 
88
129
  return final_dataset
89
130
 
90
131
 
91
132
  class ExampleConstructor:
92
- r"""Example Constructor."""
133
+ r"""Constructs training examples from raw text data.
134
+
135
+ This class handles the construction of training examples by preprocessing
136
+ text, extracting information pairs, and generating question-answer pairs.
137
+
138
+ Attributes:
139
+ config (ProcessorConfig): Configuration for example construction.
140
+ multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for QA
141
+ generation.
142
+ """
93
143
 
94
144
  def __init__(
95
145
  self,
96
146
  config: ProcessorConfig,
97
147
  multi_hop_agent: Optional[MultiHopGeneratorAgent] = None,
98
148
  ):
149
+ r"""Initialize the ExampleConstructor.
150
+
151
+ Args:
152
+ config (ProcessorConfig): Configuration for example construction.
153
+ multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
154
+ Agent for generating multi-hop QA pairs. (default: :obj:`None`)
155
+ """
99
156
  self.config = config
100
157
  self.multi_hop_agent = multi_hop_agent
101
158
 
102
159
  def construct_examples(
103
160
  self, raw_data: List[Dict[str, Any]]
104
161
  ) -> List[Dict[str, Any]]:
105
- r"""Construct training examples."""
162
+ r"""Construct training examples from raw data.
163
+
164
+ Args:
165
+ raw_data (List[Dict[str, Any]]): List of raw data dictionaries
166
+ containing text and metadata.
167
+
168
+ Returns:
169
+ List[Dict[str, Any]]: List of constructed examples with QA pairs
170
+ and metadata.
171
+ """
106
172
  logger.info("Starting to construct training examples...")
107
173
  examples = []
108
174
 
@@ -135,7 +201,15 @@ class ExampleConstructor:
135
201
  return examples
136
202
 
137
203
  def _preprocess_text(self, text: str) -> str:
138
- r"""Text preprocessing."""
204
+ r"""Preprocess input text for example construction.
205
+
206
+ Args:
207
+ text (str): Input text to preprocess.
208
+
209
+ Returns:
210
+ str: Preprocessed text, or empty string if text fails quality
211
+ checks.
212
+ """
139
213
  if not isinstance(text, str):
140
214
  return ''
141
215
 
@@ -156,7 +230,14 @@ class ExampleConstructor:
156
230
  return text
157
231
 
158
232
  def _check_text_quality(self, text: str) -> bool:
159
- r"""Check text quality."""
233
+ r"""Check the quality of input text.
234
+
235
+ Args:
236
+ text (str): Text to check quality for.
237
+
238
+ Returns:
239
+ bool: True if text passes quality checks, False otherwise.
240
+ """
160
241
  # 1. Basic quality check
161
242
  if text.count('.') < 2: # Must have at least 2 sentences
162
243
  return False
@@ -171,7 +252,15 @@ class ExampleConstructor:
171
252
  return True
172
253
 
173
254
  def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
174
- r"""Extract information pairs and relationships."""
255
+ r"""Extract information pairs and relationships from text.
256
+
257
+ Args:
258
+ text (str): Input text to extract information from.
259
+
260
+ Returns:
261
+ List[Dict[str, Sequence[str]]]: List of dictionaries containing
262
+ premise, intermediate, conclusion, and related contexts.
263
+ """
175
264
  # Split into sentences
176
265
  sentences = [s.strip() for s in text.split('.') if s.strip()]
177
266
  info_pairs = []
@@ -200,7 +289,15 @@ class ExampleConstructor:
200
289
  def _generate_qa_pairs(
201
290
  self, info_pairs: List[Dict[str, Sequence[str]]]
202
291
  ) -> List[Dict[str, str]]:
203
- r"""Generate multi-hop question-answer pairs."""
292
+ r"""Generate multi-hop question-answer pairs from information pairs.
293
+
294
+ Args:
295
+ info_pairs (List[Dict[str, Sequence[str]]]): List of information
296
+ pairs extracted from text.
297
+
298
+ Returns:
299
+ List[Dict[str, str]]: List of generated QA pairs.
300
+ """
204
301
  qa_pairs = []
205
302
 
206
303
  for pair in info_pairs:
@@ -219,7 +316,15 @@ class ExampleConstructor:
219
316
  return qa_pairs
220
317
 
221
318
  def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
222
- r"""Calculate complexity of QA pairs."""
319
+ r"""Calculate the complexity score for a set of QA pairs.
320
+
321
+ Args:
322
+ qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
323
+ complexity for.
324
+
325
+ Returns:
326
+ float: Complexity score between 0.0 and 1.0.
327
+ """
223
328
  if not qa_pairs:
224
329
  return 0.0
225
330
 
@@ -233,10 +338,10 @@ class ExampleConstructor:
233
338
  supporting_facts_count = len(qa.get('supporting_facts', []))
234
339
 
235
340
  # 3. Question length
236
- question_length = len(qa['question'].split())
341
+ question_length = len(qa.get('question', '').split())
237
342
 
238
343
  # 4. Answer length
239
- answer_length = len(qa['answer'].split())
344
+ answer_length = len(qa.get('answer', '').split())
240
345
 
241
346
  # Calculate complexity of a single QA pair
242
347
  qa_complexity = (
@@ -256,15 +361,37 @@ class ExampleConstructor:
256
361
 
257
362
 
258
363
  class DataCurator:
259
- r"""Data Manager."""
364
+ r"""Manages and curates datasets of multi-hop question-answer pairs.
365
+
366
+ This class handles dataset management tasks including quality filtering,
367
+ complexity filtering, deduplication, and dataset sampling.
260
368
 
261
- def __init__(self, config: ProcessorConfig):
369
+ Attributes:
370
+ config (ProcessorConfig): Configuration for data curation parameters.
371
+ rng (random.Random): Random number generator for reproducible sampling.
372
+ """
373
+
374
+ def __init__(self, config: ProcessorConfig, rng: random.Random):
375
+ r"""Initialize the DataCurator.
376
+
377
+ Args:
378
+ config (ProcessorConfig): Configuration for data curation.
379
+ rng (random.Random): Random number generator for reproducibility.
380
+ """
262
381
  self.config = config
382
+ self.rng = rng
263
383
 
264
384
  def curate_dataset(
265
385
  self, examples: List[Dict[str, Any]]
266
386
  ) -> List[Dict[str, Any]]:
267
- r"""Dataset management."""
387
+ r"""Manage and curate a dataset through multiple filtering stages.
388
+
389
+ Args:
390
+ examples (List[Dict[str, Any]]): List of examples to curate.
391
+
392
+ Returns:
393
+ List[Dict[str, Any]]: Curated dataset meeting quality criteria.
394
+ """
268
395
  logger.info("Starting dataset management...")
269
396
 
270
397
  # 1. Quality filtering
@@ -296,7 +423,14 @@ class DataCurator:
296
423
  def _quality_filter(
297
424
  self, examples: List[Dict[str, Any]]
298
425
  ) -> List[Dict[str, Any]]:
299
- r"""Quality filtering."""
426
+ r"""Filter examples based on quality criteria.
427
+
428
+ Args:
429
+ examples (List[Dict[str, Any]]): List of examples to filter.
430
+
431
+ Returns:
432
+ List[Dict[str, Any]]: Examples that pass quality checks.
433
+ """
300
434
  filtered = []
301
435
 
302
436
  for example in examples:
@@ -314,7 +448,14 @@ class DataCurator:
314
448
  return filtered
315
449
 
316
450
  def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
317
- r"""Check quality of QA pairs."""
451
+ r"""Check the quality of question-answer pairs.
452
+
453
+ Args:
454
+ qa_pairs (List[Dict[str, str]]): List of QA pairs to check.
455
+
456
+ Returns:
457
+ bool: True if QA pairs meet quality criteria, False otherwise.
458
+ """
318
459
  if not qa_pairs:
319
460
  return False
320
461
 
@@ -335,7 +476,17 @@ class DataCurator:
335
476
  def _complexity_filter(
336
477
  self, examples: List[Dict[str, Any]]
337
478
  ) -> List[Dict[str, Any]]:
338
- r"""Complexity filtering."""
479
+ """
480
+ Filter examples based on complexity threshold.
481
+
482
+ Removes examples with complexity scores below the configured threshold.
483
+
484
+ Args:
485
+ examples (List[Dict[str, Any]]): List of examples to filter.
486
+
487
+ Returns:
488
+ List[Dict[str, Any]]: Examples meeting complexity threshold.
489
+ """
339
490
  return [
340
491
  example
341
492
  for example in examples
@@ -346,7 +497,14 @@ class DataCurator:
346
497
  def _remove_duplicates(
347
498
  self, examples: List[Dict[str, Any]]
348
499
  ) -> List[Dict[str, Any]]:
349
- r"""Remove duplicates."""
500
+ r"""Remove duplicate examples from the dataset.
501
+
502
+ Args:
503
+ examples (List[Dict[str, Any]]): List of examples to deduplicate.
504
+
505
+ Returns:
506
+ List[Dict[str, Any]]: Deduplicated examples.
507
+ """
350
508
  seen = set()
351
509
  unique_examples = []
352
510
 
@@ -366,8 +524,15 @@ class DataCurator:
366
524
  def _sample_dataset(
367
525
  self, examples: List[Dict[str, Any]]
368
526
  ) -> List[Dict[str, Any]]:
369
- r"""Sample to target dataset size."""
527
+ r"""Sample examples to match target dataset size.
528
+
529
+ Args:
530
+ examples (List[Dict[str, Any]]): List of examples to sample from.
531
+
532
+ Returns:
533
+ List[Dict[str, Any]]: Sampled dataset of target size or smaller.
534
+ """
370
535
  if len(examples) <= self.config.dataset_size:
371
536
  return examples
372
537
 
373
- return random.sample(examples, self.config.dataset_size)
538
+ return self.rng.sample(examples, self.config.dataset_size)