camel-ai 0.2.21__py3-none-any.whl → 0.2.23a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/_types.py +41 -0
- camel/agents/_utils.py +188 -0
- camel/agents/chat_agent.py +556 -965
- camel/agents/knowledge_graph_agent.py +7 -1
- camel/agents/multi_hop_generator_agent.py +1 -1
- camel/configs/base_config.py +10 -13
- camel/configs/deepseek_config.py +4 -30
- camel/configs/gemini_config.py +5 -31
- camel/configs/openai_config.py +14 -32
- camel/configs/qwen_config.py +36 -36
- camel/datagen/self_improving_cot.py +79 -1
- camel/datagen/self_instruct/filter/instruction_filter.py +19 -3
- camel/datagen/self_instruct/self_instruct.py +7 -2
- camel/datasets/__init__.py +28 -0
- camel/datasets/base.py +969 -0
- camel/embeddings/openai_embedding.py +10 -1
- camel/environments/__init__.py +16 -0
- camel/environments/base.py +503 -0
- camel/extractors/__init__.py +16 -0
- camel/extractors/base.py +263 -0
- camel/interpreters/docker/Dockerfile +12 -0
- camel/interpreters/docker_interpreter.py +19 -1
- camel/interpreters/subprocess_interpreter.py +42 -17
- camel/loaders/__init__.py +2 -0
- camel/loaders/mineru_extractor.py +250 -0
- camel/memories/agent_memories.py +16 -1
- camel/memories/blocks/chat_history_block.py +10 -2
- camel/memories/blocks/vectordb_block.py +1 -0
- camel/memories/context_creators/score_based.py +20 -3
- camel/memories/records.py +10 -0
- camel/messages/base.py +8 -8
- camel/models/_utils.py +57 -0
- camel/models/aiml_model.py +48 -17
- camel/models/anthropic_model.py +41 -3
- camel/models/azure_openai_model.py +39 -3
- camel/models/base_model.py +132 -4
- camel/models/cohere_model.py +88 -11
- camel/models/deepseek_model.py +107 -63
- camel/models/gemini_model.py +133 -15
- camel/models/groq_model.py +72 -10
- camel/models/internlm_model.py +14 -3
- camel/models/litellm_model.py +9 -2
- camel/models/mistral_model.py +42 -5
- camel/models/model_manager.py +48 -3
- camel/models/moonshot_model.py +33 -4
- camel/models/nemotron_model.py +32 -3
- camel/models/nvidia_model.py +43 -3
- camel/models/ollama_model.py +139 -17
- camel/models/openai_audio_models.py +7 -1
- camel/models/openai_compatible_model.py +37 -3
- camel/models/openai_model.py +158 -46
- camel/models/qwen_model.py +61 -4
- camel/models/reka_model.py +53 -3
- camel/models/samba_model.py +209 -4
- camel/models/sglang_model.py +153 -14
- camel/models/siliconflow_model.py +16 -3
- camel/models/stub_model.py +46 -4
- camel/models/togetherai_model.py +38 -3
- camel/models/vllm_model.py +37 -3
- camel/models/yi_model.py +36 -3
- camel/models/zhipuai_model.py +38 -3
- camel/retrievers/__init__.py +3 -0
- camel/retrievers/hybrid_retrival.py +237 -0
- camel/toolkits/__init__.py +9 -2
- camel/toolkits/arxiv_toolkit.py +2 -1
- camel/toolkits/ask_news_toolkit.py +4 -2
- camel/toolkits/base.py +22 -3
- camel/toolkits/code_execution.py +2 -0
- camel/toolkits/dappier_toolkit.py +2 -1
- camel/toolkits/data_commons_toolkit.py +38 -12
- camel/toolkits/function_tool.py +13 -0
- camel/toolkits/github_toolkit.py +5 -1
- camel/toolkits/google_maps_toolkit.py +2 -1
- camel/toolkits/google_scholar_toolkit.py +2 -0
- camel/toolkits/human_toolkit.py +0 -3
- camel/toolkits/linkedin_toolkit.py +3 -2
- camel/toolkits/meshy_toolkit.py +3 -2
- camel/toolkits/mineru_toolkit.py +178 -0
- camel/toolkits/networkx_toolkit.py +240 -0
- camel/toolkits/notion_toolkit.py +2 -0
- camel/toolkits/openbb_toolkit.py +3 -2
- camel/toolkits/reddit_toolkit.py +11 -3
- camel/toolkits/retrieval_toolkit.py +6 -1
- camel/toolkits/semantic_scholar_toolkit.py +2 -1
- camel/toolkits/stripe_toolkit.py +8 -2
- camel/toolkits/sympy_toolkit.py +44 -1
- camel/toolkits/video_toolkit.py +2 -0
- camel/toolkits/whatsapp_toolkit.py +3 -2
- camel/toolkits/zapier_toolkit.py +191 -0
- camel/types/__init__.py +2 -2
- camel/types/agents/__init__.py +16 -0
- camel/types/agents/tool_calling_record.py +52 -0
- camel/types/enums.py +3 -0
- camel/types/openai_types.py +16 -14
- camel/utils/__init__.py +2 -1
- camel/utils/async_func.py +2 -2
- camel/utils/commons.py +114 -1
- camel/verifiers/__init__.py +23 -0
- camel/verifiers/base.py +340 -0
- camel/verifiers/models.py +82 -0
- camel/verifiers/python_verifier.py +202 -0
- {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/METADATA +273 -256
- {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/RECORD +106 -85
- {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/WHEEL +1 -1
- {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/LICENSE +0 -0
camel/extractors/base.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from types import TracebackType
|
|
17
|
+
from typing import Any, Dict, Optional, Type
|
|
18
|
+
|
|
19
|
+
from typing_extensions import Self
|
|
20
|
+
|
|
21
|
+
from camel.logger import get_logger
|
|
22
|
+
from camel.utils import BatchProcessor
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseExtractor(ABC):
|
|
28
|
+
r"""Base class for all response extractors.
|
|
29
|
+
|
|
30
|
+
An extractor takes the response and extracts the relevant parts,
|
|
31
|
+
converting them into a format that the verifier can handle.
|
|
32
|
+
Implements async context manager protocol for proper resource management.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
cache_templates: bool = True,
|
|
38
|
+
max_cache_size: int = 1000,
|
|
39
|
+
extraction_timeout: float = 30.0,
|
|
40
|
+
batch_size: int = 10,
|
|
41
|
+
monitoring_interval: float = 5.0,
|
|
42
|
+
cpu_threshold: float = 80.0,
|
|
43
|
+
memory_threshold: float = 85.0,
|
|
44
|
+
**kwargs,
|
|
45
|
+
):
|
|
46
|
+
r"""Initialize the extractor.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
cache_templates (bool): Whether to cache extraction templates.
|
|
50
|
+
(default: :obj:`True`)
|
|
51
|
+
max_cache_size (int): Maximum number of templates to cache.
|
|
52
|
+
(default: :obj:`1000`)
|
|
53
|
+
extraction_timeout (float): Maximum time for extraction in seconds.
|
|
54
|
+
(default: :obj:`30.0`)
|
|
55
|
+
batch_size (int): Size of batches for parallel extraction.
|
|
56
|
+
(default: :obj:`10`)
|
|
57
|
+
monitoring_interval (float): Interval in seconds between resource
|
|
58
|
+
checks. (default: :obj:`5.0`)
|
|
59
|
+
cpu_threshold (float): CPU usage percentage threshold for scaling
|
|
60
|
+
down. (default: :obj:`80.0`)
|
|
61
|
+
memory_threshold (float): Memory usage percentage threshold for
|
|
62
|
+
scaling down. (default: :obj:`85.0`)
|
|
63
|
+
**kwargs: Additional extractor parameters.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If invalid parameter values are provided
|
|
67
|
+
"""
|
|
68
|
+
# Store all parameters in metadata dict for compatibility
|
|
69
|
+
self._metadata = {
|
|
70
|
+
'cache_templates': cache_templates,
|
|
71
|
+
'max_cache_size': max_cache_size,
|
|
72
|
+
'extraction_timeout': extraction_timeout,
|
|
73
|
+
'batch_size': batch_size,
|
|
74
|
+
'monitoring_interval': monitoring_interval,
|
|
75
|
+
'cpu_threshold': cpu_threshold,
|
|
76
|
+
'memory_threshold': memory_threshold,
|
|
77
|
+
**kwargs,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
self._is_setup = False
|
|
81
|
+
self._cache: Dict[str, Any] = {}
|
|
82
|
+
self._batch_processor: Optional[BatchProcessor] = None
|
|
83
|
+
|
|
84
|
+
# Store configuration parameters
|
|
85
|
+
self._cache_templates = cache_templates
|
|
86
|
+
self._max_cache_size = max_cache_size
|
|
87
|
+
self._extraction_timeout = extraction_timeout
|
|
88
|
+
self._batch_size = batch_size
|
|
89
|
+
self._monitoring_interval = monitoring_interval
|
|
90
|
+
self._cpu_threshold = cpu_threshold
|
|
91
|
+
self._memory_threshold = memory_threshold
|
|
92
|
+
|
|
93
|
+
async def setup(self) -> None:
|
|
94
|
+
r"""Set up the extractor with necessary resources.
|
|
95
|
+
|
|
96
|
+
This method:
|
|
97
|
+
1. Initializes template cache if enabled
|
|
98
|
+
2. Sets up any parallel processing resources
|
|
99
|
+
3. Validates extraction patterns
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
RuntimeError: If initialization fails
|
|
103
|
+
"""
|
|
104
|
+
if self._is_setup:
|
|
105
|
+
logger.debug(f"{self.__class__.__name__} already initialized")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Initialize template cache if enabled
|
|
110
|
+
if self._cache_templates:
|
|
111
|
+
self._template_cache: Dict[str, Any] = {}
|
|
112
|
+
|
|
113
|
+
# Set up batch processing if needed
|
|
114
|
+
if self._batch_size > 1:
|
|
115
|
+
self._batch_processor = BatchProcessor(
|
|
116
|
+
initial_batch_size=self._batch_size,
|
|
117
|
+
monitoring_interval=self._monitoring_interval,
|
|
118
|
+
cpu_threshold=self._cpu_threshold,
|
|
119
|
+
memory_threshold=self._memory_threshold,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
self._is_setup = True
|
|
123
|
+
logger.info(f"{self.__class__.__name__} initialized successfully")
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
error_msg = f"Error during {self.__class__.__name__} setup: {e}"
|
|
127
|
+
logger.error(error_msg)
|
|
128
|
+
await self.cleanup()
|
|
129
|
+
raise RuntimeError(error_msg) from e
|
|
130
|
+
|
|
131
|
+
async def cleanup(self) -> None:
|
|
132
|
+
r"""Clean up extractor resources.
|
|
133
|
+
|
|
134
|
+
This method handles cleanup of resources and resets the extractor
|
|
135
|
+
state.
|
|
136
|
+
It ensures:
|
|
137
|
+
1. All resources are properly released
|
|
138
|
+
2. Template cache is cleared
|
|
139
|
+
3. Parallel processing resources are shutdown
|
|
140
|
+
4. State is reset to initial
|
|
141
|
+
5. Cleanup happens even if errors occur
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
RuntimeError: If cleanup fails (after resetting initialization
|
|
145
|
+
state).
|
|
146
|
+
"""
|
|
147
|
+
if not self._is_setup:
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"{self.__class__.__name__} not initialized, skipping cleanup"
|
|
150
|
+
)
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
errors = []
|
|
154
|
+
try:
|
|
155
|
+
# Clear template cache
|
|
156
|
+
if hasattr(self, '_template_cache'):
|
|
157
|
+
try:
|
|
158
|
+
self._template_cache.clear()
|
|
159
|
+
except Exception as e:
|
|
160
|
+
errors.append(f"Failed to clear template cache: {e}")
|
|
161
|
+
|
|
162
|
+
# Shutdown parallel processing
|
|
163
|
+
if self._batch_processor is not None:
|
|
164
|
+
try:
|
|
165
|
+
# Get final performance metrics before cleanup
|
|
166
|
+
metrics = self._batch_processor.get_performance_metrics()
|
|
167
|
+
logger.info(f"Batch processor final metrics: {metrics}")
|
|
168
|
+
except Exception as e:
|
|
169
|
+
errors.append(
|
|
170
|
+
f"Failed to get batch processor metrics: {e}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Preserve init config in metadata
|
|
174
|
+
self._metadata = {
|
|
175
|
+
'cache_templates': self._cache_templates,
|
|
176
|
+
'max_cache_size': self._max_cache_size,
|
|
177
|
+
'extraction_timeout': self._extraction_timeout,
|
|
178
|
+
'batch_size': self._batch_size,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if not errors:
|
|
182
|
+
logger.info(
|
|
183
|
+
f"{self.__class__.__name__} cleaned up successfully"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
errors.append(f"Unexpected error during cleanup: {e}")
|
|
188
|
+
|
|
189
|
+
finally:
|
|
190
|
+
# Always mark as uninitialized, even if cleanup fails
|
|
191
|
+
self._is_setup = False
|
|
192
|
+
self._batch_processor = None
|
|
193
|
+
|
|
194
|
+
if errors:
|
|
195
|
+
error_msg = (
|
|
196
|
+
f"Errors during {self.__class__.__name__} cleanup: "
|
|
197
|
+
f"{'; '.join(errors)}"
|
|
198
|
+
)
|
|
199
|
+
logger.error(error_msg)
|
|
200
|
+
raise RuntimeError(error_msg)
|
|
201
|
+
|
|
202
|
+
async def __aenter__(self) -> Self:
|
|
203
|
+
r"""Async context manager entry.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Self reference for context manager usage.
|
|
207
|
+
"""
|
|
208
|
+
await self.setup()
|
|
209
|
+
return self
|
|
210
|
+
|
|
211
|
+
async def __aexit__(
|
|
212
|
+
self,
|
|
213
|
+
exc_type: Optional[Type[BaseException]],
|
|
214
|
+
exc_val: Optional[BaseException],
|
|
215
|
+
exc_tb: Optional[TracebackType],
|
|
216
|
+
) -> None:
|
|
217
|
+
r"""Async context manager exit.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
exc_type (Optional[Type[BaseException]]): Exception type if an
|
|
221
|
+
error occurred.
|
|
222
|
+
exc_val (Optional[BaseException]): Exception value if an error
|
|
223
|
+
occurred.
|
|
224
|
+
exc_tb (Optional[TracebackType]): Exception traceback if an error
|
|
225
|
+
occurred.
|
|
226
|
+
"""
|
|
227
|
+
await self.cleanup()
|
|
228
|
+
|
|
229
|
+
@abstractmethod
|
|
230
|
+
async def extract(
|
|
231
|
+
self, response: str, context: Optional[Dict[str, Any]] = None
|
|
232
|
+
) -> str:
|
|
233
|
+
r"""Extract relevant parts from a response.
|
|
234
|
+
|
|
235
|
+
Extracts:
|
|
236
|
+
1. Final answer or output
|
|
237
|
+
2. Chain of thought reasoning steps
|
|
238
|
+
3. Difficulty assessment
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
response (str): Raw response from agent generation.
|
|
242
|
+
context (Optional[Dict[str, Any]]): Optional context for
|
|
243
|
+
extraction like:
|
|
244
|
+
- final_answer
|
|
245
|
+
- rationale
|
|
246
|
+
- complexity
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
str: Extracted content string.
|
|
250
|
+
|
|
251
|
+
Raises:
|
|
252
|
+
ValueError: If response is empty or invalid.
|
|
253
|
+
NotImplementedError: If no implementation is provided.
|
|
254
|
+
RuntimeError: If extractor is not initialized.
|
|
255
|
+
"""
|
|
256
|
+
if not self._is_setup:
|
|
257
|
+
raise RuntimeError(
|
|
258
|
+
f"{self.__class__.__name__} must be initialized "
|
|
259
|
+
"before extraction"
|
|
260
|
+
)
|
|
261
|
+
if not response or not response.strip():
|
|
262
|
+
raise ValueError("Empty or whitespace-only response")
|
|
263
|
+
raise NotImplementedError("Subclasses must implement extract()")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
FROM python:3.9-slim
|
|
2
|
+
|
|
3
|
+
# Install R and required dependencies
|
|
4
|
+
RUN apt-get update && apt-get install -y \
|
|
5
|
+
r-base \
|
|
6
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
7
|
+
|
|
8
|
+
# Set working directory
|
|
9
|
+
WORKDIR /workspace
|
|
10
|
+
|
|
11
|
+
# Keep container running
|
|
12
|
+
CMD ["tail", "-f", "/dev/null"]
|
|
@@ -52,11 +52,13 @@ class DockerInterpreter(BaseInterpreter):
|
|
|
52
52
|
_CODE_EXECUTE_CMD_MAPPING: ClassVar[Dict[str, str]] = {
|
|
53
53
|
"python": "python {file_name}",
|
|
54
54
|
"bash": "bash {file_name}",
|
|
55
|
+
"r": "Rscript {file_name}",
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
_CODE_EXTENSION_MAPPING: ClassVar[Dict[str, str]] = {
|
|
58
59
|
"python": "py",
|
|
59
60
|
"bash": "sh",
|
|
61
|
+
"r": "R",
|
|
60
62
|
}
|
|
61
63
|
|
|
62
64
|
_CODE_TYPE_MAPPING: ClassVar[Dict[str, str]] = {
|
|
@@ -67,6 +69,8 @@ class DockerInterpreter(BaseInterpreter):
|
|
|
67
69
|
"shell": "bash",
|
|
68
70
|
"bash": "bash",
|
|
69
71
|
"sh": "bash",
|
|
72
|
+
"r": "r",
|
|
73
|
+
"R": "r",
|
|
70
74
|
}
|
|
71
75
|
|
|
72
76
|
def __init__(
|
|
@@ -104,8 +108,22 @@ class DockerInterpreter(BaseInterpreter):
|
|
|
104
108
|
import docker
|
|
105
109
|
|
|
106
110
|
client = docker.from_env()
|
|
111
|
+
|
|
112
|
+
# Build custom image with Python and R
|
|
113
|
+
dockerfile_path = Path(__file__).parent / "docker"
|
|
114
|
+
image_tag = "camel-interpreter:latest"
|
|
115
|
+
try:
|
|
116
|
+
client.images.get(image_tag)
|
|
117
|
+
except docker.errors.ImageNotFound:
|
|
118
|
+
logger.info("Building custom interpreter image...")
|
|
119
|
+
client.images.build(
|
|
120
|
+
path=str(dockerfile_path),
|
|
121
|
+
tag=image_tag,
|
|
122
|
+
rm=True,
|
|
123
|
+
)
|
|
124
|
+
|
|
107
125
|
self._container = client.containers.run(
|
|
108
|
-
|
|
126
|
+
image_tag,
|
|
109
127
|
detach=True,
|
|
110
128
|
name=f"camel-interpreter-{uuid.uuid4()}",
|
|
111
129
|
command="tail -f /dev/null",
|
|
@@ -48,11 +48,13 @@ class SubprocessInterpreter(BaseInterpreter):
|
|
|
48
48
|
_CODE_EXECUTE_CMD_MAPPING: ClassVar[Dict[str, str]] = {
|
|
49
49
|
"python": "python {file_name}",
|
|
50
50
|
"bash": "bash {file_name}",
|
|
51
|
+
"r": "Rscript {file_name}",
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
_CODE_EXTENSION_MAPPING: ClassVar[Dict[str, str]] = {
|
|
54
55
|
"python": "py",
|
|
55
56
|
"bash": "sh",
|
|
57
|
+
"r": "R",
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
_CODE_TYPE_MAPPING: ClassVar[Dict[str, str]] = {
|
|
@@ -63,6 +65,8 @@ class SubprocessInterpreter(BaseInterpreter):
|
|
|
63
65
|
"shell": "bash",
|
|
64
66
|
"bash": "bash",
|
|
65
67
|
"sh": "bash",
|
|
68
|
+
"r": "r",
|
|
69
|
+
"R": "r",
|
|
66
70
|
}
|
|
67
71
|
|
|
68
72
|
def __init__(
|
|
@@ -98,7 +102,7 @@ class SubprocessInterpreter(BaseInterpreter):
|
|
|
98
102
|
if not file.is_file():
|
|
99
103
|
raise RuntimeError(f"{file} is not a file.")
|
|
100
104
|
code_type = self._check_code_type(code_type)
|
|
101
|
-
if code_type == "python":
|
|
105
|
+
if self._CODE_TYPE_MAPPING[code_type] == "python":
|
|
102
106
|
# For Python code, use ast to analyze and modify the code
|
|
103
107
|
import ast
|
|
104
108
|
|
|
@@ -113,23 +117,41 @@ class SubprocessInterpreter(BaseInterpreter):
|
|
|
113
117
|
# Get the last node
|
|
114
118
|
if tree.body:
|
|
115
119
|
last_node = tree.body[-1]
|
|
116
|
-
#
|
|
120
|
+
# Handle expressions that would normally not produce output
|
|
121
|
+
# For example: In a REPL, typing '1 + 2' should show '3'
|
|
122
|
+
|
|
117
123
|
if isinstance(last_node, ast.Expr):
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
124
|
+
# Only wrap in print(repr()) if it's not already a
|
|
125
|
+
# print call
|
|
126
|
+
if not (
|
|
127
|
+
isinstance(last_node.value, ast.Call)
|
|
128
|
+
and isinstance(last_node.value.func, ast.Name)
|
|
129
|
+
and last_node.value.func.id == 'print'
|
|
130
|
+
):
|
|
131
|
+
# Transform the AST to wrap the expression in print
|
|
132
|
+
# (repr())
|
|
133
|
+
# Example transformation:
|
|
134
|
+
# Before: x + y
|
|
135
|
+
# After: print(repr(x + y))
|
|
136
|
+
tree.body[-1] = ast.Expr(
|
|
137
|
+
value=ast.Call(
|
|
138
|
+
# Create print() function call
|
|
139
|
+
func=ast.Name(id='print', ctx=ast.Load()),
|
|
140
|
+
args=[
|
|
141
|
+
ast.Call(
|
|
142
|
+
# Create repr() function call
|
|
143
|
+
func=ast.Name(
|
|
144
|
+
id='repr', ctx=ast.Load()
|
|
145
|
+
),
|
|
146
|
+
# Pass the original expression as
|
|
147
|
+
# argument to repr()
|
|
148
|
+
args=[last_node.value],
|
|
149
|
+
keywords=[],
|
|
150
|
+
)
|
|
151
|
+
],
|
|
152
|
+
keywords=[],
|
|
153
|
+
)
|
|
131
154
|
)
|
|
132
|
-
)
|
|
133
155
|
# Fix missing source locations
|
|
134
156
|
ast.fix_missing_locations(tree)
|
|
135
157
|
# Convert back to source
|
|
@@ -159,7 +181,10 @@ class SubprocessInterpreter(BaseInterpreter):
|
|
|
159
181
|
return_code = proc.returncode
|
|
160
182
|
|
|
161
183
|
# Clean up temporary file if it was created
|
|
162
|
-
if
|
|
184
|
+
if (
|
|
185
|
+
self._CODE_TYPE_MAPPING[code_type] == "python"
|
|
186
|
+
and 'temp_file' in locals()
|
|
187
|
+
):
|
|
163
188
|
temp_file.unlink()
|
|
164
189
|
|
|
165
190
|
if self.print_stdout and stdout:
|
camel/loaders/__init__.py
CHANGED
|
@@ -17,6 +17,7 @@ from .base_io import File, create_file, create_file_from_raw_bytes
|
|
|
17
17
|
from .chunkr_reader import ChunkrReader
|
|
18
18
|
from .firecrawl_reader import Firecrawl
|
|
19
19
|
from .jina_url_reader import JinaURLReader
|
|
20
|
+
from .mineru_extractor import MinerU
|
|
20
21
|
from .panda_reader import PandaReader
|
|
21
22
|
from .unstructured_io import UnstructuredIO
|
|
22
23
|
|
|
@@ -30,4 +31,5 @@ __all__ = [
|
|
|
30
31
|
'Apify',
|
|
31
32
|
'ChunkrReader',
|
|
32
33
|
'PandaReader',
|
|
34
|
+
'MinerU',
|
|
33
35
|
]
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import time
|
|
17
|
+
from typing import Dict, List, Optional, Union
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
from camel.utils import api_keys_required
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MinerU:
|
|
25
|
+
r"""Document extraction service supporting OCR, formula recognition
|
|
26
|
+
and tables.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
api_key (str, optional): Authentication key for MinerU API service.
|
|
30
|
+
If not provided, will use MINERU_API_KEY environment variable.
|
|
31
|
+
(default: :obj:`None`)
|
|
32
|
+
api_url (str, optional): Base URL endpoint for the MinerU API service.
|
|
33
|
+
(default: :obj:`"https://mineru.net/api/v4"`)
|
|
34
|
+
|
|
35
|
+
Note:
|
|
36
|
+
- Single file size limit: 200MB
|
|
37
|
+
- Page limit per file: 600 pages
|
|
38
|
+
- Daily high-priority parsing quota: 2000 pages
|
|
39
|
+
- Some URLs (GitHub, AWS) may timeout due to network restrictions
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@api_keys_required(
|
|
43
|
+
[
|
|
44
|
+
("api_key", "MINERU_API_KEY"),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
api_key: Optional[str] = None,
|
|
50
|
+
api_url: Optional[str] = "https://mineru.net/api/v4",
|
|
51
|
+
is_ocr: bool = False,
|
|
52
|
+
enable_formula: bool = False,
|
|
53
|
+
enable_table: bool = True,
|
|
54
|
+
layout_model: str = "doclayout_yolo",
|
|
55
|
+
language: str = "en",
|
|
56
|
+
) -> None:
|
|
57
|
+
r"""Initialize MinerU extractor.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
api_key (str, optional): Authentication key for MinerU API service.
|
|
61
|
+
If not provided, will use MINERU_API_KEY environment variable.
|
|
62
|
+
api_url (str, optional): Base URL endpoint for MinerU API service.
|
|
63
|
+
(default: "https://mineru.net/api/v4")
|
|
64
|
+
is_ocr (bool, optional): Enable optical character recognition.
|
|
65
|
+
(default: :obj:`False`)
|
|
66
|
+
enable_formula (bool, optional): Enable formula recognition.
|
|
67
|
+
(default: :obj:`False`)
|
|
68
|
+
enable_table (bool, optional): Enable table detection, extraction.
|
|
69
|
+
(default: :obj:`True`)
|
|
70
|
+
layout_model (str, optional): Model for document layout detection.
|
|
71
|
+
Options are 'doclayout_yolo' or 'layoutlmv3'.
|
|
72
|
+
(default: :obj:`"doclayout_yolo"`)
|
|
73
|
+
language (str, optional): Primary language of the document.
|
|
74
|
+
(default: :obj:`"en"`)
|
|
75
|
+
"""
|
|
76
|
+
self._api_key = api_key or os.environ.get("MINERU_API_KEY")
|
|
77
|
+
self._api_url = api_url
|
|
78
|
+
self._headers = {
|
|
79
|
+
"Authorization": f"Bearer {self._api_key}",
|
|
80
|
+
"Content-Type": "application/json",
|
|
81
|
+
"Accept": "*/*",
|
|
82
|
+
}
|
|
83
|
+
self.is_ocr = is_ocr
|
|
84
|
+
self.enable_formula = enable_formula
|
|
85
|
+
self.enable_table = enable_table
|
|
86
|
+
self.layout_model = layout_model
|
|
87
|
+
self.language = language
|
|
88
|
+
|
|
89
|
+
def extract_url(self, url: str) -> Dict:
|
|
90
|
+
r"""Extract content from a URL document.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
url (str): Document URL to extract content from.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dict: Task identifier for tracking extraction progress.
|
|
97
|
+
"""
|
|
98
|
+
endpoint = f"{self._api_url}/extract/task"
|
|
99
|
+
payload = {"url": url}
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
response = requests.post(
|
|
103
|
+
endpoint,
|
|
104
|
+
headers=self._headers,
|
|
105
|
+
json=payload,
|
|
106
|
+
)
|
|
107
|
+
response.raise_for_status()
|
|
108
|
+
return response.json()["data"]
|
|
109
|
+
except Exception as e:
|
|
110
|
+
raise RuntimeError(f"Failed to extract URL: {e}")
|
|
111
|
+
|
|
112
|
+
def batch_extract_urls(
|
|
113
|
+
self,
|
|
114
|
+
files: List[Dict[str, Union[str, bool]]],
|
|
115
|
+
) -> str:
|
|
116
|
+
r"""Extract content from multiple document URLs in batch.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
files (List[Dict[str, Union[str, bool]]]): List of document
|
|
120
|
+
configurations. Each document requires 'url' and optionally
|
|
121
|
+
'is_ocr' and 'data_id' parameters.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
str: Batch identifier for tracking extraction progress.
|
|
125
|
+
"""
|
|
126
|
+
endpoint = f"{self._api_url}/extract/task/batch"
|
|
127
|
+
payload = {"files": files}
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
response = requests.post(
|
|
131
|
+
endpoint,
|
|
132
|
+
headers=self._headers,
|
|
133
|
+
json=payload,
|
|
134
|
+
)
|
|
135
|
+
response.raise_for_status()
|
|
136
|
+
return response.json()["data"]["batch_id"]
|
|
137
|
+
except Exception as e:
|
|
138
|
+
raise RuntimeError(f"Failed to batch extract URLs: {e}")
|
|
139
|
+
|
|
140
|
+
def get_task_status(self, task_id: str) -> Dict:
|
|
141
|
+
r"""Retrieve status of a single extraction task.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
task_id (str): Unique identifier of the extraction task.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dict: Current task status and results if completed.
|
|
148
|
+
"""
|
|
149
|
+
endpoint = f"{self._api_url}/extract/task/{task_id}"
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
response = requests.get(endpoint, headers=self._headers)
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
return response.json()["data"]
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise RuntimeError(f"Failed to get task status: {e}")
|
|
157
|
+
|
|
158
|
+
def get_batch_status(self, batch_id: str) -> Dict:
|
|
159
|
+
r"""Retrieve status of a batch extraction task.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
batch_id (str): Unique identifier of the batch extraction task.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dict: Current status and results for all documents in the batch.
|
|
166
|
+
"""
|
|
167
|
+
endpoint = f"{self._api_url}/extract-results/batch/{batch_id}"
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
response = requests.get(endpoint, headers=self._headers)
|
|
171
|
+
response.raise_for_status()
|
|
172
|
+
return response.json()["data"]
|
|
173
|
+
except Exception as e:
|
|
174
|
+
raise RuntimeError(f"Failed to get batch status: {e}")
|
|
175
|
+
|
|
176
|
+
def wait_for_completion(
|
|
177
|
+
self,
|
|
178
|
+
task_id: str,
|
|
179
|
+
is_batch: bool = False,
|
|
180
|
+
timeout: float = 100,
|
|
181
|
+
check_interval: float = 5,
|
|
182
|
+
) -> Dict:
|
|
183
|
+
r"""Monitor task until completion or timeout.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
task_id (str): Unique identifier of the task or batch.
|
|
187
|
+
is_batch (bool, optional): Indicates if task is a batch operation.
|
|
188
|
+
(default: :obj:`False`)
|
|
189
|
+
timeout (float, optional): Maximum wait time in seconds.
|
|
190
|
+
(default: :obj:`100`)
|
|
191
|
+
check_interval (float, optional): Time between status checks in
|
|
192
|
+
seconds. (default: :obj:`5`)
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Dict: Final task status and extraction results.
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
TimeoutError: If task exceeds specified timeout duration.
|
|
199
|
+
RuntimeError: If task fails or encounters processing error.
|
|
200
|
+
"""
|
|
201
|
+
start_time = time.time()
|
|
202
|
+
while True:
|
|
203
|
+
if time.time() - start_time > timeout:
|
|
204
|
+
raise TimeoutError(
|
|
205
|
+
f"Task {task_id} timed out after {timeout}s"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
status = (
|
|
210
|
+
self.get_batch_status(task_id)
|
|
211
|
+
if is_batch
|
|
212
|
+
else self.get_task_status(task_id)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if is_batch:
|
|
216
|
+
# Check batch status
|
|
217
|
+
all_done = True
|
|
218
|
+
failed_tasks = []
|
|
219
|
+
for result in status.get('extract_result', []):
|
|
220
|
+
if result.get('state') == 'failed':
|
|
221
|
+
failed_tasks.append(
|
|
222
|
+
f"{result.get('data_id')}:"
|
|
223
|
+
f" {result.get('err_msg')}"
|
|
224
|
+
)
|
|
225
|
+
elif result.get('state') != 'done':
|
|
226
|
+
all_done = False
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
if failed_tasks:
|
|
230
|
+
raise RuntimeError(
|
|
231
|
+
f"Batch tasks failed: {'; '.join(failed_tasks)}"
|
|
232
|
+
)
|
|
233
|
+
if all_done:
|
|
234
|
+
return status
|
|
235
|
+
else:
|
|
236
|
+
# Check single task status
|
|
237
|
+
state = status.get('state')
|
|
238
|
+
if state == 'failed':
|
|
239
|
+
raise RuntimeError(
|
|
240
|
+
f"Task failed: {status.get('err_msg')}"
|
|
241
|
+
)
|
|
242
|
+
elif state == 'done':
|
|
243
|
+
return status
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
if not isinstance(e, RuntimeError):
|
|
247
|
+
raise RuntimeError(f"Error checking status: {e}")
|
|
248
|
+
raise
|
|
249
|
+
|
|
250
|
+
time.sleep(check_interval)
|