camel-ai 0.2.21__py3-none-any.whl → 0.2.23a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (106) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_types.py +41 -0
  3. camel/agents/_utils.py +188 -0
  4. camel/agents/chat_agent.py +556 -965
  5. camel/agents/knowledge_graph_agent.py +7 -1
  6. camel/agents/multi_hop_generator_agent.py +1 -1
  7. camel/configs/base_config.py +10 -13
  8. camel/configs/deepseek_config.py +4 -30
  9. camel/configs/gemini_config.py +5 -31
  10. camel/configs/openai_config.py +14 -32
  11. camel/configs/qwen_config.py +36 -36
  12. camel/datagen/self_improving_cot.py +79 -1
  13. camel/datagen/self_instruct/filter/instruction_filter.py +19 -3
  14. camel/datagen/self_instruct/self_instruct.py +7 -2
  15. camel/datasets/__init__.py +28 -0
  16. camel/datasets/base.py +969 -0
  17. camel/embeddings/openai_embedding.py +10 -1
  18. camel/environments/__init__.py +16 -0
  19. camel/environments/base.py +503 -0
  20. camel/extractors/__init__.py +16 -0
  21. camel/extractors/base.py +263 -0
  22. camel/interpreters/docker/Dockerfile +12 -0
  23. camel/interpreters/docker_interpreter.py +19 -1
  24. camel/interpreters/subprocess_interpreter.py +42 -17
  25. camel/loaders/__init__.py +2 -0
  26. camel/loaders/mineru_extractor.py +250 -0
  27. camel/memories/agent_memories.py +16 -1
  28. camel/memories/blocks/chat_history_block.py +10 -2
  29. camel/memories/blocks/vectordb_block.py +1 -0
  30. camel/memories/context_creators/score_based.py +20 -3
  31. camel/memories/records.py +10 -0
  32. camel/messages/base.py +8 -8
  33. camel/models/_utils.py +57 -0
  34. camel/models/aiml_model.py +48 -17
  35. camel/models/anthropic_model.py +41 -3
  36. camel/models/azure_openai_model.py +39 -3
  37. camel/models/base_model.py +132 -4
  38. camel/models/cohere_model.py +88 -11
  39. camel/models/deepseek_model.py +107 -63
  40. camel/models/gemini_model.py +133 -15
  41. camel/models/groq_model.py +72 -10
  42. camel/models/internlm_model.py +14 -3
  43. camel/models/litellm_model.py +9 -2
  44. camel/models/mistral_model.py +42 -5
  45. camel/models/model_manager.py +48 -3
  46. camel/models/moonshot_model.py +33 -4
  47. camel/models/nemotron_model.py +32 -3
  48. camel/models/nvidia_model.py +43 -3
  49. camel/models/ollama_model.py +139 -17
  50. camel/models/openai_audio_models.py +7 -1
  51. camel/models/openai_compatible_model.py +37 -3
  52. camel/models/openai_model.py +158 -46
  53. camel/models/qwen_model.py +61 -4
  54. camel/models/reka_model.py +53 -3
  55. camel/models/samba_model.py +209 -4
  56. camel/models/sglang_model.py +153 -14
  57. camel/models/siliconflow_model.py +16 -3
  58. camel/models/stub_model.py +46 -4
  59. camel/models/togetherai_model.py +38 -3
  60. camel/models/vllm_model.py +37 -3
  61. camel/models/yi_model.py +36 -3
  62. camel/models/zhipuai_model.py +38 -3
  63. camel/retrievers/__init__.py +3 -0
  64. camel/retrievers/hybrid_retrival.py +237 -0
  65. camel/toolkits/__init__.py +9 -2
  66. camel/toolkits/arxiv_toolkit.py +2 -1
  67. camel/toolkits/ask_news_toolkit.py +4 -2
  68. camel/toolkits/base.py +22 -3
  69. camel/toolkits/code_execution.py +2 -0
  70. camel/toolkits/dappier_toolkit.py +2 -1
  71. camel/toolkits/data_commons_toolkit.py +38 -12
  72. camel/toolkits/function_tool.py +13 -0
  73. camel/toolkits/github_toolkit.py +5 -1
  74. camel/toolkits/google_maps_toolkit.py +2 -1
  75. camel/toolkits/google_scholar_toolkit.py +2 -0
  76. camel/toolkits/human_toolkit.py +0 -3
  77. camel/toolkits/linkedin_toolkit.py +3 -2
  78. camel/toolkits/meshy_toolkit.py +3 -2
  79. camel/toolkits/mineru_toolkit.py +178 -0
  80. camel/toolkits/networkx_toolkit.py +240 -0
  81. camel/toolkits/notion_toolkit.py +2 -0
  82. camel/toolkits/openbb_toolkit.py +3 -2
  83. camel/toolkits/reddit_toolkit.py +11 -3
  84. camel/toolkits/retrieval_toolkit.py +6 -1
  85. camel/toolkits/semantic_scholar_toolkit.py +2 -1
  86. camel/toolkits/stripe_toolkit.py +8 -2
  87. camel/toolkits/sympy_toolkit.py +44 -1
  88. camel/toolkits/video_toolkit.py +2 -0
  89. camel/toolkits/whatsapp_toolkit.py +3 -2
  90. camel/toolkits/zapier_toolkit.py +191 -0
  91. camel/types/__init__.py +2 -2
  92. camel/types/agents/__init__.py +16 -0
  93. camel/types/agents/tool_calling_record.py +52 -0
  94. camel/types/enums.py +3 -0
  95. camel/types/openai_types.py +16 -14
  96. camel/utils/__init__.py +2 -1
  97. camel/utils/async_func.py +2 -2
  98. camel/utils/commons.py +114 -1
  99. camel/verifiers/__init__.py +23 -0
  100. camel/verifiers/base.py +340 -0
  101. camel/verifiers/models.py +82 -0
  102. camel/verifiers/python_verifier.py +202 -0
  103. {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/METADATA +273 -256
  104. {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/RECORD +106 -85
  105. {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/WHEEL +1 -1
  106. {camel_ai-0.2.21.dist-info → camel_ai-0.2.23a0.dist-info}/LICENSE +0 -0
@@ -0,0 +1,263 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ from abc import ABC, abstractmethod
16
+ from types import TracebackType
17
+ from typing import Any, Dict, Optional, Type
18
+
19
+ from typing_extensions import Self
20
+
21
+ from camel.logger import get_logger
22
+ from camel.utils import BatchProcessor
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ class BaseExtractor(ABC):
28
+ r"""Base class for all response extractors.
29
+
30
+ An extractor takes the response and extracts the relevant parts,
31
+ converting them into a format that the verifier can handle.
32
+ Implements async context manager protocol for proper resource management.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ cache_templates: bool = True,
38
+ max_cache_size: int = 1000,
39
+ extraction_timeout: float = 30.0,
40
+ batch_size: int = 10,
41
+ monitoring_interval: float = 5.0,
42
+ cpu_threshold: float = 80.0,
43
+ memory_threshold: float = 85.0,
44
+ **kwargs,
45
+ ):
46
+ r"""Initialize the extractor.
47
+
48
+ Args:
49
+ cache_templates (bool): Whether to cache extraction templates.
50
+ (default: :obj:`True`)
51
+ max_cache_size (int): Maximum number of templates to cache.
52
+ (default: :obj:`1000`)
53
+ extraction_timeout (float): Maximum time for extraction in seconds.
54
+ (default: :obj:`30.0`)
55
+ batch_size (int): Size of batches for parallel extraction.
56
+ (default: :obj:`10`)
57
+ monitoring_interval (float): Interval in seconds between resource
58
+ checks. (default: :obj:`5.0`)
59
+ cpu_threshold (float): CPU usage percentage threshold for scaling
60
+ down. (default: :obj:`80.0`)
61
+ memory_threshold (float): Memory usage percentage threshold for
62
+ scaling down. (default: :obj:`85.0`)
63
+ **kwargs: Additional extractor parameters.
64
+
65
+ Raises:
66
+ ValueError: If invalid parameter values are provided
67
+ """
68
+ # Store all parameters in metadata dict for compatibility
69
+ self._metadata = {
70
+ 'cache_templates': cache_templates,
71
+ 'max_cache_size': max_cache_size,
72
+ 'extraction_timeout': extraction_timeout,
73
+ 'batch_size': batch_size,
74
+ 'monitoring_interval': monitoring_interval,
75
+ 'cpu_threshold': cpu_threshold,
76
+ 'memory_threshold': memory_threshold,
77
+ **kwargs,
78
+ }
79
+
80
+ self._is_setup = False
81
+ self._cache: Dict[str, Any] = {}
82
+ self._batch_processor: Optional[BatchProcessor] = None
83
+
84
+ # Store configuration parameters
85
+ self._cache_templates = cache_templates
86
+ self._max_cache_size = max_cache_size
87
+ self._extraction_timeout = extraction_timeout
88
+ self._batch_size = batch_size
89
+ self._monitoring_interval = monitoring_interval
90
+ self._cpu_threshold = cpu_threshold
91
+ self._memory_threshold = memory_threshold
92
+
93
+ async def setup(self) -> None:
94
+ r"""Set up the extractor with necessary resources.
95
+
96
+ This method:
97
+ 1. Initializes template cache if enabled
98
+ 2. Sets up any parallel processing resources
99
+ 3. Validates extraction patterns
100
+
101
+ Raises:
102
+ RuntimeError: If initialization fails
103
+ """
104
+ if self._is_setup:
105
+ logger.debug(f"{self.__class__.__name__} already initialized")
106
+ return
107
+
108
+ try:
109
+ # Initialize template cache if enabled
110
+ if self._cache_templates:
111
+ self._template_cache: Dict[str, Any] = {}
112
+
113
+ # Set up batch processing if needed
114
+ if self._batch_size > 1:
115
+ self._batch_processor = BatchProcessor(
116
+ initial_batch_size=self._batch_size,
117
+ monitoring_interval=self._monitoring_interval,
118
+ cpu_threshold=self._cpu_threshold,
119
+ memory_threshold=self._memory_threshold,
120
+ )
121
+
122
+ self._is_setup = True
123
+ logger.info(f"{self.__class__.__name__} initialized successfully")
124
+
125
+ except Exception as e:
126
+ error_msg = f"Error during {self.__class__.__name__} setup: {e}"
127
+ logger.error(error_msg)
128
+ await self.cleanup()
129
+ raise RuntimeError(error_msg) from e
130
+
131
+ async def cleanup(self) -> None:
132
+ r"""Clean up extractor resources.
133
+
134
+ This method handles cleanup of resources and resets the extractor
135
+ state.
136
+ It ensures:
137
+ 1. All resources are properly released
138
+ 2. Template cache is cleared
139
+ 3. Parallel processing resources are shutdown
140
+ 4. State is reset to initial
141
+ 5. Cleanup happens even if errors occur
142
+
143
+ Raises:
144
+ RuntimeError: If cleanup fails (after resetting initialization
145
+ state).
146
+ """
147
+ if not self._is_setup:
148
+ logger.debug(
149
+ f"{self.__class__.__name__} not initialized, skipping cleanup"
150
+ )
151
+ return
152
+
153
+ errors = []
154
+ try:
155
+ # Clear template cache
156
+ if hasattr(self, '_template_cache'):
157
+ try:
158
+ self._template_cache.clear()
159
+ except Exception as e:
160
+ errors.append(f"Failed to clear template cache: {e}")
161
+
162
+ # Shutdown parallel processing
163
+ if self._batch_processor is not None:
164
+ try:
165
+ # Get final performance metrics before cleanup
166
+ metrics = self._batch_processor.get_performance_metrics()
167
+ logger.info(f"Batch processor final metrics: {metrics}")
168
+ except Exception as e:
169
+ errors.append(
170
+ f"Failed to get batch processor metrics: {e}"
171
+ )
172
+
173
+ # Preserve init config in metadata
174
+ self._metadata = {
175
+ 'cache_templates': self._cache_templates,
176
+ 'max_cache_size': self._max_cache_size,
177
+ 'extraction_timeout': self._extraction_timeout,
178
+ 'batch_size': self._batch_size,
179
+ }
180
+
181
+ if not errors:
182
+ logger.info(
183
+ f"{self.__class__.__name__} cleaned up successfully"
184
+ )
185
+
186
+ except Exception as e:
187
+ errors.append(f"Unexpected error during cleanup: {e}")
188
+
189
+ finally:
190
+ # Always mark as uninitialized, even if cleanup fails
191
+ self._is_setup = False
192
+ self._batch_processor = None
193
+
194
+ if errors:
195
+ error_msg = (
196
+ f"Errors during {self.__class__.__name__} cleanup: "
197
+ f"{'; '.join(errors)}"
198
+ )
199
+ logger.error(error_msg)
200
+ raise RuntimeError(error_msg)
201
+
202
+ async def __aenter__(self) -> Self:
203
+ r"""Async context manager entry.
204
+
205
+ Returns:
206
+ Self reference for context manager usage.
207
+ """
208
+ await self.setup()
209
+ return self
210
+
211
+ async def __aexit__(
212
+ self,
213
+ exc_type: Optional[Type[BaseException]],
214
+ exc_val: Optional[BaseException],
215
+ exc_tb: Optional[TracebackType],
216
+ ) -> None:
217
+ r"""Async context manager exit.
218
+
219
+ Args:
220
+ exc_type (Optional[Type[BaseException]]): Exception type if an
221
+ error occurred.
222
+ exc_val (Optional[BaseException]): Exception value if an error
223
+ occurred.
224
+ exc_tb (Optional[TracebackType]): Exception traceback if an error
225
+ occurred.
226
+ """
227
+ await self.cleanup()
228
+
229
+ @abstractmethod
230
+ async def extract(
231
+ self, response: str, context: Optional[Dict[str, Any]] = None
232
+ ) -> str:
233
+ r"""Extract relevant parts from a response.
234
+
235
+ Extracts:
236
+ 1. Final answer or output
237
+ 2. Chain of thought reasoning steps
238
+ 3. Difficulty assessment
239
+
240
+ Args:
241
+ response (str): Raw response from agent generation.
242
+ context (Optional[Dict[str, Any]]): Optional context for
243
+ extraction like:
244
+ - final_answer
245
+ - rationale
246
+ - complexity
247
+
248
+ Returns:
249
+ str: Extracted content string.
250
+
251
+ Raises:
252
+ ValueError: If response is empty or invalid.
253
+ NotImplementedError: If no implementation is provided.
254
+ RuntimeError: If extractor is not initialized.
255
+ """
256
+ if not self._is_setup:
257
+ raise RuntimeError(
258
+ f"{self.__class__.__name__} must be initialized "
259
+ "before extraction"
260
+ )
261
+ if not response or not response.strip():
262
+ raise ValueError("Empty or whitespace-only response")
263
+ raise NotImplementedError("Subclasses must implement extract()")
@@ -0,0 +1,12 @@
1
+ FROM python:3.9-slim
2
+
3
+ # Install R and required dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ r-base \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Set working directory
9
+ WORKDIR /workspace
10
+
11
+ # Keep container running
12
+ CMD ["tail", "-f", "/dev/null"]
@@ -52,11 +52,13 @@ class DockerInterpreter(BaseInterpreter):
52
52
  _CODE_EXECUTE_CMD_MAPPING: ClassVar[Dict[str, str]] = {
53
53
  "python": "python {file_name}",
54
54
  "bash": "bash {file_name}",
55
+ "r": "Rscript {file_name}",
55
56
  }
56
57
 
57
58
  _CODE_EXTENSION_MAPPING: ClassVar[Dict[str, str]] = {
58
59
  "python": "py",
59
60
  "bash": "sh",
61
+ "r": "R",
60
62
  }
61
63
 
62
64
  _CODE_TYPE_MAPPING: ClassVar[Dict[str, str]] = {
@@ -67,6 +69,8 @@ class DockerInterpreter(BaseInterpreter):
67
69
  "shell": "bash",
68
70
  "bash": "bash",
69
71
  "sh": "bash",
72
+ "r": "r",
73
+ "R": "r",
70
74
  }
71
75
 
72
76
  def __init__(
@@ -104,8 +108,22 @@ class DockerInterpreter(BaseInterpreter):
104
108
  import docker
105
109
 
106
110
  client = docker.from_env()
111
+
112
+ # Build custom image with Python and R
113
+ dockerfile_path = Path(__file__).parent / "docker"
114
+ image_tag = "camel-interpreter:latest"
115
+ try:
116
+ client.images.get(image_tag)
117
+ except docker.errors.ImageNotFound:
118
+ logger.info("Building custom interpreter image...")
119
+ client.images.build(
120
+ path=str(dockerfile_path),
121
+ tag=image_tag,
122
+ rm=True,
123
+ )
124
+
107
125
  self._container = client.containers.run(
108
- "python:3.10",
126
+ image_tag,
109
127
  detach=True,
110
128
  name=f"camel-interpreter-{uuid.uuid4()}",
111
129
  command="tail -f /dev/null",
@@ -48,11 +48,13 @@ class SubprocessInterpreter(BaseInterpreter):
48
48
  _CODE_EXECUTE_CMD_MAPPING: ClassVar[Dict[str, str]] = {
49
49
  "python": "python {file_name}",
50
50
  "bash": "bash {file_name}",
51
+ "r": "Rscript {file_name}",
51
52
  }
52
53
 
53
54
  _CODE_EXTENSION_MAPPING: ClassVar[Dict[str, str]] = {
54
55
  "python": "py",
55
56
  "bash": "sh",
57
+ "r": "R",
56
58
  }
57
59
 
58
60
  _CODE_TYPE_MAPPING: ClassVar[Dict[str, str]] = {
@@ -63,6 +65,8 @@ class SubprocessInterpreter(BaseInterpreter):
63
65
  "shell": "bash",
64
66
  "bash": "bash",
65
67
  "sh": "bash",
68
+ "r": "r",
69
+ "R": "r",
66
70
  }
67
71
 
68
72
  def __init__(
@@ -98,7 +102,7 @@ class SubprocessInterpreter(BaseInterpreter):
98
102
  if not file.is_file():
99
103
  raise RuntimeError(f"{file} is not a file.")
100
104
  code_type = self._check_code_type(code_type)
101
- if code_type == "python":
105
+ if self._CODE_TYPE_MAPPING[code_type] == "python":
102
106
  # For Python code, use ast to analyze and modify the code
103
107
  import ast
104
108
 
@@ -113,23 +117,41 @@ class SubprocessInterpreter(BaseInterpreter):
113
117
  # Get the last node
114
118
  if tree.body:
115
119
  last_node = tree.body[-1]
116
- # If it's an expression, wrap it in a print
120
+ # Handle expressions that would normally not produce output
121
+ # For example: In a REPL, typing '1 + 2' should show '3'
122
+
117
123
  if isinstance(last_node, ast.Expr):
118
- tree.body[-1] = ast.Expr(
119
- value=ast.Call(
120
- func=ast.Name(id='print', ctx=ast.Load()),
121
- args=[
122
- ast.Call(
123
- func=ast.Name(
124
- id='repr', ctx=ast.Load()
125
- ),
126
- args=[last_node.value],
127
- keywords=[],
128
- )
129
- ],
130
- keywords=[],
124
+ # Only wrap in print(repr()) if it's not already a
125
+ # print call
126
+ if not (
127
+ isinstance(last_node.value, ast.Call)
128
+ and isinstance(last_node.value.func, ast.Name)
129
+ and last_node.value.func.id == 'print'
130
+ ):
131
+ # Transform the AST to wrap the expression in print
132
+ # (repr())
133
+ # Example transformation:
134
+ # Before: x + y
135
+ # After: print(repr(x + y))
136
+ tree.body[-1] = ast.Expr(
137
+ value=ast.Call(
138
+ # Create print() function call
139
+ func=ast.Name(id='print', ctx=ast.Load()),
140
+ args=[
141
+ ast.Call(
142
+ # Create repr() function call
143
+ func=ast.Name(
144
+ id='repr', ctx=ast.Load()
145
+ ),
146
+ # Pass the original expression as
147
+ # argument to repr()
148
+ args=[last_node.value],
149
+ keywords=[],
150
+ )
151
+ ],
152
+ keywords=[],
153
+ )
131
154
  )
132
- )
133
155
  # Fix missing source locations
134
156
  ast.fix_missing_locations(tree)
135
157
  # Convert back to source
@@ -159,7 +181,10 @@ class SubprocessInterpreter(BaseInterpreter):
159
181
  return_code = proc.returncode
160
182
 
161
183
  # Clean up temporary file if it was created
162
- if code_type == "python" and 'temp_file' in locals():
184
+ if (
185
+ self._CODE_TYPE_MAPPING[code_type] == "python"
186
+ and 'temp_file' in locals()
187
+ ):
163
188
  temp_file.unlink()
164
189
 
165
190
  if self.print_stdout and stdout:
camel/loaders/__init__.py CHANGED
@@ -17,6 +17,7 @@ from .base_io import File, create_file, create_file_from_raw_bytes
17
17
  from .chunkr_reader import ChunkrReader
18
18
  from .firecrawl_reader import Firecrawl
19
19
  from .jina_url_reader import JinaURLReader
20
+ from .mineru_extractor import MinerU
20
21
  from .panda_reader import PandaReader
21
22
  from .unstructured_io import UnstructuredIO
22
23
 
@@ -30,4 +31,5 @@ __all__ = [
30
31
  'Apify',
31
32
  'ChunkrReader',
32
33
  'PandaReader',
34
+ 'MinerU',
33
35
  ]
@@ -0,0 +1,250 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import os
16
+ import time
17
+ from typing import Dict, List, Optional, Union
18
+
19
+ import requests
20
+
21
+ from camel.utils import api_keys_required
22
+
23
+
24
+ class MinerU:
25
+ r"""Document extraction service supporting OCR, formula recognition
26
+ and tables.
27
+
28
+ Args:
29
+ api_key (str, optional): Authentication key for MinerU API service.
30
+ If not provided, will use MINERU_API_KEY environment variable.
31
+ (default: :obj:`None`)
32
+ api_url (str, optional): Base URL endpoint for the MinerU API service.
33
+ (default: :obj:`"https://mineru.net/api/v4"`)
34
+
35
+ Note:
36
+ - Single file size limit: 200MB
37
+ - Page limit per file: 600 pages
38
+ - Daily high-priority parsing quota: 2000 pages
39
+ - Some URLs (GitHub, AWS) may timeout due to network restrictions
40
+ """
41
+
42
+ @api_keys_required(
43
+ [
44
+ ("api_key", "MINERU_API_KEY"),
45
+ ]
46
+ )
47
+ def __init__(
48
+ self,
49
+ api_key: Optional[str] = None,
50
+ api_url: Optional[str] = "https://mineru.net/api/v4",
51
+ is_ocr: bool = False,
52
+ enable_formula: bool = False,
53
+ enable_table: bool = True,
54
+ layout_model: str = "doclayout_yolo",
55
+ language: str = "en",
56
+ ) -> None:
57
+ r"""Initialize MinerU extractor.
58
+
59
+ Args:
60
+ api_key (str, optional): Authentication key for MinerU API service.
61
+ If not provided, will use MINERU_API_KEY environment variable.
62
+ api_url (str, optional): Base URL endpoint for MinerU API service.
63
+ (default: "https://mineru.net/api/v4")
64
+ is_ocr (bool, optional): Enable optical character recognition.
65
+ (default: :obj:`False`)
66
+ enable_formula (bool, optional): Enable formula recognition.
67
+ (default: :obj:`False`)
68
+ enable_table (bool, optional): Enable table detection, extraction.
69
+ (default: :obj:`True`)
70
+ layout_model (str, optional): Model for document layout detection.
71
+ Options are 'doclayout_yolo' or 'layoutlmv3'.
72
+ (default: :obj:`"doclayout_yolo"`)
73
+ language (str, optional): Primary language of the document.
74
+ (default: :obj:`"en"`)
75
+ """
76
+ self._api_key = api_key or os.environ.get("MINERU_API_KEY")
77
+ self._api_url = api_url
78
+ self._headers = {
79
+ "Authorization": f"Bearer {self._api_key}",
80
+ "Content-Type": "application/json",
81
+ "Accept": "*/*",
82
+ }
83
+ self.is_ocr = is_ocr
84
+ self.enable_formula = enable_formula
85
+ self.enable_table = enable_table
86
+ self.layout_model = layout_model
87
+ self.language = language
88
+
89
+ def extract_url(self, url: str) -> Dict:
90
+ r"""Extract content from a URL document.
91
+
92
+ Args:
93
+ url (str): Document URL to extract content from.
94
+
95
+ Returns:
96
+ Dict: Task identifier for tracking extraction progress.
97
+ """
98
+ endpoint = f"{self._api_url}/extract/task"
99
+ payload = {"url": url}
100
+
101
+ try:
102
+ response = requests.post(
103
+ endpoint,
104
+ headers=self._headers,
105
+ json=payload,
106
+ )
107
+ response.raise_for_status()
108
+ return response.json()["data"]
109
+ except Exception as e:
110
+ raise RuntimeError(f"Failed to extract URL: {e}")
111
+
112
+ def batch_extract_urls(
113
+ self,
114
+ files: List[Dict[str, Union[str, bool]]],
115
+ ) -> str:
116
+ r"""Extract content from multiple document URLs in batch.
117
+
118
+ Args:
119
+ files (List[Dict[str, Union[str, bool]]]): List of document
120
+ configurations. Each document requires 'url' and optionally
121
+ 'is_ocr' and 'data_id' parameters.
122
+
123
+ Returns:
124
+ str: Batch identifier for tracking extraction progress.
125
+ """
126
+ endpoint = f"{self._api_url}/extract/task/batch"
127
+ payload = {"files": files}
128
+
129
+ try:
130
+ response = requests.post(
131
+ endpoint,
132
+ headers=self._headers,
133
+ json=payload,
134
+ )
135
+ response.raise_for_status()
136
+ return response.json()["data"]["batch_id"]
137
+ except Exception as e:
138
+ raise RuntimeError(f"Failed to batch extract URLs: {e}")
139
+
140
+ def get_task_status(self, task_id: str) -> Dict:
141
+ r"""Retrieve status of a single extraction task.
142
+
143
+ Args:
144
+ task_id (str): Unique identifier of the extraction task.
145
+
146
+ Returns:
147
+ Dict: Current task status and results if completed.
148
+ """
149
+ endpoint = f"{self._api_url}/extract/task/{task_id}"
150
+
151
+ try:
152
+ response = requests.get(endpoint, headers=self._headers)
153
+ response.raise_for_status()
154
+ return response.json()["data"]
155
+ except Exception as e:
156
+ raise RuntimeError(f"Failed to get task status: {e}")
157
+
158
+ def get_batch_status(self, batch_id: str) -> Dict:
159
+ r"""Retrieve status of a batch extraction task.
160
+
161
+ Args:
162
+ batch_id (str): Unique identifier of the batch extraction task.
163
+
164
+ Returns:
165
+ Dict: Current status and results for all documents in the batch.
166
+ """
167
+ endpoint = f"{self._api_url}/extract-results/batch/{batch_id}"
168
+
169
+ try:
170
+ response = requests.get(endpoint, headers=self._headers)
171
+ response.raise_for_status()
172
+ return response.json()["data"]
173
+ except Exception as e:
174
+ raise RuntimeError(f"Failed to get batch status: {e}")
175
+
176
+ def wait_for_completion(
177
+ self,
178
+ task_id: str,
179
+ is_batch: bool = False,
180
+ timeout: float = 100,
181
+ check_interval: float = 5,
182
+ ) -> Dict:
183
+ r"""Monitor task until completion or timeout.
184
+
185
+ Args:
186
+ task_id (str): Unique identifier of the task or batch.
187
+ is_batch (bool, optional): Indicates if task is a batch operation.
188
+ (default: :obj:`False`)
189
+ timeout (float, optional): Maximum wait time in seconds.
190
+ (default: :obj:`100`)
191
+ check_interval (float, optional): Time between status checks in
192
+ seconds. (default: :obj:`5`)
193
+
194
+ Returns:
195
+ Dict: Final task status and extraction results.
196
+
197
+ Raises:
198
+ TimeoutError: If task exceeds specified timeout duration.
199
+ RuntimeError: If task fails or encounters processing error.
200
+ """
201
+ start_time = time.time()
202
+ while True:
203
+ if time.time() - start_time > timeout:
204
+ raise TimeoutError(
205
+ f"Task {task_id} timed out after {timeout}s"
206
+ )
207
+
208
+ try:
209
+ status = (
210
+ self.get_batch_status(task_id)
211
+ if is_batch
212
+ else self.get_task_status(task_id)
213
+ )
214
+
215
+ if is_batch:
216
+ # Check batch status
217
+ all_done = True
218
+ failed_tasks = []
219
+ for result in status.get('extract_result', []):
220
+ if result.get('state') == 'failed':
221
+ failed_tasks.append(
222
+ f"{result.get('data_id')}:"
223
+ f" {result.get('err_msg')}"
224
+ )
225
+ elif result.get('state') != 'done':
226
+ all_done = False
227
+ break
228
+
229
+ if failed_tasks:
230
+ raise RuntimeError(
231
+ f"Batch tasks failed: {'; '.join(failed_tasks)}"
232
+ )
233
+ if all_done:
234
+ return status
235
+ else:
236
+ # Check single task status
237
+ state = status.get('state')
238
+ if state == 'failed':
239
+ raise RuntimeError(
240
+ f"Task failed: {status.get('err_msg')}"
241
+ )
242
+ elif state == 'done':
243
+ return status
244
+
245
+ except Exception as e:
246
+ if not isinstance(e, RuntimeError):
247
+ raise RuntimeError(f"Error checking status: {e}")
248
+ raise
249
+
250
+ time.sleep(check_interval)