camel-ai 0.2.68__py3-none-any.whl → 0.2.69a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (38) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +170 -11
  3. camel/configs/vllm_config.py +2 -0
  4. camel/datagen/self_improving_cot.py +1 -1
  5. camel/memories/context_creators/score_based.py +129 -87
  6. camel/runtimes/configs.py +11 -11
  7. camel/runtimes/daytona_runtime.py +4 -4
  8. camel/runtimes/docker_runtime.py +6 -6
  9. camel/runtimes/remote_http_runtime.py +5 -5
  10. camel/societies/workforce/prompts.py +13 -12
  11. camel/societies/workforce/single_agent_worker.py +263 -26
  12. camel/societies/workforce/utils.py +10 -2
  13. camel/societies/workforce/worker.py +21 -45
  14. camel/societies/workforce/workforce.py +43 -17
  15. camel/tasks/task.py +18 -12
  16. camel/toolkits/__init__.py +2 -0
  17. camel/toolkits/aci_toolkit.py +19 -19
  18. camel/toolkits/arxiv_toolkit.py +6 -6
  19. camel/toolkits/dappier_toolkit.py +5 -5
  20. camel/toolkits/file_write_toolkit.py +10 -10
  21. camel/toolkits/function_tool.py +4 -3
  22. camel/toolkits/github_toolkit.py +3 -3
  23. camel/toolkits/non_visual_browser_toolkit/__init__.py +18 -0
  24. camel/toolkits/non_visual_browser_toolkit/actions.py +196 -0
  25. camel/toolkits/non_visual_browser_toolkit/agent.py +278 -0
  26. camel/toolkits/non_visual_browser_toolkit/browser_non_visual_toolkit.py +363 -0
  27. camel/toolkits/non_visual_browser_toolkit/nv_browser_session.py +175 -0
  28. camel/toolkits/non_visual_browser_toolkit/snapshot.js +188 -0
  29. camel/toolkits/non_visual_browser_toolkit/snapshot.py +164 -0
  30. camel/toolkits/pptx_toolkit.py +4 -4
  31. camel/toolkits/sympy_toolkit.py +1 -1
  32. camel/toolkits/task_planning_toolkit.py +3 -3
  33. camel/toolkits/thinking_toolkit.py +1 -1
  34. camel/toolkits/video_analysis_toolkit.py +77 -3
  35. {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/METADATA +5 -1
  36. {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/RECORD +38 -31
  37. {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/WHEEL +0 -0
  38. {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,188 @@
1
+ (() => {
2
+ // Store each element as {text, priority, depth}
3
+ const elements = [];
4
+
5
+ // Maximum lines allowed before we start dropping lower-priority nodes
6
+ const MAX_LINES = 400;
7
+
8
+ // Priority helper – lower number = higher priority
9
+ function getPriority(tag, role, text) {
10
+ // 1. Interactive elements
11
+ if (["input", "button", "a", "select", "textarea"].includes(tag)) return 1;
12
+ if (["checkbox", "radio"].includes(role)) return 1;
13
+
14
+ // 2. Labels / descriptive adjacent text (label elements)
15
+ if (tag === "label") return 2;
16
+
17
+ // 3. General visible text
18
+ if (text) return 3;
19
+
20
+ // 4. Low-value structural nodes
21
+ return 4;
22
+ }
23
+
24
+ function isVisible(node) {
25
+ const rect = node.getBoundingClientRect();
26
+ if (rect.width === 0 || rect.height === 0) return false;
27
+
28
+ const style = window.getComputedStyle(node);
29
+ if (style.display === 'none' || style.visibility === 'hidden') return false;
30
+
31
+ return true;
32
+ }
33
+
34
+ function getRole(node) {
35
+ const tag = node.tagName.toLowerCase();
36
+ const type = node.getAttribute('type');
37
+
38
+ if (node.getAttribute('role')) return node.getAttribute('role');
39
+
40
+ if (tag === 'input') {
41
+ if (type === 'checkbox') return 'checkbox';
42
+ if (type === 'radio') return 'radio';
43
+ return 'input';
44
+ }
45
+
46
+ if (tag === 'button') return 'button';
47
+ if (tag === 'a') return 'link';
48
+ if (tag === 'select') return 'select';
49
+ if (tag === 'textarea') return 'textarea';
50
+ if (tag === 'p') return 'paragraph';
51
+ if (tag === 'span') return 'text';
52
+
53
+ return 'generic';
54
+ }
55
+
56
+ function getAccessibleName(node) {
57
+ if (node.hasAttribute('aria-label')) {
58
+ return node.getAttribute('aria-label');
59
+ }
60
+ if (node.hasAttribute('aria-labelledby')) {
61
+ const id = node.getAttribute('aria-labelledby');
62
+ const labelEl = document.getElementById(id);
63
+ if (labelEl) return labelEl.textContent.trim();
64
+ }
65
+ if (node.hasAttribute('title')) {
66
+ return node.getAttribute('title');
67
+ }
68
+
69
+ const tagName = node.tagName?.toLowerCase();
70
+ if (['style', 'script', 'meta', 'noscript', 'svg'].includes(tagName)) {
71
+ return '';
72
+ }
73
+
74
+ const text = node.textContent?.trim() || '';
75
+
76
+ // Ignore styles, tokens, or long CSS-like expressions
77
+ if (/^[.#]?[a-zA-Z0-9\-_]+\s*\{[^}]*\}/.test(text)) return '';
78
+ if ((text.match(/[;:{}]/g)?.length || 0) > 2) return '';
79
+
80
+ return text.replace(/[^\w\u4e00-\u9fa5\s\-.,?!'"()()]/g, '').trim();
81
+ }
82
+
83
+ let refCounter = 1;
84
+
85
+ function traverse(node, depth) {
86
+ if (node.nodeType !== Node.ELEMENT_NODE) return;
87
+ if (!isVisible(node)) return;
88
+
89
+ const tagName = node.tagName.toLowerCase();
90
+ const text = getAccessibleName(node).slice(0, 50);
91
+
92
+ // Skip unlabeled links (anchors without any accessible name)
93
+ if (tagName === 'a' && !text) {
94
+ // Skip unlabeled links; process children if any
95
+ for (const child of node.children) {
96
+ traverse(child, depth + 1);
97
+ }
98
+ return;
99
+ }
100
+
101
+ const hasRoleOrText = ['button', 'a', 'input', 'select', 'textarea', 'p', 'span'].includes(tagName) ||
102
+ node.getAttribute('role') || text;
103
+
104
+ if (hasRoleOrText) {
105
+ const role = getRole(node);
106
+ const ref = `e${refCounter++}`;
107
+ const label = text ? `"${text}"` : '';
108
+
109
+ // Raw line (without indent) – we will apply indentation later once we know
110
+ // which ancestor lines survive filtering so that indentation always reflects
111
+ // the visible hierarchy.
112
+ const lineText = `- ${role}${label ? ` ${label}` : ''} [ref=${ref}]`;
113
+ const priority = getPriority(tagName, role, text);
114
+
115
+ elements.push({ text: lineText, priority, depth });
116
+
117
+ // Always inject ref so Playwright can still locate the element even if line is later filtered out.
118
+ node.setAttribute('aria-ref', ref);
119
+ }
120
+
121
+ for (const child of node.children) {
122
+ traverse(child, depth + 1);
123
+ }
124
+ }
125
+
126
+ function processDocument(doc, depth = 0) {
127
+ try {
128
+ traverse(doc.body, depth);
129
+ } catch (e) {
130
+ // Handle docs without body (e.g., about:blank)
131
+ }
132
+
133
+ const frames = doc.querySelectorAll('iframe');
134
+ for (const frame of frames) {
135
+ try {
136
+ if (frame.contentDocument) {
137
+ processDocument(frame.contentDocument, depth + 1);
138
+ }
139
+ } catch (e) {
140
+ // Skip cross-origin iframes
141
+ }
142
+ }
143
+ }
144
+
145
+ processDocument(document);
146
+
147
+ // Always drop priority-4 nodes (low-value structural or invisible)
148
+ let finalElements = elements.filter(el => el.priority <= 3);
149
+
150
+ // Additional size condensation when still exceeding MAX_LINES
151
+ if (finalElements.length > MAX_LINES) {
152
+ const filterBy = (maxPriority) => finalElements.filter(el => el.priority <= maxPriority);
153
+
154
+ // Progressively tighten: keep 1-3, then 1-2, finally only 1
155
+ for (const limit of [3, 2, 1]) {
156
+ const candidate = filterBy(limit);
157
+ if (candidate.length <= MAX_LINES || limit === 1) {
158
+ finalElements = candidate;
159
+ break;
160
+ }
161
+ }
162
+ }
163
+
164
+ // ------------------------------------------------------------------
165
+ // Re-apply indentation so that it matches the *visible* hierarchy only.
166
+ // Whenever an ancestor element is removed due to priority rules, its
167
+ // children will be re-indented one level up so the structure remains
168
+ // intuitive.
169
+ // ------------------------------------------------------------------
170
+ const outputLines = [];
171
+ const depthStack = []; // keeps track of kept original depths
172
+
173
+ for (const el of finalElements) {
174
+ // Pop depths that are not ancestors of current element
175
+ while (depthStack.length && depthStack[depthStack.length - 1] >= el.depth) {
176
+ depthStack.pop();
177
+ }
178
+
179
+ // Push the current depth so future descendants know their ancestor chain
180
+ depthStack.push(el.depth);
181
+
182
+ const compressedDepth = depthStack.length - 1; // root level has zero indent
183
+ const indent = '\t'.repeat(compressedDepth);
184
+ outputLines.push(indent + el.text);
185
+ }
186
+
187
+ return outputLines.join('\n');
188
+ })();
@@ -0,0 +1,164 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Dict, List, Optional
16
+
17
+ if TYPE_CHECKING:
18
+ from playwright.async_api import Page
19
+
20
+ # Logging support
21
+ from camel.logger import get_logger
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+ class PageSnapshot:
27
+ """Utility for capturing YAML-like page snapshots and diff-only
28
+ variants."""
29
+
30
+ MAX_TIMEOUT_MS = 5000 # wait_for_load_state timeout
31
+
32
+ def __init__(self, page: "Page"):
33
+ self.page = page
34
+ self.snapshot_data: Optional[str] = None # last full snapshot
35
+ self._last_url: Optional[str] = None
36
+ self.last_info: Dict[str, List[int] | bool] = {
37
+ "is_diff": False,
38
+ "priorities": [1, 2, 3],
39
+ }
40
+
41
+ # ---------------------------------------------------------------------
42
+ # Public API
43
+ # ---------------------------------------------------------------------
44
+ async def capture(
45
+ self, *, force_refresh: bool = False, diff_only: bool = False
46
+ ) -> str:
47
+ """Return current snapshot or just the diff to previous one."""
48
+ try:
49
+ current_url = self.page.url
50
+
51
+ # Serve cached copy (unless diff requested)
52
+ if (
53
+ not force_refresh
54
+ and current_url == self._last_url
55
+ and self.snapshot_data
56
+ and not diff_only
57
+ ):
58
+ return self.snapshot_data
59
+
60
+ # ensure DOM stability
61
+ await self.page.wait_for_load_state(
62
+ 'domcontentloaded', timeout=self.MAX_TIMEOUT_MS
63
+ )
64
+
65
+ logger.debug("Capturing page snapshot …")
66
+ snapshot_text = await self._get_snapshot_direct()
67
+ formatted = self._format_snapshot(snapshot_text or "<empty>")
68
+
69
+ output = formatted
70
+ if diff_only and self.snapshot_data:
71
+ output = self._compute_diff(self.snapshot_data, formatted)
72
+
73
+ # update cache with *full* snapshot (not diff)
74
+ self._last_url = current_url
75
+ self.snapshot_data = formatted
76
+
77
+ # analyse priorities present (only for non-diff)
78
+ priorities_included = self._detect_priorities(
79
+ formatted if not diff_only else self.snapshot_data or formatted
80
+ )
81
+ self.last_info = {
82
+ "is_diff": diff_only and self.snapshot_data is not None,
83
+ "priorities": priorities_included,
84
+ }
85
+
86
+ logger.debug(
87
+ "Snapshot captured. Diff_only=%s, priorities=%s",
88
+ diff_only,
89
+ self.last_info["priorities"],
90
+ )
91
+ return output
92
+ except Exception as exc:
93
+ logger.error("Snapshot capture failed: %s", exc)
94
+ return f"Error: Could not capture page snapshot {exc}"
95
+
96
+ # ------------------------------------------------------------------
97
+ # Internal helpers
98
+ # ------------------------------------------------------------------
99
+ _snapshot_js_cache: Optional[str] = None # class-level cache
100
+
101
+ async def _get_snapshot_direct(self) -> Optional[str]:
102
+ try:
103
+ if PageSnapshot._snapshot_js_cache is None:
104
+ js_path = Path(__file__).parent / "snapshot.js"
105
+ PageSnapshot._snapshot_js_cache = js_path.read_text(
106
+ encoding="utf-8"
107
+ )
108
+ return await self.page.evaluate(PageSnapshot._snapshot_js_cache)
109
+ except Exception as e:
110
+ logger.warning("Failed to execute snapshot JavaScript: %s", e)
111
+ return None
112
+
113
+ @staticmethod
114
+ def _format_snapshot(text: str) -> str:
115
+ return "\n".join(["- Page Snapshot", "```yaml", text, "```"])
116
+
117
+ @staticmethod
118
+ def _compute_diff(old: str, new: str) -> str:
119
+ if not old or not new:
120
+ return "- Page Snapshot (error: missing data for diff)"
121
+
122
+ import difflib
123
+
124
+ diff = list(
125
+ difflib.unified_diff(
126
+ old.splitlines(False),
127
+ new.splitlines(False),
128
+ fromfile='prev',
129
+ tofile='curr',
130
+ lineterm='',
131
+ )
132
+ )
133
+ if not diff:
134
+ return "- Page Snapshot (no structural changes)"
135
+ return "\n".join(["- Page Snapshot (diff)", "```diff", *diff, "```"])
136
+
137
+ # ------------------------------------------------------------------
138
+ def _detect_priorities(self, snapshot_yaml: str) -> List[int]:
139
+ """Return sorted list of priorities present (1,2,3)."""
140
+ priorities = set()
141
+ for line in snapshot_yaml.splitlines():
142
+ if '[ref=' not in line:
143
+ continue
144
+ lower_line = line.lower()
145
+ if any(
146
+ r in lower_line
147
+ for r in (
148
+ "input",
149
+ "button",
150
+ "select",
151
+ "textarea",
152
+ "checkbox",
153
+ "radio",
154
+ "link",
155
+ )
156
+ ):
157
+ priorities.add(1)
158
+ elif 'label' in lower_line:
159
+ priorities.add(2)
160
+ else:
161
+ priorities.add(3)
162
+ if not priorities:
163
+ priorities.add(3)
164
+ return sorted(priorities)
@@ -62,7 +62,7 @@ class PPTXToolkit(BaseToolkit):
62
62
  output_dir (str): The default directory for output files.
63
63
  Defaults to the current working directory.
64
64
  timeout (Optional[float]): The timeout for the toolkit.
65
- (default: :obj: `None`)
65
+ (default: :obj:`None`)
66
66
  """
67
67
  super().__init__(timeout=timeout)
68
68
  self.output_dir = Path(output_dir).resolve()
@@ -120,7 +120,7 @@ class PPTXToolkit(BaseToolkit):
120
120
  frame_paragraph: The paragraph to format.
121
121
  text (str): The text to format.
122
122
  set_color_to_white (bool): Whether to set the color to white.
123
- (default: :obj: `False`)
123
+ (default: :obj:`False`)
124
124
  """
125
125
  from pptx.dml.color import RGBColor
126
126
 
@@ -170,7 +170,7 @@ class PPTXToolkit(BaseToolkit):
170
170
  flat_items_list (List[Tuple[str, int]]): The list of items to be
171
171
  displayed.
172
172
  set_color_to_white (bool): Whether to set the font color to white.
173
- (default: :obj: `False`)
173
+ (default: :obj:`False`)
174
174
  """
175
175
  if not flat_items_list:
176
176
  logger.warning("Empty bullet point list provided")
@@ -368,7 +368,7 @@ class PPTXToolkit(BaseToolkit):
368
368
  supplied, it is resolved to self.output_dir.
369
369
  template (Optional[str]): The path to the template PPTX file.
370
370
  Initializes a presentation from a given template file Or PPTX
371
- file. (default: :obj: `None`)
371
+ file. (default: :obj:`None`)
372
372
 
373
373
  Returns:
374
374
  str: A success message indicating the file was created.
@@ -39,7 +39,7 @@ class SymPyToolkit(BaseToolkit):
39
39
 
40
40
  Args:
41
41
  default_variable (str): The default variable for
42
- operations (default: :obj: `x`)
42
+ operations (default: :obj:`x`)
43
43
  """
44
44
  super().__init__(timeout=timeout)
45
45
  self.default_variable = default_variable
@@ -32,7 +32,7 @@ class TaskPlanningToolkit(BaseToolkit):
32
32
 
33
33
  Args:
34
34
  timeout (Optional[float]): The timeout for the toolkit.
35
- (default: :obj: `None`)
35
+ (default: :obj:`None`)
36
36
  """
37
37
  super().__init__(timeout=timeout)
38
38
 
@@ -53,7 +53,7 @@ class TaskPlanningToolkit(BaseToolkit):
53
53
  string is the content for a new sub-task.
54
54
  original_task_id (Optional[str]): The id of the task to be
55
55
  decomposed. If not provided, a new id will be generated.
56
- (default: :obj: `None`)
56
+ (default: :obj:`None`)
57
57
 
58
58
  Returns:
59
59
  List[Task]: A list of newly created sub-task objects.
@@ -99,7 +99,7 @@ class TaskPlanningToolkit(BaseToolkit):
99
99
  sub_task_contents (List[str]): A list of strings, where each
100
100
  string is the content for a new sub-task.
101
101
  original_task_id (Optional[str]): The id of the task to be
102
- decomposed. (default: :obj: `None`)
102
+ decomposed. (default: :obj:`None`)
103
103
 
104
104
  Returns:
105
105
  List[Task]: Reordered or modified tasks.
@@ -32,7 +32,7 @@ class ThinkingToolkit(BaseToolkit):
32
32
 
33
33
  Args:
34
34
  timeout (Optional[float]): The timeout for the toolkit.
35
- (default: :obj: `None`)
35
+ (default: :obj:`None`)
36
36
  """
37
37
  super().__init__(timeout=timeout)
38
38
  self.plans: List[str] = []
@@ -17,6 +17,7 @@ from __future__ import annotations
17
17
 
18
18
  import io
19
19
  import os
20
+ import re
20
21
  import tempfile
21
22
  from pathlib import Path
22
23
  from typing import List, Optional
@@ -41,6 +42,11 @@ VIDEO_QA_PROMPT = """
41
42
  Analyze the provided video frames and corresponding audio transcription to \
42
43
  answer the given question(s) thoroughly and accurately.
43
44
 
45
+ The transcriptions may come from two sources:
46
+ 1. **Audio Transcription**: The spoken words in the video.
47
+ 2. **Visual Text (OCR)**: Text extracted from the video frames (like \
48
+ captions, on-screen text, etc.).
49
+
44
50
  Instructions:
45
51
  1. Visual Analysis:
46
52
  - Examine the video frames to identify visible entities.
@@ -49,11 +55,13 @@ such as size, color, shape, texture, or behavior.
49
55
  - Note significant groupings, interactions, or contextual patterns \
50
56
  relevant to the analysis.
51
57
 
52
- 2. Audio Integration:
58
+ 2. Audio and Text Integration:
53
59
  - Use the audio transcription to complement or clarify your visual \
54
60
  observations.
61
+ - Use the visual text (OCR) to get exact textual information that may \
62
+ not be accurately readable from the images alone.
55
63
  - Identify names, descriptions, or contextual hints in the \
56
- transcription that help confirm or refine your visual analysis.
64
+ transcriptions that help confirm or refine your visual analysis.
57
65
 
58
66
  3. Detailed Reasoning and Justification:
59
67
  - Provide a brief explanation of how you identified and distinguished \
@@ -65,7 +73,7 @@ your reasoning.
65
73
  - Specify the total number of distinct species or object types \
66
74
  identified in the video.
67
75
  - Describe the defining characteristics and any supporting evidence \
68
- from the video and transcription.
76
+ from the video and transcription sources.
69
77
 
70
78
  5. Important Considerations:
71
79
  - Pay close attention to subtle differences that could distinguish \
@@ -76,6 +84,9 @@ similar-looking species or objects
76
84
  **Audio Transcription:**
77
85
  {audio_transcription}
78
86
 
87
+ **Visual Text (OCR):**
88
+ {visual_text}
89
+
79
90
  **Question:**
80
91
  {question}
81
92
  """
@@ -96,6 +107,8 @@ class VideoAnalysisToolkit(BaseToolkit):
96
107
  transcription using OpenAI's audio models. Requires a valid OpenAI
97
108
  API key. When disabled, video analysis will be based solely on
98
109
  visual content. (default: :obj:`False`)
110
+ use_ocr (bool, optional): Whether to enable OCR for extracting text
111
+ from video frames. (default: :obj:`False`)
99
112
  frame_interval (float, optional): Interval in seconds between frames
100
113
  to extract from the video. (default: :obj:`4.0`)
101
114
  output_language (str, optional): The language for output responses.
@@ -113,6 +126,7 @@ class VideoAnalysisToolkit(BaseToolkit):
113
126
  download_directory: Optional[str] = None,
114
127
  model: Optional[BaseModelBackend] = None,
115
128
  use_audio_transcription: bool = False,
129
+ use_ocr: bool = False,
116
130
  frame_interval: float = 4.0,
117
131
  output_language: str = "English",
118
132
  cookies_path: Optional[str] = None,
@@ -122,6 +136,7 @@ class VideoAnalysisToolkit(BaseToolkit):
122
136
  self._cleanup = download_directory is None
123
137
  self._temp_files: list[str] = [] # Track temporary files for cleanup
124
138
  self._use_audio_transcription = use_audio_transcription
139
+ self._use_ocr = use_ocr
125
140
  self.output_language = output_language
126
141
  self.frame_interval = frame_interval
127
142
 
@@ -211,6 +226,53 @@ class VideoAnalysisToolkit(BaseToolkit):
211
226
  f"{self._download_directory}: {e}"
212
227
  )
213
228
 
229
+ @dependencies_required("pytesseract", "cv2", "numpy")
230
+ def _extract_text_from_frame(self, frame: Image.Image) -> str:
231
+ r"""Extract text from a video frame using OCR.
232
+
233
+ Args:
234
+ frame (Image.Image): PIL image frame to process.
235
+
236
+ Returns:
237
+ str: Extracted text from the frame.
238
+ """
239
+ import cv2
240
+ import numpy as np
241
+ import pytesseract
242
+
243
+ try:
244
+ # Convert to OpenCV format for preprocessing
245
+ cv_image = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
246
+
247
+ # Preprocessing for better OCR results
248
+ gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
249
+ blur = cv2.GaussianBlur(gray, (3, 3), 0)
250
+ _, threshold = cv2.threshold(
251
+ blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
252
+ )
253
+
254
+ # Convert back to PIL image for OCR
255
+ preprocessed_frame = Image.fromarray(threshold)
256
+ return pytesseract.image_to_string(preprocessed_frame).strip()
257
+ except Exception as e:
258
+ logger.error(f"OCR failed: {e}")
259
+ return ""
260
+
261
+ def _process_extracted_text(self, text: str) -> str:
262
+ r"""Clean and format OCR-extracted text.
263
+
264
+ Args:
265
+ text (str): Raw extracted OCR text.
266
+
267
+ Returns:
268
+ str: Cleaned and formatted text.
269
+ """
270
+ # Filter irrelevant characters and noise
271
+ text = re.sub(r'[^\w\s,.?!:;\'"()-]', '', text)
272
+ # Remove excessive whitespace
273
+ text = re.sub(r'\s+', ' ', text).strip()
274
+ return text
275
+
214
276
  def _extract_audio_from_video(
215
277
  self, video_path: str, output_format: str = "mp3"
216
278
  ) -> str:
@@ -511,9 +573,21 @@ class VideoAnalysisToolkit(BaseToolkit):
511
573
  audio_path = self._extract_audio_from_video(video_path)
512
574
  audio_transcript = self._transcribe_audio(audio_path)
513
575
 
576
+ # Extract visual text with OCR
577
+ visual_text = ""
514
578
  video_frames = self._extract_keyframes(video_path)
579
+ # Build visual text only if OCR is enabled
580
+ if self._use_ocr:
581
+ for frame in video_frames:
582
+ text = self._extract_text_from_frame(frame)
583
+ processed = self._process_extracted_text(text)
584
+ if processed:
585
+ visual_text += processed + "\n"
586
+ visual_text = visual_text.strip() or "No visual text detected."
587
+
515
588
  prompt = VIDEO_QA_PROMPT.format(
516
589
  audio_transcription=audio_transcript,
590
+ visual_text=visual_text,
517
591
  question=question,
518
592
  )
519
593
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: camel-ai
3
- Version: 0.2.68
3
+ Version: 0.2.69a2
4
4
  Summary: Communicative Agents for AI Society Study
5
5
  Project-URL: Homepage, https://www.camel-ai.org/
6
6
  Project-URL: Repository, https://github.com/camel-ai/camel
@@ -99,6 +99,7 @@ Requires-Dist: pymupdf<2,>=1.22.5; extra == 'all'
99
99
  Requires-Dist: pyobvector>=0.1.18; extra == 'all'
100
100
  Requires-Dist: pyowm<4,>=3.3.0; extra == 'all'
101
101
  Requires-Dist: pytelegrambotapi<5,>=4.18.0; extra == 'all'
102
+ Requires-Dist: pytesseract>=0.3.13; extra == 'all'
102
103
  Requires-Dist: pytest-asyncio<0.24,>=0.23.0; extra == 'all'
103
104
  Requires-Dist: pytest-cov<5,>=4; extra == 'all'
104
105
  Requires-Dist: pytest<8,>=7; extra == 'all'
@@ -226,6 +227,7 @@ Provides-Extra: media-tools
226
227
  Requires-Dist: ffmpeg-python<0.3,>=0.2.0; extra == 'media-tools'
227
228
  Requires-Dist: imageio[pyav]<3,>=2.34.2; extra == 'media-tools'
228
229
  Requires-Dist: pydub<0.26,>=0.25.1; extra == 'media-tools'
230
+ Requires-Dist: pytesseract>=0.3.13; extra == 'media-tools'
229
231
  Requires-Dist: scenedetect>=0.6.5.2; extra == 'media-tools'
230
232
  Requires-Dist: yt-dlp<2025,>=2024.11.4; extra == 'media-tools'
231
233
  Provides-Extra: model-platforms
@@ -252,6 +254,7 @@ Requires-Dist: ffmpeg-python<0.3,>=0.2.0; extra == 'owl'
252
254
  Requires-Dist: fpdf>=1.7.2; extra == 'owl'
253
255
  Requires-Dist: html2text>=2024.2.26; extra == 'owl'
254
256
  Requires-Dist: imageio[pyav]<3,>=2.34.2; extra == 'owl'
257
+ Requires-Dist: markitdown==0.1.1; extra == 'owl'
255
258
  Requires-Dist: mcp-server-fetch==2025.1.17; extra == 'owl'
256
259
  Requires-Dist: mcp-simple-arxiv==0.2.2; extra == 'owl'
257
260
  Requires-Dist: newspaper3k<0.3,>=0.2.8; extra == 'owl'
@@ -266,6 +269,7 @@ Requires-Dist: pyautogui<0.10,>=0.9.54; extra == 'owl'
266
269
  Requires-Dist: pydub<0.26,>=0.25.1; extra == 'owl'
267
270
  Requires-Dist: pylatex>=1.4.2; extra == 'owl'
268
271
  Requires-Dist: pymupdf<2,>=1.22.5; extra == 'owl'
272
+ Requires-Dist: pytesseract>=0.3.13; extra == 'owl'
269
273
  Requires-Dist: python-dotenv<2,>=1.0.0; extra == 'owl'
270
274
  Requires-Dist: python-pptx>=1.0.2; extra == 'owl'
271
275
  Requires-Dist: requests-oauthlib<2,>=1.3.1; extra == 'owl'