camel-ai 0.2.68__py3-none-any.whl → 0.2.69a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +170 -11
- camel/configs/vllm_config.py +2 -0
- camel/datagen/self_improving_cot.py +1 -1
- camel/memories/context_creators/score_based.py +129 -87
- camel/runtimes/configs.py +11 -11
- camel/runtimes/daytona_runtime.py +4 -4
- camel/runtimes/docker_runtime.py +6 -6
- camel/runtimes/remote_http_runtime.py +5 -5
- camel/societies/workforce/prompts.py +13 -12
- camel/societies/workforce/single_agent_worker.py +263 -26
- camel/societies/workforce/utils.py +10 -2
- camel/societies/workforce/worker.py +21 -45
- camel/societies/workforce/workforce.py +43 -17
- camel/tasks/task.py +18 -12
- camel/toolkits/__init__.py +2 -0
- camel/toolkits/aci_toolkit.py +19 -19
- camel/toolkits/arxiv_toolkit.py +6 -6
- camel/toolkits/dappier_toolkit.py +5 -5
- camel/toolkits/file_write_toolkit.py +10 -10
- camel/toolkits/function_tool.py +4 -3
- camel/toolkits/github_toolkit.py +3 -3
- camel/toolkits/non_visual_browser_toolkit/__init__.py +18 -0
- camel/toolkits/non_visual_browser_toolkit/actions.py +196 -0
- camel/toolkits/non_visual_browser_toolkit/agent.py +278 -0
- camel/toolkits/non_visual_browser_toolkit/browser_non_visual_toolkit.py +363 -0
- camel/toolkits/non_visual_browser_toolkit/nv_browser_session.py +175 -0
- camel/toolkits/non_visual_browser_toolkit/snapshot.js +188 -0
- camel/toolkits/non_visual_browser_toolkit/snapshot.py +164 -0
- camel/toolkits/pptx_toolkit.py +4 -4
- camel/toolkits/sympy_toolkit.py +1 -1
- camel/toolkits/task_planning_toolkit.py +3 -3
- camel/toolkits/thinking_toolkit.py +1 -1
- camel/toolkits/video_analysis_toolkit.py +77 -3
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/METADATA +5 -1
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/RECORD +38 -31
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
(() => {
|
|
2
|
+
// Store each element as {text, priority, depth}
|
|
3
|
+
const elements = [];
|
|
4
|
+
|
|
5
|
+
// Maximum lines allowed before we start dropping lower-priority nodes
|
|
6
|
+
const MAX_LINES = 400;
|
|
7
|
+
|
|
8
|
+
// Priority helper – lower number = higher priority
|
|
9
|
+
function getPriority(tag, role, text) {
|
|
10
|
+
// 1. Interactive elements
|
|
11
|
+
if (["input", "button", "a", "select", "textarea"].includes(tag)) return 1;
|
|
12
|
+
if (["checkbox", "radio"].includes(role)) return 1;
|
|
13
|
+
|
|
14
|
+
// 2. Labels / descriptive adjacent text (label elements)
|
|
15
|
+
if (tag === "label") return 2;
|
|
16
|
+
|
|
17
|
+
// 3. General visible text
|
|
18
|
+
if (text) return 3;
|
|
19
|
+
|
|
20
|
+
// 4. Low-value structural nodes
|
|
21
|
+
return 4;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function isVisible(node) {
|
|
25
|
+
const rect = node.getBoundingClientRect();
|
|
26
|
+
if (rect.width === 0 || rect.height === 0) return false;
|
|
27
|
+
|
|
28
|
+
const style = window.getComputedStyle(node);
|
|
29
|
+
if (style.display === 'none' || style.visibility === 'hidden') return false;
|
|
30
|
+
|
|
31
|
+
return true;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function getRole(node) {
|
|
35
|
+
const tag = node.tagName.toLowerCase();
|
|
36
|
+
const type = node.getAttribute('type');
|
|
37
|
+
|
|
38
|
+
if (node.getAttribute('role')) return node.getAttribute('role');
|
|
39
|
+
|
|
40
|
+
if (tag === 'input') {
|
|
41
|
+
if (type === 'checkbox') return 'checkbox';
|
|
42
|
+
if (type === 'radio') return 'radio';
|
|
43
|
+
return 'input';
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (tag === 'button') return 'button';
|
|
47
|
+
if (tag === 'a') return 'link';
|
|
48
|
+
if (tag === 'select') return 'select';
|
|
49
|
+
if (tag === 'textarea') return 'textarea';
|
|
50
|
+
if (tag === 'p') return 'paragraph';
|
|
51
|
+
if (tag === 'span') return 'text';
|
|
52
|
+
|
|
53
|
+
return 'generic';
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function getAccessibleName(node) {
|
|
57
|
+
if (node.hasAttribute('aria-label')) {
|
|
58
|
+
return node.getAttribute('aria-label');
|
|
59
|
+
}
|
|
60
|
+
if (node.hasAttribute('aria-labelledby')) {
|
|
61
|
+
const id = node.getAttribute('aria-labelledby');
|
|
62
|
+
const labelEl = document.getElementById(id);
|
|
63
|
+
if (labelEl) return labelEl.textContent.trim();
|
|
64
|
+
}
|
|
65
|
+
if (node.hasAttribute('title')) {
|
|
66
|
+
return node.getAttribute('title');
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const tagName = node.tagName?.toLowerCase();
|
|
70
|
+
if (['style', 'script', 'meta', 'noscript', 'svg'].includes(tagName)) {
|
|
71
|
+
return '';
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const text = node.textContent?.trim() || '';
|
|
75
|
+
|
|
76
|
+
// Ignore styles, tokens, or long CSS-like expressions
|
|
77
|
+
if (/^[.#]?[a-zA-Z0-9\-_]+\s*\{[^}]*\}/.test(text)) return '';
|
|
78
|
+
if ((text.match(/[;:{}]/g)?.length || 0) > 2) return '';
|
|
79
|
+
|
|
80
|
+
return text.replace(/[^\w\u4e00-\u9fa5\s\-.,?!'"()()]/g, '').trim();
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
let refCounter = 1;
|
|
84
|
+
|
|
85
|
+
function traverse(node, depth) {
|
|
86
|
+
if (node.nodeType !== Node.ELEMENT_NODE) return;
|
|
87
|
+
if (!isVisible(node)) return;
|
|
88
|
+
|
|
89
|
+
const tagName = node.tagName.toLowerCase();
|
|
90
|
+
const text = getAccessibleName(node).slice(0, 50);
|
|
91
|
+
|
|
92
|
+
// Skip unlabeled links (anchors without any accessible name)
|
|
93
|
+
if (tagName === 'a' && !text) {
|
|
94
|
+
// Skip unlabeled links; process children if any
|
|
95
|
+
for (const child of node.children) {
|
|
96
|
+
traverse(child, depth + 1);
|
|
97
|
+
}
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const hasRoleOrText = ['button', 'a', 'input', 'select', 'textarea', 'p', 'span'].includes(tagName) ||
|
|
102
|
+
node.getAttribute('role') || text;
|
|
103
|
+
|
|
104
|
+
if (hasRoleOrText) {
|
|
105
|
+
const role = getRole(node);
|
|
106
|
+
const ref = `e${refCounter++}`;
|
|
107
|
+
const label = text ? `"${text}"` : '';
|
|
108
|
+
|
|
109
|
+
// Raw line (without indent) – we will apply indentation later once we know
|
|
110
|
+
// which ancestor lines survive filtering so that indentation always reflects
|
|
111
|
+
// the visible hierarchy.
|
|
112
|
+
const lineText = `- ${role}${label ? ` ${label}` : ''} [ref=${ref}]`;
|
|
113
|
+
const priority = getPriority(tagName, role, text);
|
|
114
|
+
|
|
115
|
+
elements.push({ text: lineText, priority, depth });
|
|
116
|
+
|
|
117
|
+
// Always inject ref so Playwright can still locate the element even if line is later filtered out.
|
|
118
|
+
node.setAttribute('aria-ref', ref);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
for (const child of node.children) {
|
|
122
|
+
traverse(child, depth + 1);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function processDocument(doc, depth = 0) {
|
|
127
|
+
try {
|
|
128
|
+
traverse(doc.body, depth);
|
|
129
|
+
} catch (e) {
|
|
130
|
+
// Handle docs without body (e.g., about:blank)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const frames = doc.querySelectorAll('iframe');
|
|
134
|
+
for (const frame of frames) {
|
|
135
|
+
try {
|
|
136
|
+
if (frame.contentDocument) {
|
|
137
|
+
processDocument(frame.contentDocument, depth + 1);
|
|
138
|
+
}
|
|
139
|
+
} catch (e) {
|
|
140
|
+
// Skip cross-origin iframes
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
processDocument(document);
|
|
146
|
+
|
|
147
|
+
// Always drop priority-4 nodes (low-value structural or invisible)
|
|
148
|
+
let finalElements = elements.filter(el => el.priority <= 3);
|
|
149
|
+
|
|
150
|
+
// Additional size condensation when still exceeding MAX_LINES
|
|
151
|
+
if (finalElements.length > MAX_LINES) {
|
|
152
|
+
const filterBy = (maxPriority) => finalElements.filter(el => el.priority <= maxPriority);
|
|
153
|
+
|
|
154
|
+
// Progressively tighten: keep 1-3, then 1-2, finally only 1
|
|
155
|
+
for (const limit of [3, 2, 1]) {
|
|
156
|
+
const candidate = filterBy(limit);
|
|
157
|
+
if (candidate.length <= MAX_LINES || limit === 1) {
|
|
158
|
+
finalElements = candidate;
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// ------------------------------------------------------------------
|
|
165
|
+
// Re-apply indentation so that it matches the *visible* hierarchy only.
|
|
166
|
+
// Whenever an ancestor element is removed due to priority rules, its
|
|
167
|
+
// children will be re-indented one level up so the structure remains
|
|
168
|
+
// intuitive.
|
|
169
|
+
// ------------------------------------------------------------------
|
|
170
|
+
const outputLines = [];
|
|
171
|
+
const depthStack = []; // keeps track of kept original depths
|
|
172
|
+
|
|
173
|
+
for (const el of finalElements) {
|
|
174
|
+
// Pop depths that are not ancestors of current element
|
|
175
|
+
while (depthStack.length && depthStack[depthStack.length - 1] >= el.depth) {
|
|
176
|
+
depthStack.pop();
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Push the current depth so future descendants know their ancestor chain
|
|
180
|
+
depthStack.push(el.depth);
|
|
181
|
+
|
|
182
|
+
const compressedDepth = depthStack.length - 1; // root level has zero indent
|
|
183
|
+
const indent = '\t'.repeat(compressedDepth);
|
|
184
|
+
outputLines.push(indent + el.text);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return outputLines.join('\n');
|
|
188
|
+
})();
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from playwright.async_api import Page
|
|
19
|
+
|
|
20
|
+
# Logging support
|
|
21
|
+
from camel.logger import get_logger
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PageSnapshot:
|
|
27
|
+
"""Utility for capturing YAML-like page snapshots and diff-only
|
|
28
|
+
variants."""
|
|
29
|
+
|
|
30
|
+
MAX_TIMEOUT_MS = 5000 # wait_for_load_state timeout
|
|
31
|
+
|
|
32
|
+
def __init__(self, page: "Page"):
|
|
33
|
+
self.page = page
|
|
34
|
+
self.snapshot_data: Optional[str] = None # last full snapshot
|
|
35
|
+
self._last_url: Optional[str] = None
|
|
36
|
+
self.last_info: Dict[str, List[int] | bool] = {
|
|
37
|
+
"is_diff": False,
|
|
38
|
+
"priorities": [1, 2, 3],
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------
|
|
42
|
+
# Public API
|
|
43
|
+
# ---------------------------------------------------------------------
|
|
44
|
+
async def capture(
|
|
45
|
+
self, *, force_refresh: bool = False, diff_only: bool = False
|
|
46
|
+
) -> str:
|
|
47
|
+
"""Return current snapshot or just the diff to previous one."""
|
|
48
|
+
try:
|
|
49
|
+
current_url = self.page.url
|
|
50
|
+
|
|
51
|
+
# Serve cached copy (unless diff requested)
|
|
52
|
+
if (
|
|
53
|
+
not force_refresh
|
|
54
|
+
and current_url == self._last_url
|
|
55
|
+
and self.snapshot_data
|
|
56
|
+
and not diff_only
|
|
57
|
+
):
|
|
58
|
+
return self.snapshot_data
|
|
59
|
+
|
|
60
|
+
# ensure DOM stability
|
|
61
|
+
await self.page.wait_for_load_state(
|
|
62
|
+
'domcontentloaded', timeout=self.MAX_TIMEOUT_MS
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
logger.debug("Capturing page snapshot …")
|
|
66
|
+
snapshot_text = await self._get_snapshot_direct()
|
|
67
|
+
formatted = self._format_snapshot(snapshot_text or "<empty>")
|
|
68
|
+
|
|
69
|
+
output = formatted
|
|
70
|
+
if diff_only and self.snapshot_data:
|
|
71
|
+
output = self._compute_diff(self.snapshot_data, formatted)
|
|
72
|
+
|
|
73
|
+
# update cache with *full* snapshot (not diff)
|
|
74
|
+
self._last_url = current_url
|
|
75
|
+
self.snapshot_data = formatted
|
|
76
|
+
|
|
77
|
+
# analyse priorities present (only for non-diff)
|
|
78
|
+
priorities_included = self._detect_priorities(
|
|
79
|
+
formatted if not diff_only else self.snapshot_data or formatted
|
|
80
|
+
)
|
|
81
|
+
self.last_info = {
|
|
82
|
+
"is_diff": diff_only and self.snapshot_data is not None,
|
|
83
|
+
"priorities": priorities_included,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
logger.debug(
|
|
87
|
+
"Snapshot captured. Diff_only=%s, priorities=%s",
|
|
88
|
+
diff_only,
|
|
89
|
+
self.last_info["priorities"],
|
|
90
|
+
)
|
|
91
|
+
return output
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
logger.error("Snapshot capture failed: %s", exc)
|
|
94
|
+
return f"Error: Could not capture page snapshot {exc}"
|
|
95
|
+
|
|
96
|
+
# ------------------------------------------------------------------
|
|
97
|
+
# Internal helpers
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
_snapshot_js_cache: Optional[str] = None # class-level cache
|
|
100
|
+
|
|
101
|
+
async def _get_snapshot_direct(self) -> Optional[str]:
|
|
102
|
+
try:
|
|
103
|
+
if PageSnapshot._snapshot_js_cache is None:
|
|
104
|
+
js_path = Path(__file__).parent / "snapshot.js"
|
|
105
|
+
PageSnapshot._snapshot_js_cache = js_path.read_text(
|
|
106
|
+
encoding="utf-8"
|
|
107
|
+
)
|
|
108
|
+
return await self.page.evaluate(PageSnapshot._snapshot_js_cache)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.warning("Failed to execute snapshot JavaScript: %s", e)
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def _format_snapshot(text: str) -> str:
|
|
115
|
+
return "\n".join(["- Page Snapshot", "```yaml", text, "```"])
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _compute_diff(old: str, new: str) -> str:
|
|
119
|
+
if not old or not new:
|
|
120
|
+
return "- Page Snapshot (error: missing data for diff)"
|
|
121
|
+
|
|
122
|
+
import difflib
|
|
123
|
+
|
|
124
|
+
diff = list(
|
|
125
|
+
difflib.unified_diff(
|
|
126
|
+
old.splitlines(False),
|
|
127
|
+
new.splitlines(False),
|
|
128
|
+
fromfile='prev',
|
|
129
|
+
tofile='curr',
|
|
130
|
+
lineterm='',
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
if not diff:
|
|
134
|
+
return "- Page Snapshot (no structural changes)"
|
|
135
|
+
return "\n".join(["- Page Snapshot (diff)", "```diff", *diff, "```"])
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
def _detect_priorities(self, snapshot_yaml: str) -> List[int]:
|
|
139
|
+
"""Return sorted list of priorities present (1,2,3)."""
|
|
140
|
+
priorities = set()
|
|
141
|
+
for line in snapshot_yaml.splitlines():
|
|
142
|
+
if '[ref=' not in line:
|
|
143
|
+
continue
|
|
144
|
+
lower_line = line.lower()
|
|
145
|
+
if any(
|
|
146
|
+
r in lower_line
|
|
147
|
+
for r in (
|
|
148
|
+
"input",
|
|
149
|
+
"button",
|
|
150
|
+
"select",
|
|
151
|
+
"textarea",
|
|
152
|
+
"checkbox",
|
|
153
|
+
"radio",
|
|
154
|
+
"link",
|
|
155
|
+
)
|
|
156
|
+
):
|
|
157
|
+
priorities.add(1)
|
|
158
|
+
elif 'label' in lower_line:
|
|
159
|
+
priorities.add(2)
|
|
160
|
+
else:
|
|
161
|
+
priorities.add(3)
|
|
162
|
+
if not priorities:
|
|
163
|
+
priorities.add(3)
|
|
164
|
+
return sorted(priorities)
|
camel/toolkits/pptx_toolkit.py
CHANGED
|
@@ -62,7 +62,7 @@ class PPTXToolkit(BaseToolkit):
|
|
|
62
62
|
output_dir (str): The default directory for output files.
|
|
63
63
|
Defaults to the current working directory.
|
|
64
64
|
timeout (Optional[float]): The timeout for the toolkit.
|
|
65
|
-
(default: :obj
|
|
65
|
+
(default: :obj:`None`)
|
|
66
66
|
"""
|
|
67
67
|
super().__init__(timeout=timeout)
|
|
68
68
|
self.output_dir = Path(output_dir).resolve()
|
|
@@ -120,7 +120,7 @@ class PPTXToolkit(BaseToolkit):
|
|
|
120
120
|
frame_paragraph: The paragraph to format.
|
|
121
121
|
text (str): The text to format.
|
|
122
122
|
set_color_to_white (bool): Whether to set the color to white.
|
|
123
|
-
(default: :obj
|
|
123
|
+
(default: :obj:`False`)
|
|
124
124
|
"""
|
|
125
125
|
from pptx.dml.color import RGBColor
|
|
126
126
|
|
|
@@ -170,7 +170,7 @@ class PPTXToolkit(BaseToolkit):
|
|
|
170
170
|
flat_items_list (List[Tuple[str, int]]): The list of items to be
|
|
171
171
|
displayed.
|
|
172
172
|
set_color_to_white (bool): Whether to set the font color to white.
|
|
173
|
-
(default: :obj
|
|
173
|
+
(default: :obj:`False`)
|
|
174
174
|
"""
|
|
175
175
|
if not flat_items_list:
|
|
176
176
|
logger.warning("Empty bullet point list provided")
|
|
@@ -368,7 +368,7 @@ class PPTXToolkit(BaseToolkit):
|
|
|
368
368
|
supplied, it is resolved to self.output_dir.
|
|
369
369
|
template (Optional[str]): The path to the template PPTX file.
|
|
370
370
|
Initializes a presentation from a given template file Or PPTX
|
|
371
|
-
file. (default: :obj
|
|
371
|
+
file. (default: :obj:`None`)
|
|
372
372
|
|
|
373
373
|
Returns:
|
|
374
374
|
str: A success message indicating the file was created.
|
camel/toolkits/sympy_toolkit.py
CHANGED
|
@@ -32,7 +32,7 @@ class TaskPlanningToolkit(BaseToolkit):
|
|
|
32
32
|
|
|
33
33
|
Args:
|
|
34
34
|
timeout (Optional[float]): The timeout for the toolkit.
|
|
35
|
-
(default: :obj
|
|
35
|
+
(default: :obj:`None`)
|
|
36
36
|
"""
|
|
37
37
|
super().__init__(timeout=timeout)
|
|
38
38
|
|
|
@@ -53,7 +53,7 @@ class TaskPlanningToolkit(BaseToolkit):
|
|
|
53
53
|
string is the content for a new sub-task.
|
|
54
54
|
original_task_id (Optional[str]): The id of the task to be
|
|
55
55
|
decomposed. If not provided, a new id will be generated.
|
|
56
|
-
(default: :obj
|
|
56
|
+
(default: :obj:`None`)
|
|
57
57
|
|
|
58
58
|
Returns:
|
|
59
59
|
List[Task]: A list of newly created sub-task objects.
|
|
@@ -99,7 +99,7 @@ class TaskPlanningToolkit(BaseToolkit):
|
|
|
99
99
|
sub_task_contents (List[str]): A list of strings, where each
|
|
100
100
|
string is the content for a new sub-task.
|
|
101
101
|
original_task_id (Optional[str]): The id of the task to be
|
|
102
|
-
decomposed. (default: :obj
|
|
102
|
+
decomposed. (default: :obj:`None`)
|
|
103
103
|
|
|
104
104
|
Returns:
|
|
105
105
|
List[Task]: Reordered or modified tasks.
|
|
@@ -17,6 +17,7 @@ from __future__ import annotations
|
|
|
17
17
|
|
|
18
18
|
import io
|
|
19
19
|
import os
|
|
20
|
+
import re
|
|
20
21
|
import tempfile
|
|
21
22
|
from pathlib import Path
|
|
22
23
|
from typing import List, Optional
|
|
@@ -41,6 +42,11 @@ VIDEO_QA_PROMPT = """
|
|
|
41
42
|
Analyze the provided video frames and corresponding audio transcription to \
|
|
42
43
|
answer the given question(s) thoroughly and accurately.
|
|
43
44
|
|
|
45
|
+
The transcriptions may come from two sources:
|
|
46
|
+
1. **Audio Transcription**: The spoken words in the video.
|
|
47
|
+
2. **Visual Text (OCR)**: Text extracted from the video frames (like \
|
|
48
|
+
captions, on-screen text, etc.).
|
|
49
|
+
|
|
44
50
|
Instructions:
|
|
45
51
|
1. Visual Analysis:
|
|
46
52
|
- Examine the video frames to identify visible entities.
|
|
@@ -49,11 +55,13 @@ such as size, color, shape, texture, or behavior.
|
|
|
49
55
|
- Note significant groupings, interactions, or contextual patterns \
|
|
50
56
|
relevant to the analysis.
|
|
51
57
|
|
|
52
|
-
2. Audio Integration:
|
|
58
|
+
2. Audio and Text Integration:
|
|
53
59
|
- Use the audio transcription to complement or clarify your visual \
|
|
54
60
|
observations.
|
|
61
|
+
- Use the visual text (OCR) to get exact textual information that may \
|
|
62
|
+
not be accurately readable from the images alone.
|
|
55
63
|
- Identify names, descriptions, or contextual hints in the \
|
|
56
|
-
|
|
64
|
+
transcriptions that help confirm or refine your visual analysis.
|
|
57
65
|
|
|
58
66
|
3. Detailed Reasoning and Justification:
|
|
59
67
|
- Provide a brief explanation of how you identified and distinguished \
|
|
@@ -65,7 +73,7 @@ your reasoning.
|
|
|
65
73
|
- Specify the total number of distinct species or object types \
|
|
66
74
|
identified in the video.
|
|
67
75
|
- Describe the defining characteristics and any supporting evidence \
|
|
68
|
-
from the video and transcription.
|
|
76
|
+
from the video and transcription sources.
|
|
69
77
|
|
|
70
78
|
5. Important Considerations:
|
|
71
79
|
- Pay close attention to subtle differences that could distinguish \
|
|
@@ -76,6 +84,9 @@ similar-looking species or objects
|
|
|
76
84
|
**Audio Transcription:**
|
|
77
85
|
{audio_transcription}
|
|
78
86
|
|
|
87
|
+
**Visual Text (OCR):**
|
|
88
|
+
{visual_text}
|
|
89
|
+
|
|
79
90
|
**Question:**
|
|
80
91
|
{question}
|
|
81
92
|
"""
|
|
@@ -96,6 +107,8 @@ class VideoAnalysisToolkit(BaseToolkit):
|
|
|
96
107
|
transcription using OpenAI's audio models. Requires a valid OpenAI
|
|
97
108
|
API key. When disabled, video analysis will be based solely on
|
|
98
109
|
visual content. (default: :obj:`False`)
|
|
110
|
+
use_ocr (bool, optional): Whether to enable OCR for extracting text
|
|
111
|
+
from video frames. (default: :obj:`False`)
|
|
99
112
|
frame_interval (float, optional): Interval in seconds between frames
|
|
100
113
|
to extract from the video. (default: :obj:`4.0`)
|
|
101
114
|
output_language (str, optional): The language for output responses.
|
|
@@ -113,6 +126,7 @@ class VideoAnalysisToolkit(BaseToolkit):
|
|
|
113
126
|
download_directory: Optional[str] = None,
|
|
114
127
|
model: Optional[BaseModelBackend] = None,
|
|
115
128
|
use_audio_transcription: bool = False,
|
|
129
|
+
use_ocr: bool = False,
|
|
116
130
|
frame_interval: float = 4.0,
|
|
117
131
|
output_language: str = "English",
|
|
118
132
|
cookies_path: Optional[str] = None,
|
|
@@ -122,6 +136,7 @@ class VideoAnalysisToolkit(BaseToolkit):
|
|
|
122
136
|
self._cleanup = download_directory is None
|
|
123
137
|
self._temp_files: list[str] = [] # Track temporary files for cleanup
|
|
124
138
|
self._use_audio_transcription = use_audio_transcription
|
|
139
|
+
self._use_ocr = use_ocr
|
|
125
140
|
self.output_language = output_language
|
|
126
141
|
self.frame_interval = frame_interval
|
|
127
142
|
|
|
@@ -211,6 +226,53 @@ class VideoAnalysisToolkit(BaseToolkit):
|
|
|
211
226
|
f"{self._download_directory}: {e}"
|
|
212
227
|
)
|
|
213
228
|
|
|
229
|
+
@dependencies_required("pytesseract", "cv2", "numpy")
|
|
230
|
+
def _extract_text_from_frame(self, frame: Image.Image) -> str:
|
|
231
|
+
r"""Extract text from a video frame using OCR.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
frame (Image.Image): PIL image frame to process.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
str: Extracted text from the frame.
|
|
238
|
+
"""
|
|
239
|
+
import cv2
|
|
240
|
+
import numpy as np
|
|
241
|
+
import pytesseract
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
# Convert to OpenCV format for preprocessing
|
|
245
|
+
cv_image = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
|
|
246
|
+
|
|
247
|
+
# Preprocessing for better OCR results
|
|
248
|
+
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
|
|
249
|
+
blur = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
250
|
+
_, threshold = cv2.threshold(
|
|
251
|
+
blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Convert back to PIL image for OCR
|
|
255
|
+
preprocessed_frame = Image.fromarray(threshold)
|
|
256
|
+
return pytesseract.image_to_string(preprocessed_frame).strip()
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(f"OCR failed: {e}")
|
|
259
|
+
return ""
|
|
260
|
+
|
|
261
|
+
def _process_extracted_text(self, text: str) -> str:
|
|
262
|
+
r"""Clean and format OCR-extracted text.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
text (str): Raw extracted OCR text.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
str: Cleaned and formatted text.
|
|
269
|
+
"""
|
|
270
|
+
# Filter irrelevant characters and noise
|
|
271
|
+
text = re.sub(r'[^\w\s,.?!:;\'"()-]', '', text)
|
|
272
|
+
# Remove excessive whitespace
|
|
273
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
274
|
+
return text
|
|
275
|
+
|
|
214
276
|
def _extract_audio_from_video(
|
|
215
277
|
self, video_path: str, output_format: str = "mp3"
|
|
216
278
|
) -> str:
|
|
@@ -511,9 +573,21 @@ class VideoAnalysisToolkit(BaseToolkit):
|
|
|
511
573
|
audio_path = self._extract_audio_from_video(video_path)
|
|
512
574
|
audio_transcript = self._transcribe_audio(audio_path)
|
|
513
575
|
|
|
576
|
+
# Extract visual text with OCR
|
|
577
|
+
visual_text = ""
|
|
514
578
|
video_frames = self._extract_keyframes(video_path)
|
|
579
|
+
# Build visual text only if OCR is enabled
|
|
580
|
+
if self._use_ocr:
|
|
581
|
+
for frame in video_frames:
|
|
582
|
+
text = self._extract_text_from_frame(frame)
|
|
583
|
+
processed = self._process_extracted_text(text)
|
|
584
|
+
if processed:
|
|
585
|
+
visual_text += processed + "\n"
|
|
586
|
+
visual_text = visual_text.strip() or "No visual text detected."
|
|
587
|
+
|
|
515
588
|
prompt = VIDEO_QA_PROMPT.format(
|
|
516
589
|
audio_transcription=audio_transcript,
|
|
590
|
+
visual_text=visual_text,
|
|
517
591
|
question=question,
|
|
518
592
|
)
|
|
519
593
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: camel-ai
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.69a2
|
|
4
4
|
Summary: Communicative Agents for AI Society Study
|
|
5
5
|
Project-URL: Homepage, https://www.camel-ai.org/
|
|
6
6
|
Project-URL: Repository, https://github.com/camel-ai/camel
|
|
@@ -99,6 +99,7 @@ Requires-Dist: pymupdf<2,>=1.22.5; extra == 'all'
|
|
|
99
99
|
Requires-Dist: pyobvector>=0.1.18; extra == 'all'
|
|
100
100
|
Requires-Dist: pyowm<4,>=3.3.0; extra == 'all'
|
|
101
101
|
Requires-Dist: pytelegrambotapi<5,>=4.18.0; extra == 'all'
|
|
102
|
+
Requires-Dist: pytesseract>=0.3.13; extra == 'all'
|
|
102
103
|
Requires-Dist: pytest-asyncio<0.24,>=0.23.0; extra == 'all'
|
|
103
104
|
Requires-Dist: pytest-cov<5,>=4; extra == 'all'
|
|
104
105
|
Requires-Dist: pytest<8,>=7; extra == 'all'
|
|
@@ -226,6 +227,7 @@ Provides-Extra: media-tools
|
|
|
226
227
|
Requires-Dist: ffmpeg-python<0.3,>=0.2.0; extra == 'media-tools'
|
|
227
228
|
Requires-Dist: imageio[pyav]<3,>=2.34.2; extra == 'media-tools'
|
|
228
229
|
Requires-Dist: pydub<0.26,>=0.25.1; extra == 'media-tools'
|
|
230
|
+
Requires-Dist: pytesseract>=0.3.13; extra == 'media-tools'
|
|
229
231
|
Requires-Dist: scenedetect>=0.6.5.2; extra == 'media-tools'
|
|
230
232
|
Requires-Dist: yt-dlp<2025,>=2024.11.4; extra == 'media-tools'
|
|
231
233
|
Provides-Extra: model-platforms
|
|
@@ -252,6 +254,7 @@ Requires-Dist: ffmpeg-python<0.3,>=0.2.0; extra == 'owl'
|
|
|
252
254
|
Requires-Dist: fpdf>=1.7.2; extra == 'owl'
|
|
253
255
|
Requires-Dist: html2text>=2024.2.26; extra == 'owl'
|
|
254
256
|
Requires-Dist: imageio[pyav]<3,>=2.34.2; extra == 'owl'
|
|
257
|
+
Requires-Dist: markitdown==0.1.1; extra == 'owl'
|
|
255
258
|
Requires-Dist: mcp-server-fetch==2025.1.17; extra == 'owl'
|
|
256
259
|
Requires-Dist: mcp-simple-arxiv==0.2.2; extra == 'owl'
|
|
257
260
|
Requires-Dist: newspaper3k<0.3,>=0.2.8; extra == 'owl'
|
|
@@ -266,6 +269,7 @@ Requires-Dist: pyautogui<0.10,>=0.9.54; extra == 'owl'
|
|
|
266
269
|
Requires-Dist: pydub<0.26,>=0.25.1; extra == 'owl'
|
|
267
270
|
Requires-Dist: pylatex>=1.4.2; extra == 'owl'
|
|
268
271
|
Requires-Dist: pymupdf<2,>=1.22.5; extra == 'owl'
|
|
272
|
+
Requires-Dist: pytesseract>=0.3.13; extra == 'owl'
|
|
269
273
|
Requires-Dist: python-dotenv<2,>=1.0.0; extra == 'owl'
|
|
270
274
|
Requires-Dist: python-pptx>=1.0.2; extra == 'owl'
|
|
271
275
|
Requires-Dist: requests-oauthlib<2,>=1.3.1; extra == 'owl'
|