utim-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- utim_cli/__init__.py +40 -0
- utim_cli/agent.py +359 -0
- utim_cli/auth.py +208 -0
- utim_cli/backup.py +101 -0
- utim_cli/billing.py +40 -0
- utim_cli/blender_agent.py +1018 -0
- utim_cli/bootstrap.py +324 -0
- utim_cli/client_utils.py +135 -0
- utim_cli/config.py +194 -0
- utim_cli/context_pruner.py +504 -0
- utim_cli/doctor.py +118 -0
- utim_cli/knowledge_graph.py +462 -0
- utim_cli/logger.py +121 -0
- utim_cli/mcp_clean_wrapper.py +55 -0
- utim_cli/mcp_client.py +198 -0
- utim_cli/mcp_registry.json +1102 -0
- utim_cli/orchestrator.py +3209 -0
- utim_cli/reflection.py +200 -0
- utim_cli/report.py +100 -0
- utim_cli/scrapy_search.py +229 -0
- utim_cli/share.py +320 -0
- utim_cli/share_tui.py +554 -0
- utim_cli/situational_scoring.py +269 -0
- utim_cli/state.py +15 -0
- utim_cli/tools.py +3381 -0
- utim_cli/utim.py +4051 -0
- utim_cli/vector_memory.py +629 -0
- utim_cli/workspace.py +33 -0
- utim_cli-1.0.0.dist-info/METADATA +134 -0
- utim_cli-1.0.0.dist-info/RECORD +34 -0
- utim_cli-1.0.0.dist-info/WHEEL +5 -0
- utim_cli-1.0.0.dist-info/entry_points.txt +2 -0
- utim_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
- utim_cli-1.0.0.dist-info/top_level.txt +1 -0
utim_cli/reflection.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Automated Task Reflection & Experience System — Powered by Hugging Face Vector Embeddings.
|
|
3
|
+
|
|
4
|
+
This module captures learnings, architecture rules, user preferences, and failure corrections
|
|
5
|
+
at the end of agent tasks, indexing them into ChromaDB using Hugging Face model embeddings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
import uuid
|
|
12
|
+
import sqlite3
|
|
13
|
+
from typing import Dict, List, Optional
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
MEMORY_FILE = ".utim_tmp/task_reflections.json"
|
|
19
|
+
CONVENTIONS_FILE = ".utim_conventions.md"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_learnings(user_message: str, assistant_content: str,
|
|
23
|
+
tool_results: List[Dict], llm_key: str, elapsed_seconds: int = 0, iterations: int = 0) -> Dict:
|
|
24
|
+
"""
|
|
25
|
+
Use LLM to analyze completed work and extract learnings, preferences, conventions, corrections, and time management insights.
|
|
26
|
+
|
|
27
|
+
Returns dictionary with preferences, conventions, rules, corrections, time_reflection, and sub-agent specific learnings.
|
|
28
|
+
"""
|
|
29
|
+
if not llm_key:
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
# Prepare context for reflection
|
|
33
|
+
tool_summary = []
|
|
34
|
+
for r in tool_results[:10]: # Limit for context
|
|
35
|
+
name = r.get("func_name", "") or r.get("name", "")
|
|
36
|
+
result = str(r.get("result", ""))[:300]
|
|
37
|
+
tool_summary.append(f"- {name}: {result[:100]}...")
|
|
38
|
+
|
|
39
|
+
tool_text = "\n".join(tool_summary)
|
|
40
|
+
|
|
41
|
+
prompt = f"""Analyze this completed interaction and extract ULTRA-CONCISE actionable rules (each strictly under 12 words, no long prose):
|
|
42
|
+
1. User preferences (e.g., "always use pytest")
|
|
43
|
+
2. Project conventions discovered (naming/structure rules)
|
|
44
|
+
3. Architectural rules learned
|
|
45
|
+
4. Failure corrections or reasoning lessons learned
|
|
46
|
+
5. Sub-agent specific rules (for `project_res`, `plan_project`, `web_search`)
|
|
47
|
+
6. Time-management reflection (single rule under 80 chars)
|
|
48
|
+
|
|
49
|
+
Task:
|
|
50
|
+
User: {user_message}
|
|
51
|
+
|
|
52
|
+
Assistant: {assistant_content[:600]}
|
|
53
|
+
|
|
54
|
+
Tools used:
|
|
55
|
+
{tool_text}
|
|
56
|
+
|
|
57
|
+
Return ONLY a JSON object with the following structure:
|
|
58
|
+
{{
|
|
59
|
+
"preferences": ["..."],
|
|
60
|
+
"conventions": ["..."],
|
|
61
|
+
"rules": ["..."],
|
|
62
|
+
"corrections": ["..."],
|
|
63
|
+
"time_reflection": "...",
|
|
64
|
+
"subagent_learnings": {{
|
|
65
|
+
"project_res": {{ "rules": ["..."], "experiences": ["..."] }},
|
|
66
|
+
"plan_project": {{ "rules": ["..."], "experiences": ["..."] }},
|
|
67
|
+
"web_search": {{ "rules": ["..."], "experiences": ["..."] }}
|
|
68
|
+
}}
|
|
69
|
+
}}"""
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Prefer direct API endpoint or router
|
|
73
|
+
apiUrl = os.environ.get("ROUTER_API_URL", "https://openrouter.ai/api/v1/chat/completions")
|
|
74
|
+
resp = requests.post(
|
|
75
|
+
apiUrl,
|
|
76
|
+
json={
|
|
77
|
+
"model": "cohere/north-mini-code:free",
|
|
78
|
+
"messages": [
|
|
79
|
+
{"role": "system", "content": "You are a reflection engine. Extract technical learnings, logical corrections, and time adaptations from completed work."},
|
|
80
|
+
{"role": "user", "content": prompt}
|
|
81
|
+
]
|
|
82
|
+
},
|
|
83
|
+
headers={"Authorization": f"Bearer {llm_key}"},
|
|
84
|
+
timeout=15
|
|
85
|
+
)
|
|
86
|
+
if resp.status_code == 200:
|
|
87
|
+
content = resp.json()["choices"][0]["message"]["content"]
|
|
88
|
+
import re
|
|
89
|
+
content = re.sub(r"<think(?:ing)?>.*?</think(?:ing)?>", "", content, flags=re.DOTALL).strip()
|
|
90
|
+
if content.startswith("```"):
|
|
91
|
+
lines = content.splitlines()
|
|
92
|
+
if lines[0].startswith("```"):
|
|
93
|
+
lines = lines[1:]
|
|
94
|
+
if lines[-1].startswith("```"):
|
|
95
|
+
lines = lines[:-1]
|
|
96
|
+
content = "\n".join(lines).strip()
|
|
97
|
+
if content.startswith("json"):
|
|
98
|
+
content = content[4:].strip()
|
|
99
|
+
return json.loads(content)
|
|
100
|
+
except Exception:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def save_learnings(learnings: Dict, project_dir: str = ".", user_message: str = "", assistant_content: str = "", elapsed_seconds: int = 0, iterations: int = 0):
|
|
107
|
+
"""
|
|
108
|
+
Save learnings to memory.json, .utim_conventions.md, and the Hugging Face Reflection Vector DB.
|
|
109
|
+
"""
|
|
110
|
+
os.makedirs(".utim_tmp", exist_ok=True)
|
|
111
|
+
|
|
112
|
+
# 1. Update task reflections history
|
|
113
|
+
if learnings and any(learnings.get(k) for k in ["conventions", "rules", "preferences", "corrections", "subagent_learnings", "time_reflection"]):
|
|
114
|
+
reflections = []
|
|
115
|
+
if os.path.exists(MEMORY_FILE):
|
|
116
|
+
try:
|
|
117
|
+
with open(MEMORY_FILE, "r", encoding="utf-8") as f:
|
|
118
|
+
reflections = json.load(f)
|
|
119
|
+
except Exception:
|
|
120
|
+
reflections = []
|
|
121
|
+
|
|
122
|
+
entry = {
|
|
123
|
+
"timestamp": datetime.now().isoformat(),
|
|
124
|
+
"user_task": user_message[:200],
|
|
125
|
+
"learnings": learnings
|
|
126
|
+
}
|
|
127
|
+
reflections.append(entry)
|
|
128
|
+
reflections = reflections[-100:]
|
|
129
|
+
|
|
130
|
+
with open(MEMORY_FILE, "w", encoding="utf-8") as f:
|
|
131
|
+
json.dump(reflections, f, indent=2)
|
|
132
|
+
|
|
133
|
+
# 2. Update project conventions file
|
|
134
|
+
conventions_path = os.path.join(project_dir, CONVENTIONS_FILE)
|
|
135
|
+
existing = ""
|
|
136
|
+
if os.path.exists(conventions_path):
|
|
137
|
+
try:
|
|
138
|
+
with open(conventions_path, "r", encoding="utf-8") as f:
|
|
139
|
+
existing = f.read()
|
|
140
|
+
except Exception:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
new_section = ""
|
|
144
|
+
if learnings.get("conventions"):
|
|
145
|
+
new_section += f"\n\n## Conventions (added {datetime.now().strftime('%Y-%m-%d %H:%M')})\n"
|
|
146
|
+
for c in learnings["conventions"]:
|
|
147
|
+
new_section += f"- {c}\n"
|
|
148
|
+
if learnings.get("rules"):
|
|
149
|
+
new_section += f"\n### Architectural Rules\n"
|
|
150
|
+
for r in learnings["rules"]:
|
|
151
|
+
new_section += f"- {r}\n"
|
|
152
|
+
if learnings.get("preferences"):
|
|
153
|
+
new_section += f"\n### Preferences\n"
|
|
154
|
+
for p in learnings["preferences"]:
|
|
155
|
+
new_section += f"- {p}\n"
|
|
156
|
+
|
|
157
|
+
if new_section:
|
|
158
|
+
try:
|
|
159
|
+
with open(conventions_path, "w", encoding="utf-8") as f:
|
|
160
|
+
f.write(existing + new_section)
|
|
161
|
+
except Exception:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
# 3. Save to Vector Memory DB using Hugging Face model
|
|
165
|
+
try:
|
|
166
|
+
from utim_cli.vector_memory import get_reflections_memory, store_reflection
|
|
167
|
+
|
|
168
|
+
for pref in learnings.get("preferences", []):
|
|
169
|
+
store_reflection(content=pref, category="user_preference", task_prompt=user_message)
|
|
170
|
+
|
|
171
|
+
for rule in learnings.get("rules", []):
|
|
172
|
+
store_reflection(content=rule, category="architectural_rule", task_prompt=user_message)
|
|
173
|
+
|
|
174
|
+
for conv in learnings.get("conventions", []):
|
|
175
|
+
store_reflection(content=conv, category="project_convention", task_prompt=user_message)
|
|
176
|
+
|
|
177
|
+
for corr in learnings.get("corrections", []):
|
|
178
|
+
store_reflection(content=corr, category="failure_correction", task_prompt=user_message)
|
|
179
|
+
|
|
180
|
+
if user_message and assistant_content:
|
|
181
|
+
summary = f"Task: {user_message[:300]}\nResolution: {assistant_content[:400]}"
|
|
182
|
+
store_reflection(content=summary, category="task_experience", task_prompt=user_message)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def run_reflection_phase(user_message: str, assistant_content: str,
|
|
188
|
+
tool_results: List[Dict], elapsed_seconds: int = 0, iterations: int = 0) -> Dict:
|
|
189
|
+
"""
|
|
190
|
+
Main entry point for the reflection phase.
|
|
191
|
+
Analyzes completed work and stores learnings for future tasks.
|
|
192
|
+
"""
|
|
193
|
+
llm_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("UTIM_API_KEY") or "mock_key"
|
|
194
|
+
learnings = {}
|
|
195
|
+
|
|
196
|
+
if llm_key and llm_key != "mock_key":
|
|
197
|
+
learnings = extract_learnings(user_message, assistant_content, tool_results, llm_key, elapsed_seconds, iterations)
|
|
198
|
+
|
|
199
|
+
save_learnings(learnings, user_message=user_message, assistant_content=assistant_content, elapsed_seconds=elapsed_seconds, iterations=iterations)
|
|
200
|
+
return learnings
|
utim_cli/report.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import sys
|
|
4
|
+
import shutil
|
|
5
|
+
import zipfile
|
|
6
|
+
import io
|
|
7
|
+
from utim_cli.logger import redact_text, LOG_FILE
|
|
8
|
+
from utim_cli.doctor import run_diagnostics
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
# ── Unicode → ASCII symbol map ────────────────────────────────────────────────
|
|
12
|
+
_UNICODE_TO_ASCII: list = [
|
|
13
|
+
# Rich / doctor symbols
|
|
14
|
+
("\u2713", "[OK]"), # ✓
|
|
15
|
+
("\u2717", "[FAIL]"), # ✗
|
|
16
|
+
("\u2022", "-"), # •
|
|
17
|
+
("\u2b21", "#"), # ⬡
|
|
18
|
+
("\u2026", "..."), # …
|
|
19
|
+
("\u2192", "->"), # →
|
|
20
|
+
("\u2714", "[OK]"), # ✔
|
|
21
|
+
("\u2718", "[FAIL]"), # ✘
|
|
22
|
+
("\u25b6", ">"), # ▶
|
|
23
|
+
("\u25cf", "*"), # ●
|
|
24
|
+
# Emoji used in doctor / report
|
|
25
|
+
("\U0001f4c4", "[file]"),
|
|
26
|
+
("\u270f", "[edit]"),
|
|
27
|
+
("\U0001f5d1", "[del]"),
|
|
28
|
+
("\U0001f4e6", "[pkg]"),
|
|
29
|
+
("\u26a1", "[run]"),
|
|
30
|
+
("\U0001f4c1", "[dir]"),
|
|
31
|
+
("\U0001f50d", "[search]"),
|
|
32
|
+
("\U0001f9e0", "[ai]"),
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
_ANSI_RE = re.compile(r"\x1b\[[0-9;]*[mKHFABCDJsu]")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _to_ascii(text: str) -> str:
|
|
39
|
+
"""Strip ANSI escape codes and replace Unicode symbols with ASCII equivalents.
|
|
40
|
+
|
|
41
|
+
Any remaining non-ASCII character (e.g. user-supplied filenames or log
|
|
42
|
+
entries) is replaced with '?' so the output is always 7-bit clean and safe
|
|
43
|
+
to print on any Windows code page.
|
|
44
|
+
"""
|
|
45
|
+
# 1. Remove ANSI colour/cursor escape sequences
|
|
46
|
+
text = _ANSI_RE.sub("", text)
|
|
47
|
+
# 2. Map known symbols to ASCII stand-ins
|
|
48
|
+
for uni, asc in _UNICODE_TO_ASCII:
|
|
49
|
+
text = text.replace(uni, asc)
|
|
50
|
+
# 3. Encode to ASCII, replacing anything still non-ASCII
|
|
51
|
+
return text.encode("ascii", errors="replace").decode("ascii")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_report_bundle() -> str:
|
|
55
|
+
"""Create a support report zip bundle with sensitive data redacted.
|
|
56
|
+
|
|
57
|
+
The file written inside the zip is ASCII-only so it can be opened and
|
|
58
|
+
printed on any Windows console regardless of the active code page.
|
|
59
|
+
"""
|
|
60
|
+
report_dir = ".utim_tmp"
|
|
61
|
+
os.makedirs(report_dir, exist_ok=True)
|
|
62
|
+
|
|
63
|
+
report_txt_path = os.path.join(report_dir, "support_report.txt")
|
|
64
|
+
bundle_zip_path = os.path.join(report_dir, "report_bundle.zip")
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
# Capture Rich diagnostics into a StringIO buffer (no terminal needed)
|
|
68
|
+
buf = io.StringIO()
|
|
69
|
+
buf_console = Console(file=buf, force_terminal=False, width=100)
|
|
70
|
+
run_diagnostics(buf_console)
|
|
71
|
+
diagnostics_text = buf.getvalue()
|
|
72
|
+
|
|
73
|
+
with open(report_txt_path, "w", encoding="utf-8") as f:
|
|
74
|
+
f.write("=== UTIM SUPPORT REPORT ===\n")
|
|
75
|
+
ts = os.path.getmtime(LOG_FILE) if os.path.exists(LOG_FILE) else "unknown"
|
|
76
|
+
f.write(f"Timestamp: {ts}\n\n")
|
|
77
|
+
|
|
78
|
+
f.write("=== DIAGNOSTICS ===\n")
|
|
79
|
+
# Redact first, then ASCII-ify so redaction markers stay readable
|
|
80
|
+
f.write(_to_ascii(redact_text(diagnostics_text)) + "\n\n")
|
|
81
|
+
|
|
82
|
+
f.write("=== REDACTED DEBUG LOG ===\n")
|
|
83
|
+
if os.path.exists(LOG_FILE):
|
|
84
|
+
with open(LOG_FILE, "r", encoding="utf-8") as lf:
|
|
85
|
+
log_content = lf.read()
|
|
86
|
+
f.write(_to_ascii(redact_text(log_content)))
|
|
87
|
+
else:
|
|
88
|
+
f.write("(no debug log found)\n")
|
|
89
|
+
|
|
90
|
+
with zipfile.ZipFile(bundle_zip_path, "w", zipfile.ZIP_DEFLATED) as z:
|
|
91
|
+
z.write(report_txt_path, "support_report.txt")
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
os.remove(report_txt_path)
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
return bundle_zip_path
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise RuntimeError(f"Failed to create support bundle: {e}")
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scrapy-based web scraping enhancement for the web_search tool.
|
|
3
|
+
Provides robust, async-capable scraping with proper HTTP semantics.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import os
|
|
8
|
+
from typing import Dict, List, Optional
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ScrapedContent:
|
|
14
|
+
"""Container for scraped page content."""
|
|
15
|
+
url: str
|
|
16
|
+
title: str
|
|
17
|
+
text: str
|
|
18
|
+
error: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def scrape_urls_with_playwright(urls: List[str], use_js: bool = False, timeout: int = 10) -> Dict[str, str]:
|
|
22
|
+
"""
|
|
23
|
+
Scrape URLs using Scrapy with optional Playwright for JavaScript rendering.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
urls: List of URLs to scrape
|
|
27
|
+
use_js: Whether to use Playwright for JavaScript-heavy sites
|
|
28
|
+
timeout: Request timeout in seconds
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dictionary mapping URL to scraped text content
|
|
32
|
+
"""
|
|
33
|
+
from scrapy import Spider, Request
|
|
34
|
+
from scrapy.crawler import CrawlerProcess
|
|
35
|
+
from scrapy.utils.project import get_project_settings
|
|
36
|
+
from scrapy.utils.log import configure_logging
|
|
37
|
+
|
|
38
|
+
results = {}
|
|
39
|
+
|
|
40
|
+
if use_js and len(urls) > 0:
|
|
41
|
+
# Use Playwright for JS rendering
|
|
42
|
+
return await _scrape_with_playwright(urls, timeout)
|
|
43
|
+
else:
|
|
44
|
+
# Use regular Scrapy for static content
|
|
45
|
+
return await _scrape_static(urls, timeout)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def _scrape_with_playwright(urls: List[str], timeout: int) -> Dict[str, str]:
|
|
49
|
+
"""Scrape URLs using Playwright for JavaScript rendering."""
|
|
50
|
+
try:
|
|
51
|
+
from playwright.async_api import async_playwright
|
|
52
|
+
except ImportError:
|
|
53
|
+
# Fallback to static scraping if Playwright not available
|
|
54
|
+
return await _scrape_static(urls, timeout)
|
|
55
|
+
|
|
56
|
+
results = {}
|
|
57
|
+
|
|
58
|
+
async with async_playwright() as p:
|
|
59
|
+
browser = await p.chromium.launch(headless=True)
|
|
60
|
+
context = await browser.new_context(
|
|
61
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
for url in urls:
|
|
65
|
+
try:
|
|
66
|
+
page = await context.new_page()
|
|
67
|
+
await page.goto(url, wait_until="networkidle", timeout=timeout * 1000)
|
|
68
|
+
content = await page.content()
|
|
69
|
+
|
|
70
|
+
# Extract text content
|
|
71
|
+
from bs4 import BeautifulSoup
|
|
72
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
73
|
+
|
|
74
|
+
# Remove script and style elements
|
|
75
|
+
for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
|
|
76
|
+
script.decompose()
|
|
77
|
+
|
|
78
|
+
text = soup.get_text(separator='\n', strip=True)
|
|
79
|
+
# Clean up whitespace
|
|
80
|
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
81
|
+
results[url] = '\n'.join(lines)[:6000]
|
|
82
|
+
|
|
83
|
+
await page.close()
|
|
84
|
+
except Exception as e:
|
|
85
|
+
results[url] = f"Error: {str(e)}"
|
|
86
|
+
|
|
87
|
+
await context.close()
|
|
88
|
+
await browser.close()
|
|
89
|
+
|
|
90
|
+
return results
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
async def _scrape_static(urls: List[str], timeout: int) -> Dict[str, str]:
|
|
94
|
+
"""Scrape URLs using standard Scrapy (no JavaScript)."""
|
|
95
|
+
import shutil
|
|
96
|
+
|
|
97
|
+
# Check if scrapy-playwright is available
|
|
98
|
+
use_playwright = shutil.which('playwright') is not None
|
|
99
|
+
|
|
100
|
+
# Create a temporary Scrapy project settings
|
|
101
|
+
from scrapy.settings import Settings
|
|
102
|
+
|
|
103
|
+
settings = Settings()
|
|
104
|
+
settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
105
|
+
settings.set('ROBOTSTXT_OBEY', True)
|
|
106
|
+
settings.set('DOWNLOAD_TIMEOUT', timeout)
|
|
107
|
+
settings.set('CONCURRENT_REQUESTS', min(len(urls), 8))
|
|
108
|
+
settings.set('CONCURRENT_REQUESTS_PER_DOMAIN', 2)
|
|
109
|
+
settings.set('RETRY_TIMES', 2)
|
|
110
|
+
settings.set('LOG_LEVEL', 'ERROR')
|
|
111
|
+
|
|
112
|
+
# Configure Playwright if available
|
|
113
|
+
if use_playwright:
|
|
114
|
+
settings.set('PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT', timeout * 1000)
|
|
115
|
+
|
|
116
|
+
# Import here to avoid issues if not installed
|
|
117
|
+
try:
|
|
118
|
+
from scrapy.crawler import CrawlerRunner
|
|
119
|
+
from twisted.internet import asyncioreactor
|
|
120
|
+
asyncioreactor.install()
|
|
121
|
+
except ImportError:
|
|
122
|
+
# Fall back to simple requests if Scrapy has issues
|
|
123
|
+
return _fallback_requests_scraping(urls, timeout)
|
|
124
|
+
|
|
125
|
+
# Create inline spider
|
|
126
|
+
from scrapy import Spider
|
|
127
|
+
from itemadapter import ItemAdapter
|
|
128
|
+
|
|
129
|
+
class InlineSpider(Spider):
|
|
130
|
+
name = 'inline_spider'
|
|
131
|
+
|
|
132
|
+
def __init__(self, *args, **kwargs):
|
|
133
|
+
super().__init__(*args, **kwargs)
|
|
134
|
+
self.results = {}
|
|
135
|
+
self.urls_to_scrape = urls
|
|
136
|
+
|
|
137
|
+
def start_requests(self):
|
|
138
|
+
for url in self.urls_to_scrape:
|
|
139
|
+
yield Request(url=url, callback=self.parse, errback=self.errback)
|
|
140
|
+
|
|
141
|
+
def parse(self, response):
|
|
142
|
+
# Extract text with BeautifulSoup
|
|
143
|
+
try:
|
|
144
|
+
from bs4 import BeautifulSoup
|
|
145
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
146
|
+
|
|
147
|
+
# Remove unwanted elements
|
|
148
|
+
for elem in soup(["script", "style", "nav", "header", "footer", "aside"]):
|
|
149
|
+
elem.decompose()
|
|
150
|
+
|
|
151
|
+
text = soup.get_text(separator='\n', strip=True)
|
|
152
|
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
153
|
+
self.results[response.url] = '\n'.join(lines)[:6000]
|
|
154
|
+
except Exception as e:
|
|
155
|
+
self.results[response.url] = f"Parse error: {str(e)}"
|
|
156
|
+
|
|
157
|
+
yield {'url': response.url}
|
|
158
|
+
|
|
159
|
+
def errback(self, failure):
|
|
160
|
+
self.results[failure.request.url] = f"Error: {str(failure.value)}"
|
|
161
|
+
|
|
162
|
+
# Run the spider
|
|
163
|
+
from twisted.internet import reactor
|
|
164
|
+
from scrapy.crawler import CrawlerRunner
|
|
165
|
+
|
|
166
|
+
runner = CrawlerRunner(settings)
|
|
167
|
+
spider = InlineSpider()
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
d = runner.crawl(spider)
|
|
171
|
+
d.addBoth(lambda _: reactor.stop())
|
|
172
|
+
reactor.run()
|
|
173
|
+
except Exception:
|
|
174
|
+
# Fallback if reactor issues
|
|
175
|
+
return _fallback_requests_scraping(urls, timeout)
|
|
176
|
+
|
|
177
|
+
return spider.results or _fallback_requests_scraping(urls, timeout)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _fallback_requests_scraping(urls: List[str], timeout: int) -> Dict[str, str]:
|
|
181
|
+
"""Fallback to requests-based scraping if Scrapy fails."""
|
|
182
|
+
import requests
|
|
183
|
+
import re
|
|
184
|
+
import html as html_lib
|
|
185
|
+
|
|
186
|
+
results = {}
|
|
187
|
+
|
|
188
|
+
for url in urls:
|
|
189
|
+
try:
|
|
190
|
+
headers = {
|
|
191
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
192
|
+
}
|
|
193
|
+
r = requests.get(url, headers=headers, timeout=timeout)
|
|
194
|
+
if r.status_code == 200:
|
|
195
|
+
html_content = r.text
|
|
196
|
+
# Remove scripts, styles
|
|
197
|
+
html_content = re.sub(r'<(script|style).*?>.*?</\1>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
|
|
198
|
+
# Remove html tags
|
|
199
|
+
text = re.sub(r'<.*?>', ' ', html_content)
|
|
200
|
+
text = html_lib.unescape(text)
|
|
201
|
+
# Format whitespace
|
|
202
|
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
|
203
|
+
results[url] = '\n'.join(lines)[:6000]
|
|
204
|
+
except Exception as e:
|
|
205
|
+
results[url] = f"Error: {str(e)}"
|
|
206
|
+
|
|
207
|
+
return results
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def enhanced_scrape_urls(urls: List[str], use_js: bool = False, timeout: int = 10) -> Dict[str, str]:
|
|
211
|
+
"""
|
|
212
|
+
Synchronous wrapper for scrape_urls_with_playwright.
|
|
213
|
+
|
|
214
|
+
This is the main entry point used by web_search.
|
|
215
|
+
"""
|
|
216
|
+
try:
|
|
217
|
+
# Try async approach first
|
|
218
|
+
loop = asyncio.get_event_loop()
|
|
219
|
+
if loop.is_running():
|
|
220
|
+
# If event loop is already running, use a new one
|
|
221
|
+
return asyncio.run(scrape_urls_with_playwright(urls, use_js, timeout))
|
|
222
|
+
else:
|
|
223
|
+
return loop.run_until_complete(scrape_urls_with_playwright(urls, use_js, timeout))
|
|
224
|
+
except RuntimeError:
|
|
225
|
+
# No event loop, create one
|
|
226
|
+
return asyncio.run(scrape_urls_with_playwright(urls, use_js, timeout))
|
|
227
|
+
except Exception:
|
|
228
|
+
# Fallback to sync requests
|
|
229
|
+
return _fallback_requests_scraping(urls, timeout)
|