h-ai-brain 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h_ai/__init__.py +5 -0
- h_ai/application/__init__.py +0 -0
- h_ai/application/hai_service.py +48 -0
- h_ai/application/system_prompts/__init__.py +0 -0
- h_ai/application/system_prompts/roles/__init__.py +0 -0
- h_ai/application/web_docs_service.py +35 -0
- h_ai/domain/__init__.py +0 -0
- h_ai/domain/reasoning/__init__.py +0 -0
- h_ai/domain/reasoning/llm_chat_repository.py +9 -0
- h_ai/domain/reasoning/llm_generate_respository.py +6 -0
- h_ai/domain/reasoning/llm_tool_repository.py +14 -0
- h_ai/domain/reasoning/text_analysis.py +149 -0
- h_ai/domain/reasoning/tool_message.py +4 -0
- h_ai/domain/web_docs/__init__.py +0 -0
- h_ai/domain/web_docs/doc_link_scorer_service.py +45 -0
- h_ai/domain/web_docs/documentation_pattern_repository.py +44 -0
- h_ai/domain/web_docs/gitbook/__init__.py +0 -0
- h_ai/domain/web_docs/gitbook/text_chapter.py +18 -0
- h_ai/domain/web_docs/gitbook/text_page.py +46 -0
- h_ai/domain/web_docs/gitbook_web_fetcher_service.py +172 -0
- h_ai/domain/web_docs/web_docs_link_detector.py +26 -0
- h_ai/domain/web_docs/web_link.py +11 -0
- h_ai/domain/webpages/__init__.py +0 -0
- h_ai/domain/webpages/web_fetcher_repository.py +10 -0
- h_ai/domain/webpages/web_text_fetcher_repository.py +12 -0
- h_ai/infrastructure/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/soup_processor.py +240 -0
- h_ai/infrastructure/llm/__init__.py +0 -0
- h_ai/infrastructure/llm/data_handler.py +30 -0
- h_ai/infrastructure/llm/llm_response_cleaner.py +21 -0
- h_ai/infrastructure/llm/ollama/__init__.py +0 -0
- h_ai/infrastructure/llm/ollama/models/__init__.py +0 -0
- h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py +13 -0
- h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py +12 -0
- h_ai/infrastructure/llm/ollama/ollama_chat_repository.py +56 -0
- h_ai/infrastructure/llm/ollama/ollama_generate_repository.py +53 -0
- h_ai/infrastructure/llm/ollama/ollama_tool_repository.py +138 -0
- h_ai/infrastructure/llm/prompt_helper.py +7 -0
- h_ai/infrastructure/llm/prompt_loader.py +18 -0
- h_ai/infrastructure/playwright/__init__.py +0 -0
- h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +48 -0
- h_ai_brain-0.0.1.dist-info/METADATA +22 -0
- h_ai_brain-0.0.1.dist-info/RECORD +48 -0
- h_ai_brain-0.0.1.dist-info/WHEEL +5 -0
- h_ai_brain-0.0.1.dist-info/licenses/LICENSE +202 -0
- h_ai_brain-0.0.1.dist-info/licenses/NOTICE.txt +19 -0
- h_ai_brain-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List, Optional
|
3
|
+
|
4
|
+
from ...domain.web_docs.gitbook.text_page import TextPage
|
5
|
+
|
6
|
+
|
7
|
+
class WebTextFetcherRepository(ABC):
|
8
|
+
|
9
|
+
@abstractmethod
|
10
|
+
async def fetch(self) -> Optional[List[TextPage]]:
|
11
|
+
"""Fetch all content"""
|
12
|
+
pass
|
File without changes
|
File without changes
|
@@ -0,0 +1,240 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from typing import List, Optional
|
4
|
+
from urllib.parse import urljoin
|
5
|
+
|
6
|
+
from bs4 import BeautifulSoup, Tag
|
7
|
+
|
8
|
+
from ...domain.web_docs.gitbook.text_chapter import TextChapter
|
9
|
+
from ...domain.web_docs.web_link import WebLink
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
class SoupProcessor:
|
14
|
+
def __init__(self, html_content:str):
|
15
|
+
self.soup = BeautifulSoup(html_content, 'html.parser')
|
16
|
+
|
17
|
+
def extract_links(self, base_url: str) -> List[WebLink]:
|
18
|
+
"""Extract links from a page"""
|
19
|
+
web_links = []
|
20
|
+
links = self.soup.find_all('a', href=True)
|
21
|
+
for link in links:
|
22
|
+
href = link.get('href', '').strip()
|
23
|
+
if not href or href.startswith('#') or href.startswith('javascript:'):
|
24
|
+
continue
|
25
|
+
|
26
|
+
full_url = urljoin(base_url, href)
|
27
|
+
link_text = link.get_text().strip()
|
28
|
+
web_link = WebLink(url=full_url, title=link_text)
|
29
|
+
web_links.append(web_link)
|
30
|
+
return web_links
|
31
|
+
|
32
|
+
def normalize_url(self, href, base_url) -> Optional[str]:
|
33
|
+
"""Normalize URL to absolute form and filter out non-content URLs"""
|
34
|
+
# Skip fragment-only URLs
|
35
|
+
if href.startswith('#'):
|
36
|
+
return None
|
37
|
+
|
38
|
+
# Skip external links
|
39
|
+
if href.startswith(('http://', 'https://')) and not href.startswith(base_url):
|
40
|
+
return None
|
41
|
+
|
42
|
+
# Skip resource URLs
|
43
|
+
if href.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.js', '.css')):
|
44
|
+
return None
|
45
|
+
|
46
|
+
# Convert to absolute URL if needed
|
47
|
+
full_url = href
|
48
|
+
if not href.startswith(('http://', 'https://')):
|
49
|
+
full_url = urljoin(base_url, href)
|
50
|
+
|
51
|
+
# Make sure URL belongs to the same domain
|
52
|
+
if not full_url.startswith(base_url):
|
53
|
+
return None
|
54
|
+
|
55
|
+
return full_url
|
56
|
+
|
57
|
+
def extract_last_updated_refs_from_soup(self) -> str:
|
58
|
+
datetime_value = ""
|
59
|
+
|
60
|
+
# Find and remove elements containing "Last updated" text
|
61
|
+
for element in self.soup.find_all(string=lambda text: text and "Last updated" in text):
|
62
|
+
# Get the parent element and remove it
|
63
|
+
parent = element.parent
|
64
|
+
if parent:
|
65
|
+
parent.decompose()
|
66
|
+
|
67
|
+
return datetime_value
|
68
|
+
|
69
|
+
def extract_title(self) -> Optional[str]:
|
70
|
+
"""Extract the title of the page using multiple strategies"""
|
71
|
+
# Strategy 1: Look for h1
|
72
|
+
h1 = self.soup.find('h1')
|
73
|
+
if h1:
|
74
|
+
return h1.get_text(strip=True)
|
75
|
+
|
76
|
+
# Strategy 2: Look for title tag
|
77
|
+
title_tag = self.soup.find('title')
|
78
|
+
if title_tag:
|
79
|
+
title_text = title_tag.get_text(strip=True)
|
80
|
+
title_parts = re.split(r'[|\-–]', title_text)
|
81
|
+
return title_parts[0].strip()
|
82
|
+
|
83
|
+
# Strategy 3: Try to find GitBook-specific title elements
|
84
|
+
gitbook_title = self.soup.find('span', {'data-testid': 'page.title'})
|
85
|
+
if gitbook_title:
|
86
|
+
return gitbook_title.get_text(strip=True)
|
87
|
+
|
88
|
+
return None
|
89
|
+
|
90
|
+
def find_body_content(self) -> Optional[Tag]:
|
91
|
+
body_content = self.soup.find('body')
|
92
|
+
if body_content:
|
93
|
+
return body_content
|
94
|
+
return None
|
95
|
+
|
96
|
+
def gitbook_extract_modern_nav(self, base_url, processed_urls):
|
97
|
+
"""Extract navigation from modern GitBook layout"""
|
98
|
+
nav_links = []
|
99
|
+
|
100
|
+
# Look for navigation sidebar
|
101
|
+
sidebar = self.soup.select_one('div[data-testid="page.desktopTableOfContents"]')
|
102
|
+
if sidebar:
|
103
|
+
for link in sidebar.find_all('a', href=True):
|
104
|
+
full_url = self.normalize_url(link['href'], base_url)
|
105
|
+
if full_url and full_url not in processed_urls:
|
106
|
+
nav_links.append(full_url)
|
107
|
+
processed_urls.add(full_url)
|
108
|
+
|
109
|
+
return nav_links
|
110
|
+
|
111
|
+
def gitbook_extract_traditional_nav(self, base_url, processed_urls):
|
112
|
+
"""Extract navigation from traditional GitBook layout"""
|
113
|
+
nav_links = []
|
114
|
+
|
115
|
+
# Find GitBook navigation elements
|
116
|
+
nav_elements = self.soup.find_all(['nav', 'aside'])
|
117
|
+
for nav in nav_elements:
|
118
|
+
# Look for lists that typically contain the navigation
|
119
|
+
nav_lists = nav.find_all(['ol', 'ul'])
|
120
|
+
for nav_list in nav_lists:
|
121
|
+
for li in nav_list.find_all('li'):
|
122
|
+
link = li.find('a', href=True)
|
123
|
+
if link:
|
124
|
+
full_url = self.normalize_url(link['href'], base_url)
|
125
|
+
if full_url and full_url not in processed_urls:
|
126
|
+
nav_links.append(full_url)
|
127
|
+
processed_urls.add(full_url)
|
128
|
+
|
129
|
+
# Try summary element which is common in GitBook
|
130
|
+
summary = self.soup.find('ul', {'class': 'summary'})
|
131
|
+
if summary:
|
132
|
+
for link in summary.find_all('a', href=True):
|
133
|
+
full_url = self.normalize_url(link['href'], base_url)
|
134
|
+
if full_url and full_url not in processed_urls:
|
135
|
+
nav_links.append(full_url)
|
136
|
+
processed_urls.add(full_url)
|
137
|
+
|
138
|
+
return nav_links
|
139
|
+
|
140
|
+
def gitbook_extract_pagination_links(self, base_url, processed_urls):
|
141
|
+
"""Extract navigation from pagination elements"""
|
142
|
+
nav_links = []
|
143
|
+
|
144
|
+
# Find pagination links (next/prev)
|
145
|
+
selectors = [
|
146
|
+
'a[aria-label="Next"]',
|
147
|
+
'a[aria-label="Previous"]',
|
148
|
+
'a.navigation-next',
|
149
|
+
'a.navigation-prev',
|
150
|
+
'a:has(svg[data-icon="arrow-right"])',
|
151
|
+
'a:has(svg[data-icon="arrow-left"])'
|
152
|
+
]
|
153
|
+
|
154
|
+
for selector in selectors:
|
155
|
+
try:
|
156
|
+
for link in self.soup.select(selector):
|
157
|
+
if link.has_attr('href'):
|
158
|
+
full_url = self.normalize_url(link['href'], base_url)
|
159
|
+
if full_url and full_url not in processed_urls:
|
160
|
+
nav_links.append(full_url)
|
161
|
+
processed_urls.add(full_url)
|
162
|
+
except Exception:
|
163
|
+
continue
|
164
|
+
|
165
|
+
return nav_links
|
166
|
+
|
167
|
+
def gitbook_extract_class_based_nav(self, base_url, processed_urls):
|
168
|
+
"""Extract navigation based on common GitBook class patterns"""
|
169
|
+
nav_links = []
|
170
|
+
|
171
|
+
# Common class patterns for navigation in GitBook
|
172
|
+
class_patterns = [
|
173
|
+
'nav-', 'menu-', 'sidebar-', 'toc-', '-nav', '-menu', '-sidebar', '-toc'
|
174
|
+
]
|
175
|
+
|
176
|
+
# Look for elements with these class patterns
|
177
|
+
for pattern in class_patterns:
|
178
|
+
elements = self.soup.find_all(class_=lambda c: c and pattern in c)
|
179
|
+
for element in elements:
|
180
|
+
for link in element.find_all('a', href=True):
|
181
|
+
full_url = self.normalize_url(link['href'], base_url)
|
182
|
+
if full_url and full_url not in processed_urls:
|
183
|
+
nav_links.append(full_url)
|
184
|
+
processed_urls.add(full_url)
|
185
|
+
|
186
|
+
return nav_links
|
187
|
+
|
188
|
+
@staticmethod
|
189
|
+
def clean_template_usage(content: Tag):
|
190
|
+
if not content or not isinstance(content, Tag):
|
191
|
+
return None
|
192
|
+
# Step 1: Build a mapping of template IDs to hidden div content
|
193
|
+
template_map = {}
|
194
|
+
# Find all hidden divs with IDs like S:*
|
195
|
+
for hidden_div in content.find_all('div', {'hidden': True}, id=re.compile(r'S:\d+')):
|
196
|
+
div_id = hidden_div.get('id')
|
197
|
+
# Store the first child (e.g., <a> tag) or the entire content
|
198
|
+
if hidden_div.contents:
|
199
|
+
template_map[div_id] = hidden_div.contents[0] if len(hidden_div.contents) == 1 else hidden_div
|
200
|
+
|
201
|
+
# Step 2: Replace <template> tags with content from hidden divs based on $RS logic
|
202
|
+
for template in content.find_all('template', id=re.compile(r'P:\d+')):
|
203
|
+
template_id = template.get('id') # e.g., P:2
|
204
|
+
# Convert P:* to S:* to match the hidden div (assuming $RS("S:2", "P:2") pattern)
|
205
|
+
source_id = f"S:{template_id.split(':')[1]}" # e.g., S:2
|
206
|
+
if source_id in template_map:
|
207
|
+
# Replace the template with the content from the hidden div
|
208
|
+
replacement = template_map[source_id]
|
209
|
+
# If it's a Tag, use it directly; if it's a div, extract its contents
|
210
|
+
if isinstance(replacement, Tag):
|
211
|
+
template.replace_with(replacement)
|
212
|
+
else:
|
213
|
+
template.replace_with(replacement.contents[0])
|
214
|
+
|
215
|
+
@staticmethod
|
216
|
+
def extract_chapters(content: Tag) -> List[TextChapter]:
|
217
|
+
chapters = []
|
218
|
+
|
219
|
+
# Create a default chapter for content before any heading
|
220
|
+
default_chapter = TextChapter(heading="Introduction", level=0)
|
221
|
+
current_chapter = default_chapter
|
222
|
+
chapters.append(default_chapter)
|
223
|
+
|
224
|
+
for element in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
225
|
+
if element.name.startswith('h'):
|
226
|
+
# Extract heading level (h1=1, h2=2, etc.)
|
227
|
+
level = int(element.name[1])
|
228
|
+
heading_text = element.get_text(strip=True)
|
229
|
+
|
230
|
+
# Create a new chapter
|
231
|
+
current_chapter = TextChapter(heading=heading_text, level=level)
|
232
|
+
chapters.append(current_chapter)
|
233
|
+
|
234
|
+
elif element.name == 'p' and current_chapter is not None:
|
235
|
+
paragraph_text = element.get_text(strip=True)
|
236
|
+
if paragraph_text:
|
237
|
+
current_chapter.paragraphs.append(paragraph_text)
|
238
|
+
|
239
|
+
# Remove any chapters without content if they're not top-level
|
240
|
+
return [ch for ch in chapters if ch.paragraphs or ch.level <= 2]
|
File without changes
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
|
4
|
+
def parse_json_data(json_string: str) -> dict |None:
|
5
|
+
try:
|
6
|
+
# Check for empty string
|
7
|
+
if not json_string or not json_string.strip():
|
8
|
+
print("Empty JSON string provided")
|
9
|
+
return {}
|
10
|
+
|
11
|
+
# First try to extract JSON from Markdown code blocks
|
12
|
+
import re
|
13
|
+
match = re.search(r'```(?:json)?\s*\n(.*?)\n```', json_string, re.DOTALL)
|
14
|
+
if match:
|
15
|
+
json_string = match.group(1).strip()
|
16
|
+
else:
|
17
|
+
# If no code block found, try the existing logic
|
18
|
+
if json_string.strip().startswith('json'):
|
19
|
+
json_string = json_string.strip()[4:].strip()
|
20
|
+
|
21
|
+
# Parse the JSON data
|
22
|
+
data = json.loads(json_string)
|
23
|
+
|
24
|
+
return data
|
25
|
+
except json.JSONDecodeError as e:
|
26
|
+
print(f"Error parsing JSON data: {e}")
|
27
|
+
return {}
|
28
|
+
except Exception as e:
|
29
|
+
print(f"Unexpected error while parsing JSON data: {e}")
|
30
|
+
return {}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
def clean_llm_response(response_text: str) -> str:
|
4
|
+
clean_deepseek = _strip_think_tag_from_deepseek_response(_replace_role_tags_from_response(response_text))
|
5
|
+
clean_tags = _replace_role_tags_from_response(clean_deepseek)
|
6
|
+
return clean_tags
|
7
|
+
|
8
|
+
def _strip_think_tag_from_deepseek_response(response_text: str) -> str:
|
9
|
+
"""
|
10
|
+
Removes any content enclosed within the "<think>" tags from the response text,
|
11
|
+
"""
|
12
|
+
think_start = response_text.find("<think>")
|
13
|
+
think_end = response_text.find("</think>")
|
14
|
+
if think_start != -1 and think_end != -1:
|
15
|
+
return response_text[think_end + 8:].strip()
|
16
|
+
return response_text
|
17
|
+
|
18
|
+
def _replace_role_tags_from_response(response_text: str) -> str:
|
19
|
+
return (response_text.replace("<|assistant|>", "")
|
20
|
+
.replace("<|user|>", "")
|
21
|
+
.replace("<|system|>", ""))
|
File without changes
|
File without changes
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from .....infrastructure.llm.ollama.models.ollama_chat_message import OllamaChatMessage
|
4
|
+
|
5
|
+
|
6
|
+
class OllamaChatSession:
|
7
|
+
def __init__(self, session_id: str, messages: List[OllamaChatMessage]):
|
8
|
+
self.session_id = session_id
|
9
|
+
self.messages = messages
|
10
|
+
|
11
|
+
def add_message(self, message: OllamaChatMessage):
|
12
|
+
self.messages.append(message)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
from typing import Optional, List
|
2
|
+
|
3
|
+
import requests
|
4
|
+
|
5
|
+
from ....domain.reasoning.llm_chat_repository import LlmChatRepository
|
6
|
+
from ....infrastructure.llm.llm_response_cleaner import clean_llm_response
|
7
|
+
from ....infrastructure.llm.ollama.models.ollama_chat_message import OllamaChatMessage
|
8
|
+
from ....infrastructure.llm.ollama.models.ollama_chat_session import OllamaChatSession
|
9
|
+
|
10
|
+
|
11
|
+
class OllamaChatRepository(LlmChatRepository):
|
12
|
+
|
13
|
+
def __init__(self, api_url: str, model_name: str, system_prompts: list[str] = None, temperature: float = None, seed: int = None):
|
14
|
+
self.api_url = api_url
|
15
|
+
self.model_name = model_name
|
16
|
+
self.temperature = temperature
|
17
|
+
self.seed = seed
|
18
|
+
self.system_prompts = system_prompts
|
19
|
+
|
20
|
+
def chat(self, user_message: str, session_id: str) -> Optional[str]:
|
21
|
+
|
22
|
+
messages = [OllamaChatMessage("user", user_message)]
|
23
|
+
for system_prompt in self.system_prompts:
|
24
|
+
messages.append(OllamaChatMessage("system", system_prompt))
|
25
|
+
session = OllamaChatSession(session_id, messages)
|
26
|
+
|
27
|
+
return self._call_ollama_api(session.messages)
|
28
|
+
|
29
|
+
def _call_ollama_api(self, messages: List[OllamaChatMessage]) -> Optional[str]:
|
30
|
+
url = f"{self.api_url}/chat"
|
31
|
+
formatted_messages = [message.to_dict() for message in messages]
|
32
|
+
payload = {
|
33
|
+
"model": self.model_name,
|
34
|
+
"messages": formatted_messages,
|
35
|
+
"stream": False,
|
36
|
+
"temperature": "0.6"
|
37
|
+
}
|
38
|
+
if self.temperature:
|
39
|
+
payload["temperature"] = self.temperature
|
40
|
+
if self.seed:
|
41
|
+
payload["seed"] = self.seed
|
42
|
+
|
43
|
+
try:
|
44
|
+
print(payload)
|
45
|
+
response = requests.post(url, json=payload)
|
46
|
+
response.raise_for_status()
|
47
|
+
full_response = response.json()["message"]["content"]
|
48
|
+
print(full_response)
|
49
|
+
return clean_llm_response(full_response)
|
50
|
+
|
51
|
+
except requests.exceptions.RequestException as e:
|
52
|
+
print(f"Error occurred during API call: {e}")
|
53
|
+
return None
|
54
|
+
|
55
|
+
|
56
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import uuid
|
2
|
+
|
3
|
+
import requests
|
4
|
+
|
5
|
+
from ....domain.reasoning.llm_generate_respository import LlmGenerateRepository
|
6
|
+
|
7
|
+
|
8
|
+
class OllamaGenerateRepository(LlmGenerateRepository):
|
9
|
+
|
10
|
+
def __init__(self, api_url: str, model_name: str, system_prompt: str = None, temperature: float = None, seed: int = None):
|
11
|
+
self.model_name = model_name
|
12
|
+
self.system_prompt = system_prompt
|
13
|
+
self.api_url = api_url
|
14
|
+
self.temperature = temperature
|
15
|
+
self.seed = seed
|
16
|
+
|
17
|
+
|
18
|
+
def generate(self, user_prompt: str, system_prompt: str = None, session_id: str = None) -> str|None:
|
19
|
+
url = f"{self.api_url}/generate"
|
20
|
+
random_guid = uuid.uuid4()
|
21
|
+
guid_str = str(random_guid)
|
22
|
+
system_prompt = system_prompt or self.system_prompt
|
23
|
+
payload = {
|
24
|
+
"model": self.model_name,
|
25
|
+
"prompt": user_prompt,
|
26
|
+
"system": system_prompt,
|
27
|
+
"stream": False,
|
28
|
+
"session": guid_str,
|
29
|
+
"num_ctx": "5000",
|
30
|
+
"temperature": "0.6"
|
31
|
+
}
|
32
|
+
|
33
|
+
if session_id:
|
34
|
+
payload["session"] = session_id
|
35
|
+
if self.seed:
|
36
|
+
payload["seed"] = self.seed
|
37
|
+
if self.temperature:
|
38
|
+
payload["temperature"] = self.temperature
|
39
|
+
|
40
|
+
try:
|
41
|
+
print(payload)
|
42
|
+
response = requests.post(url, json=payload)
|
43
|
+
response.raise_for_status()
|
44
|
+
|
45
|
+
print(response.json())
|
46
|
+
|
47
|
+
response_content = response.json()["response"]
|
48
|
+
return clean_llm_response(response_content)
|
49
|
+
|
50
|
+
except requests.exceptions.RequestException as e:
|
51
|
+
print(f"Error occurred during API call: {e}")
|
52
|
+
return None
|
53
|
+
|
@@ -0,0 +1,138 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Dict, Any
|
3
|
+
|
4
|
+
import requests
|
5
|
+
|
6
|
+
from ..prompt_helper import replace_placeholders
|
7
|
+
from ....domain.reasoning.llm_tool_repository import LlmToolRepository
|
8
|
+
from ....domain.reasoning.tool_message import ToolMessage
|
9
|
+
|
10
|
+
|
11
|
+
class OllamaToolRepository(LlmToolRepository):
|
12
|
+
|
13
|
+
def __init__(self, api_url: str, model_name: str, tools: list, tool_prompt: str, tool_response_prompt: str, temperature: float = None, seed: int = None):
|
14
|
+
self.model_name = model_name
|
15
|
+
self.api_url = api_url
|
16
|
+
self.tools = tools
|
17
|
+
self.tool_prompt = tool_prompt
|
18
|
+
self.tool_response_prompt = tool_response_prompt
|
19
|
+
self.temperature = temperature
|
20
|
+
self.seed = seed
|
21
|
+
|
22
|
+
def find_tools_in_message(self, message: str) -> list[ToolMessage] | None:
|
23
|
+
url = f"{self.api_url}/generate"
|
24
|
+
|
25
|
+
placeholders = {
|
26
|
+
"$available_tools_placeholder": {json.dumps(self.tools, indent=2)},
|
27
|
+
}
|
28
|
+
system_message_tools = replace_placeholders(self.tool_prompt, placeholders)
|
29
|
+
|
30
|
+
payload = {
|
31
|
+
"model": self.model_name,
|
32
|
+
"prompt": message,
|
33
|
+
"stream": False,
|
34
|
+
"system": system_message_tools,
|
35
|
+
"num_ctx": "2500",
|
36
|
+
"temperature": "0.6"
|
37
|
+
}
|
38
|
+
|
39
|
+
if self.seed:
|
40
|
+
payload["seed"] = self.seed
|
41
|
+
if self.temperature:
|
42
|
+
payload["temperature"] = self.temperature
|
43
|
+
|
44
|
+
try:
|
45
|
+
print(payload)
|
46
|
+
response = requests.post(url, json=payload)
|
47
|
+
response.raise_for_status()
|
48
|
+
|
49
|
+
print(response.json())
|
50
|
+
|
51
|
+
found_tool_definitions = self._process_tool_response(response.json())
|
52
|
+
if found_tool_definitions:
|
53
|
+
tool_messages = []
|
54
|
+
for tool_definition in found_tool_definitions:
|
55
|
+
tool_message = ToolMessage()
|
56
|
+
tool_message.method_name = tool_definition.get("tool")
|
57
|
+
tool_message.method_params = tool_definition.get("parameters",{})
|
58
|
+
tool_messages.append(tool_message)
|
59
|
+
|
60
|
+
return tool_messages
|
61
|
+
else:
|
62
|
+
return None
|
63
|
+
|
64
|
+
except requests.exceptions.RequestException as e:
|
65
|
+
print(f"Error occurred during API call: {e}")
|
66
|
+
return None
|
67
|
+
|
68
|
+
def build_tool_response_prompt(self, question:str, tool_results: list[str]):
|
69
|
+
placeholders = {
|
70
|
+
"$tools_question_placeholder": question,
|
71
|
+
"$tools_response_context_placeholder": tool_results,
|
72
|
+
}
|
73
|
+
return replace_placeholders(self.tool_response_prompt, placeholders)
|
74
|
+
|
75
|
+
def _process_tool_response(self, response: Dict[str, Any]) -> list[dict] | None:
|
76
|
+
try:
|
77
|
+
text = response.get('response', '')
|
78
|
+
|
79
|
+
tool_calls = self._parse_json_string(text)
|
80
|
+
|
81
|
+
if isinstance(tool_calls, dict):
|
82
|
+
tool_calls = [tool_calls]
|
83
|
+
|
84
|
+
if isinstance(tool_calls, list):
|
85
|
+
valid_calls = [
|
86
|
+
call for call in tool_calls
|
87
|
+
if isinstance(call, dict) and 'tool' in call
|
88
|
+
]
|
89
|
+
return valid_calls if valid_calls else None
|
90
|
+
|
91
|
+
except json.JSONDecodeError:
|
92
|
+
return None
|
93
|
+
|
94
|
+
return None
|
95
|
+
|
96
|
+
def _parse_json_string(self, input_str: str) -> list[dict] | None:
|
97
|
+
if not input_str or not input_str.strip():
|
98
|
+
return None
|
99
|
+
|
100
|
+
try:
|
101
|
+
return json.loads(input_str)
|
102
|
+
except json.JSONDecodeError:
|
103
|
+
cleaned = self._clean_json_string(input_str)
|
104
|
+
if not cleaned:
|
105
|
+
return None
|
106
|
+
|
107
|
+
json_objects = []
|
108
|
+
current_pos = 0
|
109
|
+
|
110
|
+
while True:
|
111
|
+
try:
|
112
|
+
start = cleaned.find('{', current_pos)
|
113
|
+
if start == -1:
|
114
|
+
break
|
115
|
+
|
116
|
+
decoder = json.JSONDecoder()
|
117
|
+
obj, end = decoder.raw_decode(cleaned[start:])
|
118
|
+
json_objects.append(obj)
|
119
|
+
current_pos = start + end
|
120
|
+
except json.JSONDecodeError:
|
121
|
+
current_pos += 1
|
122
|
+
continue
|
123
|
+
except Exception:
|
124
|
+
break
|
125
|
+
|
126
|
+
if not json_objects:
|
127
|
+
return None
|
128
|
+
return json_objects
|
129
|
+
|
130
|
+
@staticmethod
|
131
|
+
def _clean_json_string(input_str: str) -> str:
|
132
|
+
cleaned = input_str.replace('```json', '').replace('```', '')
|
133
|
+
cleaned = cleaned.strip("'").strip('"')
|
134
|
+
cleaned = cleaned.strip()
|
135
|
+
return cleaned
|
136
|
+
|
137
|
+
|
138
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
|
4
|
+
|
5
|
+
class PromptLoader:
|
6
|
+
def __init__(self, file_path):
|
7
|
+
# Resolve relative path to absolute path
|
8
|
+
absolute_file_path = os.path.abspath(file_path)
|
9
|
+
|
10
|
+
with open(absolute_file_path, "r") as file:
|
11
|
+
self.config = json.load(file)
|
12
|
+
|
13
|
+
def get_config_value(self, key):
|
14
|
+
return self.config.get(key)
|
15
|
+
|
16
|
+
def get_entire_config(self):
|
17
|
+
return json.dumps(self.config, indent=2)
|
18
|
+
|
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from playwright.async_api import async_playwright
|
5
|
+
|
6
|
+
from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
class PlayWrightWebContentFetcher(WebFetcherRepository):
|
11
|
+
def __init__(self):
|
12
|
+
self.headers = {
|
13
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
14
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
15
|
+
'Accept-Language': 'en-US,en;q=0.5'
|
16
|
+
}
|
17
|
+
self.page_load_timeout = 30
|
18
|
+
self.wait_for_idle = True
|
19
|
+
|
20
|
+
async def fetch(self, url: str) -> Optional[str]:
|
21
|
+
async with async_playwright() as p:
|
22
|
+
browser = await p.chromium.launch()
|
23
|
+
try:
|
24
|
+
context = await browser.new_context(
|
25
|
+
user_agent=self.headers.get('User-Agent')
|
26
|
+
)
|
27
|
+
page = await context.new_page()
|
28
|
+
|
29
|
+
# Set timeout
|
30
|
+
page.set_default_timeout(self.page_load_timeout * 1000) # Convert to ms
|
31
|
+
|
32
|
+
# Navigate to the URL
|
33
|
+
await page.goto(url)
|
34
|
+
|
35
|
+
# Wait for network to be idle if requested
|
36
|
+
if self.wait_for_idle:
|
37
|
+
await page.wait_for_load_state("networkidle")
|
38
|
+
|
39
|
+
logger.debug(f"Successfully fetched {url} with headless browser")
|
40
|
+
|
41
|
+
# Get the rendered HTML
|
42
|
+
return await page.content()
|
43
|
+
|
44
|
+
except Exception as e:
|
45
|
+
logger.error(f"Error fetching {url} with headless browser: {str(e)}")
|
46
|
+
return None
|
47
|
+
finally:
|
48
|
+
await browser.close()
|