h-ai-brain 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. h_ai/__init__.py +5 -0
  2. h_ai/application/__init__.py +0 -0
  3. h_ai/application/hai_service.py +48 -0
  4. h_ai/application/system_prompts/__init__.py +0 -0
  5. h_ai/application/system_prompts/roles/__init__.py +0 -0
  6. h_ai/application/web_docs_service.py +35 -0
  7. h_ai/domain/__init__.py +0 -0
  8. h_ai/domain/reasoning/__init__.py +0 -0
  9. h_ai/domain/reasoning/llm_chat_repository.py +9 -0
  10. h_ai/domain/reasoning/llm_generate_respository.py +6 -0
  11. h_ai/domain/reasoning/llm_tool_repository.py +14 -0
  12. h_ai/domain/reasoning/text_analysis.py +149 -0
  13. h_ai/domain/reasoning/tool_message.py +4 -0
  14. h_ai/domain/web_docs/__init__.py +0 -0
  15. h_ai/domain/web_docs/doc_link_scorer_service.py +45 -0
  16. h_ai/domain/web_docs/documentation_pattern_repository.py +44 -0
  17. h_ai/domain/web_docs/gitbook/__init__.py +0 -0
  18. h_ai/domain/web_docs/gitbook/text_chapter.py +18 -0
  19. h_ai/domain/web_docs/gitbook/text_page.py +46 -0
  20. h_ai/domain/web_docs/gitbook_web_fetcher_service.py +172 -0
  21. h_ai/domain/web_docs/web_docs_link_detector.py +26 -0
  22. h_ai/domain/web_docs/web_link.py +11 -0
  23. h_ai/domain/webpages/__init__.py +0 -0
  24. h_ai/domain/webpages/web_fetcher_repository.py +10 -0
  25. h_ai/domain/webpages/web_text_fetcher_repository.py +12 -0
  26. h_ai/infrastructure/__init__.py +0 -0
  27. h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
  28. h_ai/infrastructure/beautifulsoup/soup_processor.py +240 -0
  29. h_ai/infrastructure/llm/__init__.py +0 -0
  30. h_ai/infrastructure/llm/data_handler.py +30 -0
  31. h_ai/infrastructure/llm/llm_response_cleaner.py +21 -0
  32. h_ai/infrastructure/llm/ollama/__init__.py +0 -0
  33. h_ai/infrastructure/llm/ollama/models/__init__.py +0 -0
  34. h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py +13 -0
  35. h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py +12 -0
  36. h_ai/infrastructure/llm/ollama/ollama_chat_repository.py +56 -0
  37. h_ai/infrastructure/llm/ollama/ollama_generate_repository.py +53 -0
  38. h_ai/infrastructure/llm/ollama/ollama_tool_repository.py +138 -0
  39. h_ai/infrastructure/llm/prompt_helper.py +7 -0
  40. h_ai/infrastructure/llm/prompt_loader.py +18 -0
  41. h_ai/infrastructure/playwright/__init__.py +0 -0
  42. h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +48 -0
  43. h_ai_brain-0.0.1.dist-info/METADATA +22 -0
  44. h_ai_brain-0.0.1.dist-info/RECORD +48 -0
  45. h_ai_brain-0.0.1.dist-info/WHEEL +5 -0
  46. h_ai_brain-0.0.1.dist-info/licenses/LICENSE +202 -0
  47. h_ai_brain-0.0.1.dist-info/licenses/NOTICE.txt +19 -0
  48. h_ai_brain-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,12 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional
3
+
4
+ from ...domain.web_docs.gitbook.text_page import TextPage
5
+
6
+
7
+ class WebTextFetcherRepository(ABC):
8
+
9
+ @abstractmethod
10
+ async def fetch(self) -> Optional[List[TextPage]]:
11
+ """Fetch all content"""
12
+ pass
File without changes
File without changes
@@ -0,0 +1,240 @@
1
+ import logging
2
+ import re
3
+ from typing import List, Optional
4
+ from urllib.parse import urljoin
5
+
6
+ from bs4 import BeautifulSoup, Tag
7
+
8
+ from ...domain.web_docs.gitbook.text_chapter import TextChapter
9
+ from ...domain.web_docs.web_link import WebLink
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SoupProcessor:
14
+ def __init__(self, html_content:str):
15
+ self.soup = BeautifulSoup(html_content, 'html.parser')
16
+
17
+ def extract_links(self, base_url: str) -> List[WebLink]:
18
+ """Extract links from a page"""
19
+ web_links = []
20
+ links = self.soup.find_all('a', href=True)
21
+ for link in links:
22
+ href = link.get('href', '').strip()
23
+ if not href or href.startswith('#') or href.startswith('javascript:'):
24
+ continue
25
+
26
+ full_url = urljoin(base_url, href)
27
+ link_text = link.get_text().strip()
28
+ web_link = WebLink(url=full_url, title=link_text)
29
+ web_links.append(web_link)
30
+ return web_links
31
+
32
+ def normalize_url(self, href, base_url) -> Optional[str]:
33
+ """Normalize URL to absolute form and filter out non-content URLs"""
34
+ # Skip fragment-only URLs
35
+ if href.startswith('#'):
36
+ return None
37
+
38
+ # Skip external links
39
+ if href.startswith(('http://', 'https://')) and not href.startswith(base_url):
40
+ return None
41
+
42
+ # Skip resource URLs
43
+ if href.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.js', '.css')):
44
+ return None
45
+
46
+ # Convert to absolute URL if needed
47
+ full_url = href
48
+ if not href.startswith(('http://', 'https://')):
49
+ full_url = urljoin(base_url, href)
50
+
51
+ # Make sure URL belongs to the same domain
52
+ if not full_url.startswith(base_url):
53
+ return None
54
+
55
+ return full_url
56
+
57
+ def extract_last_updated_refs_from_soup(self) -> str:
58
+ datetime_value = ""
59
+
60
+ # Find and remove elements containing "Last updated" text
61
+ for element in self.soup.find_all(string=lambda text: text and "Last updated" in text):
62
+ # Get the parent element and remove it
63
+ parent = element.parent
64
+ if parent:
65
+ parent.decompose()
66
+
67
+ return datetime_value
68
+
69
+ def extract_title(self) -> Optional[str]:
70
+ """Extract the title of the page using multiple strategies"""
71
+ # Strategy 1: Look for h1
72
+ h1 = self.soup.find('h1')
73
+ if h1:
74
+ return h1.get_text(strip=True)
75
+
76
+ # Strategy 2: Look for title tag
77
+ title_tag = self.soup.find('title')
78
+ if title_tag:
79
+ title_text = title_tag.get_text(strip=True)
80
+ title_parts = re.split(r'[|\-–]', title_text)
81
+ return title_parts[0].strip()
82
+
83
+ # Strategy 3: Try to find GitBook-specific title elements
84
+ gitbook_title = self.soup.find('span', {'data-testid': 'page.title'})
85
+ if gitbook_title:
86
+ return gitbook_title.get_text(strip=True)
87
+
88
+ return None
89
+
90
+ def find_body_content(self) -> Optional[Tag]:
91
+ body_content = self.soup.find('body')
92
+ if body_content:
93
+ return body_content
94
+ return None
95
+
96
+ def gitbook_extract_modern_nav(self, base_url, processed_urls):
97
+ """Extract navigation from modern GitBook layout"""
98
+ nav_links = []
99
+
100
+ # Look for navigation sidebar
101
+ sidebar = self.soup.select_one('div[data-testid="page.desktopTableOfContents"]')
102
+ if sidebar:
103
+ for link in sidebar.find_all('a', href=True):
104
+ full_url = self.normalize_url(link['href'], base_url)
105
+ if full_url and full_url not in processed_urls:
106
+ nav_links.append(full_url)
107
+ processed_urls.add(full_url)
108
+
109
+ return nav_links
110
+
111
+ def gitbook_extract_traditional_nav(self, base_url, processed_urls):
112
+ """Extract navigation from traditional GitBook layout"""
113
+ nav_links = []
114
+
115
+ # Find GitBook navigation elements
116
+ nav_elements = self.soup.find_all(['nav', 'aside'])
117
+ for nav in nav_elements:
118
+ # Look for lists that typically contain the navigation
119
+ nav_lists = nav.find_all(['ol', 'ul'])
120
+ for nav_list in nav_lists:
121
+ for li in nav_list.find_all('li'):
122
+ link = li.find('a', href=True)
123
+ if link:
124
+ full_url = self.normalize_url(link['href'], base_url)
125
+ if full_url and full_url not in processed_urls:
126
+ nav_links.append(full_url)
127
+ processed_urls.add(full_url)
128
+
129
+ # Try summary element which is common in GitBook
130
+ summary = self.soup.find('ul', {'class': 'summary'})
131
+ if summary:
132
+ for link in summary.find_all('a', href=True):
133
+ full_url = self.normalize_url(link['href'], base_url)
134
+ if full_url and full_url not in processed_urls:
135
+ nav_links.append(full_url)
136
+ processed_urls.add(full_url)
137
+
138
+ return nav_links
139
+
140
+ def gitbook_extract_pagination_links(self, base_url, processed_urls):
141
+ """Extract navigation from pagination elements"""
142
+ nav_links = []
143
+
144
+ # Find pagination links (next/prev)
145
+ selectors = [
146
+ 'a[aria-label="Next"]',
147
+ 'a[aria-label="Previous"]',
148
+ 'a.navigation-next',
149
+ 'a.navigation-prev',
150
+ 'a:has(svg[data-icon="arrow-right"])',
151
+ 'a:has(svg[data-icon="arrow-left"])'
152
+ ]
153
+
154
+ for selector in selectors:
155
+ try:
156
+ for link in self.soup.select(selector):
157
+ if link.has_attr('href'):
158
+ full_url = self.normalize_url(link['href'], base_url)
159
+ if full_url and full_url not in processed_urls:
160
+ nav_links.append(full_url)
161
+ processed_urls.add(full_url)
162
+ except Exception:
163
+ continue
164
+
165
+ return nav_links
166
+
167
+ def gitbook_extract_class_based_nav(self, base_url, processed_urls):
168
+ """Extract navigation based on common GitBook class patterns"""
169
+ nav_links = []
170
+
171
+ # Common class patterns for navigation in GitBook
172
+ class_patterns = [
173
+ 'nav-', 'menu-', 'sidebar-', 'toc-', '-nav', '-menu', '-sidebar', '-toc'
174
+ ]
175
+
176
+ # Look for elements with these class patterns
177
+ for pattern in class_patterns:
178
+ elements = self.soup.find_all(class_=lambda c: c and pattern in c)
179
+ for element in elements:
180
+ for link in element.find_all('a', href=True):
181
+ full_url = self.normalize_url(link['href'], base_url)
182
+ if full_url and full_url not in processed_urls:
183
+ nav_links.append(full_url)
184
+ processed_urls.add(full_url)
185
+
186
+ return nav_links
187
+
188
+ @staticmethod
189
+ def clean_template_usage(content: Tag):
190
+ if not content or not isinstance(content, Tag):
191
+ return None
192
+ # Step 1: Build a mapping of template IDs to hidden div content
193
+ template_map = {}
194
+ # Find all hidden divs with IDs like S:*
195
+ for hidden_div in content.find_all('div', {'hidden': True}, id=re.compile(r'S:\d+')):
196
+ div_id = hidden_div.get('id')
197
+ # Store the first child (e.g., <a> tag) or the entire content
198
+ if hidden_div.contents:
199
+ template_map[div_id] = hidden_div.contents[0] if len(hidden_div.contents) == 1 else hidden_div
200
+
201
+ # Step 2: Replace <template> tags with content from hidden divs based on $RS logic
202
+ for template in content.find_all('template', id=re.compile(r'P:\d+')):
203
+ template_id = template.get('id') # e.g., P:2
204
+ # Convert P:* to S:* to match the hidden div (assuming $RS("S:2", "P:2") pattern)
205
+ source_id = f"S:{template_id.split(':')[1]}" # e.g., S:2
206
+ if source_id in template_map:
207
+ # Replace the template with the content from the hidden div
208
+ replacement = template_map[source_id]
209
+ # If it's a Tag, use it directly; if it's a div, extract its contents
210
+ if isinstance(replacement, Tag):
211
+ template.replace_with(replacement)
212
+ else:
213
+ template.replace_with(replacement.contents[0])
214
+
215
+ @staticmethod
216
+ def extract_chapters(content: Tag) -> List[TextChapter]:
217
+ chapters = []
218
+
219
+ # Create a default chapter for content before any heading
220
+ default_chapter = TextChapter(heading="Introduction", level=0)
221
+ current_chapter = default_chapter
222
+ chapters.append(default_chapter)
223
+
224
+ for element in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
225
+ if element.name.startswith('h'):
226
+ # Extract heading level (h1=1, h2=2, etc.)
227
+ level = int(element.name[1])
228
+ heading_text = element.get_text(strip=True)
229
+
230
+ # Create a new chapter
231
+ current_chapter = TextChapter(heading=heading_text, level=level)
232
+ chapters.append(current_chapter)
233
+
234
+ elif element.name == 'p' and current_chapter is not None:
235
+ paragraph_text = element.get_text(strip=True)
236
+ if paragraph_text:
237
+ current_chapter.paragraphs.append(paragraph_text)
238
+
239
+ # Remove any chapters without content if they're not top-level
240
+ return [ch for ch in chapters if ch.paragraphs or ch.level <= 2]
File without changes
@@ -0,0 +1,30 @@
1
+ import json
2
+
3
+
4
+ def parse_json_data(json_string: str) -> dict |None:
5
+ try:
6
+ # Check for empty string
7
+ if not json_string or not json_string.strip():
8
+ print("Empty JSON string provided")
9
+ return {}
10
+
11
+ # First try to extract JSON from Markdown code blocks
12
+ import re
13
+ match = re.search(r'```(?:json)?\s*\n(.*?)\n```', json_string, re.DOTALL)
14
+ if match:
15
+ json_string = match.group(1).strip()
16
+ else:
17
+ # If no code block found, try the existing logic
18
+ if json_string.strip().startswith('json'):
19
+ json_string = json_string.strip()[4:].strip()
20
+
21
+ # Parse the JSON data
22
+ data = json.loads(json_string)
23
+
24
+ return data
25
+ except json.JSONDecodeError as e:
26
+ print(f"Error parsing JSON data: {e}")
27
+ return {}
28
+ except Exception as e:
29
+ print(f"Unexpected error while parsing JSON data: {e}")
30
+ return {}
@@ -0,0 +1,21 @@
1
+
2
+
3
+ def clean_llm_response(response_text: str) -> str:
4
+ clean_deepseek = _strip_think_tag_from_deepseek_response(_replace_role_tags_from_response(response_text))
5
+ clean_tags = _replace_role_tags_from_response(clean_deepseek)
6
+ return clean_tags
7
+
8
+ def _strip_think_tag_from_deepseek_response(response_text: str) -> str:
9
+ """
10
+ Removes any content enclosed within the "<think>" tags from the response text,
11
+ """
12
+ think_start = response_text.find("<think>")
13
+ think_end = response_text.find("</think>")
14
+ if think_start != -1 and think_end != -1:
15
+ return response_text[think_end + 8:].strip()
16
+ return response_text
17
+
18
+ def _replace_role_tags_from_response(response_text: str) -> str:
19
+ return (response_text.replace("<|assistant|>", "")
20
+ .replace("<|user|>", "")
21
+ .replace("<|system|>", ""))
File without changes
File without changes
@@ -0,0 +1,13 @@
1
+ from typing import Dict
2
+
3
+
4
+ class OllamaChatMessage:
5
+ def __init__(self, role: str, content: str):
6
+ self.role = role
7
+ self.content = content
8
+
9
+ def to_dict(self) -> Dict[str, str]:
10
+ return {
11
+ "role": self.role,
12
+ "content": self.content
13
+ }
@@ -0,0 +1,12 @@
1
+ from typing import List
2
+
3
+ from .....infrastructure.llm.ollama.models.ollama_chat_message import OllamaChatMessage
4
+
5
+
6
+ class OllamaChatSession:
7
+ def __init__(self, session_id: str, messages: List[OllamaChatMessage]):
8
+ self.session_id = session_id
9
+ self.messages = messages
10
+
11
+ def add_message(self, message: OllamaChatMessage):
12
+ self.messages.append(message)
@@ -0,0 +1,56 @@
1
+ from typing import Optional, List
2
+
3
+ import requests
4
+
5
+ from ....domain.reasoning.llm_chat_repository import LlmChatRepository
6
+ from ....infrastructure.llm.llm_response_cleaner import clean_llm_response
7
+ from ....infrastructure.llm.ollama.models.ollama_chat_message import OllamaChatMessage
8
+ from ....infrastructure.llm.ollama.models.ollama_chat_session import OllamaChatSession
9
+
10
+
11
+ class OllamaChatRepository(LlmChatRepository):
12
+
13
+ def __init__(self, api_url: str, model_name: str, system_prompts: list[str] = None, temperature: float = None, seed: int = None):
14
+ self.api_url = api_url
15
+ self.model_name = model_name
16
+ self.temperature = temperature
17
+ self.seed = seed
18
+ self.system_prompts = system_prompts
19
+
20
+ def chat(self, user_message: str, session_id: str) -> Optional[str]:
21
+
22
+ messages = [OllamaChatMessage("user", user_message)]
23
+ for system_prompt in self.system_prompts:
24
+ messages.append(OllamaChatMessage("system", system_prompt))
25
+ session = OllamaChatSession(session_id, messages)
26
+
27
+ return self._call_ollama_api(session.messages)
28
+
29
+ def _call_ollama_api(self, messages: List[OllamaChatMessage]) -> Optional[str]:
30
+ url = f"{self.api_url}/chat"
31
+ formatted_messages = [message.to_dict() for message in messages]
32
+ payload = {
33
+ "model": self.model_name,
34
+ "messages": formatted_messages,
35
+ "stream": False,
36
+ "temperature": "0.6"
37
+ }
38
+ if self.temperature:
39
+ payload["temperature"] = self.temperature
40
+ if self.seed:
41
+ payload["seed"] = self.seed
42
+
43
+ try:
44
+ print(payload)
45
+ response = requests.post(url, json=payload)
46
+ response.raise_for_status()
47
+ full_response = response.json()["message"]["content"]
48
+ print(full_response)
49
+ return clean_llm_response(full_response)
50
+
51
+ except requests.exceptions.RequestException as e:
52
+ print(f"Error occurred during API call: {e}")
53
+ return None
54
+
55
+
56
+
@@ -0,0 +1,53 @@
1
+ import uuid
2
+
3
+ import requests
4
+
5
+ from ....domain.reasoning.llm_generate_respository import LlmGenerateRepository
6
+
7
+
8
+ class OllamaGenerateRepository(LlmGenerateRepository):
9
+
10
+ def __init__(self, api_url: str, model_name: str, system_prompt: str = None, temperature: float = None, seed: int = None):
11
+ self.model_name = model_name
12
+ self.system_prompt = system_prompt
13
+ self.api_url = api_url
14
+ self.temperature = temperature
15
+ self.seed = seed
16
+
17
+
18
+ def generate(self, user_prompt: str, system_prompt: str = None, session_id: str = None) -> str|None:
19
+ url = f"{self.api_url}/generate"
20
+ random_guid = uuid.uuid4()
21
+ guid_str = str(random_guid)
22
+ system_prompt = system_prompt or self.system_prompt
23
+ payload = {
24
+ "model": self.model_name,
25
+ "prompt": user_prompt,
26
+ "system": system_prompt,
27
+ "stream": False,
28
+ "session": guid_str,
29
+ "num_ctx": "5000",
30
+ "temperature": "0.6"
31
+ }
32
+
33
+ if session_id:
34
+ payload["session"] = session_id
35
+ if self.seed:
36
+ payload["seed"] = self.seed
37
+ if self.temperature:
38
+ payload["temperature"] = self.temperature
39
+
40
+ try:
41
+ print(payload)
42
+ response = requests.post(url, json=payload)
43
+ response.raise_for_status()
44
+
45
+ print(response.json())
46
+
47
+ response_content = response.json()["response"]
48
+ return clean_llm_response(response_content)
49
+
50
+ except requests.exceptions.RequestException as e:
51
+ print(f"Error occurred during API call: {e}")
52
+ return None
53
+
@@ -0,0 +1,138 @@
1
+ import json
2
+ from typing import Dict, Any
3
+
4
+ import requests
5
+
6
+ from ..prompt_helper import replace_placeholders
7
+ from ....domain.reasoning.llm_tool_repository import LlmToolRepository
8
+ from ....domain.reasoning.tool_message import ToolMessage
9
+
10
+
11
+ class OllamaToolRepository(LlmToolRepository):
12
+
13
+ def __init__(self, api_url: str, model_name: str, tools: list, tool_prompt: str, tool_response_prompt: str, temperature: float = None, seed: int = None):
14
+ self.model_name = model_name
15
+ self.api_url = api_url
16
+ self.tools = tools
17
+ self.tool_prompt = tool_prompt
18
+ self.tool_response_prompt = tool_response_prompt
19
+ self.temperature = temperature
20
+ self.seed = seed
21
+
22
+ def find_tools_in_message(self, message: str) -> list[ToolMessage] | None:
23
+ url = f"{self.api_url}/generate"
24
+
25
+ placeholders = {
26
+ "$available_tools_placeholder": {json.dumps(self.tools, indent=2)},
27
+ }
28
+ system_message_tools = replace_placeholders(self.tool_prompt, placeholders)
29
+
30
+ payload = {
31
+ "model": self.model_name,
32
+ "prompt": message,
33
+ "stream": False,
34
+ "system": system_message_tools,
35
+ "num_ctx": "2500",
36
+ "temperature": "0.6"
37
+ }
38
+
39
+ if self.seed:
40
+ payload["seed"] = self.seed
41
+ if self.temperature:
42
+ payload["temperature"] = self.temperature
43
+
44
+ try:
45
+ print(payload)
46
+ response = requests.post(url, json=payload)
47
+ response.raise_for_status()
48
+
49
+ print(response.json())
50
+
51
+ found_tool_definitions = self._process_tool_response(response.json())
52
+ if found_tool_definitions:
53
+ tool_messages = []
54
+ for tool_definition in found_tool_definitions:
55
+ tool_message = ToolMessage()
56
+ tool_message.method_name = tool_definition.get("tool")
57
+ tool_message.method_params = tool_definition.get("parameters",{})
58
+ tool_messages.append(tool_message)
59
+
60
+ return tool_messages
61
+ else:
62
+ return None
63
+
64
+ except requests.exceptions.RequestException as e:
65
+ print(f"Error occurred during API call: {e}")
66
+ return None
67
+
68
+ def build_tool_response_prompt(self, question:str, tool_results: list[str]):
69
+ placeholders = {
70
+ "$tools_question_placeholder": question,
71
+ "$tools_response_context_placeholder": tool_results,
72
+ }
73
+ return replace_placeholders(self.tool_response_prompt, placeholders)
74
+
75
+ def _process_tool_response(self, response: Dict[str, Any]) -> list[dict] | None:
76
+ try:
77
+ text = response.get('response', '')
78
+
79
+ tool_calls = self._parse_json_string(text)
80
+
81
+ if isinstance(tool_calls, dict):
82
+ tool_calls = [tool_calls]
83
+
84
+ if isinstance(tool_calls, list):
85
+ valid_calls = [
86
+ call for call in tool_calls
87
+ if isinstance(call, dict) and 'tool' in call
88
+ ]
89
+ return valid_calls if valid_calls else None
90
+
91
+ except json.JSONDecodeError:
92
+ return None
93
+
94
+ return None
95
+
96
+ def _parse_json_string(self, input_str: str) -> list[dict] | None:
97
+ if not input_str or not input_str.strip():
98
+ return None
99
+
100
+ try:
101
+ return json.loads(input_str)
102
+ except json.JSONDecodeError:
103
+ cleaned = self._clean_json_string(input_str)
104
+ if not cleaned:
105
+ return None
106
+
107
+ json_objects = []
108
+ current_pos = 0
109
+
110
+ while True:
111
+ try:
112
+ start = cleaned.find('{', current_pos)
113
+ if start == -1:
114
+ break
115
+
116
+ decoder = json.JSONDecoder()
117
+ obj, end = decoder.raw_decode(cleaned[start:])
118
+ json_objects.append(obj)
119
+ current_pos = start + end
120
+ except json.JSONDecodeError:
121
+ current_pos += 1
122
+ continue
123
+ except Exception:
124
+ break
125
+
126
+ if not json_objects:
127
+ return None
128
+ return json_objects
129
+
130
+ @staticmethod
131
+ def _clean_json_string(input_str: str) -> str:
132
+ cleaned = input_str.replace('```json', '').replace('```', '')
133
+ cleaned = cleaned.strip("'").strip('"')
134
+ cleaned = cleaned.strip()
135
+ return cleaned
136
+
137
+
138
+
@@ -0,0 +1,7 @@
1
+ def replace_placeholders(prompt_string, placeholder_dict):
2
+ result = prompt_string
3
+
4
+ for placeholder, replacement in placeholder_dict.items():
5
+ result = result.replace(placeholder, str(replacement))
6
+
7
+ return result
@@ -0,0 +1,18 @@
1
+ import json
2
+ import os
3
+
4
+
5
+ class PromptLoader:
6
+ def __init__(self, file_path):
7
+ # Resolve relative path to absolute path
8
+ absolute_file_path = os.path.abspath(file_path)
9
+
10
+ with open(absolute_file_path, "r") as file:
11
+ self.config = json.load(file)
12
+
13
+ def get_config_value(self, key):
14
+ return self.config.get(key)
15
+
16
+ def get_entire_config(self):
17
+ return json.dumps(self.config, indent=2)
18
+
File without changes
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from playwright.async_api import async_playwright
5
+
6
+ from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class PlayWrightWebContentFetcher(WebFetcherRepository):
11
+ def __init__(self):
12
+ self.headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
14
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
15
+ 'Accept-Language': 'en-US,en;q=0.5'
16
+ }
17
+ self.page_load_timeout = 30
18
+ self.wait_for_idle = True
19
+
20
+ async def fetch(self, url: str) -> Optional[str]:
21
+ async with async_playwright() as p:
22
+ browser = await p.chromium.launch()
23
+ try:
24
+ context = await browser.new_context(
25
+ user_agent=self.headers.get('User-Agent')
26
+ )
27
+ page = await context.new_page()
28
+
29
+ # Set timeout
30
+ page.set_default_timeout(self.page_load_timeout * 1000) # Convert to ms
31
+
32
+ # Navigate to the URL
33
+ await page.goto(url)
34
+
35
+ # Wait for network to be idle if requested
36
+ if self.wait_for_idle:
37
+ await page.wait_for_load_state("networkidle")
38
+
39
+ logger.debug(f"Successfully fetched {url} with headless browser")
40
+
41
+ # Get the rendered HTML
42
+ return await page.content()
43
+
44
+ except Exception as e:
45
+ logger.error(f"Error fetching {url} with headless browser: {str(e)}")
46
+ return None
47
+ finally:
48
+ await browser.close()