h-ai-brain 0.0.13__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. h_ai/__init__.py +1 -3
  2. h_ai/application/hai_service.py +0 -51
  3. {h_ai_brain-0.0.13.dist-info → h_ai_brain-0.0.16.dist-info}/METADATA +2 -8
  4. h_ai_brain-0.0.16.dist-info/RECORD +31 -0
  5. {h_ai_brain-0.0.13.dist-info → h_ai_brain-0.0.16.dist-info}/WHEEL +1 -1
  6. h_ai/application/priority_queue_service.py +0 -30
  7. h_ai/application/web_docs_service.py +0 -36
  8. h_ai/domain/priorityqueue/__init__.py +0 -0
  9. h_ai/domain/priorityqueue/priority_queue_repository.py +0 -34
  10. h_ai/domain/priorityqueue/queue_item.py +0 -43
  11. h_ai/domain/web_docs/__init__.py +0 -0
  12. h_ai/domain/web_docs/doc_link_scorer_service.py +0 -45
  13. h_ai/domain/web_docs/documentation_pattern_repository.py +0 -44
  14. h_ai/domain/web_docs/ecosystem_link_scorer_service.py +0 -83
  15. h_ai/domain/web_docs/ecosystem_pattern_repository.py +0 -182
  16. h_ai/domain/web_docs/gitbook/__init__.py +0 -0
  17. h_ai/domain/web_docs/gitbook/text_chapter.py +0 -18
  18. h_ai/domain/web_docs/gitbook/text_page.py +0 -46
  19. h_ai/domain/web_docs/gitbook_web_fetcher_service.py +0 -171
  20. h_ai/domain/web_docs/web_docs_link_detector.py +0 -28
  21. h_ai/domain/web_docs/web_link.py +0 -11
  22. h_ai/domain/webpages/__init__.py +0 -0
  23. h_ai/domain/webpages/web_fetcher_repository.py +0 -10
  24. h_ai/domain/webpages/web_text_fetcher_repository.py +0 -12
  25. h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
  26. h_ai/infrastructure/beautifulsoup/soup_processor.py +0 -240
  27. h_ai/infrastructure/playwright/__init__.py +0 -0
  28. h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +0 -64
  29. h_ai/infrastructure/priorityqueue/__init__.py +0 -0
  30. h_ai/infrastructure/priorityqueue/in_memory_priority_queue_repository.py +0 -98
  31. h_ai_brain-0.0.13.dist-info/RECORD +0 -56
  32. {h_ai_brain-0.0.13.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/LICENSE +0 -0
  33. {h_ai_brain-0.0.13.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/NOTICE.txt +0 -0
  34. {h_ai_brain-0.0.13.dist-info → h_ai_brain-0.0.16.dist-info}/top_level.txt +0 -0
@@ -1,46 +0,0 @@
1
- import hashlib
2
- from dataclasses import dataclass, field
3
- from typing import Dict, List
4
-
5
- from ....domain.web_docs.gitbook.text_chapter import TextChapter
6
-
7
-
8
- @dataclass
9
- class TextPage:
10
- """Represents text on a page from a web document"""
11
- url: str = ""
12
- title: str = ""
13
- content: str = ""
14
- last_updated: str = ""
15
-
16
- index: int = 0
17
- toc_level: int = 0
18
- parent_id: str = ""
19
-
20
- chapters: List[TextChapter] = field(default_factory=list)
21
- links: Dict[str, str] = field(default_factory=dict) # Text -> URL
22
-
23
- id: str = field(init=False, default="")
24
- content_hash: str = field(init=False, default="")
25
-
26
- def __post_init__(self):
27
- self.id = hashlib.md5(self.url.encode()).hexdigest()
28
- # Generate a content hash for deduplication
29
- self.content_hash = hashlib.md5(self.content.encode()).hexdigest() if self.content else ""
30
-
31
- def to_dict(self):
32
- """Convert this TextPage instance to a serializable dictionary"""
33
- result = {
34
- 'url': self.url,
35
- 'title': self.title,
36
- #'content': self.content,
37
- 'last_updated': self.last_updated,
38
- 'index': self.index,
39
- 'toc_level': self.toc_level,
40
- 'parent_id': self.parent_id,
41
- 'id': self.id,
42
- 'content_hash': self.content_hash,
43
- 'links': self.links,
44
- 'chapters': [chapter.to_dict() for chapter in self.chapters]
45
- }
46
- return result
@@ -1,171 +0,0 @@
1
- import asyncio
2
- from typing import Optional, Set, Dict, List
3
- from urllib.parse import urlparse
4
-
5
- import aiohttp
6
-
7
- from ...domain.web_docs.gitbook.text_page import TextPage
8
- from ...domain.webpages.web_text_fetcher_repository import WebTextFetcherRepository
9
- from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
10
-
11
-
12
- class GitbookWebFetcherService(WebTextFetcherRepository):
13
-
14
- def __init__(self, url: str):
15
- self.headers = {
16
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
17
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
18
- 'Accept-Language': 'en-US,en;q=0.5'
19
- }
20
- self.base_url = url.rstrip('/')
21
- self.base_domain = urlparse(self.base_url).netloc
22
-
23
- self.delay = 1.0 # Delay between requests in seconds
24
- self.max_retries = 3
25
- self.retry_delay = 2 # Initial retry delay in seconds
26
- self.concurrent_requests = 3 # Number of concurrent requests
27
-
28
- self.visited_urls: Set[str] = set()
29
- self.content_hashes: Dict[str, str] = {} # Hash -> URL
30
- self.pages: Dict[str, TextPage] = {}
31
-
32
- async def fetch(self) -> Optional[List[TextPage]]:
33
- timeout = aiohttp.ClientTimeout(total=180) # 3 minutes total timeout
34
- async with aiohttp.ClientSession(headers=self.headers, timeout=timeout) as session:
35
- # Start with main page
36
-
37
- await self._process_url(session, self.base_url)
38
-
39
- # Wait for all tasks to complete
40
- await asyncio.sleep(0)
41
-
42
- # Sort pages by index
43
- sorted_pages = sorted(
44
- self.pages.values(),
45
- key=lambda p: p.index
46
- )
47
- return sorted_pages
48
-
49
- async def _process_url(self, session: aiohttp.ClientSession, url: str) -> None:
50
- if url in self.visited_urls:
51
- return
52
- print(f"Processing {url}")
53
- self.visited_urls.add(url)
54
-
55
- # Fetch page content
56
- html_content = await self._fetch_page(session, url)
57
- if not html_content:
58
- return
59
-
60
- # Extract page content
61
- page = await GitbookWebFetcherService.extract_page_content(url, html_content)
62
- if not page:
63
- return
64
-
65
- # Check for duplicate content
66
- if page.content_hash in self.content_hashes:
67
- return
68
-
69
- # Set page index
70
- page.index = len(self.pages)
71
-
72
- # Add page to collection
73
- self.pages[url] = page
74
- self.content_hashes[page.content_hash] = url
75
-
76
- # Extract navigation links from this page
77
- nav_links = await GitbookWebFetcherService.gitbook_extract_navigation(self.base_url, html_content)
78
-
79
- #Process the discovered links
80
- for link in nav_links:
81
- if link not in self.visited_urls:
82
- # Add delay between requests
83
- await asyncio.sleep(self.delay)
84
- # Process the URL
85
- await self._process_url(session, link)
86
-
87
- @staticmethod
88
- async def extract_page_content(url: str, html_content: str) -> Optional[TextPage]:
89
- try:
90
- soup_processor = SoupProcessor(html_content)
91
-
92
- title = soup_processor.extract_title()
93
- if not title:
94
- title = urlparse(url).path.split('/')[-1] or "Index"
95
- title = title.replace('-', ' ').replace('_', ' ').title()
96
-
97
- last_updated = soup_processor.extract_last_updated_refs_from_soup()
98
-
99
- body_tag = soup_processor.find_body_content()
100
- if body_tag is None:
101
- return None
102
- soup_processor.clean_template_usage(body_tag)
103
- chapters = soup_processor.extract_chapters(content=body_tag)
104
-
105
- return TextPage(
106
- url=url,
107
- title=title,
108
- content=html_content,
109
- last_updated=last_updated,
110
- chapters=chapters,
111
- )
112
- except Exception as e:
113
- return None
114
-
115
- @staticmethod
116
- async def gitbook_extract_navigation(base_url: str, html_content: str) -> List[str]:
117
- """Extract navigation links from a page"""
118
- try:
119
-
120
- soup_processor = SoupProcessor(html_content)
121
-
122
- nav_links = []
123
- processed_urls = set()
124
-
125
- # Extract links from modern layout
126
- nav_links.extend(soup_processor.gitbook_extract_modern_nav(base_url, processed_urls))
127
-
128
- # Extract links from traditional layout
129
- nav_links.extend(soup_processor.gitbook_extract_traditional_nav(base_url, processed_urls))
130
-
131
- # Extract links from pagination elements
132
- nav_links.extend(soup_processor.gitbook_extract_pagination_links(base_url, processed_urls))
133
-
134
- # Extract links from search for specific class patterns
135
- nav_links.extend(soup_processor.gitbook_extract_class_based_nav(base_url, processed_urls))
136
-
137
- # Remove duplicates while preserving order
138
- return list(dict.fromkeys(nav_links))
139
-
140
- except Exception as e:
141
- return []
142
-
143
- async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
144
- """Fetch a page with retry logic"""
145
- retry_count = 0
146
- current_delay = self.retry_delay
147
-
148
- while retry_count < self.max_retries:
149
- try:
150
- async with session.get(url) as response:
151
- if response.status == 429: # Rate limit
152
- retry_after = response.headers.get('Retry-After', '60')
153
- wait_time = int(retry_after)
154
-
155
- await asyncio.sleep(wait_time)
156
- retry_count += 1
157
- continue
158
-
159
- if response.status == 200:
160
- return await response.text()
161
- else:
162
- return None
163
-
164
- except (aiohttp.ClientError, asyncio.TimeoutError) as e:
165
- if retry_count < self.max_retries - 1:
166
- await asyncio.sleep(current_delay)
167
- current_delay *= 2 # Exponential backoff
168
- retry_count += 1
169
- else:
170
- return None
171
- return None
@@ -1,28 +0,0 @@
1
- from typing import List
2
-
3
- from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
4
- from ...domain.web_docs.doc_link_scorer_service import DocLinkScorerService
5
- from ...domain.web_docs.web_link import WebLink
6
- from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
7
-
8
-
9
- class WebDocsLinkDetector:
10
- def __init__(self, doc_link_scorer: DocLinkScorerService, web_fetcher: WebFetcherRepository, confidence_threshold: float = 0.5):
11
- self.doc_link_scorer = doc_link_scorer
12
- self.web_fetcher = web_fetcher
13
- self.confidence_threshold = confidence_threshold
14
-
15
- async def find_docs_links(self, website_url: str) -> List[WebLink]:
16
- doc_links = []
17
-
18
- web_content = await self.web_fetcher.fetch(website_url)
19
- if not web_content:
20
- return doc_links
21
-
22
- soup_processor = SoupProcessor(web_content)
23
- web_links = soup_processor.extract_links(website_url)
24
- for web_link in web_links:
25
- score = self.doc_link_scorer.score(web_link.url, web_link.title)
26
- if score >= self.confidence_threshold:
27
- doc_links.append(web_link)
28
- return doc_links
@@ -1,11 +0,0 @@
1
- class WebLink:
2
- def __init__(self, url: str, title: str):
3
- self.url = url
4
- self.title = title
5
-
6
- def to_dict(self):
7
- """Convert this WebLink instance to a serializable dictionary"""
8
- return {
9
- 'url': self.url,
10
- 'title': self.title
11
- }
File without changes
@@ -1,10 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Optional
3
-
4
-
5
- class WebFetcherRepository(ABC):
6
-
7
- @abstractmethod
8
- async def fetch(self, url: str) -> Optional[str]:
9
- """Fetch the content of the given URL."""
10
- pass
@@ -1,12 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import List, Optional
3
-
4
- from ...domain.web_docs.gitbook.text_page import TextPage
5
-
6
-
7
- class WebTextFetcherRepository(ABC):
8
-
9
- @abstractmethod
10
- async def fetch(self) -> Optional[List[TextPage]]:
11
- """Fetch all content"""
12
- pass
File without changes
@@ -1,240 +0,0 @@
1
- import logging
2
- import re
3
- from typing import List, Optional
4
- from urllib.parse import urljoin
5
-
6
- from bs4 import BeautifulSoup, Tag
7
-
8
- from ...domain.web_docs.gitbook.text_chapter import TextChapter
9
- from ...domain.web_docs.web_link import WebLink
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- class SoupProcessor:
14
- def __init__(self, html_content:str):
15
- self.soup = BeautifulSoup(html_content, 'html.parser')
16
-
17
- def extract_links(self, base_url: str) -> List[WebLink]:
18
- """Extract links from a page"""
19
- web_links = []
20
- links = self.soup.find_all('a', href=True)
21
- for link in links:
22
- href = link.get('href', '').strip()
23
- if not href or href.startswith('#') or href.startswith('javascript:'):
24
- continue
25
-
26
- full_url = urljoin(base_url, href)
27
- link_text = link.get_text().strip()
28
- web_link = WebLink(url=full_url, title=link_text)
29
- web_links.append(web_link)
30
- return web_links
31
-
32
- def normalize_url(self, href, base_url) -> Optional[str]:
33
- """Normalize URL to absolute form and filter out non-content URLs"""
34
- # Skip fragment-only URLs
35
- if href.startswith('#'):
36
- return None
37
-
38
- # Skip external links
39
- if href.startswith(('http://', 'https://')) and not href.startswith(base_url):
40
- return None
41
-
42
- # Skip resource URLs
43
- if href.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.js', '.css')):
44
- return None
45
-
46
- # Convert to absolute URL if needed
47
- full_url = href
48
- if not href.startswith(('http://', 'https://')):
49
- full_url = urljoin(base_url, href)
50
-
51
- # Make sure URL belongs to the same domain
52
- if not full_url.startswith(base_url):
53
- return None
54
-
55
- return full_url
56
-
57
- def extract_last_updated_refs_from_soup(self) -> str:
58
- datetime_value = ""
59
-
60
- # Find and remove elements containing "Last updated" text
61
- for element in self.soup.find_all(string=lambda text: text and "Last updated" in text):
62
- # Get the parent element and remove it
63
- parent = element.parent
64
- if parent:
65
- parent.decompose()
66
-
67
- return datetime_value
68
-
69
- def extract_title(self) -> Optional[str]:
70
- """Extract the title of the page using multiple strategies"""
71
- # Strategy 1: Look for h1
72
- h1 = self.soup.find('h1')
73
- if h1:
74
- return h1.get_text(strip=True)
75
-
76
- # Strategy 2: Look for title tag
77
- title_tag = self.soup.find('title')
78
- if title_tag:
79
- title_text = title_tag.get_text(strip=True)
80
- title_parts = re.split(r'[|\-–]', title_text)
81
- return title_parts[0].strip()
82
-
83
- # Strategy 3: Try to find GitBook-specific title elements
84
- gitbook_title = self.soup.find('span', {'data-testid': 'page.title'})
85
- if gitbook_title:
86
- return gitbook_title.get_text(strip=True)
87
-
88
- return None
89
-
90
- def find_body_content(self) -> Optional[Tag]:
91
- body_content = self.soup.find('body')
92
- if body_content:
93
- return body_content
94
- return None
95
-
96
- def gitbook_extract_modern_nav(self, base_url, processed_urls):
97
- """Extract navigation from modern GitBook layout"""
98
- nav_links = []
99
-
100
- # Look for navigation sidebar
101
- sidebar = self.soup.select_one('div[data-testid="page.desktopTableOfContents"]')
102
- if sidebar:
103
- for link in sidebar.find_all('a', href=True):
104
- full_url = self.normalize_url(link['href'], base_url)
105
- if full_url and full_url not in processed_urls:
106
- nav_links.append(full_url)
107
- processed_urls.add(full_url)
108
-
109
- return nav_links
110
-
111
- def gitbook_extract_traditional_nav(self, base_url, processed_urls):
112
- """Extract navigation from traditional GitBook layout"""
113
- nav_links = []
114
-
115
- # Find GitBook navigation elements
116
- nav_elements = self.soup.find_all(['nav', 'aside'])
117
- for nav in nav_elements:
118
- # Look for lists that typically contain the navigation
119
- nav_lists = nav.find_all(['ol', 'ul'])
120
- for nav_list in nav_lists:
121
- for li in nav_list.find_all('li'):
122
- link = li.find('a', href=True)
123
- if link:
124
- full_url = self.normalize_url(link['href'], base_url)
125
- if full_url and full_url not in processed_urls:
126
- nav_links.append(full_url)
127
- processed_urls.add(full_url)
128
-
129
- # Try summary element which is common in GitBook
130
- summary = self.soup.find('ul', {'class': 'summary'})
131
- if summary:
132
- for link in summary.find_all('a', href=True):
133
- full_url = self.normalize_url(link['href'], base_url)
134
- if full_url and full_url not in processed_urls:
135
- nav_links.append(full_url)
136
- processed_urls.add(full_url)
137
-
138
- return nav_links
139
-
140
- def gitbook_extract_pagination_links(self, base_url, processed_urls):
141
- """Extract navigation from pagination elements"""
142
- nav_links = []
143
-
144
- # Find pagination links (next/prev)
145
- selectors = [
146
- 'a[aria-label="Next"]',
147
- 'a[aria-label="Previous"]',
148
- 'a.navigation-next',
149
- 'a.navigation-prev',
150
- 'a:has(svg[data-icon="arrow-right"])',
151
- 'a:has(svg[data-icon="arrow-left"])'
152
- ]
153
-
154
- for selector in selectors:
155
- try:
156
- for link in self.soup.select(selector):
157
- if link.has_attr('href'):
158
- full_url = self.normalize_url(link['href'], base_url)
159
- if full_url and full_url not in processed_urls:
160
- nav_links.append(full_url)
161
- processed_urls.add(full_url)
162
- except Exception:
163
- continue
164
-
165
- return nav_links
166
-
167
- def gitbook_extract_class_based_nav(self, base_url, processed_urls):
168
- """Extract navigation based on common GitBook class patterns"""
169
- nav_links = []
170
-
171
- # Common class patterns for navigation in GitBook
172
- class_patterns = [
173
- 'nav-', 'menu-', 'sidebar-', 'toc-', '-nav', '-menu', '-sidebar', '-toc'
174
- ]
175
-
176
- # Look for elements with these class patterns
177
- for pattern in class_patterns:
178
- elements = self.soup.find_all(class_=lambda c: c and pattern in c)
179
- for element in elements:
180
- for link in element.find_all('a', href=True):
181
- full_url = self.normalize_url(link['href'], base_url)
182
- if full_url and full_url not in processed_urls:
183
- nav_links.append(full_url)
184
- processed_urls.add(full_url)
185
-
186
- return nav_links
187
-
188
- @staticmethod
189
- def clean_template_usage(content: Tag):
190
- if not content or not isinstance(content, Tag):
191
- return None
192
- # Step 1: Build a mapping of template IDs to hidden div content
193
- template_map = {}
194
- # Find all hidden divs with IDs like S:*
195
- for hidden_div in content.find_all('div', {'hidden': True}, id=re.compile(r'S:\d+')):
196
- div_id = hidden_div.get('id')
197
- # Store the first child (e.g., <a> tag) or the entire content
198
- if hidden_div.contents:
199
- template_map[div_id] = hidden_div.contents[0] if len(hidden_div.contents) == 1 else hidden_div
200
-
201
- # Step 2: Replace <template> tags with content from hidden divs based on $RS logic
202
- for template in content.find_all('template', id=re.compile(r'P:\d+')):
203
- template_id = template.get('id') # e.g., P:2
204
- # Convert P:* to S:* to match the hidden div (assuming $RS("S:2", "P:2") pattern)
205
- source_id = f"S:{template_id.split(':')[1]}" # e.g., S:2
206
- if source_id in template_map:
207
- # Replace the template with the content from the hidden div
208
- replacement = template_map[source_id]
209
- # If it's a Tag, use it directly; if it's a div, extract its contents
210
- if isinstance(replacement, Tag):
211
- template.replace_with(replacement)
212
- else:
213
- template.replace_with(replacement.contents[0])
214
-
215
- @staticmethod
216
- def extract_chapters(content: Tag) -> List[TextChapter]:
217
- chapters = []
218
-
219
- # Create a default chapter for content before any heading
220
- default_chapter = TextChapter(heading="Introduction", level=0)
221
- current_chapter = default_chapter
222
- chapters.append(default_chapter)
223
-
224
- for element in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
225
- if element.name.startswith('h'):
226
- # Extract heading level (h1=1, h2=2, etc.)
227
- level = int(element.name[1])
228
- heading_text = element.get_text(strip=True)
229
-
230
- # Create a new chapter
231
- current_chapter = TextChapter(heading=heading_text, level=level)
232
- chapters.append(current_chapter)
233
-
234
- elif element.name == 'p' and current_chapter is not None:
235
- paragraph_text = element.get_text(strip=True)
236
- if paragraph_text:
237
- current_chapter.paragraphs.append(paragraph_text)
238
-
239
- # Remove any chapters without content if they're not top-level
240
- return [ch for ch in chapters if ch.paragraphs or ch.level <= 2]
File without changes
@@ -1,64 +0,0 @@
1
- import logging
2
- from typing import Optional
3
-
4
- from playwright.async_api import async_playwright
5
-
6
- from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
- class PlayWrightWebContentFetcher(WebFetcherRepository):
11
- def __init__(self):
12
- self.headers = {
13
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
14
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
15
- 'Accept-Language': 'en-US,en;q=0.5'
16
- }
17
- self.page_load_timeout = 60
18
- self.wait_for_idle = True
19
-
20
- async def fetch(self, url: str) -> Optional[str]:
21
- async with async_playwright() as p:
22
- browser = await p.chromium.launch(headless=True, args=[
23
- '--disable-dev-shm-usage', # Required for Docker
24
- '--no-sandbox', # Required for Docker non-root user
25
- '--disable-setuid-sandbox', # Required for Docker security
26
- '--disable-gpu', # Reduces resource usage
27
- ])
28
-
29
- logger.debug(
30
- f"Launching headless browser with user agent: {self.headers.get('User-Agent')}"
31
- )
32
- try:
33
- context = await browser.new_context(
34
- user_agent=self.headers.get('User-Agent')
35
- )
36
- page = await context.new_page()
37
-
38
- # Set timeout
39
- page.set_default_timeout(self.page_load_timeout * 1000) # Convert to ms
40
-
41
- page.on("console", lambda msg: logger.debug(f"Browser console {url}: {msg.text}"))
42
-
43
- # Navigate to the URL
44
- await page.goto(url, timeout=self.page_load_timeout * 1000, wait_until='domcontentloaded')
45
-
46
- # Wait for network to be idle if requested
47
- if self.wait_for_idle:
48
- logger.debug(f"Waiting for network idle on {url}")
49
-
50
- await page.wait_for_load_state("networkidle", timeout=self.page_load_timeout * 1000)
51
- logger.debug(
52
- f"Network idle on {url} after {self.page_load_timeout} seconds"
53
- )
54
-
55
- logger.debug(f"Successfully fetched {url} with headless browser")
56
-
57
- # Get the rendered HTML
58
- return await page.content()
59
-
60
- except Exception as e:
61
- logger.error(f"Error fetching {url} with headless browser: {str(e)}")
62
- return None
63
- finally:
64
- await browser.close()
File without changes