h-ai-brain 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h_ai/__init__.py +1 -3
- h_ai/application/hai_service.py +0 -51
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/METADATA +2 -8
- h_ai_brain-0.0.16.dist-info/RECORD +31 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/WHEEL +1 -1
- h_ai/application/priority_queue_service.py +0 -30
- h_ai/application/web_docs_service.py +0 -36
- h_ai/domain/priorityqueue/__init__.py +0 -0
- h_ai/domain/priorityqueue/priority_queue_repository.py +0 -34
- h_ai/domain/priorityqueue/queue_item.py +0 -43
- h_ai/domain/web_docs/__init__.py +0 -0
- h_ai/domain/web_docs/doc_link_scorer_service.py +0 -45
- h_ai/domain/web_docs/documentation_pattern_repository.py +0 -44
- h_ai/domain/web_docs/ecosystem_link_scorer_service.py +0 -83
- h_ai/domain/web_docs/ecosystem_pattern_repository.py +0 -182
- h_ai/domain/web_docs/gitbook/__init__.py +0 -0
- h_ai/domain/web_docs/gitbook/text_chapter.py +0 -18
- h_ai/domain/web_docs/gitbook/text_page.py +0 -46
- h_ai/domain/web_docs/gitbook_web_fetcher_service.py +0 -171
- h_ai/domain/web_docs/web_docs_link_detector.py +0 -28
- h_ai/domain/web_docs/web_link.py +0 -11
- h_ai/domain/webpages/__init__.py +0 -0
- h_ai/domain/webpages/web_fetcher_repository.py +0 -10
- h_ai/domain/webpages/web_text_fetcher_repository.py +0 -12
- h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/soup_processor.py +0 -240
- h_ai/infrastructure/playwright/__init__.py +0 -0
- h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +0 -64
- h_ai/infrastructure/priorityqueue/__init__.py +0 -0
- h_ai/infrastructure/priorityqueue/in_memory_priority_queue_repository.py +0 -98
- h_ai_brain-0.0.15.dist-info/RECORD +0 -56
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/LICENSE +0 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/NOTICE.txt +0 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/top_level.txt +0 -0
@@ -1,46 +0,0 @@
|
|
1
|
-
import hashlib
|
2
|
-
from dataclasses import dataclass, field
|
3
|
-
from typing import Dict, List
|
4
|
-
|
5
|
-
from ....domain.web_docs.gitbook.text_chapter import TextChapter
|
6
|
-
|
7
|
-
|
8
|
-
@dataclass
|
9
|
-
class TextPage:
|
10
|
-
"""Represents text on a page from a web document"""
|
11
|
-
url: str = ""
|
12
|
-
title: str = ""
|
13
|
-
content: str = ""
|
14
|
-
last_updated: str = ""
|
15
|
-
|
16
|
-
index: int = 0
|
17
|
-
toc_level: int = 0
|
18
|
-
parent_id: str = ""
|
19
|
-
|
20
|
-
chapters: List[TextChapter] = field(default_factory=list)
|
21
|
-
links: Dict[str, str] = field(default_factory=dict) # Text -> URL
|
22
|
-
|
23
|
-
id: str = field(init=False, default="")
|
24
|
-
content_hash: str = field(init=False, default="")
|
25
|
-
|
26
|
-
def __post_init__(self):
|
27
|
-
self.id = hashlib.md5(self.url.encode()).hexdigest()
|
28
|
-
# Generate a content hash for deduplication
|
29
|
-
self.content_hash = hashlib.md5(self.content.encode()).hexdigest() if self.content else ""
|
30
|
-
|
31
|
-
def to_dict(self):
|
32
|
-
"""Convert this TextPage instance to a serializable dictionary"""
|
33
|
-
result = {
|
34
|
-
'url': self.url,
|
35
|
-
'title': self.title,
|
36
|
-
#'content': self.content,
|
37
|
-
'last_updated': self.last_updated,
|
38
|
-
'index': self.index,
|
39
|
-
'toc_level': self.toc_level,
|
40
|
-
'parent_id': self.parent_id,
|
41
|
-
'id': self.id,
|
42
|
-
'content_hash': self.content_hash,
|
43
|
-
'links': self.links,
|
44
|
-
'chapters': [chapter.to_dict() for chapter in self.chapters]
|
45
|
-
}
|
46
|
-
return result
|
@@ -1,171 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from typing import Optional, Set, Dict, List
|
3
|
-
from urllib.parse import urlparse
|
4
|
-
|
5
|
-
import aiohttp
|
6
|
-
|
7
|
-
from ...domain.web_docs.gitbook.text_page import TextPage
|
8
|
-
from ...domain.webpages.web_text_fetcher_repository import WebTextFetcherRepository
|
9
|
-
from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
|
10
|
-
|
11
|
-
|
12
|
-
class GitbookWebFetcherService(WebTextFetcherRepository):
|
13
|
-
|
14
|
-
def __init__(self, url: str):
|
15
|
-
self.headers = {
|
16
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
17
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
18
|
-
'Accept-Language': 'en-US,en;q=0.5'
|
19
|
-
}
|
20
|
-
self.base_url = url.rstrip('/')
|
21
|
-
self.base_domain = urlparse(self.base_url).netloc
|
22
|
-
|
23
|
-
self.delay = 1.0 # Delay between requests in seconds
|
24
|
-
self.max_retries = 3
|
25
|
-
self.retry_delay = 2 # Initial retry delay in seconds
|
26
|
-
self.concurrent_requests = 3 # Number of concurrent requests
|
27
|
-
|
28
|
-
self.visited_urls: Set[str] = set()
|
29
|
-
self.content_hashes: Dict[str, str] = {} # Hash -> URL
|
30
|
-
self.pages: Dict[str, TextPage] = {}
|
31
|
-
|
32
|
-
async def fetch(self) -> Optional[List[TextPage]]:
|
33
|
-
timeout = aiohttp.ClientTimeout(total=180) # 3 minutes total timeout
|
34
|
-
async with aiohttp.ClientSession(headers=self.headers, timeout=timeout) as session:
|
35
|
-
# Start with main page
|
36
|
-
|
37
|
-
await self._process_url(session, self.base_url)
|
38
|
-
|
39
|
-
# Wait for all tasks to complete
|
40
|
-
await asyncio.sleep(0)
|
41
|
-
|
42
|
-
# Sort pages by index
|
43
|
-
sorted_pages = sorted(
|
44
|
-
self.pages.values(),
|
45
|
-
key=lambda p: p.index
|
46
|
-
)
|
47
|
-
return sorted_pages
|
48
|
-
|
49
|
-
async def _process_url(self, session: aiohttp.ClientSession, url: str) -> None:
|
50
|
-
if url in self.visited_urls:
|
51
|
-
return
|
52
|
-
print(f"Processing {url}")
|
53
|
-
self.visited_urls.add(url)
|
54
|
-
|
55
|
-
# Fetch page content
|
56
|
-
html_content = await self._fetch_page(session, url)
|
57
|
-
if not html_content:
|
58
|
-
return
|
59
|
-
|
60
|
-
# Extract page content
|
61
|
-
page = await GitbookWebFetcherService.extract_page_content(url, html_content)
|
62
|
-
if not page:
|
63
|
-
return
|
64
|
-
|
65
|
-
# Check for duplicate content
|
66
|
-
if page.content_hash in self.content_hashes:
|
67
|
-
return
|
68
|
-
|
69
|
-
# Set page index
|
70
|
-
page.index = len(self.pages)
|
71
|
-
|
72
|
-
# Add page to collection
|
73
|
-
self.pages[url] = page
|
74
|
-
self.content_hashes[page.content_hash] = url
|
75
|
-
|
76
|
-
# Extract navigation links from this page
|
77
|
-
nav_links = await GitbookWebFetcherService.gitbook_extract_navigation(self.base_url, html_content)
|
78
|
-
|
79
|
-
#Process the discovered links
|
80
|
-
for link in nav_links:
|
81
|
-
if link not in self.visited_urls:
|
82
|
-
# Add delay between requests
|
83
|
-
await asyncio.sleep(self.delay)
|
84
|
-
# Process the URL
|
85
|
-
await self._process_url(session, link)
|
86
|
-
|
87
|
-
@staticmethod
|
88
|
-
async def extract_page_content(url: str, html_content: str) -> Optional[TextPage]:
|
89
|
-
try:
|
90
|
-
soup_processor = SoupProcessor(html_content)
|
91
|
-
|
92
|
-
title = soup_processor.extract_title()
|
93
|
-
if not title:
|
94
|
-
title = urlparse(url).path.split('/')[-1] or "Index"
|
95
|
-
title = title.replace('-', ' ').replace('_', ' ').title()
|
96
|
-
|
97
|
-
last_updated = soup_processor.extract_last_updated_refs_from_soup()
|
98
|
-
|
99
|
-
body_tag = soup_processor.find_body_content()
|
100
|
-
if body_tag is None:
|
101
|
-
return None
|
102
|
-
soup_processor.clean_template_usage(body_tag)
|
103
|
-
chapters = soup_processor.extract_chapters(content=body_tag)
|
104
|
-
|
105
|
-
return TextPage(
|
106
|
-
url=url,
|
107
|
-
title=title,
|
108
|
-
content=html_content,
|
109
|
-
last_updated=last_updated,
|
110
|
-
chapters=chapters,
|
111
|
-
)
|
112
|
-
except Exception as e:
|
113
|
-
return None
|
114
|
-
|
115
|
-
@staticmethod
|
116
|
-
async def gitbook_extract_navigation(base_url: str, html_content: str) -> List[str]:
|
117
|
-
"""Extract navigation links from a page"""
|
118
|
-
try:
|
119
|
-
|
120
|
-
soup_processor = SoupProcessor(html_content)
|
121
|
-
|
122
|
-
nav_links = []
|
123
|
-
processed_urls = set()
|
124
|
-
|
125
|
-
# Extract links from modern layout
|
126
|
-
nav_links.extend(soup_processor.gitbook_extract_modern_nav(base_url, processed_urls))
|
127
|
-
|
128
|
-
# Extract links from traditional layout
|
129
|
-
nav_links.extend(soup_processor.gitbook_extract_traditional_nav(base_url, processed_urls))
|
130
|
-
|
131
|
-
# Extract links from pagination elements
|
132
|
-
nav_links.extend(soup_processor.gitbook_extract_pagination_links(base_url, processed_urls))
|
133
|
-
|
134
|
-
# Extract links from search for specific class patterns
|
135
|
-
nav_links.extend(soup_processor.gitbook_extract_class_based_nav(base_url, processed_urls))
|
136
|
-
|
137
|
-
# Remove duplicates while preserving order
|
138
|
-
return list(dict.fromkeys(nav_links))
|
139
|
-
|
140
|
-
except Exception as e:
|
141
|
-
return []
|
142
|
-
|
143
|
-
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
144
|
-
"""Fetch a page with retry logic"""
|
145
|
-
retry_count = 0
|
146
|
-
current_delay = self.retry_delay
|
147
|
-
|
148
|
-
while retry_count < self.max_retries:
|
149
|
-
try:
|
150
|
-
async with session.get(url) as response:
|
151
|
-
if response.status == 429: # Rate limit
|
152
|
-
retry_after = response.headers.get('Retry-After', '60')
|
153
|
-
wait_time = int(retry_after)
|
154
|
-
|
155
|
-
await asyncio.sleep(wait_time)
|
156
|
-
retry_count += 1
|
157
|
-
continue
|
158
|
-
|
159
|
-
if response.status == 200:
|
160
|
-
return await response.text()
|
161
|
-
else:
|
162
|
-
return None
|
163
|
-
|
164
|
-
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
165
|
-
if retry_count < self.max_retries - 1:
|
166
|
-
await asyncio.sleep(current_delay)
|
167
|
-
current_delay *= 2 # Exponential backoff
|
168
|
-
retry_count += 1
|
169
|
-
else:
|
170
|
-
return None
|
171
|
-
return None
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
|
4
|
-
from ...domain.web_docs.doc_link_scorer_service import DocLinkScorerService
|
5
|
-
from ...domain.web_docs.web_link import WebLink
|
6
|
-
from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
|
7
|
-
|
8
|
-
|
9
|
-
class WebDocsLinkDetector:
|
10
|
-
def __init__(self, doc_link_scorer: DocLinkScorerService, web_fetcher: WebFetcherRepository, confidence_threshold: float = 0.5):
|
11
|
-
self.doc_link_scorer = doc_link_scorer
|
12
|
-
self.web_fetcher = web_fetcher
|
13
|
-
self.confidence_threshold = confidence_threshold
|
14
|
-
|
15
|
-
async def find_docs_links(self, website_url: str) -> List[WebLink]:
|
16
|
-
doc_links = []
|
17
|
-
|
18
|
-
web_content = await self.web_fetcher.fetch(website_url)
|
19
|
-
if not web_content:
|
20
|
-
return doc_links
|
21
|
-
|
22
|
-
soup_processor = SoupProcessor(web_content)
|
23
|
-
web_links = soup_processor.extract_links(website_url)
|
24
|
-
for web_link in web_links:
|
25
|
-
score = self.doc_link_scorer.score(web_link.url, web_link.title)
|
26
|
-
if score >= self.confidence_threshold:
|
27
|
-
doc_links.append(web_link)
|
28
|
-
return doc_links
|
h_ai/domain/web_docs/web_link.py
DELETED
h_ai/domain/webpages/__init__.py
DELETED
File without changes
|
@@ -1,12 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
from typing import List, Optional
|
3
|
-
|
4
|
-
from ...domain.web_docs.gitbook.text_page import TextPage
|
5
|
-
|
6
|
-
|
7
|
-
class WebTextFetcherRepository(ABC):
|
8
|
-
|
9
|
-
@abstractmethod
|
10
|
-
async def fetch(self) -> Optional[List[TextPage]]:
|
11
|
-
"""Fetch all content"""
|
12
|
-
pass
|
File without changes
|
@@ -1,240 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import re
|
3
|
-
from typing import List, Optional
|
4
|
-
from urllib.parse import urljoin
|
5
|
-
|
6
|
-
from bs4 import BeautifulSoup, Tag
|
7
|
-
|
8
|
-
from ...domain.web_docs.gitbook.text_chapter import TextChapter
|
9
|
-
from ...domain.web_docs.web_link import WebLink
|
10
|
-
|
11
|
-
logger = logging.getLogger(__name__)
|
12
|
-
|
13
|
-
class SoupProcessor:
|
14
|
-
def __init__(self, html_content:str):
|
15
|
-
self.soup = BeautifulSoup(html_content, 'html.parser')
|
16
|
-
|
17
|
-
def extract_links(self, base_url: str) -> List[WebLink]:
|
18
|
-
"""Extract links from a page"""
|
19
|
-
web_links = []
|
20
|
-
links = self.soup.find_all('a', href=True)
|
21
|
-
for link in links:
|
22
|
-
href = link.get('href', '').strip()
|
23
|
-
if not href or href.startswith('#') or href.startswith('javascript:'):
|
24
|
-
continue
|
25
|
-
|
26
|
-
full_url = urljoin(base_url, href)
|
27
|
-
link_text = link.get_text().strip()
|
28
|
-
web_link = WebLink(url=full_url, title=link_text)
|
29
|
-
web_links.append(web_link)
|
30
|
-
return web_links
|
31
|
-
|
32
|
-
def normalize_url(self, href, base_url) -> Optional[str]:
|
33
|
-
"""Normalize URL to absolute form and filter out non-content URLs"""
|
34
|
-
# Skip fragment-only URLs
|
35
|
-
if href.startswith('#'):
|
36
|
-
return None
|
37
|
-
|
38
|
-
# Skip external links
|
39
|
-
if href.startswith(('http://', 'https://')) and not href.startswith(base_url):
|
40
|
-
return None
|
41
|
-
|
42
|
-
# Skip resource URLs
|
43
|
-
if href.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.js', '.css')):
|
44
|
-
return None
|
45
|
-
|
46
|
-
# Convert to absolute URL if needed
|
47
|
-
full_url = href
|
48
|
-
if not href.startswith(('http://', 'https://')):
|
49
|
-
full_url = urljoin(base_url, href)
|
50
|
-
|
51
|
-
# Make sure URL belongs to the same domain
|
52
|
-
if not full_url.startswith(base_url):
|
53
|
-
return None
|
54
|
-
|
55
|
-
return full_url
|
56
|
-
|
57
|
-
def extract_last_updated_refs_from_soup(self) -> str:
|
58
|
-
datetime_value = ""
|
59
|
-
|
60
|
-
# Find and remove elements containing "Last updated" text
|
61
|
-
for element in self.soup.find_all(string=lambda text: text and "Last updated" in text):
|
62
|
-
# Get the parent element and remove it
|
63
|
-
parent = element.parent
|
64
|
-
if parent:
|
65
|
-
parent.decompose()
|
66
|
-
|
67
|
-
return datetime_value
|
68
|
-
|
69
|
-
def extract_title(self) -> Optional[str]:
|
70
|
-
"""Extract the title of the page using multiple strategies"""
|
71
|
-
# Strategy 1: Look for h1
|
72
|
-
h1 = self.soup.find('h1')
|
73
|
-
if h1:
|
74
|
-
return h1.get_text(strip=True)
|
75
|
-
|
76
|
-
# Strategy 2: Look for title tag
|
77
|
-
title_tag = self.soup.find('title')
|
78
|
-
if title_tag:
|
79
|
-
title_text = title_tag.get_text(strip=True)
|
80
|
-
title_parts = re.split(r'[|\-–]', title_text)
|
81
|
-
return title_parts[0].strip()
|
82
|
-
|
83
|
-
# Strategy 3: Try to find GitBook-specific title elements
|
84
|
-
gitbook_title = self.soup.find('span', {'data-testid': 'page.title'})
|
85
|
-
if gitbook_title:
|
86
|
-
return gitbook_title.get_text(strip=True)
|
87
|
-
|
88
|
-
return None
|
89
|
-
|
90
|
-
def find_body_content(self) -> Optional[Tag]:
|
91
|
-
body_content = self.soup.find('body')
|
92
|
-
if body_content:
|
93
|
-
return body_content
|
94
|
-
return None
|
95
|
-
|
96
|
-
def gitbook_extract_modern_nav(self, base_url, processed_urls):
|
97
|
-
"""Extract navigation from modern GitBook layout"""
|
98
|
-
nav_links = []
|
99
|
-
|
100
|
-
# Look for navigation sidebar
|
101
|
-
sidebar = self.soup.select_one('div[data-testid="page.desktopTableOfContents"]')
|
102
|
-
if sidebar:
|
103
|
-
for link in sidebar.find_all('a', href=True):
|
104
|
-
full_url = self.normalize_url(link['href'], base_url)
|
105
|
-
if full_url and full_url not in processed_urls:
|
106
|
-
nav_links.append(full_url)
|
107
|
-
processed_urls.add(full_url)
|
108
|
-
|
109
|
-
return nav_links
|
110
|
-
|
111
|
-
def gitbook_extract_traditional_nav(self, base_url, processed_urls):
|
112
|
-
"""Extract navigation from traditional GitBook layout"""
|
113
|
-
nav_links = []
|
114
|
-
|
115
|
-
# Find GitBook navigation elements
|
116
|
-
nav_elements = self.soup.find_all(['nav', 'aside'])
|
117
|
-
for nav in nav_elements:
|
118
|
-
# Look for lists that typically contain the navigation
|
119
|
-
nav_lists = nav.find_all(['ol', 'ul'])
|
120
|
-
for nav_list in nav_lists:
|
121
|
-
for li in nav_list.find_all('li'):
|
122
|
-
link = li.find('a', href=True)
|
123
|
-
if link:
|
124
|
-
full_url = self.normalize_url(link['href'], base_url)
|
125
|
-
if full_url and full_url not in processed_urls:
|
126
|
-
nav_links.append(full_url)
|
127
|
-
processed_urls.add(full_url)
|
128
|
-
|
129
|
-
# Try summary element which is common in GitBook
|
130
|
-
summary = self.soup.find('ul', {'class': 'summary'})
|
131
|
-
if summary:
|
132
|
-
for link in summary.find_all('a', href=True):
|
133
|
-
full_url = self.normalize_url(link['href'], base_url)
|
134
|
-
if full_url and full_url not in processed_urls:
|
135
|
-
nav_links.append(full_url)
|
136
|
-
processed_urls.add(full_url)
|
137
|
-
|
138
|
-
return nav_links
|
139
|
-
|
140
|
-
def gitbook_extract_pagination_links(self, base_url, processed_urls):
|
141
|
-
"""Extract navigation from pagination elements"""
|
142
|
-
nav_links = []
|
143
|
-
|
144
|
-
# Find pagination links (next/prev)
|
145
|
-
selectors = [
|
146
|
-
'a[aria-label="Next"]',
|
147
|
-
'a[aria-label="Previous"]',
|
148
|
-
'a.navigation-next',
|
149
|
-
'a.navigation-prev',
|
150
|
-
'a:has(svg[data-icon="arrow-right"])',
|
151
|
-
'a:has(svg[data-icon="arrow-left"])'
|
152
|
-
]
|
153
|
-
|
154
|
-
for selector in selectors:
|
155
|
-
try:
|
156
|
-
for link in self.soup.select(selector):
|
157
|
-
if link.has_attr('href'):
|
158
|
-
full_url = self.normalize_url(link['href'], base_url)
|
159
|
-
if full_url and full_url not in processed_urls:
|
160
|
-
nav_links.append(full_url)
|
161
|
-
processed_urls.add(full_url)
|
162
|
-
except Exception:
|
163
|
-
continue
|
164
|
-
|
165
|
-
return nav_links
|
166
|
-
|
167
|
-
def gitbook_extract_class_based_nav(self, base_url, processed_urls):
|
168
|
-
"""Extract navigation based on common GitBook class patterns"""
|
169
|
-
nav_links = []
|
170
|
-
|
171
|
-
# Common class patterns for navigation in GitBook
|
172
|
-
class_patterns = [
|
173
|
-
'nav-', 'menu-', 'sidebar-', 'toc-', '-nav', '-menu', '-sidebar', '-toc'
|
174
|
-
]
|
175
|
-
|
176
|
-
# Look for elements with these class patterns
|
177
|
-
for pattern in class_patterns:
|
178
|
-
elements = self.soup.find_all(class_=lambda c: c and pattern in c)
|
179
|
-
for element in elements:
|
180
|
-
for link in element.find_all('a', href=True):
|
181
|
-
full_url = self.normalize_url(link['href'], base_url)
|
182
|
-
if full_url and full_url not in processed_urls:
|
183
|
-
nav_links.append(full_url)
|
184
|
-
processed_urls.add(full_url)
|
185
|
-
|
186
|
-
return nav_links
|
187
|
-
|
188
|
-
@staticmethod
|
189
|
-
def clean_template_usage(content: Tag):
|
190
|
-
if not content or not isinstance(content, Tag):
|
191
|
-
return None
|
192
|
-
# Step 1: Build a mapping of template IDs to hidden div content
|
193
|
-
template_map = {}
|
194
|
-
# Find all hidden divs with IDs like S:*
|
195
|
-
for hidden_div in content.find_all('div', {'hidden': True}, id=re.compile(r'S:\d+')):
|
196
|
-
div_id = hidden_div.get('id')
|
197
|
-
# Store the first child (e.g., <a> tag) or the entire content
|
198
|
-
if hidden_div.contents:
|
199
|
-
template_map[div_id] = hidden_div.contents[0] if len(hidden_div.contents) == 1 else hidden_div
|
200
|
-
|
201
|
-
# Step 2: Replace <template> tags with content from hidden divs based on $RS logic
|
202
|
-
for template in content.find_all('template', id=re.compile(r'P:\d+')):
|
203
|
-
template_id = template.get('id') # e.g., P:2
|
204
|
-
# Convert P:* to S:* to match the hidden div (assuming $RS("S:2", "P:2") pattern)
|
205
|
-
source_id = f"S:{template_id.split(':')[1]}" # e.g., S:2
|
206
|
-
if source_id in template_map:
|
207
|
-
# Replace the template with the content from the hidden div
|
208
|
-
replacement = template_map[source_id]
|
209
|
-
# If it's a Tag, use it directly; if it's a div, extract its contents
|
210
|
-
if isinstance(replacement, Tag):
|
211
|
-
template.replace_with(replacement)
|
212
|
-
else:
|
213
|
-
template.replace_with(replacement.contents[0])
|
214
|
-
|
215
|
-
@staticmethod
|
216
|
-
def extract_chapters(content: Tag) -> List[TextChapter]:
|
217
|
-
chapters = []
|
218
|
-
|
219
|
-
# Create a default chapter for content before any heading
|
220
|
-
default_chapter = TextChapter(heading="Introduction", level=0)
|
221
|
-
current_chapter = default_chapter
|
222
|
-
chapters.append(default_chapter)
|
223
|
-
|
224
|
-
for element in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
225
|
-
if element.name.startswith('h'):
|
226
|
-
# Extract heading level (h1=1, h2=2, etc.)
|
227
|
-
level = int(element.name[1])
|
228
|
-
heading_text = element.get_text(strip=True)
|
229
|
-
|
230
|
-
# Create a new chapter
|
231
|
-
current_chapter = TextChapter(heading=heading_text, level=level)
|
232
|
-
chapters.append(current_chapter)
|
233
|
-
|
234
|
-
elif element.name == 'p' and current_chapter is not None:
|
235
|
-
paragraph_text = element.get_text(strip=True)
|
236
|
-
if paragraph_text:
|
237
|
-
current_chapter.paragraphs.append(paragraph_text)
|
238
|
-
|
239
|
-
# Remove any chapters without content if they're not top-level
|
240
|
-
return [ch for ch in chapters if ch.paragraphs or ch.level <= 2]
|
File without changes
|
@@ -1,64 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Optional
|
3
|
-
|
4
|
-
from playwright.async_api import async_playwright
|
5
|
-
|
6
|
-
from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
|
7
|
-
|
8
|
-
logger = logging.getLogger(__name__)
|
9
|
-
|
10
|
-
class PlayWrightWebContentFetcher(WebFetcherRepository):
|
11
|
-
def __init__(self):
|
12
|
-
self.headers = {
|
13
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
14
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
15
|
-
'Accept-Language': 'en-US,en;q=0.5'
|
16
|
-
}
|
17
|
-
self.page_load_timeout = 60
|
18
|
-
self.wait_for_idle = True
|
19
|
-
|
20
|
-
async def fetch(self, url: str) -> Optional[str]:
|
21
|
-
async with async_playwright() as p:
|
22
|
-
browser = await p.chromium.launch(headless=True, args=[
|
23
|
-
'--disable-dev-shm-usage', # Required for Docker
|
24
|
-
'--no-sandbox', # Required for Docker non-root user
|
25
|
-
'--disable-setuid-sandbox', # Required for Docker security
|
26
|
-
'--disable-gpu', # Reduces resource usage
|
27
|
-
])
|
28
|
-
|
29
|
-
logger.debug(
|
30
|
-
f"Launching headless browser with user agent: {self.headers.get('User-Agent')}"
|
31
|
-
)
|
32
|
-
try:
|
33
|
-
context = await browser.new_context(
|
34
|
-
user_agent=self.headers.get('User-Agent')
|
35
|
-
)
|
36
|
-
page = await context.new_page()
|
37
|
-
|
38
|
-
# Set timeout
|
39
|
-
page.set_default_timeout(self.page_load_timeout * 1000) # Convert to ms
|
40
|
-
|
41
|
-
page.on("console", lambda msg: logger.debug(f"Browser console {url}: {msg.text}"))
|
42
|
-
|
43
|
-
# Navigate to the URL
|
44
|
-
await page.goto(url, timeout=self.page_load_timeout * 1000, wait_until='domcontentloaded')
|
45
|
-
|
46
|
-
# Wait for network to be idle if requested
|
47
|
-
if self.wait_for_idle:
|
48
|
-
logger.debug(f"Waiting for network idle on {url}")
|
49
|
-
|
50
|
-
await page.wait_for_load_state("networkidle", timeout=self.page_load_timeout * 1000)
|
51
|
-
logger.debug(
|
52
|
-
f"Network idle on {url} after {self.page_load_timeout} seconds"
|
53
|
-
)
|
54
|
-
|
55
|
-
logger.debug(f"Successfully fetched {url} with headless browser")
|
56
|
-
|
57
|
-
# Get the rendered HTML
|
58
|
-
return await page.content()
|
59
|
-
|
60
|
-
except Exception as e:
|
61
|
-
logger.error(f"Error fetching {url} with headless browser: {str(e)}")
|
62
|
-
return None
|
63
|
-
finally:
|
64
|
-
await browser.close()
|
File without changes
|