h-ai-brain 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. h_ai/__init__.py +5 -0
  2. h_ai/application/__init__.py +0 -0
  3. h_ai/application/hai_service.py +48 -0
  4. h_ai/application/system_prompts/__init__.py +0 -0
  5. h_ai/application/system_prompts/roles/__init__.py +0 -0
  6. h_ai/application/web_docs_service.py +35 -0
  7. h_ai/domain/__init__.py +0 -0
  8. h_ai/domain/reasoning/__init__.py +0 -0
  9. h_ai/domain/reasoning/llm_chat_repository.py +9 -0
  10. h_ai/domain/reasoning/llm_generate_respository.py +6 -0
  11. h_ai/domain/reasoning/llm_tool_repository.py +14 -0
  12. h_ai/domain/reasoning/text_analysis.py +149 -0
  13. h_ai/domain/reasoning/tool_message.py +4 -0
  14. h_ai/domain/web_docs/__init__.py +0 -0
  15. h_ai/domain/web_docs/doc_link_scorer_service.py +45 -0
  16. h_ai/domain/web_docs/documentation_pattern_repository.py +44 -0
  17. h_ai/domain/web_docs/gitbook/__init__.py +0 -0
  18. h_ai/domain/web_docs/gitbook/text_chapter.py +18 -0
  19. h_ai/domain/web_docs/gitbook/text_page.py +46 -0
  20. h_ai/domain/web_docs/gitbook_web_fetcher_service.py +172 -0
  21. h_ai/domain/web_docs/web_docs_link_detector.py +26 -0
  22. h_ai/domain/web_docs/web_link.py +11 -0
  23. h_ai/domain/webpages/__init__.py +0 -0
  24. h_ai/domain/webpages/web_fetcher_repository.py +10 -0
  25. h_ai/domain/webpages/web_text_fetcher_repository.py +12 -0
  26. h_ai/infrastructure/__init__.py +0 -0
  27. h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
  28. h_ai/infrastructure/beautifulsoup/soup_processor.py +240 -0
  29. h_ai/infrastructure/llm/__init__.py +0 -0
  30. h_ai/infrastructure/llm/data_handler.py +30 -0
  31. h_ai/infrastructure/llm/llm_response_cleaner.py +21 -0
  32. h_ai/infrastructure/llm/ollama/__init__.py +0 -0
  33. h_ai/infrastructure/llm/ollama/models/__init__.py +0 -0
  34. h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py +13 -0
  35. h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py +12 -0
  36. h_ai/infrastructure/llm/ollama/ollama_chat_repository.py +56 -0
  37. h_ai/infrastructure/llm/ollama/ollama_generate_repository.py +53 -0
  38. h_ai/infrastructure/llm/ollama/ollama_tool_repository.py +138 -0
  39. h_ai/infrastructure/llm/prompt_helper.py +7 -0
  40. h_ai/infrastructure/llm/prompt_loader.py +18 -0
  41. h_ai/infrastructure/playwright/__init__.py +0 -0
  42. h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +48 -0
  43. h_ai_brain-0.0.1.dist-info/METADATA +22 -0
  44. h_ai_brain-0.0.1.dist-info/RECORD +48 -0
  45. h_ai_brain-0.0.1.dist-info/WHEEL +5 -0
  46. h_ai_brain-0.0.1.dist-info/licenses/LICENSE +202 -0
  47. h_ai_brain-0.0.1.dist-info/licenses/NOTICE.txt +19 -0
  48. h_ai_brain-0.0.1.dist-info/top_level.txt +1 -0
h_ai/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ __all__ = ['HaiService', 'WebDocsService']
2
+
3
+ from .application.web_docs_service import WebDocsService
4
+
5
+ from .application.hai_service import HaiService
File without changes
@@ -0,0 +1,48 @@
1
+ import datetime
2
+
3
+ from h_message_bus import NatsPublisherAdapter
4
+ from h_message_bus.domain.twitter_get_user_request_message import TwitterGetUserRequestMessage
5
+ from h_message_bus.domain.twitter_get_user_response_message import TwitterGetUserResponseMessage
6
+ from h_message_bus.domain.vector_save_request_message import VectorSaveRequestMessage
7
+
8
+ from ..application.web_docs_service import WebDocsService
9
+
10
+
11
+ class HaiService:
12
+ def __init__(self, nats_publisher_adapter: NatsPublisherAdapter):
13
+ self.nats_publisher_adapter = nats_publisher_adapter
14
+ self.web_docs_service = WebDocsService()
15
+
16
+ async def detect_and_store_documentation(self, twitter_screen_name: str):
17
+ req_message = TwitterGetUserRequestMessage.create_message(twitter_screen_name)
18
+ response = await self.nats_publisher_adapter.request(req_message)
19
+ twitter_user = TwitterGetUserResponseMessage.from_hai_message(response)
20
+
21
+ if twitter_user.url is not None:
22
+ print(f"Documentation found for {twitter_user.screen_name}: {twitter_user.url}")
23
+ docs = await self.web_docs_service.discover_documentation(twitter_user.url)
24
+
25
+ for doc in docs:
26
+ collection_name = f"{twitter_user.screen_name}_docs"
27
+ chapters = doc.chapters
28
+ for chapter in chapters:
29
+ i = 0
30
+ for text in chapter.paragraphs:
31
+ document_id = f"{doc.title}_{chapter.heading}_{i}"
32
+
33
+ req_metadata = {
34
+ "source": doc.url,
35
+ "updated": datetime.datetime.utcnow()
36
+ }
37
+ i = i + 1
38
+
39
+ request = VectorSaveRequestMessage.create_message(
40
+ collection_name=collection_name,
41
+ document_id=document_id,
42
+ content=text,
43
+ metadata=req_metadata)
44
+
45
+ await self.nats_publisher_adapter.publish(request)
46
+
47
+ else:
48
+ print(f"No documentation found for {twitter_user.screen_name}")
File without changes
File without changes
@@ -0,0 +1,35 @@
1
+ from typing import List, Optional
2
+
3
+ from ..domain.web_docs.doc_link_scorer_service import DocLinkScorerService
4
+ from ..domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
5
+ from ..domain.web_docs.gitbook.text_page import TextPage
6
+ from ..domain.web_docs.gitbook_web_fetcher_service import GitbookWebFetcherService
7
+ from ..domain.web_docs.web_docs_link_detector import WebDocsLinkDetector
8
+ from ..domain.web_docs.web_link import WebLink
9
+ from ..infrastructure.playwright.playwright_web_content_fetcher import PlayWrightWebContentFetcher
10
+
11
+
12
+ class WebDocsService:
13
+
14
+ def __init__(self):
15
+ self.pattern_repo = DocumentationPatternRepository()
16
+ self.scorer = DocLinkScorerService(self.pattern_repo)
17
+ self.headless_browser = PlayWrightWebContentFetcher()
18
+ self.web_link_detector = WebDocsLinkDetector(
19
+ self.scorer,
20
+ self.headless_browser)
21
+
22
+
23
+ async def discover_documentation(self, website_url: str) -> Optional[List[TextPage]]:
24
+ detected_links = await self.detect_documentation_links(website_url)
25
+ for link in detected_links:
26
+ gitbook_fetcher = GitbookWebFetcherService(link.url)
27
+ gitbook_pages = await gitbook_fetcher.fetch()
28
+ return gitbook_pages
29
+
30
+ async def detect_documentation_links(self, website_url: str) -> List[WebLink]:
31
+ """
32
+ Function to detect documentation links from a website
33
+ Returns a list of potential documentation root URLs
34
+ """
35
+ return await self.web_link_detector.find_docs_links(website_url)
File without changes
File without changes
@@ -0,0 +1,9 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+
5
+ class LlmChatRepository(ABC):
6
+
7
+ @abstractmethod
8
+ def chat(self, user_message: str, session_id: str) -> Optional[str]:
9
+ ...
@@ -0,0 +1,6 @@
1
+ from typing import Protocol
2
+
3
+
4
+ class LlmGenerateRepository(Protocol):
5
+ def generate(self, user_prompt: str, system_prompt: str, session_id: str = None) -> str:
6
+ ...
@@ -0,0 +1,14 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from ...domain.reasoning.tool_message import ToolMessage
5
+
6
+
7
+ class LlmToolRepository(ABC):
8
+ @abstractmethod
9
+ def find_tools_in_message(self, message: str) -> List[ToolMessage] | None:
10
+ ...
11
+
12
+ @abstractmethod
13
+ def build_tool_response_prompt(self, question: str, tool_results: list[str])-> str|None:
14
+ ...
@@ -0,0 +1,149 @@
1
+ import uuid
2
+ from typing import List, Dict, Union, Optional
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class Sentiment:
8
+ """Domain object representing the sentiment of analyzed text"""
9
+ score: float
10
+ label: str # 'negative', 'neutral', or 'positive'
11
+
12
+
13
+ @dataclass
14
+ class DetectedEntity:
15
+ """Domain object representing an entity detected in analyzed text"""
16
+ name: str
17
+ type: str
18
+ mentions: List[str] = None
19
+
20
+ def __post_init__(self):
21
+ if self.mentions is None:
22
+ self.mentions = []
23
+
24
+
25
+ @dataclass
26
+ class DetectedRelationship:
27
+ """Domain object representing a relationship between entities detected in text"""
28
+ source_entity_name: str
29
+ relation_type: str
30
+ target_entity_name: str
31
+ confidence: float = 0.0 # Confidence score of this relationship detection
32
+
33
+
34
+ @dataclass
35
+ class ExtractedFact:
36
+ """Domain object representing a factual statement extracted from text"""
37
+ statement: str
38
+ referenced_entities: List[str] = None # Names of entities referenced in this fact
39
+
40
+ def __post_init__(self):
41
+ if self.referenced_entities is None:
42
+ self.referenced_entities = []
43
+
44
+
45
+ @dataclass
46
+ class TextAnalysisResult:
47
+ """Root aggregate for the result of analyzing a piece of text"""
48
+ text_id: str
49
+ category: str
50
+ topic: str
51
+ sentiment: Sentiment
52
+ entities: List[DetectedEntity]
53
+ relationships: List[DetectedRelationship]
54
+ facts: List[ExtractedFact]
55
+
56
+ def __init__(self, text_id: str, category: str, topic: str,
57
+ sentiment_score: float = 0.0,
58
+ sentiment_label: str = "neutral",
59
+ entities: Optional[List[Dict[str, Union[str, List[str]]]]] = None,
60
+ relationships: Optional[List[Dict[str, Union[str, float]]]] = None,
61
+ facts: Optional[List[Dict[str, Union[str, List[str]]]]] = None):
62
+
63
+ self.text_id = text_id
64
+ self.category = category
65
+ self.topic = topic
66
+ self.sentiment = Sentiment(sentiment_score, sentiment_label)
67
+
68
+ # Process entities
69
+ self.entities = []
70
+ if entities:
71
+ for entity_data in entities:
72
+ entity = DetectedEntity(
73
+ name=entity_data["name"],
74
+ type=entity_data["type"],
75
+ mentions=entity_data.get("mentions", [])
76
+ )
77
+ self.entities.append(entity)
78
+
79
+ # Process relationships
80
+ self.relationships = []
81
+ if relationships:
82
+ for rel_data in relationships:
83
+ rel = DetectedRelationship(
84
+ source_entity_name=rel_data["source_entity_name"],
85
+ relation_type=rel_data["relation_type"],
86
+ target_entity_name=rel_data["target_entity_name"],
87
+ confidence=rel_data.get("confidence", 0.0)
88
+ )
89
+ self.relationships.append(rel)
90
+
91
+ # Process facts
92
+ self.facts = []
93
+ if facts:
94
+ for fact_data in facts:
95
+ fact = ExtractedFact(
96
+ statement=fact_data["statement"],
97
+ referenced_entities=fact_data.get("referenced_entities", [])
98
+ )
99
+ self.facts.append(fact)
100
+
101
+ def to_dict(self) -> Dict[str, Union[str, Dict, List]]:
102
+ """Convert the TextAnalysisResult to a dictionary"""
103
+ return {
104
+ "text_id": self.text_id,
105
+ "category": self.category,
106
+ "topic": self.topic,
107
+ "sentiment": {
108
+ "score": self.sentiment.score,
109
+ "label": self.sentiment.label
110
+ },
111
+ "entities": [
112
+ {
113
+ "name": entity.name,
114
+ "type": entity.type,
115
+ "mentions": entity.mentions
116
+ } for entity in self.entities
117
+ ],
118
+ "relationships": [
119
+ {
120
+ "source_entity_name": rel.source_entity_name,
121
+ "relation_type": rel.relation_type,
122
+ "target_entity_name": rel.target_entity_name,
123
+ "confidence": rel.confidence
124
+ } for rel in self.relationships
125
+ ],
126
+ "facts": [
127
+ {
128
+ "statement": fact.statement,
129
+ "referenced_entities": fact.referenced_entities
130
+ } for fact in self.facts
131
+ ]
132
+ }
133
+
134
+ @classmethod
135
+ def from_dict(cls, data: Dict) -> 'TextAnalysisResult':
136
+ """Create a TextAnalysisResult from a dictionary"""
137
+ random_guid = uuid.uuid4()
138
+ random_guid_str = str(uuid.uuid4())
139
+
140
+ return cls(
141
+ text_id=random_guid_str,
142
+ category=data["category"],
143
+ topic=data["topic"],
144
+ sentiment_score=data["sentiment"]["score"],
145
+ sentiment_label=data["sentiment"]["label"],
146
+ entities=data.get("entities", []),
147
+ relationships=data.get("relationships", []),
148
+ facts=data.get("facts", [])
149
+ )
@@ -0,0 +1,4 @@
1
+
2
+ class ToolMessage:
3
+ method_name: str
4
+ method_params: dict
File without changes
@@ -0,0 +1,45 @@
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+ from ...domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
5
+
6
+
7
+ class DocLinkScorerService:
8
+ """Service for scoring potential documentation links"""
9
+
10
+ def __init__(self, pattern_repo: DocumentationPatternRepository):
11
+ self.pattern_repo = pattern_repo
12
+
13
+ def score(self, full_url: str, link_text: str) -> float:
14
+ """
15
+ Score a link based on how likely it is to be documentation
16
+ Returns a value between 0.0 and 1.0
17
+ """
18
+ score = 0.0
19
+ max_score = 3.0 # Maximum possible score
20
+
21
+ # Parse the URL
22
+ parsed_url = urlparse(full_url)
23
+ domain = parsed_url.netloc
24
+ path = parsed_url.path
25
+
26
+ # Check domain patterns
27
+ for doc_domain in self.pattern_repo.doc_domains:
28
+ if doc_domain in domain:
29
+ score += 1.0
30
+ break
31
+
32
+ # Check path patterns
33
+ for path_pattern in self.pattern_repo.doc_path_patterns:
34
+ if re.search(path_pattern, path):
35
+ score += 1.0
36
+ break
37
+
38
+ # Check link text patterns
39
+ for text_pattern in self.pattern_repo.doc_text_patterns:
40
+ if re.search(text_pattern, link_text):
41
+ score += 1.0
42
+ break
43
+
44
+ # Normalize score to 0.0-1.0 range
45
+ return min(score / max_score, 1.0)
@@ -0,0 +1,44 @@
1
+ class DocumentationPatternRepository:
2
+ """Repository of patterns that indicate documentation links"""
3
+
4
+ def __init__(self):
5
+ # Domain patterns that commonly host documentation
6
+ self.doc_domains = [
7
+ "gitbook.io",
8
+ "readthedocs.io",
9
+ "docs.github.com",
10
+ "developer.mozilla.org",
11
+ "confluence.",
12
+ "zendesk.com",
13
+ "help.",
14
+ "support.",
15
+ "wiki.",
16
+ ]
17
+
18
+ # URL path patterns that commonly indicate documentation
19
+ self.doc_path_patterns = [
20
+ r"/docs/",
21
+ r"/documentation/",
22
+ r"/guide/",
23
+ r"/manual/",
24
+ r"/help/",
25
+ r"/knowledge/",
26
+ r"/support/",
27
+ r"/api/",
28
+ r"/reference/",
29
+ r"/wiki/",
30
+ ]
31
+
32
+ # Link text patterns that suggest documentation
33
+ self.doc_text_patterns = [
34
+ r"(?i)documentation",
35
+ r"(?i)docs",
36
+ r"(?i)developer guide",
37
+ r"(?i)user guide",
38
+ r"(?i)knowledge base",
39
+ r"(?i)help center",
40
+ r"(?i)manual",
41
+ r"(?i)api reference",
42
+ r"(?i)getting started",
43
+ r"(?i)learn more",
44
+ ]
File without changes
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+
5
+ @dataclass
6
+ class TextChapter:
7
+ """Represents a chapter/section in a page defined by a heading."""
8
+ heading: str
9
+ level: int # h1=1, h2=2, etc.
10
+ paragraphs: List[str] = field(default_factory=list)
11
+
12
+ def to_dict(self):
13
+ """Convert this TextChapter instance to a serializable dictionary"""
14
+ return {
15
+ 'heading': self.heading,
16
+ 'level': self.level,
17
+ 'paragraphs': self.paragraphs
18
+ }
@@ -0,0 +1,46 @@
1
+ import hashlib
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, List
4
+
5
+ from src.h_ai.domain.web_docs.gitbook.text_chapter import TextChapter
6
+
7
+
8
+ @dataclass
9
+ class TextPage:
10
+ """Represents text on a page from a web document"""
11
+ url: str = ""
12
+ title: str = ""
13
+ content: str = ""
14
+ last_updated: str = ""
15
+
16
+ index: int = 0
17
+ toc_level: int = 0
18
+ parent_id: str = ""
19
+
20
+ chapters: List[TextChapter] = field(default_factory=list)
21
+ links: Dict[str, str] = field(default_factory=dict) # Text -> URL
22
+
23
+ id: str = field(init=False, default="")
24
+ content_hash: str = field(init=False, default="")
25
+
26
+ def __post_init__(self):
27
+ self.id = hashlib.md5(self.url.encode()).hexdigest()
28
+ # Generate a content hash for deduplication
29
+ self.content_hash = hashlib.md5(self.content.encode()).hexdigest() if self.content else ""
30
+
31
+ def to_dict(self):
32
+ """Convert this TextPage instance to a serializable dictionary"""
33
+ result = {
34
+ 'url': self.url,
35
+ 'title': self.title,
36
+ #'content': self.content,
37
+ 'last_updated': self.last_updated,
38
+ 'index': self.index,
39
+ 'toc_level': self.toc_level,
40
+ 'parent_id': self.parent_id,
41
+ 'id': self.id,
42
+ 'content_hash': self.content_hash,
43
+ 'links': self.links,
44
+ 'chapters': [chapter.to_dict() for chapter in self.chapters]
45
+ }
46
+ return result
@@ -0,0 +1,172 @@
1
+ import asyncio
2
+ import json
3
+ from typing import Optional, Set, Dict, List
4
+ from urllib.parse import urlparse
5
+
6
+ import aiohttp
7
+
8
+ from ...domain.web_docs.gitbook.text_page import TextPage
9
+ from ...domain.webpages.web_text_fetcher_repository import WebTextFetcherRepository
10
+ from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
11
+
12
+
13
+ class GitbookWebFetcherService(WebTextFetcherRepository):
14
+
15
+ def __init__(self, url: str):
16
+ self.headers = {
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5'
20
+ }
21
+ self.base_url = url.rstrip('/')
22
+ self.base_domain = urlparse(self.base_url).netloc
23
+
24
+ self.delay = 1.0 # Delay between requests in seconds
25
+ self.max_retries = 3
26
+ self.retry_delay = 2 # Initial retry delay in seconds
27
+ self.concurrent_requests = 3 # Number of concurrent requests
28
+
29
+ self.visited_urls: Set[str] = set()
30
+ self.content_hashes: Dict[str, str] = {} # Hash -> URL
31
+ self.pages: Dict[str, TextPage] = {}
32
+
33
+ async def fetch(self) -> Optional[List[TextPage]]:
34
+ timeout = aiohttp.ClientTimeout(total=180) # 3 minutes total timeout
35
+ async with aiohttp.ClientSession(headers=self.headers, timeout=timeout) as session:
36
+ # Start with main page
37
+
38
+ await self._process_url(session, self.base_url)
39
+
40
+ # Wait for all tasks to complete
41
+ await asyncio.sleep(0)
42
+
43
+ # Sort pages by index
44
+ sorted_pages = sorted(
45
+ self.pages.values(),
46
+ key=lambda p: p.index
47
+ )
48
+ return sorted_pages
49
+
50
+ async def _process_url(self, session: aiohttp.ClientSession, url: str) -> None:
51
+ if url in self.visited_urls:
52
+ return
53
+ print(f"Processing {url}")
54
+ self.visited_urls.add(url)
55
+
56
+ # Fetch page content
57
+ html_content = await self._fetch_page(session, url)
58
+ if not html_content:
59
+ return
60
+
61
+ # Extract page content
62
+ page = await GitbookWebFetcherService.extract_page_content(url, html_content)
63
+ if not page:
64
+ return
65
+
66
+ # Check for duplicate content
67
+ if page.content_hash in self.content_hashes:
68
+ return
69
+
70
+ # Set page index
71
+ page.index = len(self.pages)
72
+
73
+ # Add page to collection
74
+ self.pages[url] = page
75
+ self.content_hashes[page.content_hash] = url
76
+
77
+ # Extract navigation links from this page
78
+ nav_links = await GitbookWebFetcherService.gitbook_extract_navigation(self.base_url, html_content)
79
+
80
+ #Process the discovered links
81
+ for link in nav_links:
82
+ if link not in self.visited_urls:
83
+ # Add delay between requests
84
+ await asyncio.sleep(self.delay)
85
+ # Process the URL
86
+ await self._process_url(session, link)
87
+
88
+ @staticmethod
89
+ async def extract_page_content(url: str, html_content: str) -> Optional[TextPage]:
90
+ try:
91
+ soup_processor = SoupProcessor(html_content)
92
+
93
+ title = soup_processor.extract_title()
94
+ if not title:
95
+ title = urlparse(url).path.split('/')[-1] or "Index"
96
+ title = title.replace('-', ' ').replace('_', ' ').title()
97
+
98
+ last_updated = soup_processor.extract_last_updated_refs_from_soup()
99
+
100
+ body_tag = soup_processor.find_body_content()
101
+ if body_tag is None:
102
+ return None
103
+ soup_processor.clean_template_usage(body_tag)
104
+ chapters = soup_processor.extract_chapters(content=body_tag)
105
+
106
+ return TextPage(
107
+ url=url,
108
+ title=title,
109
+ content=html_content,
110
+ last_updated=last_updated,
111
+ chapters=chapters,
112
+ )
113
+ except Exception as e:
114
+ return None
115
+
116
+ @staticmethod
117
+ async def gitbook_extract_navigation(base_url: str, html_content: str) -> List[str]:
118
+ """Extract navigation links from a page"""
119
+ try:
120
+
121
+ soup_processor = SoupProcessor(html_content)
122
+
123
+ nav_links = []
124
+ processed_urls = set()
125
+
126
+ # Extract links from modern layout
127
+ nav_links.extend(soup_processor.gitbook_extract_modern_nav(base_url, processed_urls))
128
+
129
+ # Extract links from traditional layout
130
+ nav_links.extend(soup_processor.gitbook_extract_traditional_nav(base_url, processed_urls))
131
+
132
+ # Extract links from pagination elements
133
+ nav_links.extend(soup_processor.gitbook_extract_pagination_links(base_url, processed_urls))
134
+
135
+ # Extract links from search for specific class patterns
136
+ nav_links.extend(soup_processor.gitbook_extract_class_based_nav(base_url, processed_urls))
137
+
138
+ # Remove duplicates while preserving order
139
+ return list(dict.fromkeys(nav_links))
140
+
141
+ except Exception as e:
142
+ return []
143
+
144
+ async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
145
+ """Fetch a page with retry logic"""
146
+ retry_count = 0
147
+ current_delay = self.retry_delay
148
+
149
+ while retry_count < self.max_retries:
150
+ try:
151
+ async with session.get(url) as response:
152
+ if response.status == 429: # Rate limit
153
+ retry_after = response.headers.get('Retry-After', '60')
154
+ wait_time = int(retry_after)
155
+
156
+ await asyncio.sleep(wait_time)
157
+ retry_count += 1
158
+ continue
159
+
160
+ if response.status == 200:
161
+ return await response.text()
162
+ else:
163
+ return None
164
+
165
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
166
+ if retry_count < self.max_retries - 1:
167
+ await asyncio.sleep(current_delay)
168
+ current_delay *= 2 # Exponential backoff
169
+ retry_count += 1
170
+ else:
171
+ return None
172
+ return None
@@ -0,0 +1,26 @@
1
+ from typing import List
2
+
3
+ from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
4
+ from ...domain.web_docs.doc_link_scorer_service import DocLinkScorerService
5
+ from ...domain.web_docs.web_link import WebLink
6
+ from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
7
+
8
+
9
+ class WebDocsLinkDetector:
10
+ def __init__(self, doc_link_scorer: DocLinkScorerService, web_fetcher: WebFetcherRepository, confidence_threshold: float = 0.5):
11
+ self.doc_link_scorer = doc_link_scorer
12
+ self.web_fetcher = web_fetcher
13
+ self.confidence_threshold = confidence_threshold
14
+
15
+ async def find_docs_links(self, website_url: str) -> List[WebLink]:
16
+ doc_links = []
17
+
18
+ web_content = await self.web_fetcher.fetch(website_url)
19
+
20
+ soup_processor = SoupProcessor(web_content)
21
+ web_links = soup_processor.extract_links(website_url)
22
+ for web_link in web_links:
23
+ score = self.doc_link_scorer.score(web_link.url, web_link.title)
24
+ if score >= self.confidence_threshold:
25
+ doc_links.append(web_link)
26
+ return doc_links
@@ -0,0 +1,11 @@
1
+ class WebLink:
2
+ def __init__(self, url: str, title: str):
3
+ self.url = url
4
+ self.title = title
5
+
6
+ def to_dict(self):
7
+ """Convert this WebLink instance to a serializable dictionary"""
8
+ return {
9
+ 'url': self.url,
10
+ 'title': self.title
11
+ }
File without changes
@@ -0,0 +1,10 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+
5
+ class WebFetcherRepository(ABC):
6
+
7
+ @abstractmethod
8
+ async def fetch(self, url: str) -> Optional[str]:
9
+ """Fetch the content of the given URL."""
10
+ pass