h-ai-brain 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h_ai/__init__.py +5 -0
- h_ai/application/__init__.py +0 -0
- h_ai/application/hai_service.py +48 -0
- h_ai/application/system_prompts/__init__.py +0 -0
- h_ai/application/system_prompts/roles/__init__.py +0 -0
- h_ai/application/web_docs_service.py +35 -0
- h_ai/domain/__init__.py +0 -0
- h_ai/domain/reasoning/__init__.py +0 -0
- h_ai/domain/reasoning/llm_chat_repository.py +9 -0
- h_ai/domain/reasoning/llm_generate_respository.py +6 -0
- h_ai/domain/reasoning/llm_tool_repository.py +14 -0
- h_ai/domain/reasoning/text_analysis.py +149 -0
- h_ai/domain/reasoning/tool_message.py +4 -0
- h_ai/domain/web_docs/__init__.py +0 -0
- h_ai/domain/web_docs/doc_link_scorer_service.py +45 -0
- h_ai/domain/web_docs/documentation_pattern_repository.py +44 -0
- h_ai/domain/web_docs/gitbook/__init__.py +0 -0
- h_ai/domain/web_docs/gitbook/text_chapter.py +18 -0
- h_ai/domain/web_docs/gitbook/text_page.py +46 -0
- h_ai/domain/web_docs/gitbook_web_fetcher_service.py +172 -0
- h_ai/domain/web_docs/web_docs_link_detector.py +26 -0
- h_ai/domain/web_docs/web_link.py +11 -0
- h_ai/domain/webpages/__init__.py +0 -0
- h_ai/domain/webpages/web_fetcher_repository.py +10 -0
- h_ai/domain/webpages/web_text_fetcher_repository.py +12 -0
- h_ai/infrastructure/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/soup_processor.py +240 -0
- h_ai/infrastructure/llm/__init__.py +0 -0
- h_ai/infrastructure/llm/data_handler.py +30 -0
- h_ai/infrastructure/llm/llm_response_cleaner.py +21 -0
- h_ai/infrastructure/llm/ollama/__init__.py +0 -0
- h_ai/infrastructure/llm/ollama/models/__init__.py +0 -0
- h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py +13 -0
- h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py +12 -0
- h_ai/infrastructure/llm/ollama/ollama_chat_repository.py +56 -0
- h_ai/infrastructure/llm/ollama/ollama_generate_repository.py +53 -0
- h_ai/infrastructure/llm/ollama/ollama_tool_repository.py +138 -0
- h_ai/infrastructure/llm/prompt_helper.py +7 -0
- h_ai/infrastructure/llm/prompt_loader.py +18 -0
- h_ai/infrastructure/playwright/__init__.py +0 -0
- h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +48 -0
- h_ai_brain-0.0.1.dist-info/METADATA +22 -0
- h_ai_brain-0.0.1.dist-info/RECORD +48 -0
- h_ai_brain-0.0.1.dist-info/WHEEL +5 -0
- h_ai_brain-0.0.1.dist-info/licenses/LICENSE +202 -0
- h_ai_brain-0.0.1.dist-info/licenses/NOTICE.txt +19 -0
- h_ai_brain-0.0.1.dist-info/top_level.txt +1 -0
h_ai/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import datetime
|
2
|
+
|
3
|
+
from h_message_bus import NatsPublisherAdapter
|
4
|
+
from h_message_bus.domain.twitter_get_user_request_message import TwitterGetUserRequestMessage
|
5
|
+
from h_message_bus.domain.twitter_get_user_response_message import TwitterGetUserResponseMessage
|
6
|
+
from h_message_bus.domain.vector_save_request_message import VectorSaveRequestMessage
|
7
|
+
|
8
|
+
from ..application.web_docs_service import WebDocsService
|
9
|
+
|
10
|
+
|
11
|
+
class HaiService:
|
12
|
+
def __init__(self, nats_publisher_adapter: NatsPublisherAdapter):
|
13
|
+
self.nats_publisher_adapter = nats_publisher_adapter
|
14
|
+
self.web_docs_service = WebDocsService()
|
15
|
+
|
16
|
+
async def detect_and_store_documentation(self, twitter_screen_name: str):
|
17
|
+
req_message = TwitterGetUserRequestMessage.create_message(twitter_screen_name)
|
18
|
+
response = await self.nats_publisher_adapter.request(req_message)
|
19
|
+
twitter_user = TwitterGetUserResponseMessage.from_hai_message(response)
|
20
|
+
|
21
|
+
if twitter_user.url is not None:
|
22
|
+
print(f"Documentation found for {twitter_user.screen_name}: {twitter_user.url}")
|
23
|
+
docs = await self.web_docs_service.discover_documentation(twitter_user.url)
|
24
|
+
|
25
|
+
for doc in docs:
|
26
|
+
collection_name = f"{twitter_user.screen_name}_docs"
|
27
|
+
chapters = doc.chapters
|
28
|
+
for chapter in chapters:
|
29
|
+
i = 0
|
30
|
+
for text in chapter.paragraphs:
|
31
|
+
document_id = f"{doc.title}_{chapter.heading}_{i}"
|
32
|
+
|
33
|
+
req_metadata = {
|
34
|
+
"source": doc.url,
|
35
|
+
"updated": datetime.datetime.utcnow()
|
36
|
+
}
|
37
|
+
i = i + 1
|
38
|
+
|
39
|
+
request = VectorSaveRequestMessage.create_message(
|
40
|
+
collection_name=collection_name,
|
41
|
+
document_id=document_id,
|
42
|
+
content=text,
|
43
|
+
metadata=req_metadata)
|
44
|
+
|
45
|
+
await self.nats_publisher_adapter.publish(request)
|
46
|
+
|
47
|
+
else:
|
48
|
+
print(f"No documentation found for {twitter_user.screen_name}")
|
File without changes
|
File without changes
|
@@ -0,0 +1,35 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
from ..domain.web_docs.doc_link_scorer_service import DocLinkScorerService
|
4
|
+
from ..domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
|
5
|
+
from ..domain.web_docs.gitbook.text_page import TextPage
|
6
|
+
from ..domain.web_docs.gitbook_web_fetcher_service import GitbookWebFetcherService
|
7
|
+
from ..domain.web_docs.web_docs_link_detector import WebDocsLinkDetector
|
8
|
+
from ..domain.web_docs.web_link import WebLink
|
9
|
+
from ..infrastructure.playwright.playwright_web_content_fetcher import PlayWrightWebContentFetcher
|
10
|
+
|
11
|
+
|
12
|
+
class WebDocsService:
|
13
|
+
|
14
|
+
def __init__(self):
|
15
|
+
self.pattern_repo = DocumentationPatternRepository()
|
16
|
+
self.scorer = DocLinkScorerService(self.pattern_repo)
|
17
|
+
self.headless_browser = PlayWrightWebContentFetcher()
|
18
|
+
self.web_link_detector = WebDocsLinkDetector(
|
19
|
+
self.scorer,
|
20
|
+
self.headless_browser)
|
21
|
+
|
22
|
+
|
23
|
+
async def discover_documentation(self, website_url: str) -> Optional[List[TextPage]]:
|
24
|
+
detected_links = await self.detect_documentation_links(website_url)
|
25
|
+
for link in detected_links:
|
26
|
+
gitbook_fetcher = GitbookWebFetcherService(link.url)
|
27
|
+
gitbook_pages = await gitbook_fetcher.fetch()
|
28
|
+
return gitbook_pages
|
29
|
+
|
30
|
+
async def detect_documentation_links(self, website_url: str) -> List[WebLink]:
|
31
|
+
"""
|
32
|
+
Function to detect documentation links from a website
|
33
|
+
Returns a list of potential documentation root URLs
|
34
|
+
"""
|
35
|
+
return await self.web_link_detector.find_docs_links(website_url)
|
h_ai/domain/__init__.py
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from ...domain.reasoning.tool_message import ToolMessage
|
5
|
+
|
6
|
+
|
7
|
+
class LlmToolRepository(ABC):
|
8
|
+
@abstractmethod
|
9
|
+
def find_tools_in_message(self, message: str) -> List[ToolMessage] | None:
|
10
|
+
...
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
def build_tool_response_prompt(self, question: str, tool_results: list[str])-> str|None:
|
14
|
+
...
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import uuid
|
2
|
+
from typing import List, Dict, Union, Optional
|
3
|
+
from dataclasses import dataclass
|
4
|
+
|
5
|
+
|
6
|
+
@dataclass
|
7
|
+
class Sentiment:
|
8
|
+
"""Domain object representing the sentiment of analyzed text"""
|
9
|
+
score: float
|
10
|
+
label: str # 'negative', 'neutral', or 'positive'
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass
|
14
|
+
class DetectedEntity:
|
15
|
+
"""Domain object representing an entity detected in analyzed text"""
|
16
|
+
name: str
|
17
|
+
type: str
|
18
|
+
mentions: List[str] = None
|
19
|
+
|
20
|
+
def __post_init__(self):
|
21
|
+
if self.mentions is None:
|
22
|
+
self.mentions = []
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class DetectedRelationship:
|
27
|
+
"""Domain object representing a relationship between entities detected in text"""
|
28
|
+
source_entity_name: str
|
29
|
+
relation_type: str
|
30
|
+
target_entity_name: str
|
31
|
+
confidence: float = 0.0 # Confidence score of this relationship detection
|
32
|
+
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class ExtractedFact:
|
36
|
+
"""Domain object representing a factual statement extracted from text"""
|
37
|
+
statement: str
|
38
|
+
referenced_entities: List[str] = None # Names of entities referenced in this fact
|
39
|
+
|
40
|
+
def __post_init__(self):
|
41
|
+
if self.referenced_entities is None:
|
42
|
+
self.referenced_entities = []
|
43
|
+
|
44
|
+
|
45
|
+
@dataclass
|
46
|
+
class TextAnalysisResult:
|
47
|
+
"""Root aggregate for the result of analyzing a piece of text"""
|
48
|
+
text_id: str
|
49
|
+
category: str
|
50
|
+
topic: str
|
51
|
+
sentiment: Sentiment
|
52
|
+
entities: List[DetectedEntity]
|
53
|
+
relationships: List[DetectedRelationship]
|
54
|
+
facts: List[ExtractedFact]
|
55
|
+
|
56
|
+
def __init__(self, text_id: str, category: str, topic: str,
|
57
|
+
sentiment_score: float = 0.0,
|
58
|
+
sentiment_label: str = "neutral",
|
59
|
+
entities: Optional[List[Dict[str, Union[str, List[str]]]]] = None,
|
60
|
+
relationships: Optional[List[Dict[str, Union[str, float]]]] = None,
|
61
|
+
facts: Optional[List[Dict[str, Union[str, List[str]]]]] = None):
|
62
|
+
|
63
|
+
self.text_id = text_id
|
64
|
+
self.category = category
|
65
|
+
self.topic = topic
|
66
|
+
self.sentiment = Sentiment(sentiment_score, sentiment_label)
|
67
|
+
|
68
|
+
# Process entities
|
69
|
+
self.entities = []
|
70
|
+
if entities:
|
71
|
+
for entity_data in entities:
|
72
|
+
entity = DetectedEntity(
|
73
|
+
name=entity_data["name"],
|
74
|
+
type=entity_data["type"],
|
75
|
+
mentions=entity_data.get("mentions", [])
|
76
|
+
)
|
77
|
+
self.entities.append(entity)
|
78
|
+
|
79
|
+
# Process relationships
|
80
|
+
self.relationships = []
|
81
|
+
if relationships:
|
82
|
+
for rel_data in relationships:
|
83
|
+
rel = DetectedRelationship(
|
84
|
+
source_entity_name=rel_data["source_entity_name"],
|
85
|
+
relation_type=rel_data["relation_type"],
|
86
|
+
target_entity_name=rel_data["target_entity_name"],
|
87
|
+
confidence=rel_data.get("confidence", 0.0)
|
88
|
+
)
|
89
|
+
self.relationships.append(rel)
|
90
|
+
|
91
|
+
# Process facts
|
92
|
+
self.facts = []
|
93
|
+
if facts:
|
94
|
+
for fact_data in facts:
|
95
|
+
fact = ExtractedFact(
|
96
|
+
statement=fact_data["statement"],
|
97
|
+
referenced_entities=fact_data.get("referenced_entities", [])
|
98
|
+
)
|
99
|
+
self.facts.append(fact)
|
100
|
+
|
101
|
+
def to_dict(self) -> Dict[str, Union[str, Dict, List]]:
|
102
|
+
"""Convert the TextAnalysisResult to a dictionary"""
|
103
|
+
return {
|
104
|
+
"text_id": self.text_id,
|
105
|
+
"category": self.category,
|
106
|
+
"topic": self.topic,
|
107
|
+
"sentiment": {
|
108
|
+
"score": self.sentiment.score,
|
109
|
+
"label": self.sentiment.label
|
110
|
+
},
|
111
|
+
"entities": [
|
112
|
+
{
|
113
|
+
"name": entity.name,
|
114
|
+
"type": entity.type,
|
115
|
+
"mentions": entity.mentions
|
116
|
+
} for entity in self.entities
|
117
|
+
],
|
118
|
+
"relationships": [
|
119
|
+
{
|
120
|
+
"source_entity_name": rel.source_entity_name,
|
121
|
+
"relation_type": rel.relation_type,
|
122
|
+
"target_entity_name": rel.target_entity_name,
|
123
|
+
"confidence": rel.confidence
|
124
|
+
} for rel in self.relationships
|
125
|
+
],
|
126
|
+
"facts": [
|
127
|
+
{
|
128
|
+
"statement": fact.statement,
|
129
|
+
"referenced_entities": fact.referenced_entities
|
130
|
+
} for fact in self.facts
|
131
|
+
]
|
132
|
+
}
|
133
|
+
|
134
|
+
@classmethod
|
135
|
+
def from_dict(cls, data: Dict) -> 'TextAnalysisResult':
|
136
|
+
"""Create a TextAnalysisResult from a dictionary"""
|
137
|
+
random_guid = uuid.uuid4()
|
138
|
+
random_guid_str = str(uuid.uuid4())
|
139
|
+
|
140
|
+
return cls(
|
141
|
+
text_id=random_guid_str,
|
142
|
+
category=data["category"],
|
143
|
+
topic=data["topic"],
|
144
|
+
sentiment_score=data["sentiment"]["score"],
|
145
|
+
sentiment_label=data["sentiment"]["label"],
|
146
|
+
entities=data.get("entities", []),
|
147
|
+
relationships=data.get("relationships", []),
|
148
|
+
facts=data.get("facts", [])
|
149
|
+
)
|
File without changes
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import re
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
|
4
|
+
from ...domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
|
5
|
+
|
6
|
+
|
7
|
+
class DocLinkScorerService:
|
8
|
+
"""Service for scoring potential documentation links"""
|
9
|
+
|
10
|
+
def __init__(self, pattern_repo: DocumentationPatternRepository):
|
11
|
+
self.pattern_repo = pattern_repo
|
12
|
+
|
13
|
+
def score(self, full_url: str, link_text: str) -> float:
|
14
|
+
"""
|
15
|
+
Score a link based on how likely it is to be documentation
|
16
|
+
Returns a value between 0.0 and 1.0
|
17
|
+
"""
|
18
|
+
score = 0.0
|
19
|
+
max_score = 3.0 # Maximum possible score
|
20
|
+
|
21
|
+
# Parse the URL
|
22
|
+
parsed_url = urlparse(full_url)
|
23
|
+
domain = parsed_url.netloc
|
24
|
+
path = parsed_url.path
|
25
|
+
|
26
|
+
# Check domain patterns
|
27
|
+
for doc_domain in self.pattern_repo.doc_domains:
|
28
|
+
if doc_domain in domain:
|
29
|
+
score += 1.0
|
30
|
+
break
|
31
|
+
|
32
|
+
# Check path patterns
|
33
|
+
for path_pattern in self.pattern_repo.doc_path_patterns:
|
34
|
+
if re.search(path_pattern, path):
|
35
|
+
score += 1.0
|
36
|
+
break
|
37
|
+
|
38
|
+
# Check link text patterns
|
39
|
+
for text_pattern in self.pattern_repo.doc_text_patterns:
|
40
|
+
if re.search(text_pattern, link_text):
|
41
|
+
score += 1.0
|
42
|
+
break
|
43
|
+
|
44
|
+
# Normalize score to 0.0-1.0 range
|
45
|
+
return min(score / max_score, 1.0)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class DocumentationPatternRepository:
|
2
|
+
"""Repository of patterns that indicate documentation links"""
|
3
|
+
|
4
|
+
def __init__(self):
|
5
|
+
# Domain patterns that commonly host documentation
|
6
|
+
self.doc_domains = [
|
7
|
+
"gitbook.io",
|
8
|
+
"readthedocs.io",
|
9
|
+
"docs.github.com",
|
10
|
+
"developer.mozilla.org",
|
11
|
+
"confluence.",
|
12
|
+
"zendesk.com",
|
13
|
+
"help.",
|
14
|
+
"support.",
|
15
|
+
"wiki.",
|
16
|
+
]
|
17
|
+
|
18
|
+
# URL path patterns that commonly indicate documentation
|
19
|
+
self.doc_path_patterns = [
|
20
|
+
r"/docs/",
|
21
|
+
r"/documentation/",
|
22
|
+
r"/guide/",
|
23
|
+
r"/manual/",
|
24
|
+
r"/help/",
|
25
|
+
r"/knowledge/",
|
26
|
+
r"/support/",
|
27
|
+
r"/api/",
|
28
|
+
r"/reference/",
|
29
|
+
r"/wiki/",
|
30
|
+
]
|
31
|
+
|
32
|
+
# Link text patterns that suggest documentation
|
33
|
+
self.doc_text_patterns = [
|
34
|
+
r"(?i)documentation",
|
35
|
+
r"(?i)docs",
|
36
|
+
r"(?i)developer guide",
|
37
|
+
r"(?i)user guide",
|
38
|
+
r"(?i)knowledge base",
|
39
|
+
r"(?i)help center",
|
40
|
+
r"(?i)manual",
|
41
|
+
r"(?i)api reference",
|
42
|
+
r"(?i)getting started",
|
43
|
+
r"(?i)learn more",
|
44
|
+
]
|
File without changes
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class TextChapter:
|
7
|
+
"""Represents a chapter/section in a page defined by a heading."""
|
8
|
+
heading: str
|
9
|
+
level: int # h1=1, h2=2, etc.
|
10
|
+
paragraphs: List[str] = field(default_factory=list)
|
11
|
+
|
12
|
+
def to_dict(self):
|
13
|
+
"""Convert this TextChapter instance to a serializable dictionary"""
|
14
|
+
return {
|
15
|
+
'heading': self.heading,
|
16
|
+
'level': self.level,
|
17
|
+
'paragraphs': self.paragraphs
|
18
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import hashlib
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import Dict, List
|
4
|
+
|
5
|
+
from src.h_ai.domain.web_docs.gitbook.text_chapter import TextChapter
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class TextPage:
|
10
|
+
"""Represents text on a page from a web document"""
|
11
|
+
url: str = ""
|
12
|
+
title: str = ""
|
13
|
+
content: str = ""
|
14
|
+
last_updated: str = ""
|
15
|
+
|
16
|
+
index: int = 0
|
17
|
+
toc_level: int = 0
|
18
|
+
parent_id: str = ""
|
19
|
+
|
20
|
+
chapters: List[TextChapter] = field(default_factory=list)
|
21
|
+
links: Dict[str, str] = field(default_factory=dict) # Text -> URL
|
22
|
+
|
23
|
+
id: str = field(init=False, default="")
|
24
|
+
content_hash: str = field(init=False, default="")
|
25
|
+
|
26
|
+
def __post_init__(self):
|
27
|
+
self.id = hashlib.md5(self.url.encode()).hexdigest()
|
28
|
+
# Generate a content hash for deduplication
|
29
|
+
self.content_hash = hashlib.md5(self.content.encode()).hexdigest() if self.content else ""
|
30
|
+
|
31
|
+
def to_dict(self):
|
32
|
+
"""Convert this TextPage instance to a serializable dictionary"""
|
33
|
+
result = {
|
34
|
+
'url': self.url,
|
35
|
+
'title': self.title,
|
36
|
+
#'content': self.content,
|
37
|
+
'last_updated': self.last_updated,
|
38
|
+
'index': self.index,
|
39
|
+
'toc_level': self.toc_level,
|
40
|
+
'parent_id': self.parent_id,
|
41
|
+
'id': self.id,
|
42
|
+
'content_hash': self.content_hash,
|
43
|
+
'links': self.links,
|
44
|
+
'chapters': [chapter.to_dict() for chapter in self.chapters]
|
45
|
+
}
|
46
|
+
return result
|
@@ -0,0 +1,172 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
from typing import Optional, Set, Dict, List
|
4
|
+
from urllib.parse import urlparse
|
5
|
+
|
6
|
+
import aiohttp
|
7
|
+
|
8
|
+
from ...domain.web_docs.gitbook.text_page import TextPage
|
9
|
+
from ...domain.webpages.web_text_fetcher_repository import WebTextFetcherRepository
|
10
|
+
from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
|
11
|
+
|
12
|
+
|
13
|
+
class GitbookWebFetcherService(WebTextFetcherRepository):
|
14
|
+
|
15
|
+
def __init__(self, url: str):
|
16
|
+
self.headers = {
|
17
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
18
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
19
|
+
'Accept-Language': 'en-US,en;q=0.5'
|
20
|
+
}
|
21
|
+
self.base_url = url.rstrip('/')
|
22
|
+
self.base_domain = urlparse(self.base_url).netloc
|
23
|
+
|
24
|
+
self.delay = 1.0 # Delay between requests in seconds
|
25
|
+
self.max_retries = 3
|
26
|
+
self.retry_delay = 2 # Initial retry delay in seconds
|
27
|
+
self.concurrent_requests = 3 # Number of concurrent requests
|
28
|
+
|
29
|
+
self.visited_urls: Set[str] = set()
|
30
|
+
self.content_hashes: Dict[str, str] = {} # Hash -> URL
|
31
|
+
self.pages: Dict[str, TextPage] = {}
|
32
|
+
|
33
|
+
async def fetch(self) -> Optional[List[TextPage]]:
|
34
|
+
timeout = aiohttp.ClientTimeout(total=180) # 3 minutes total timeout
|
35
|
+
async with aiohttp.ClientSession(headers=self.headers, timeout=timeout) as session:
|
36
|
+
# Start with main page
|
37
|
+
|
38
|
+
await self._process_url(session, self.base_url)
|
39
|
+
|
40
|
+
# Wait for all tasks to complete
|
41
|
+
await asyncio.sleep(0)
|
42
|
+
|
43
|
+
# Sort pages by index
|
44
|
+
sorted_pages = sorted(
|
45
|
+
self.pages.values(),
|
46
|
+
key=lambda p: p.index
|
47
|
+
)
|
48
|
+
return sorted_pages
|
49
|
+
|
50
|
+
async def _process_url(self, session: aiohttp.ClientSession, url: str) -> None:
|
51
|
+
if url in self.visited_urls:
|
52
|
+
return
|
53
|
+
print(f"Processing {url}")
|
54
|
+
self.visited_urls.add(url)
|
55
|
+
|
56
|
+
# Fetch page content
|
57
|
+
html_content = await self._fetch_page(session, url)
|
58
|
+
if not html_content:
|
59
|
+
return
|
60
|
+
|
61
|
+
# Extract page content
|
62
|
+
page = await GitbookWebFetcherService.extract_page_content(url, html_content)
|
63
|
+
if not page:
|
64
|
+
return
|
65
|
+
|
66
|
+
# Check for duplicate content
|
67
|
+
if page.content_hash in self.content_hashes:
|
68
|
+
return
|
69
|
+
|
70
|
+
# Set page index
|
71
|
+
page.index = len(self.pages)
|
72
|
+
|
73
|
+
# Add page to collection
|
74
|
+
self.pages[url] = page
|
75
|
+
self.content_hashes[page.content_hash] = url
|
76
|
+
|
77
|
+
# Extract navigation links from this page
|
78
|
+
nav_links = await GitbookWebFetcherService.gitbook_extract_navigation(self.base_url, html_content)
|
79
|
+
|
80
|
+
#Process the discovered links
|
81
|
+
for link in nav_links:
|
82
|
+
if link not in self.visited_urls:
|
83
|
+
# Add delay between requests
|
84
|
+
await asyncio.sleep(self.delay)
|
85
|
+
# Process the URL
|
86
|
+
await self._process_url(session, link)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
async def extract_page_content(url: str, html_content: str) -> Optional[TextPage]:
|
90
|
+
try:
|
91
|
+
soup_processor = SoupProcessor(html_content)
|
92
|
+
|
93
|
+
title = soup_processor.extract_title()
|
94
|
+
if not title:
|
95
|
+
title = urlparse(url).path.split('/')[-1] or "Index"
|
96
|
+
title = title.replace('-', ' ').replace('_', ' ').title()
|
97
|
+
|
98
|
+
last_updated = soup_processor.extract_last_updated_refs_from_soup()
|
99
|
+
|
100
|
+
body_tag = soup_processor.find_body_content()
|
101
|
+
if body_tag is None:
|
102
|
+
return None
|
103
|
+
soup_processor.clean_template_usage(body_tag)
|
104
|
+
chapters = soup_processor.extract_chapters(content=body_tag)
|
105
|
+
|
106
|
+
return TextPage(
|
107
|
+
url=url,
|
108
|
+
title=title,
|
109
|
+
content=html_content,
|
110
|
+
last_updated=last_updated,
|
111
|
+
chapters=chapters,
|
112
|
+
)
|
113
|
+
except Exception as e:
|
114
|
+
return None
|
115
|
+
|
116
|
+
@staticmethod
|
117
|
+
async def gitbook_extract_navigation(base_url: str, html_content: str) -> List[str]:
|
118
|
+
"""Extract navigation links from a page"""
|
119
|
+
try:
|
120
|
+
|
121
|
+
soup_processor = SoupProcessor(html_content)
|
122
|
+
|
123
|
+
nav_links = []
|
124
|
+
processed_urls = set()
|
125
|
+
|
126
|
+
# Extract links from modern layout
|
127
|
+
nav_links.extend(soup_processor.gitbook_extract_modern_nav(base_url, processed_urls))
|
128
|
+
|
129
|
+
# Extract links from traditional layout
|
130
|
+
nav_links.extend(soup_processor.gitbook_extract_traditional_nav(base_url, processed_urls))
|
131
|
+
|
132
|
+
# Extract links from pagination elements
|
133
|
+
nav_links.extend(soup_processor.gitbook_extract_pagination_links(base_url, processed_urls))
|
134
|
+
|
135
|
+
# Extract links from search for specific class patterns
|
136
|
+
nav_links.extend(soup_processor.gitbook_extract_class_based_nav(base_url, processed_urls))
|
137
|
+
|
138
|
+
# Remove duplicates while preserving order
|
139
|
+
return list(dict.fromkeys(nav_links))
|
140
|
+
|
141
|
+
except Exception as e:
|
142
|
+
return []
|
143
|
+
|
144
|
+
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
145
|
+
"""Fetch a page with retry logic"""
|
146
|
+
retry_count = 0
|
147
|
+
current_delay = self.retry_delay
|
148
|
+
|
149
|
+
while retry_count < self.max_retries:
|
150
|
+
try:
|
151
|
+
async with session.get(url) as response:
|
152
|
+
if response.status == 429: # Rate limit
|
153
|
+
retry_after = response.headers.get('Retry-After', '60')
|
154
|
+
wait_time = int(retry_after)
|
155
|
+
|
156
|
+
await asyncio.sleep(wait_time)
|
157
|
+
retry_count += 1
|
158
|
+
continue
|
159
|
+
|
160
|
+
if response.status == 200:
|
161
|
+
return await response.text()
|
162
|
+
else:
|
163
|
+
return None
|
164
|
+
|
165
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
166
|
+
if retry_count < self.max_retries - 1:
|
167
|
+
await asyncio.sleep(current_delay)
|
168
|
+
current_delay *= 2 # Exponential backoff
|
169
|
+
retry_count += 1
|
170
|
+
else:
|
171
|
+
return None
|
172
|
+
return None
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from ...domain.webpages.web_fetcher_repository import WebFetcherRepository
|
4
|
+
from ...domain.web_docs.doc_link_scorer_service import DocLinkScorerService
|
5
|
+
from ...domain.web_docs.web_link import WebLink
|
6
|
+
from ...infrastructure.beautifulsoup.soup_processor import SoupProcessor
|
7
|
+
|
8
|
+
|
9
|
+
class WebDocsLinkDetector:
|
10
|
+
def __init__(self, doc_link_scorer: DocLinkScorerService, web_fetcher: WebFetcherRepository, confidence_threshold: float = 0.5):
|
11
|
+
self.doc_link_scorer = doc_link_scorer
|
12
|
+
self.web_fetcher = web_fetcher
|
13
|
+
self.confidence_threshold = confidence_threshold
|
14
|
+
|
15
|
+
async def find_docs_links(self, website_url: str) -> List[WebLink]:
|
16
|
+
doc_links = []
|
17
|
+
|
18
|
+
web_content = await self.web_fetcher.fetch(website_url)
|
19
|
+
|
20
|
+
soup_processor = SoupProcessor(web_content)
|
21
|
+
web_links = soup_processor.extract_links(website_url)
|
22
|
+
for web_link in web_links:
|
23
|
+
score = self.doc_link_scorer.score(web_link.url, web_link.title)
|
24
|
+
if score >= self.confidence_threshold:
|
25
|
+
doc_links.append(web_link)
|
26
|
+
return doc_links
|
File without changes
|