h-ai-brain 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h_ai/__init__.py +1 -3
- h_ai/application/hai_service.py +0 -51
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/METADATA +2 -8
- h_ai_brain-0.0.16.dist-info/RECORD +31 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/WHEEL +1 -1
- h_ai/application/priority_queue_service.py +0 -30
- h_ai/application/web_docs_service.py +0 -36
- h_ai/domain/priorityqueue/__init__.py +0 -0
- h_ai/domain/priorityqueue/priority_queue_repository.py +0 -34
- h_ai/domain/priorityqueue/queue_item.py +0 -43
- h_ai/domain/web_docs/__init__.py +0 -0
- h_ai/domain/web_docs/doc_link_scorer_service.py +0 -45
- h_ai/domain/web_docs/documentation_pattern_repository.py +0 -44
- h_ai/domain/web_docs/ecosystem_link_scorer_service.py +0 -83
- h_ai/domain/web_docs/ecosystem_pattern_repository.py +0 -182
- h_ai/domain/web_docs/gitbook/__init__.py +0 -0
- h_ai/domain/web_docs/gitbook/text_chapter.py +0 -18
- h_ai/domain/web_docs/gitbook/text_page.py +0 -46
- h_ai/domain/web_docs/gitbook_web_fetcher_service.py +0 -171
- h_ai/domain/web_docs/web_docs_link_detector.py +0 -28
- h_ai/domain/web_docs/web_link.py +0 -11
- h_ai/domain/webpages/__init__.py +0 -0
- h_ai/domain/webpages/web_fetcher_repository.py +0 -10
- h_ai/domain/webpages/web_text_fetcher_repository.py +0 -12
- h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
- h_ai/infrastructure/beautifulsoup/soup_processor.py +0 -240
- h_ai/infrastructure/playwright/__init__.py +0 -0
- h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +0 -64
- h_ai/infrastructure/priorityqueue/__init__.py +0 -0
- h_ai/infrastructure/priorityqueue/in_memory_priority_queue_repository.py +0 -98
- h_ai_brain-0.0.15.dist-info/RECORD +0 -56
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/LICENSE +0 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/licenses/NOTICE.txt +0 -0
- {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.16.dist-info}/top_level.txt +0 -0
h_ai/__init__.py
CHANGED
h_ai/application/hai_service.py
CHANGED
@@ -1,58 +1,7 @@
|
|
1
1
|
from h_message_bus import NatsPublisherAdapter
|
2
|
-
from h_message_bus.domain.request_messages.twitter_get_user_request_message import TwitterGetUserRequestMessage
|
3
|
-
from h_message_bus.domain.request_messages.twitter_get_user_response_message import TwitterGetUserResponseMessage
|
4
|
-
from h_message_bus.domain.request_messages.vector_read_metadata_request_message import VectorReadMetaDataRequestMessage
|
5
|
-
from h_message_bus.domain.request_messages.vector_read_metadata_response_message import \
|
6
|
-
VectorReadMetaDataResponseMessage
|
7
|
-
from h_message_bus.domain.request_messages.vector_save_request_message import VectorSaveRequestMessage
|
8
|
-
|
9
|
-
from .priority_queue_service import PriorityQueueService
|
10
|
-
from ..application.web_docs_service import WebDocsService
|
11
|
-
from ..infrastructure.priorityqueue.in_memory_priority_queue_repository import InMemoryPriorityQueueRepository
|
12
|
-
|
13
2
|
|
14
3
|
class HaiService:
|
15
4
|
def __init__(self, nats_publisher_adapter: NatsPublisherAdapter):
|
16
5
|
self.nats_publisher_adapter = nats_publisher_adapter
|
17
|
-
self.web_docs_service = WebDocsService()
|
18
|
-
queue = InMemoryPriorityQueueRepository()
|
19
|
-
self.queue_service = PriorityQueueService(queue)
|
20
|
-
|
21
|
-
async def detect_and_store_documentation(self, twitter_screen_name: str):
|
22
|
-
req_message = TwitterGetUserRequestMessage.create_message(twitter_screen_name)
|
23
|
-
response = await self.nats_publisher_adapter.request(req_message)
|
24
|
-
twitter_user = TwitterGetUserResponseMessage.from_hai_message(response)
|
25
|
-
|
26
|
-
if twitter_user.url is not None:
|
27
|
-
print(f"Documentation found for {twitter_user.screen_name}: {twitter_user.url}")
|
28
|
-
docs = await self.web_docs_service.discover_documentation(twitter_user.url)
|
29
|
-
|
30
|
-
for doc in docs:
|
31
|
-
collection_name = f"{twitter_user.screen_name}_docs"
|
32
|
-
chapters = doc.chapters
|
33
|
-
for chapter in chapters:
|
34
|
-
i = 0
|
35
|
-
for text in chapter.paragraphs:
|
36
|
-
document_id = f"{doc.title}_{chapter.heading}_{i}"
|
37
|
-
|
38
|
-
req_metadata = {
|
39
|
-
"source": doc.url
|
40
|
-
}
|
41
|
-
i = i + 1
|
42
|
-
|
43
|
-
request = VectorSaveRequestMessage.create_message(
|
44
|
-
collection_name=collection_name,
|
45
|
-
document_id=document_id,
|
46
|
-
content=text,
|
47
|
-
metadata=req_metadata)
|
48
|
-
|
49
|
-
await self.nats_publisher_adapter.publish(request)
|
50
6
|
|
51
|
-
else:
|
52
|
-
print(f"No documentation found for {twitter_user.screen_name}")
|
53
7
|
|
54
|
-
async def load_current_knowledge_base_metadata(self) -> VectorReadMetaDataResponseMessage:
|
55
|
-
message = VectorReadMetaDataRequestMessage.create_message()
|
56
|
-
response = await self.nats_publisher_adapter.request(message)
|
57
|
-
metadata_result = VectorReadMetaDataResponseMessage.from_hai_message(response)
|
58
|
-
return metadata_result
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: h_ai_brain
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.16
|
4
4
|
Summary: AI Research agent API
|
5
5
|
Author-email: shoebill <shoebill.hai@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -10,13 +10,7 @@ Requires-Python: >=3.10
|
|
10
10
|
Description-Content-Type: text/markdown
|
11
11
|
License-File: LICENSE
|
12
12
|
License-File: NOTICE.txt
|
13
|
-
Requires-Dist:
|
14
|
-
Requires-Dist: h_message_bus~=0.0.19
|
15
|
-
Requires-Dist: aiohttp~=3.11.16
|
16
|
-
Requires-Dist: bs4~=0.0.2
|
17
|
-
Requires-Dist: beautifulsoup4~=4.13.3
|
18
|
-
Requires-Dist: tldextract~=5.2.0
|
19
|
-
Requires-Dist: playwright~=1.51.0
|
13
|
+
Requires-Dist: h_message_bus~=0.0.21
|
20
14
|
Provides-Extra: dev
|
21
15
|
Requires-Dist: pytest; extra == "dev"
|
22
16
|
Dynamic: license-file
|
@@ -0,0 +1,31 @@
|
|
1
|
+
h_ai/__init__.py,sha256=63uVFHPxXmLrZVo2ZPixL2cU4jwf3XTAuwIVGHGkqJI,75
|
2
|
+
h_ai/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
h_ai/application/hai_service.py,sha256=bGKUpOnarxQKLqo7NIFFjDKTIDmzkSTydQIUjkfuTnE,206
|
4
|
+
h_ai/application/system_prompts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
h_ai/application/system_prompts/roles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
h_ai/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
h_ai/domain/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
h_ai/domain/reasoning/llm_chat_repository.py,sha256=rY2izDyaDnoyyrCRS1qc9erHB98vARj4Mp-SnPwNhyY,211
|
9
|
+
h_ai/domain/reasoning/llm_generate_respository.py,sha256=DPiV6ldCE8YhDdVb5rj98MBudKalDQHV3CZ2ADTm_f8,178
|
10
|
+
h_ai/domain/reasoning/llm_tool_repository.py,sha256=nFwqtrJ0Gw8XUFX0uuO7-UejtgoqNuGeT51qZPQtxas,401
|
11
|
+
h_ai/domain/reasoning/text_analysis.py,sha256=rmCUHWzJ3muFBorVXx7HcU2Sw-UfXFOuAVXRAPkqS8E,5183
|
12
|
+
h_ai/domain/reasoning/tool_message.py,sha256=jpbfbJXj6oqZyB3lDxGOUyFB4faHtXAaEOVBHgTgSnk,67
|
13
|
+
h_ai/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
h_ai/infrastructure/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
h_ai/infrastructure/llm/data_handler.py,sha256=M2h1azkjBP9GyaffTggKQZb2CQmOvAk2yo9NrsFMYAo,987
|
16
|
+
h_ai/infrastructure/llm/llm_response_cleaner.py,sha256=pp1K7I77hagrC1r6Ib61-iSNQnU6wlM54bRmOUa7eFk,859
|
17
|
+
h_ai/infrastructure/llm/prompt_helper.py,sha256=QjxPbNW7hu2wBIi9GLJ7r00ELytT2Wr1JKDAA1jB2U4,238
|
18
|
+
h_ai/infrastructure/llm/prompt_loader.py,sha256=8h6QGq_h-s-UzC9fIxpeMCrhjHd7347NaAq7uLf-ook,455
|
19
|
+
h_ai/infrastructure/llm/ollama/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
h_ai/infrastructure/llm/ollama/ollama_chat_repository.py,sha256=GALea7UWLtKyt767Frtl3uv8rvy42HrOKMIQGpqq-H0,2108
|
21
|
+
h_ai/infrastructure/llm/ollama/ollama_generate_repository.py,sha256=jorJrsIR3WPkvls7NE3BXllEtiDePCgMX5DFADz2s8E,1712
|
22
|
+
h_ai/infrastructure/llm/ollama/ollama_tool_repository.py,sha256=7UZ-qsgXQUcJFx1qY7SVI7p3FhIy0Drdqs7jZIp42Ag,4683
|
23
|
+
h_ai/infrastructure/llm/ollama/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
+
h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py,sha256=ZIz4PQ3869vI3xAYYufPrxXpacajRDtOI8RDl5Dm9RQ,305
|
25
|
+
h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py,sha256=GZ_ddpbWa8iy6NZq50vokUFVZBiX0WNa81z9-r9RzTY,392
|
26
|
+
h_ai_brain-0.0.16.dist-info/licenses/LICENSE,sha256=SbvpEU5JIU3yzMMkyzrI0dGqHDoJR_lMKGdl6GZHsy4,11558
|
27
|
+
h_ai_brain-0.0.16.dist-info/licenses/NOTICE.txt,sha256=vxeIKUiGqAePLvDW4AVm3Xh-3BcsvMtCMn1tbsr9zsE,668
|
28
|
+
h_ai_brain-0.0.16.dist-info/METADATA,sha256=gZ_38sf9j-XybYHejsoBrkTYJEhnq5dbT25w4mv6KrI,536
|
29
|
+
h_ai_brain-0.0.16.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
30
|
+
h_ai_brain-0.0.16.dist-info/top_level.txt,sha256=3MChDBWvDJV4cEHuZhzeODxQ4ewtw-arOuyaDOc6sIo,5
|
31
|
+
h_ai_brain-0.0.16.dist-info/RECORD,,
|
@@ -1,30 +0,0 @@
|
|
1
|
-
|
2
|
-
from typing import Any, List, Optional
|
3
|
-
|
4
|
-
from ..domain.priorityqueue.priority_queue_repository import PriorityQueueRepository
|
5
|
-
from ..domain.priorityqueue.queue_item import QueueItem
|
6
|
-
|
7
|
-
|
8
|
-
class PriorityQueueService:
|
9
|
-
"""Application service to manage priority queue operations"""
|
10
|
-
|
11
|
-
def __init__(self, repository: PriorityQueueRepository):
|
12
|
-
self.repository = repository
|
13
|
-
|
14
|
-
def add_item(self, queue_name: str, content: Any, priority: int, metadata: Optional[dict] = None) -> QueueItem:
|
15
|
-
"""Add an item to the specified queue"""
|
16
|
-
item = QueueItem.create(content, priority, metadata)
|
17
|
-
self.repository.add_item(queue_name, item)
|
18
|
-
return item
|
19
|
-
|
20
|
-
def get_next_item(self, queue_name: str, block: bool = True, timeout: float = 30.0) -> Optional[QueueItem]:
|
21
|
-
"""Get and remove the highest priority item from the queue"""
|
22
|
-
return self.repository.get_highest_priority_item(queue_name, block=block, timeout=timeout)
|
23
|
-
|
24
|
-
def get_queue_length(self, queue_name: str) -> int:
|
25
|
-
"""Get the number of items in the queue"""
|
26
|
-
return self.repository.queue_length(queue_name)
|
27
|
-
|
28
|
-
def get_available_queues(self) -> List[str]:
|
29
|
-
"""Get a list of all available queue names"""
|
30
|
-
return self.repository.get_queue_names()
|
@@ -1,36 +0,0 @@
|
|
1
|
-
from typing import List, Optional
|
2
|
-
|
3
|
-
from ..domain.web_docs.doc_link_scorer_service import DocLinkScorerService
|
4
|
-
from ..domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
|
5
|
-
from ..domain.web_docs.gitbook.text_page import TextPage
|
6
|
-
from ..domain.web_docs.gitbook_web_fetcher_service import GitbookWebFetcherService
|
7
|
-
from ..domain.web_docs.web_docs_link_detector import WebDocsLinkDetector
|
8
|
-
from ..domain.web_docs.web_link import WebLink
|
9
|
-
from ..infrastructure.playwright.playwright_web_content_fetcher import PlayWrightWebContentFetcher
|
10
|
-
|
11
|
-
|
12
|
-
class WebDocsService:
|
13
|
-
|
14
|
-
def __init__(self):
|
15
|
-
self.pattern_repo = DocumentationPatternRepository()
|
16
|
-
self.scorer = DocLinkScorerService(self.pattern_repo)
|
17
|
-
self.headless_browser = PlayWrightWebContentFetcher()
|
18
|
-
self.web_link_detector = WebDocsLinkDetector(
|
19
|
-
self.scorer,
|
20
|
-
self.headless_browser)
|
21
|
-
|
22
|
-
|
23
|
-
async def discover_documentation(self, website_url: str) -> Optional[List[TextPage]]:
|
24
|
-
detected_links = await self.detect_documentation_links(website_url)
|
25
|
-
for link in detected_links:
|
26
|
-
print(f"Fetching {link.url}")
|
27
|
-
gitbook_fetcher = GitbookWebFetcherService(link.url)
|
28
|
-
gitbook_pages = await gitbook_fetcher.fetch()
|
29
|
-
return gitbook_pages
|
30
|
-
|
31
|
-
async def detect_documentation_links(self, website_url: str) -> List[WebLink]:
|
32
|
-
"""
|
33
|
-
Function to detect documentation links from a website
|
34
|
-
Returns a list of potential documentation root URLs
|
35
|
-
"""
|
36
|
-
return await self.web_link_detector.find_docs_links(website_url)
|
File without changes
|
@@ -1,34 +0,0 @@
|
|
1
|
-
|
2
|
-
from abc import ABC, abstractmethod
|
3
|
-
from typing import List, Optional
|
4
|
-
|
5
|
-
from .queue_item import QueueItem
|
6
|
-
|
7
|
-
|
8
|
-
class PriorityQueueRepository(ABC):
|
9
|
-
"""Repository interface for priority queue operations"""
|
10
|
-
|
11
|
-
@abstractmethod
|
12
|
-
def add_item(self, queue_name: str, item: QueueItem) -> None:
|
13
|
-
"""Add an item to the specified queue"""
|
14
|
-
pass
|
15
|
-
|
16
|
-
@abstractmethod
|
17
|
-
def get_highest_priority_item(self, queue_name: str, block: bool = False, timeout: Optional[float] = None) -> Optional[QueueItem]:
|
18
|
-
"""Get and remove the highest priority item from the queue"""
|
19
|
-
pass
|
20
|
-
|
21
|
-
# @abstractmethod
|
22
|
-
# def get_items(self, queue_name: str, limit: int = 10) -> List[QueueItem]:
|
23
|
-
# """Get multiple items from the queue in priority order without removing them"""
|
24
|
-
# pass
|
25
|
-
|
26
|
-
@abstractmethod
|
27
|
-
def queue_length(self, queue_name: str) -> int:
|
28
|
-
"""Get the number of items in the queue"""
|
29
|
-
pass
|
30
|
-
|
31
|
-
@abstractmethod
|
32
|
-
def get_queue_names(self) -> List[str]:
|
33
|
-
"""Get a list of all available queue names"""
|
34
|
-
pass
|
@@ -1,43 +0,0 @@
|
|
1
|
-
|
2
|
-
import uuid
|
3
|
-
from dataclasses import dataclass
|
4
|
-
from datetime import datetime
|
5
|
-
from typing import Optional
|
6
|
-
|
7
|
-
from h_message_bus import HaiMessage
|
8
|
-
|
9
|
-
|
10
|
-
@dataclass(frozen=True)
|
11
|
-
class QueueItem:
|
12
|
-
"""Value object representing an item in the priority queue"""
|
13
|
-
id: str
|
14
|
-
content: HaiMessage
|
15
|
-
priority: int
|
16
|
-
created_at: datetime
|
17
|
-
metadata: Optional[dict] = None
|
18
|
-
|
19
|
-
@classmethod
|
20
|
-
def create(cls, content: HaiMessage, priority: int, metadata: Optional[dict] = None) -> "QueueItem":
|
21
|
-
"""Factory method to create a new QueueItem"""
|
22
|
-
return cls(
|
23
|
-
id=str(uuid.uuid4()),
|
24
|
-
content=content,
|
25
|
-
priority=priority,
|
26
|
-
created_at=datetime.utcnow(),
|
27
|
-
metadata=metadata
|
28
|
-
)
|
29
|
-
|
30
|
-
def __lt__(self, other):
|
31
|
-
"""Comparison method for priority queue ordering
|
32
|
-
- Primary sort by priority: Higher number = higher priority
|
33
|
-
- Secondary sort by timestamp: Earlier timestamp = higher priority (FIFO)
|
34
|
-
"""
|
35
|
-
if not isinstance(other, QueueItem):
|
36
|
-
return NotImplemented
|
37
|
-
|
38
|
-
# First, compare by priority (higher priority value comes first)
|
39
|
-
if self.priority != other.priority:
|
40
|
-
return self.priority > other.priority
|
41
|
-
|
42
|
-
# If priorities are equal, compare by timestamp (older timestamp comes first)
|
43
|
-
return self.created_at < other.created_at
|
h_ai/domain/web_docs/__init__.py
DELETED
File without changes
|
@@ -1,45 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from urllib.parse import urlparse
|
3
|
-
|
4
|
-
from ...domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
|
5
|
-
|
6
|
-
|
7
|
-
class DocLinkScorerService:
|
8
|
-
"""Service for scoring potential documentation links"""
|
9
|
-
|
10
|
-
def __init__(self, pattern_repo: DocumentationPatternRepository):
|
11
|
-
self.pattern_repo = pattern_repo
|
12
|
-
|
13
|
-
def score(self, full_url: str, link_text: str) -> float:
|
14
|
-
"""
|
15
|
-
Score a link based on how likely it is to be documentation
|
16
|
-
Returns a value between 0.0 and 1.0
|
17
|
-
"""
|
18
|
-
score = 0.0
|
19
|
-
max_score = 3.0 # Maximum possible score
|
20
|
-
|
21
|
-
# Parse the URL
|
22
|
-
parsed_url = urlparse(full_url)
|
23
|
-
domain = parsed_url.netloc
|
24
|
-
path = parsed_url.path
|
25
|
-
|
26
|
-
# Check domain patterns
|
27
|
-
for doc_domain in self.pattern_repo.doc_domains:
|
28
|
-
if doc_domain in domain:
|
29
|
-
score += 1.0
|
30
|
-
break
|
31
|
-
|
32
|
-
# Check path patterns
|
33
|
-
for path_pattern in self.pattern_repo.doc_path_patterns:
|
34
|
-
if re.search(path_pattern, path):
|
35
|
-
score += 1.0
|
36
|
-
break
|
37
|
-
|
38
|
-
# Check link text patterns
|
39
|
-
for text_pattern in self.pattern_repo.doc_text_patterns:
|
40
|
-
if re.search(text_pattern, link_text):
|
41
|
-
score += 1.0
|
42
|
-
break
|
43
|
-
|
44
|
-
# Normalize score to 0.0-1.0 range
|
45
|
-
return min(score / max_score, 1.0)
|
@@ -1,44 +0,0 @@
|
|
1
|
-
class DocumentationPatternRepository:
|
2
|
-
"""Repository of patterns that indicate documentation links"""
|
3
|
-
|
4
|
-
def __init__(self):
|
5
|
-
# Domain patterns that commonly host documentation
|
6
|
-
self.doc_domains = [
|
7
|
-
"gitbook.io",
|
8
|
-
"readthedocs.io",
|
9
|
-
"docs.github.com",
|
10
|
-
"developer.mozilla.org",
|
11
|
-
"confluence.",
|
12
|
-
"zendesk.com",
|
13
|
-
"help.",
|
14
|
-
"support.",
|
15
|
-
"wiki.",
|
16
|
-
]
|
17
|
-
|
18
|
-
# URL path patterns that commonly indicate documentation
|
19
|
-
self.doc_path_patterns = [
|
20
|
-
r"/docs/",
|
21
|
-
r"/documentation/",
|
22
|
-
r"/guide/",
|
23
|
-
r"/manual/",
|
24
|
-
r"/help/",
|
25
|
-
r"/knowledge/",
|
26
|
-
r"/support/",
|
27
|
-
r"/api/",
|
28
|
-
r"/reference/",
|
29
|
-
r"/wiki/",
|
30
|
-
]
|
31
|
-
|
32
|
-
# Link text patterns that suggest documentation
|
33
|
-
self.doc_text_patterns = [
|
34
|
-
r"(?i)documentation",
|
35
|
-
r"(?i)docs",
|
36
|
-
r"(?i)developer guide",
|
37
|
-
r"(?i)user guide",
|
38
|
-
r"(?i)knowledge base",
|
39
|
-
r"(?i)help center",
|
40
|
-
r"(?i)manual",
|
41
|
-
r"(?i)api reference",
|
42
|
-
r"(?i)getting started",
|
43
|
-
r"(?i)learn more",
|
44
|
-
]
|
@@ -1,83 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from urllib.parse import urlparse
|
3
|
-
|
4
|
-
from ...domain.web_docs.ecosystem_pattern_repository import EcosystemPatternRepository
|
5
|
-
|
6
|
-
|
7
|
-
class EcosystemLinkScorerService:
|
8
|
-
"""Service for scoring potential ecosystem-related links and content"""
|
9
|
-
|
10
|
-
def __init__(self, pattern_repo: EcosystemPatternRepository):
|
11
|
-
self.pattern_repo = pattern_repo
|
12
|
-
|
13
|
-
def score(self, full_url: str, link_text: str) -> float:
|
14
|
-
"""
|
15
|
-
Score a link based on how likely it is to be ecosystem-related
|
16
|
-
Returns a value between 0.0 and 1.0
|
17
|
-
"""
|
18
|
-
score = 0.0
|
19
|
-
max_score = 3.0 # Maximum possible score
|
20
|
-
|
21
|
-
# Parse the URL
|
22
|
-
parsed_url = urlparse(full_url)
|
23
|
-
domain = parsed_url.netloc
|
24
|
-
path = parsed_url.path
|
25
|
-
|
26
|
-
# Check domain patterns
|
27
|
-
for eco_domain in self.pattern_repo.ecosystem_domains:
|
28
|
-
if eco_domain in domain:
|
29
|
-
score += 1.0
|
30
|
-
break
|
31
|
-
|
32
|
-
# Check path patterns
|
33
|
-
for path_pattern in self.pattern_repo.ecosystem_path_patterns:
|
34
|
-
if re.search(path_pattern, path):
|
35
|
-
score += 1.0
|
36
|
-
break
|
37
|
-
|
38
|
-
# Check link text patterns
|
39
|
-
for text_pattern in self.pattern_repo.ecosystem_text_patterns:
|
40
|
-
if re.search(text_pattern, link_text):
|
41
|
-
score += 1.0
|
42
|
-
break
|
43
|
-
|
44
|
-
# Normalize score to 0.0-1.0 range
|
45
|
-
return min(score / max_score, 1.0)
|
46
|
-
|
47
|
-
def score_page(self, page_url: str, page_title: str, page_content: str) -> float:
|
48
|
-
"""
|
49
|
-
Score an entire page based on how likely it is to contain ecosystem information
|
50
|
-
Returns a value between 0.0 and 1.0
|
51
|
-
|
52
|
-
Args:
|
53
|
-
page_url: The URL of the page
|
54
|
-
page_title: The title of the page
|
55
|
-
page_content: The full text content of the page
|
56
|
-
"""
|
57
|
-
# Start with the URL and title scoring
|
58
|
-
url_score = self.score(page_url, page_title)
|
59
|
-
|
60
|
-
# Content-based scoring
|
61
|
-
content_score = 0.0
|
62
|
-
max_content_score = 2.0
|
63
|
-
|
64
|
-
# Check content patterns
|
65
|
-
content_matches = 0
|
66
|
-
for content_pattern in self.pattern_repo.ecosystem_content_patterns:
|
67
|
-
if re.search(content_pattern, page_content):
|
68
|
-
content_matches += 1
|
69
|
-
|
70
|
-
# Score based on number of content matches
|
71
|
-
if content_matches >= 3:
|
72
|
-
content_score += 1.0
|
73
|
-
elif content_matches > 0:
|
74
|
-
content_score += 0.5
|
75
|
-
|
76
|
-
# Check for header patterns
|
77
|
-
for header_pattern in self.pattern_repo.ecosystem_header_patterns:
|
78
|
-
if re.search(header_pattern, page_content):
|
79
|
-
content_score += 1.0
|
80
|
-
break
|
81
|
-
|
82
|
-
# Combined score with higher weight on content
|
83
|
-
return min((url_score + (content_score / max_content_score) * 2) / 3, 1.0)
|
@@ -1,182 +0,0 @@
|
|
1
|
-
class EcosystemPatternRepository:
|
2
|
-
"""Repository of patterns that identify ecosystem relationships, builders, and collaboration"""
|
3
|
-
|
4
|
-
def __init__(self):
|
5
|
-
# Domains commonly associated with ecosystem and project showcases
|
6
|
-
self.ecosystem_domains = [
|
7
|
-
"showcase.",
|
8
|
-
"ecosystem.",
|
9
|
-
"community.",
|
10
|
-
"gallery.",
|
11
|
-
"partners.",
|
12
|
-
"developers.",
|
13
|
-
"marketplace.",
|
14
|
-
"expo.",
|
15
|
-
"apps.",
|
16
|
-
"extensions.",
|
17
|
-
"plugins.",
|
18
|
-
]
|
19
|
-
|
20
|
-
# URL path patterns indicating ecosystem/builder content
|
21
|
-
self.ecosystem_path_patterns = [
|
22
|
-
r"/ecosystem/",
|
23
|
-
r"/showcase/",
|
24
|
-
r"/community/",
|
25
|
-
r"/built-with/",
|
26
|
-
r"/case-studies/",
|
27
|
-
r"/customers/",
|
28
|
-
r"/partners/",
|
29
|
-
r"/users/",
|
30
|
-
r"/success-stories/",
|
31
|
-
r"/integrations/",
|
32
|
-
r"/extensions/",
|
33
|
-
r"/marketplace/",
|
34
|
-
r"/plugins/",
|
35
|
-
r"/addons/",
|
36
|
-
r"/gallery/",
|
37
|
-
r"/examples/",
|
38
|
-
r"/projects/",
|
39
|
-
r"/contributors/",
|
40
|
-
r"/whos-using/",
|
41
|
-
]
|
42
|
-
|
43
|
-
# Link text patterns suggesting ecosystem content
|
44
|
-
self.ecosystem_text_patterns = [
|
45
|
-
r"(?i)ecosystem",
|
46
|
-
r"(?i)showcase",
|
47
|
-
r"(?i)built with",
|
48
|
-
r"(?i)powered by",
|
49
|
-
r"(?i)case stud(y|ies)",
|
50
|
-
r"(?i)success stor(y|ies)",
|
51
|
-
r"(?i)who('s| is) using",
|
52
|
-
r"(?i)our users",
|
53
|
-
r"(?i)our customers",
|
54
|
-
r"(?i)integrations?",
|
55
|
-
r"(?i)extensions?",
|
56
|
-
r"(?i)plugins?",
|
57
|
-
r"(?i)addons?",
|
58
|
-
r"(?i)community projects",
|
59
|
-
r"(?i)community contributions",
|
60
|
-
r"(?i)user contributions",
|
61
|
-
r"(?i)featured projects",
|
62
|
-
r"(?i)gallery",
|
63
|
-
]
|
64
|
-
|
65
|
-
# Header/title patterns suggesting ecosystem sections
|
66
|
-
self.ecosystem_header_patterns = [
|
67
|
-
r"(?i)ecosystem",
|
68
|
-
r"(?i)who('s| is) using",
|
69
|
-
r"(?i)built (on|with)",
|
70
|
-
r"(?i)powered by",
|
71
|
-
r"(?i)trusted by",
|
72
|
-
r"(?i)customer(s| success)",
|
73
|
-
r"(?i)case stud(y|ies)",
|
74
|
-
r"(?i)success stor(y|ies)",
|
75
|
-
r"(?i)showcase",
|
76
|
-
r"(?i)featured (users|customers|projects)",
|
77
|
-
r"(?i)community (projects|showcase)",
|
78
|
-
r"(?i)partner(s| program)",
|
79
|
-
r"(?i)(our|notable) users",
|
80
|
-
r"(?i)companies using",
|
81
|
-
r"(?i)in production",
|
82
|
-
r"(?i)contributor(s| showcase)",
|
83
|
-
r"(?i)extension (gallery|showcase)",
|
84
|
-
r"(?i)plugin (directory|marketplace)",
|
85
|
-
r"(?i)apps? (built|marketplace|gallery)",
|
86
|
-
]
|
87
|
-
|
88
|
-
# Content phrases that suggest ecosystem descriptions
|
89
|
-
self.ecosystem_content_patterns = [
|
90
|
-
r"(?i)built (on|with) (our|this)",
|
91
|
-
r"(?i)(companies|organizations|projects) (using|powered by)",
|
92
|
-
r"(?i)(is|are) using (our|this)",
|
93
|
-
r"(?i)powered by (our|this)",
|
94
|
-
r"(?i)extend(s|ing)? (the|our) (platform|ecosystem)",
|
95
|
-
r"(?i)integrated with",
|
96
|
-
r"(?i)build(s|ing)? (on top of|with)",
|
97
|
-
r"(?i)leverage(s|ing)? (our|this)",
|
98
|
-
r"(?i)extend(s|ing)? (the|our) (functionality|capabilities)",
|
99
|
-
r"(?i)based on (our|this)",
|
100
|
-
r"(?i)implemented (with|using)",
|
101
|
-
r"(?i)developed (with|using)",
|
102
|
-
r"(?i)(join|be part of) (our|the) ecosystem",
|
103
|
-
]
|
104
|
-
|
105
|
-
# Builder and contribution-specific patterns
|
106
|
-
self.builder_patterns = [
|
107
|
-
r"(?i)how to (build|contribute)",
|
108
|
-
r"(?i)build(ing)? (with|on)",
|
109
|
-
r"(?i)develop(ing)? (with|on)",
|
110
|
-
r"(?i)contribute to",
|
111
|
-
r"(?i)contributor guide",
|
112
|
-
r"(?i)developer program",
|
113
|
-
r"(?i)join (our|the) (ecosystem|community)",
|
114
|
-
r"(?i)become a (contributor|partner)",
|
115
|
-
r"(?i)extend (our|the) (platform|ecosystem)",
|
116
|
-
r"(?i)create (your own|an?) (plugin|extension|integration)",
|
117
|
-
r"(?i)developer (resources|portal)",
|
118
|
-
r"(?i)sdk",
|
119
|
-
r"(?i)api (access|integration)",
|
120
|
-
r"(?i)partner (program|portal)",
|
121
|
-
]
|
122
|
-
|
123
|
-
# Visual cues that often indicate ecosystem showcases
|
124
|
-
self.visual_indicators = [
|
125
|
-
r"logo grid",
|
126
|
-
r"logo carousel",
|
127
|
-
r"client logos",
|
128
|
-
r"partner logos",
|
129
|
-
r"customer logos",
|
130
|
-
r"company logos",
|
131
|
-
r"card gallery",
|
132
|
-
r"project cards",
|
133
|
-
r"showcase gallery",
|
134
|
-
r"case study cards",
|
135
|
-
r"testimonials",
|
136
|
-
r"user testimonials",
|
137
|
-
]
|
138
|
-
|
139
|
-
# Collaboration-specific patterns
|
140
|
-
self.collaboration_patterns = [
|
141
|
-
r"(?i)how to collaborate",
|
142
|
-
r"(?i)collaboration (guide|opportunities)",
|
143
|
-
r"(?i)working together",
|
144
|
-
r"(?i)partner(ship|ing) (opportunities|program)",
|
145
|
-
r"(?i)join (our|the) (community|ecosystem)",
|
146
|
-
r"(?i)community (contribution|participation)",
|
147
|
-
r"(?i)open (source|collaboration)",
|
148
|
-
r"(?i)contribute (code|documentation|resources)",
|
149
|
-
r"(?i)become a (partner|contributor|maintainer)",
|
150
|
-
r"(?i)collaboration (framework|model)",
|
151
|
-
r"(?i)(business|technical) partnership",
|
152
|
-
r"(?i)developer relations",
|
153
|
-
r"(?i)community (engagement|involvement)",
|
154
|
-
]
|
155
|
-
|
156
|
-
# Key meta tags that might indicate ecosystem content
|
157
|
-
self.meta_tag_patterns = [
|
158
|
-
r"(?i)ecosystem",
|
159
|
-
r"(?i)showcase",
|
160
|
-
r"(?i)community",
|
161
|
-
r"(?i)partner program",
|
162
|
-
r"(?i)integration",
|
163
|
-
r"(?i)extension",
|
164
|
-
r"(?i)plugin",
|
165
|
-
r"(?i)marketplace",
|
166
|
-
r"(?i)collaboration",
|
167
|
-
r"(?i)use cases",
|
168
|
-
r"(?i)case studies",
|
169
|
-
r"(?i)success stories",
|
170
|
-
]
|
171
|
-
|
172
|
-
# Schema.org types that often indicate ecosystem relationships
|
173
|
-
self.schema_types = [
|
174
|
-
"Product",
|
175
|
-
"SoftwareApplication",
|
176
|
-
"Organization",
|
177
|
-
"BusinessPartner",
|
178
|
-
"ProgramMembership",
|
179
|
-
"CreativeWork",
|
180
|
-
"SoftwareSourceCode",
|
181
|
-
"WebApplication",
|
182
|
-
]
|
File without changes
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass, field
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
|
5
|
-
@dataclass
|
6
|
-
class TextChapter:
|
7
|
-
"""Represents a chapter/section in a page defined by a heading."""
|
8
|
-
heading: str
|
9
|
-
level: int # h1=1, h2=2, etc.
|
10
|
-
paragraphs: List[str] = field(default_factory=list)
|
11
|
-
|
12
|
-
def to_dict(self):
|
13
|
-
"""Convert this TextChapter instance to a serializable dictionary"""
|
14
|
-
return {
|
15
|
-
'heading': self.heading,
|
16
|
-
'level': self.level,
|
17
|
-
'paragraphs': self.paragraphs
|
18
|
-
}
|