h-ai-brain 0.0.15__py3-none-any.whl → 0.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. h_ai/__init__.py +1 -3
  2. h_ai/application/hai_service.py +2 -47
  3. {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.17.dist-info}/METADATA +2 -8
  4. h_ai_brain-0.0.17.dist-info/RECORD +31 -0
  5. {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.17.dist-info}/WHEEL +1 -1
  6. h_ai/application/priority_queue_service.py +0 -30
  7. h_ai/application/web_docs_service.py +0 -36
  8. h_ai/domain/priorityqueue/__init__.py +0 -0
  9. h_ai/domain/priorityqueue/priority_queue_repository.py +0 -34
  10. h_ai/domain/priorityqueue/queue_item.py +0 -43
  11. h_ai/domain/web_docs/__init__.py +0 -0
  12. h_ai/domain/web_docs/doc_link_scorer_service.py +0 -45
  13. h_ai/domain/web_docs/documentation_pattern_repository.py +0 -44
  14. h_ai/domain/web_docs/ecosystem_link_scorer_service.py +0 -83
  15. h_ai/domain/web_docs/ecosystem_pattern_repository.py +0 -182
  16. h_ai/domain/web_docs/gitbook/__init__.py +0 -0
  17. h_ai/domain/web_docs/gitbook/text_chapter.py +0 -18
  18. h_ai/domain/web_docs/gitbook/text_page.py +0 -46
  19. h_ai/domain/web_docs/gitbook_web_fetcher_service.py +0 -171
  20. h_ai/domain/web_docs/web_docs_link_detector.py +0 -28
  21. h_ai/domain/web_docs/web_link.py +0 -11
  22. h_ai/domain/webpages/__init__.py +0 -0
  23. h_ai/domain/webpages/web_fetcher_repository.py +0 -10
  24. h_ai/domain/webpages/web_text_fetcher_repository.py +0 -12
  25. h_ai/infrastructure/beautifulsoup/__init__.py +0 -0
  26. h_ai/infrastructure/beautifulsoup/soup_processor.py +0 -240
  27. h_ai/infrastructure/playwright/__init__.py +0 -0
  28. h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +0 -64
  29. h_ai/infrastructure/priorityqueue/__init__.py +0 -0
  30. h_ai/infrastructure/priorityqueue/in_memory_priority_queue_repository.py +0 -98
  31. h_ai_brain-0.0.15.dist-info/RECORD +0 -56
  32. {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.17.dist-info}/licenses/LICENSE +0 -0
  33. {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.17.dist-info}/licenses/NOTICE.txt +0 -0
  34. {h_ai_brain-0.0.15.dist-info → h_ai_brain-0.0.17.dist-info}/top_level.txt +0 -0
h_ai/__init__.py CHANGED
@@ -1,5 +1,3 @@
1
- __all__ = ['HaiService', 'WebDocsService']
2
-
3
- from .application.web_docs_service import WebDocsService
1
+ __all__ = ['HaiService']
4
2
 
5
3
  from .application.hai_service import HaiService
@@ -1,57 +1,12 @@
1
1
  from h_message_bus import NatsPublisherAdapter
2
- from h_message_bus.domain.request_messages.twitter_get_user_request_message import TwitterGetUserRequestMessage
3
- from h_message_bus.domain.request_messages.twitter_get_user_response_message import TwitterGetUserResponseMessage
4
2
  from h_message_bus.domain.request_messages.vector_read_metadata_request_message import VectorReadMetaDataRequestMessage
5
- from h_message_bus.domain.request_messages.vector_read_metadata_response_message import \
6
- VectorReadMetaDataResponseMessage
7
- from h_message_bus.domain.request_messages.vector_save_request_message import VectorSaveRequestMessage
8
-
9
- from .priority_queue_service import PriorityQueueService
10
- from ..application.web_docs_service import WebDocsService
11
- from ..infrastructure.priorityqueue.in_memory_priority_queue_repository import InMemoryPriorityQueueRepository
12
-
3
+ from h_message_bus.domain.request_messages.vector_read_metadata_response_message import VectorReadMetaDataResponseMessage
13
4
 
14
5
  class HaiService:
15
6
  def __init__(self, nats_publisher_adapter: NatsPublisherAdapter):
16
7
  self.nats_publisher_adapter = nats_publisher_adapter
17
- self.web_docs_service = WebDocsService()
18
- queue = InMemoryPriorityQueueRepository()
19
- self.queue_service = PriorityQueueService(queue)
20
-
21
- async def detect_and_store_documentation(self, twitter_screen_name: str):
22
- req_message = TwitterGetUserRequestMessage.create_message(twitter_screen_name)
23
- response = await self.nats_publisher_adapter.request(req_message)
24
- twitter_user = TwitterGetUserResponseMessage.from_hai_message(response)
25
-
26
- if twitter_user.url is not None:
27
- print(f"Documentation found for {twitter_user.screen_name}: {twitter_user.url}")
28
- docs = await self.web_docs_service.discover_documentation(twitter_user.url)
29
-
30
- for doc in docs:
31
- collection_name = f"{twitter_user.screen_name}_docs"
32
- chapters = doc.chapters
33
- for chapter in chapters:
34
- i = 0
35
- for text in chapter.paragraphs:
36
- document_id = f"{doc.title}_{chapter.heading}_{i}"
37
-
38
- req_metadata = {
39
- "source": doc.url
40
- }
41
- i = i + 1
42
-
43
- request = VectorSaveRequestMessage.create_message(
44
- collection_name=collection_name,
45
- document_id=document_id,
46
- content=text,
47
- metadata=req_metadata)
48
-
49
- await self.nats_publisher_adapter.publish(request)
50
-
51
- else:
52
- print(f"No documentation found for {twitter_user.screen_name}")
53
8
 
54
- async def load_current_knowledge_base_metadata(self) -> VectorReadMetaDataResponseMessage:
9
+ async def get_knowledgebase_metadata(self) -> VectorReadMetaDataResponseMessage:
55
10
  message = VectorReadMetaDataRequestMessage.create_message()
56
11
  response = await self.nats_publisher_adapter.request(message)
57
12
  metadata_result = VectorReadMetaDataResponseMessage.from_hai_message(response)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: h_ai_brain
3
- Version: 0.0.15
3
+ Version: 0.0.17
4
4
  Summary: AI Research agent API
5
5
  Author-email: shoebill <shoebill.hai@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -10,13 +10,7 @@ Requires-Python: >=3.10
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
12
  License-File: NOTICE.txt
13
- Requires-Dist: requests~=2.32.3
14
- Requires-Dist: h_message_bus~=0.0.19
15
- Requires-Dist: aiohttp~=3.11.16
16
- Requires-Dist: bs4~=0.0.2
17
- Requires-Dist: beautifulsoup4~=4.13.3
18
- Requires-Dist: tldextract~=5.2.0
19
- Requires-Dist: playwright~=1.51.0
13
+ Requires-Dist: h_message_bus~=0.0.21
20
14
  Provides-Extra: dev
21
15
  Requires-Dist: pytest; extra == "dev"
22
16
  Dynamic: license-file
@@ -0,0 +1,31 @@
1
+ h_ai/__init__.py,sha256=63uVFHPxXmLrZVo2ZPixL2cU4jwf3XTAuwIVGHGkqJI,75
2
+ h_ai/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ h_ai/application/hai_service.py,sha256=UwzM-w2zpSNiCsoba-uClUBRFhMq74uuznkLpt08f5g,794
4
+ h_ai/application/system_prompts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ h_ai/application/system_prompts/roles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ h_ai/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ h_ai/domain/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ h_ai/domain/reasoning/llm_chat_repository.py,sha256=rY2izDyaDnoyyrCRS1qc9erHB98vARj4Mp-SnPwNhyY,211
9
+ h_ai/domain/reasoning/llm_generate_respository.py,sha256=DPiV6ldCE8YhDdVb5rj98MBudKalDQHV3CZ2ADTm_f8,178
10
+ h_ai/domain/reasoning/llm_tool_repository.py,sha256=nFwqtrJ0Gw8XUFX0uuO7-UejtgoqNuGeT51qZPQtxas,401
11
+ h_ai/domain/reasoning/text_analysis.py,sha256=rmCUHWzJ3muFBorVXx7HcU2Sw-UfXFOuAVXRAPkqS8E,5183
12
+ h_ai/domain/reasoning/tool_message.py,sha256=jpbfbJXj6oqZyB3lDxGOUyFB4faHtXAaEOVBHgTgSnk,67
13
+ h_ai/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ h_ai/infrastructure/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ h_ai/infrastructure/llm/data_handler.py,sha256=M2h1azkjBP9GyaffTggKQZb2CQmOvAk2yo9NrsFMYAo,987
16
+ h_ai/infrastructure/llm/llm_response_cleaner.py,sha256=pp1K7I77hagrC1r6Ib61-iSNQnU6wlM54bRmOUa7eFk,859
17
+ h_ai/infrastructure/llm/prompt_helper.py,sha256=QjxPbNW7hu2wBIi9GLJ7r00ELytT2Wr1JKDAA1jB2U4,238
18
+ h_ai/infrastructure/llm/prompt_loader.py,sha256=8h6QGq_h-s-UzC9fIxpeMCrhjHd7347NaAq7uLf-ook,455
19
+ h_ai/infrastructure/llm/ollama/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ h_ai/infrastructure/llm/ollama/ollama_chat_repository.py,sha256=GALea7UWLtKyt767Frtl3uv8rvy42HrOKMIQGpqq-H0,2108
21
+ h_ai/infrastructure/llm/ollama/ollama_generate_repository.py,sha256=jorJrsIR3WPkvls7NE3BXllEtiDePCgMX5DFADz2s8E,1712
22
+ h_ai/infrastructure/llm/ollama/ollama_tool_repository.py,sha256=7UZ-qsgXQUcJFx1qY7SVI7p3FhIy0Drdqs7jZIp42Ag,4683
23
+ h_ai/infrastructure/llm/ollama/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py,sha256=ZIz4PQ3869vI3xAYYufPrxXpacajRDtOI8RDl5Dm9RQ,305
25
+ h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py,sha256=GZ_ddpbWa8iy6NZq50vokUFVZBiX0WNa81z9-r9RzTY,392
26
+ h_ai_brain-0.0.17.dist-info/licenses/LICENSE,sha256=SbvpEU5JIU3yzMMkyzrI0dGqHDoJR_lMKGdl6GZHsy4,11558
27
+ h_ai_brain-0.0.17.dist-info/licenses/NOTICE.txt,sha256=vxeIKUiGqAePLvDW4AVm3Xh-3BcsvMtCMn1tbsr9zsE,668
28
+ h_ai_brain-0.0.17.dist-info/METADATA,sha256=Dx3oqE6H3-RC4LkAPuZGDjMzttB6pSadgiXv4dNdPLk,536
29
+ h_ai_brain-0.0.17.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
30
+ h_ai_brain-0.0.17.dist-info/top_level.txt,sha256=3MChDBWvDJV4cEHuZhzeODxQ4ewtw-arOuyaDOc6sIo,5
31
+ h_ai_brain-0.0.17.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.0)
2
+ Generator: setuptools (79.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,30 +0,0 @@
1
-
2
- from typing import Any, List, Optional
3
-
4
- from ..domain.priorityqueue.priority_queue_repository import PriorityQueueRepository
5
- from ..domain.priorityqueue.queue_item import QueueItem
6
-
7
-
8
- class PriorityQueueService:
9
- """Application service to manage priority queue operations"""
10
-
11
- def __init__(self, repository: PriorityQueueRepository):
12
- self.repository = repository
13
-
14
- def add_item(self, queue_name: str, content: Any, priority: int, metadata: Optional[dict] = None) -> QueueItem:
15
- """Add an item to the specified queue"""
16
- item = QueueItem.create(content, priority, metadata)
17
- self.repository.add_item(queue_name, item)
18
- return item
19
-
20
- def get_next_item(self, queue_name: str, block: bool = True, timeout: float = 30.0) -> Optional[QueueItem]:
21
- """Get and remove the highest priority item from the queue"""
22
- return self.repository.get_highest_priority_item(queue_name, block=block, timeout=timeout)
23
-
24
- def get_queue_length(self, queue_name: str) -> int:
25
- """Get the number of items in the queue"""
26
- return self.repository.queue_length(queue_name)
27
-
28
- def get_available_queues(self) -> List[str]:
29
- """Get a list of all available queue names"""
30
- return self.repository.get_queue_names()
@@ -1,36 +0,0 @@
1
- from typing import List, Optional
2
-
3
- from ..domain.web_docs.doc_link_scorer_service import DocLinkScorerService
4
- from ..domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
5
- from ..domain.web_docs.gitbook.text_page import TextPage
6
- from ..domain.web_docs.gitbook_web_fetcher_service import GitbookWebFetcherService
7
- from ..domain.web_docs.web_docs_link_detector import WebDocsLinkDetector
8
- from ..domain.web_docs.web_link import WebLink
9
- from ..infrastructure.playwright.playwright_web_content_fetcher import PlayWrightWebContentFetcher
10
-
11
-
12
- class WebDocsService:
13
-
14
- def __init__(self):
15
- self.pattern_repo = DocumentationPatternRepository()
16
- self.scorer = DocLinkScorerService(self.pattern_repo)
17
- self.headless_browser = PlayWrightWebContentFetcher()
18
- self.web_link_detector = WebDocsLinkDetector(
19
- self.scorer,
20
- self.headless_browser)
21
-
22
-
23
- async def discover_documentation(self, website_url: str) -> Optional[List[TextPage]]:
24
- detected_links = await self.detect_documentation_links(website_url)
25
- for link in detected_links:
26
- print(f"Fetching {link.url}")
27
- gitbook_fetcher = GitbookWebFetcherService(link.url)
28
- gitbook_pages = await gitbook_fetcher.fetch()
29
- return gitbook_pages
30
-
31
- async def detect_documentation_links(self, website_url: str) -> List[WebLink]:
32
- """
33
- Function to detect documentation links from a website
34
- Returns a list of potential documentation root URLs
35
- """
36
- return await self.web_link_detector.find_docs_links(website_url)
File without changes
@@ -1,34 +0,0 @@
1
-
2
- from abc import ABC, abstractmethod
3
- from typing import List, Optional
4
-
5
- from .queue_item import QueueItem
6
-
7
-
8
- class PriorityQueueRepository(ABC):
9
- """Repository interface for priority queue operations"""
10
-
11
- @abstractmethod
12
- def add_item(self, queue_name: str, item: QueueItem) -> None:
13
- """Add an item to the specified queue"""
14
- pass
15
-
16
- @abstractmethod
17
- def get_highest_priority_item(self, queue_name: str, block: bool = False, timeout: Optional[float] = None) -> Optional[QueueItem]:
18
- """Get and remove the highest priority item from the queue"""
19
- pass
20
-
21
- # @abstractmethod
22
- # def get_items(self, queue_name: str, limit: int = 10) -> List[QueueItem]:
23
- # """Get multiple items from the queue in priority order without removing them"""
24
- # pass
25
-
26
- @abstractmethod
27
- def queue_length(self, queue_name: str) -> int:
28
- """Get the number of items in the queue"""
29
- pass
30
-
31
- @abstractmethod
32
- def get_queue_names(self) -> List[str]:
33
- """Get a list of all available queue names"""
34
- pass
@@ -1,43 +0,0 @@
1
-
2
- import uuid
3
- from dataclasses import dataclass
4
- from datetime import datetime
5
- from typing import Optional
6
-
7
- from h_message_bus import HaiMessage
8
-
9
-
10
- @dataclass(frozen=True)
11
- class QueueItem:
12
- """Value object representing an item in the priority queue"""
13
- id: str
14
- content: HaiMessage
15
- priority: int
16
- created_at: datetime
17
- metadata: Optional[dict] = None
18
-
19
- @classmethod
20
- def create(cls, content: HaiMessage, priority: int, metadata: Optional[dict] = None) -> "QueueItem":
21
- """Factory method to create a new QueueItem"""
22
- return cls(
23
- id=str(uuid.uuid4()),
24
- content=content,
25
- priority=priority,
26
- created_at=datetime.utcnow(),
27
- metadata=metadata
28
- )
29
-
30
- def __lt__(self, other):
31
- """Comparison method for priority queue ordering
32
- - Primary sort by priority: Higher number = higher priority
33
- - Secondary sort by timestamp: Earlier timestamp = higher priority (FIFO)
34
- """
35
- if not isinstance(other, QueueItem):
36
- return NotImplemented
37
-
38
- # First, compare by priority (higher priority value comes first)
39
- if self.priority != other.priority:
40
- return self.priority > other.priority
41
-
42
- # If priorities are equal, compare by timestamp (older timestamp comes first)
43
- return self.created_at < other.created_at
File without changes
@@ -1,45 +0,0 @@
1
- import re
2
- from urllib.parse import urlparse
3
-
4
- from ...domain.web_docs.documentation_pattern_repository import DocumentationPatternRepository
5
-
6
-
7
- class DocLinkScorerService:
8
- """Service for scoring potential documentation links"""
9
-
10
- def __init__(self, pattern_repo: DocumentationPatternRepository):
11
- self.pattern_repo = pattern_repo
12
-
13
- def score(self, full_url: str, link_text: str) -> float:
14
- """
15
- Score a link based on how likely it is to be documentation
16
- Returns a value between 0.0 and 1.0
17
- """
18
- score = 0.0
19
- max_score = 3.0 # Maximum possible score
20
-
21
- # Parse the URL
22
- parsed_url = urlparse(full_url)
23
- domain = parsed_url.netloc
24
- path = parsed_url.path
25
-
26
- # Check domain patterns
27
- for doc_domain in self.pattern_repo.doc_domains:
28
- if doc_domain in domain:
29
- score += 1.0
30
- break
31
-
32
- # Check path patterns
33
- for path_pattern in self.pattern_repo.doc_path_patterns:
34
- if re.search(path_pattern, path):
35
- score += 1.0
36
- break
37
-
38
- # Check link text patterns
39
- for text_pattern in self.pattern_repo.doc_text_patterns:
40
- if re.search(text_pattern, link_text):
41
- score += 1.0
42
- break
43
-
44
- # Normalize score to 0.0-1.0 range
45
- return min(score / max_score, 1.0)
@@ -1,44 +0,0 @@
1
- class DocumentationPatternRepository:
2
- """Repository of patterns that indicate documentation links"""
3
-
4
- def __init__(self):
5
- # Domain patterns that commonly host documentation
6
- self.doc_domains = [
7
- "gitbook.io",
8
- "readthedocs.io",
9
- "docs.github.com",
10
- "developer.mozilla.org",
11
- "confluence.",
12
- "zendesk.com",
13
- "help.",
14
- "support.",
15
- "wiki.",
16
- ]
17
-
18
- # URL path patterns that commonly indicate documentation
19
- self.doc_path_patterns = [
20
- r"/docs/",
21
- r"/documentation/",
22
- r"/guide/",
23
- r"/manual/",
24
- r"/help/",
25
- r"/knowledge/",
26
- r"/support/",
27
- r"/api/",
28
- r"/reference/",
29
- r"/wiki/",
30
- ]
31
-
32
- # Link text patterns that suggest documentation
33
- self.doc_text_patterns = [
34
- r"(?i)documentation",
35
- r"(?i)docs",
36
- r"(?i)developer guide",
37
- r"(?i)user guide",
38
- r"(?i)knowledge base",
39
- r"(?i)help center",
40
- r"(?i)manual",
41
- r"(?i)api reference",
42
- r"(?i)getting started",
43
- r"(?i)learn more",
44
- ]
@@ -1,83 +0,0 @@
1
- import re
2
- from urllib.parse import urlparse
3
-
4
- from ...domain.web_docs.ecosystem_pattern_repository import EcosystemPatternRepository
5
-
6
-
7
- class EcosystemLinkScorerService:
8
- """Service for scoring potential ecosystem-related links and content"""
9
-
10
- def __init__(self, pattern_repo: EcosystemPatternRepository):
11
- self.pattern_repo = pattern_repo
12
-
13
- def score(self, full_url: str, link_text: str) -> float:
14
- """
15
- Score a link based on how likely it is to be ecosystem-related
16
- Returns a value between 0.0 and 1.0
17
- """
18
- score = 0.0
19
- max_score = 3.0 # Maximum possible score
20
-
21
- # Parse the URL
22
- parsed_url = urlparse(full_url)
23
- domain = parsed_url.netloc
24
- path = parsed_url.path
25
-
26
- # Check domain patterns
27
- for eco_domain in self.pattern_repo.ecosystem_domains:
28
- if eco_domain in domain:
29
- score += 1.0
30
- break
31
-
32
- # Check path patterns
33
- for path_pattern in self.pattern_repo.ecosystem_path_patterns:
34
- if re.search(path_pattern, path):
35
- score += 1.0
36
- break
37
-
38
- # Check link text patterns
39
- for text_pattern in self.pattern_repo.ecosystem_text_patterns:
40
- if re.search(text_pattern, link_text):
41
- score += 1.0
42
- break
43
-
44
- # Normalize score to 0.0-1.0 range
45
- return min(score / max_score, 1.0)
46
-
47
- def score_page(self, page_url: str, page_title: str, page_content: str) -> float:
48
- """
49
- Score an entire page based on how likely it is to contain ecosystem information
50
- Returns a value between 0.0 and 1.0
51
-
52
- Args:
53
- page_url: The URL of the page
54
- page_title: The title of the page
55
- page_content: The full text content of the page
56
- """
57
- # Start with the URL and title scoring
58
- url_score = self.score(page_url, page_title)
59
-
60
- # Content-based scoring
61
- content_score = 0.0
62
- max_content_score = 2.0
63
-
64
- # Check content patterns
65
- content_matches = 0
66
- for content_pattern in self.pattern_repo.ecosystem_content_patterns:
67
- if re.search(content_pattern, page_content):
68
- content_matches += 1
69
-
70
- # Score based on number of content matches
71
- if content_matches >= 3:
72
- content_score += 1.0
73
- elif content_matches > 0:
74
- content_score += 0.5
75
-
76
- # Check for header patterns
77
- for header_pattern in self.pattern_repo.ecosystem_header_patterns:
78
- if re.search(header_pattern, page_content):
79
- content_score += 1.0
80
- break
81
-
82
- # Combined score with higher weight on content
83
- return min((url_score + (content_score / max_content_score) * 2) / 3, 1.0)
@@ -1,182 +0,0 @@
1
- class EcosystemPatternRepository:
2
- """Repository of patterns that identify ecosystem relationships, builders, and collaboration"""
3
-
4
- def __init__(self):
5
- # Domains commonly associated with ecosystem and project showcases
6
- self.ecosystem_domains = [
7
- "showcase.",
8
- "ecosystem.",
9
- "community.",
10
- "gallery.",
11
- "partners.",
12
- "developers.",
13
- "marketplace.",
14
- "expo.",
15
- "apps.",
16
- "extensions.",
17
- "plugins.",
18
- ]
19
-
20
- # URL path patterns indicating ecosystem/builder content
21
- self.ecosystem_path_patterns = [
22
- r"/ecosystem/",
23
- r"/showcase/",
24
- r"/community/",
25
- r"/built-with/",
26
- r"/case-studies/",
27
- r"/customers/",
28
- r"/partners/",
29
- r"/users/",
30
- r"/success-stories/",
31
- r"/integrations/",
32
- r"/extensions/",
33
- r"/marketplace/",
34
- r"/plugins/",
35
- r"/addons/",
36
- r"/gallery/",
37
- r"/examples/",
38
- r"/projects/",
39
- r"/contributors/",
40
- r"/whos-using/",
41
- ]
42
-
43
- # Link text patterns suggesting ecosystem content
44
- self.ecosystem_text_patterns = [
45
- r"(?i)ecosystem",
46
- r"(?i)showcase",
47
- r"(?i)built with",
48
- r"(?i)powered by",
49
- r"(?i)case stud(y|ies)",
50
- r"(?i)success stor(y|ies)",
51
- r"(?i)who('s| is) using",
52
- r"(?i)our users",
53
- r"(?i)our customers",
54
- r"(?i)integrations?",
55
- r"(?i)extensions?",
56
- r"(?i)plugins?",
57
- r"(?i)addons?",
58
- r"(?i)community projects",
59
- r"(?i)community contributions",
60
- r"(?i)user contributions",
61
- r"(?i)featured projects",
62
- r"(?i)gallery",
63
- ]
64
-
65
- # Header/title patterns suggesting ecosystem sections
66
- self.ecosystem_header_patterns = [
67
- r"(?i)ecosystem",
68
- r"(?i)who('s| is) using",
69
- r"(?i)built (on|with)",
70
- r"(?i)powered by",
71
- r"(?i)trusted by",
72
- r"(?i)customer(s| success)",
73
- r"(?i)case stud(y|ies)",
74
- r"(?i)success stor(y|ies)",
75
- r"(?i)showcase",
76
- r"(?i)featured (users|customers|projects)",
77
- r"(?i)community (projects|showcase)",
78
- r"(?i)partner(s| program)",
79
- r"(?i)(our|notable) users",
80
- r"(?i)companies using",
81
- r"(?i)in production",
82
- r"(?i)contributor(s| showcase)",
83
- r"(?i)extension (gallery|showcase)",
84
- r"(?i)plugin (directory|marketplace)",
85
- r"(?i)apps? (built|marketplace|gallery)",
86
- ]
87
-
88
- # Content phrases that suggest ecosystem descriptions
89
- self.ecosystem_content_patterns = [
90
- r"(?i)built (on|with) (our|this)",
91
- r"(?i)(companies|organizations|projects) (using|powered by)",
92
- r"(?i)(is|are) using (our|this)",
93
- r"(?i)powered by (our|this)",
94
- r"(?i)extend(s|ing)? (the|our) (platform|ecosystem)",
95
- r"(?i)integrated with",
96
- r"(?i)build(s|ing)? (on top of|with)",
97
- r"(?i)leverage(s|ing)? (our|this)",
98
- r"(?i)extend(s|ing)? (the|our) (functionality|capabilities)",
99
- r"(?i)based on (our|this)",
100
- r"(?i)implemented (with|using)",
101
- r"(?i)developed (with|using)",
102
- r"(?i)(join|be part of) (our|the) ecosystem",
103
- ]
104
-
105
- # Builder and contribution-specific patterns
106
- self.builder_patterns = [
107
- r"(?i)how to (build|contribute)",
108
- r"(?i)build(ing)? (with|on)",
109
- r"(?i)develop(ing)? (with|on)",
110
- r"(?i)contribute to",
111
- r"(?i)contributor guide",
112
- r"(?i)developer program",
113
- r"(?i)join (our|the) (ecosystem|community)",
114
- r"(?i)become a (contributor|partner)",
115
- r"(?i)extend (our|the) (platform|ecosystem)",
116
- r"(?i)create (your own|an?) (plugin|extension|integration)",
117
- r"(?i)developer (resources|portal)",
118
- r"(?i)sdk",
119
- r"(?i)api (access|integration)",
120
- r"(?i)partner (program|portal)",
121
- ]
122
-
123
- # Visual cues that often indicate ecosystem showcases
124
- self.visual_indicators = [
125
- r"logo grid",
126
- r"logo carousel",
127
- r"client logos",
128
- r"partner logos",
129
- r"customer logos",
130
- r"company logos",
131
- r"card gallery",
132
- r"project cards",
133
- r"showcase gallery",
134
- r"case study cards",
135
- r"testimonials",
136
- r"user testimonials",
137
- ]
138
-
139
- # Collaboration-specific patterns
140
- self.collaboration_patterns = [
141
- r"(?i)how to collaborate",
142
- r"(?i)collaboration (guide|opportunities)",
143
- r"(?i)working together",
144
- r"(?i)partner(ship|ing) (opportunities|program)",
145
- r"(?i)join (our|the) (community|ecosystem)",
146
- r"(?i)community (contribution|participation)",
147
- r"(?i)open (source|collaboration)",
148
- r"(?i)contribute (code|documentation|resources)",
149
- r"(?i)become a (partner|contributor|maintainer)",
150
- r"(?i)collaboration (framework|model)",
151
- r"(?i)(business|technical) partnership",
152
- r"(?i)developer relations",
153
- r"(?i)community (engagement|involvement)",
154
- ]
155
-
156
- # Key meta tags that might indicate ecosystem content
157
- self.meta_tag_patterns = [
158
- r"(?i)ecosystem",
159
- r"(?i)showcase",
160
- r"(?i)community",
161
- r"(?i)partner program",
162
- r"(?i)integration",
163
- r"(?i)extension",
164
- r"(?i)plugin",
165
- r"(?i)marketplace",
166
- r"(?i)collaboration",
167
- r"(?i)use cases",
168
- r"(?i)case studies",
169
- r"(?i)success stories",
170
- ]
171
-
172
- # Schema.org types that often indicate ecosystem relationships
173
- self.schema_types = [
174
- "Product",
175
- "SoftwareApplication",
176
- "Organization",
177
- "BusinessPartner",
178
- "ProgramMembership",
179
- "CreativeWork",
180
- "SoftwareSourceCode",
181
- "WebApplication",
182
- ]
File without changes
@@ -1,18 +0,0 @@
1
- from dataclasses import dataclass, field
2
- from typing import List
3
-
4
-
5
- @dataclass
6
- class TextChapter:
7
- """Represents a chapter/section in a page defined by a heading."""
8
- heading: str
9
- level: int # h1=1, h2=2, etc.
10
- paragraphs: List[str] = field(default_factory=list)
11
-
12
- def to_dict(self):
13
- """Convert this TextChapter instance to a serializable dictionary"""
14
- return {
15
- 'heading': self.heading,
16
- 'level': self.level,
17
- 'paragraphs': self.paragraphs
18
- }