h-ai-brain 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h_ai/domain/web_docs/ecosystem_link_scorer_service.py +83 -0
- h_ai/domain/web_docs/ecosystem_pattern_repository.py +182 -0
- h_ai/infrastructure/playwright/playwright_web_content_fetcher.py +7 -1
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/METADATA +1 -1
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/RECORD +9 -7
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/WHEEL +0 -0
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/licenses/LICENSE +0 -0
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/licenses/NOTICE.txt +0 -0
- {h_ai_brain-0.0.5.dist-info → h_ai_brain-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
import re
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
|
4
|
+
from ...domain.web_docs.ecosystem_pattern_repository import EcosystemPatternRepository
|
5
|
+
|
6
|
+
|
7
|
+
class EcosystemLinkScorerService:
|
8
|
+
"""Service for scoring potential ecosystem-related links and content"""
|
9
|
+
|
10
|
+
def __init__(self, pattern_repo: EcosystemPatternRepository):
|
11
|
+
self.pattern_repo = pattern_repo
|
12
|
+
|
13
|
+
def score(self, full_url: str, link_text: str) -> float:
|
14
|
+
"""
|
15
|
+
Score a link based on how likely it is to be ecosystem-related
|
16
|
+
Returns a value between 0.0 and 1.0
|
17
|
+
"""
|
18
|
+
score = 0.0
|
19
|
+
max_score = 3.0 # Maximum possible score
|
20
|
+
|
21
|
+
# Parse the URL
|
22
|
+
parsed_url = urlparse(full_url)
|
23
|
+
domain = parsed_url.netloc
|
24
|
+
path = parsed_url.path
|
25
|
+
|
26
|
+
# Check domain patterns
|
27
|
+
for eco_domain in self.pattern_repo.ecosystem_domains:
|
28
|
+
if eco_domain in domain:
|
29
|
+
score += 1.0
|
30
|
+
break
|
31
|
+
|
32
|
+
# Check path patterns
|
33
|
+
for path_pattern in self.pattern_repo.ecosystem_path_patterns:
|
34
|
+
if re.search(path_pattern, path):
|
35
|
+
score += 1.0
|
36
|
+
break
|
37
|
+
|
38
|
+
# Check link text patterns
|
39
|
+
for text_pattern in self.pattern_repo.ecosystem_text_patterns:
|
40
|
+
if re.search(text_pattern, link_text):
|
41
|
+
score += 1.0
|
42
|
+
break
|
43
|
+
|
44
|
+
# Normalize score to 0.0-1.0 range
|
45
|
+
return min(score / max_score, 1.0)
|
46
|
+
|
47
|
+
def score_page(self, page_url: str, page_title: str, page_content: str) -> float:
|
48
|
+
"""
|
49
|
+
Score an entire page based on how likely it is to contain ecosystem information
|
50
|
+
Returns a value between 0.0 and 1.0
|
51
|
+
|
52
|
+
Args:
|
53
|
+
page_url: The URL of the page
|
54
|
+
page_title: The title of the page
|
55
|
+
page_content: The full text content of the page
|
56
|
+
"""
|
57
|
+
# Start with the URL and title scoring
|
58
|
+
url_score = self.score(page_url, page_title)
|
59
|
+
|
60
|
+
# Content-based scoring
|
61
|
+
content_score = 0.0
|
62
|
+
max_content_score = 2.0
|
63
|
+
|
64
|
+
# Check content patterns
|
65
|
+
content_matches = 0
|
66
|
+
for content_pattern in self.pattern_repo.ecosystem_content_patterns:
|
67
|
+
if re.search(content_pattern, page_content):
|
68
|
+
content_matches += 1
|
69
|
+
|
70
|
+
# Score based on number of content matches
|
71
|
+
if content_matches >= 3:
|
72
|
+
content_score += 1.0
|
73
|
+
elif content_matches > 0:
|
74
|
+
content_score += 0.5
|
75
|
+
|
76
|
+
# Check for header patterns
|
77
|
+
for header_pattern in self.pattern_repo.ecosystem_header_patterns:
|
78
|
+
if re.search(header_pattern, page_content):
|
79
|
+
content_score += 1.0
|
80
|
+
break
|
81
|
+
|
82
|
+
# Combined score with higher weight on content
|
83
|
+
return min((url_score + (content_score / max_content_score) * 2) / 3, 1.0)
|
@@ -0,0 +1,182 @@
|
|
1
|
+
class EcosystemPatternRepository:
|
2
|
+
"""Repository of patterns that identify ecosystem relationships, builders, and collaboration"""
|
3
|
+
|
4
|
+
def __init__(self):
|
5
|
+
# Domains commonly associated with ecosystem and project showcases
|
6
|
+
self.ecosystem_domains = [
|
7
|
+
"showcase.",
|
8
|
+
"ecosystem.",
|
9
|
+
"community.",
|
10
|
+
"gallery.",
|
11
|
+
"partners.",
|
12
|
+
"developers.",
|
13
|
+
"marketplace.",
|
14
|
+
"expo.",
|
15
|
+
"apps.",
|
16
|
+
"extensions.",
|
17
|
+
"plugins.",
|
18
|
+
]
|
19
|
+
|
20
|
+
# URL path patterns indicating ecosystem/builder content
|
21
|
+
self.ecosystem_path_patterns = [
|
22
|
+
r"/ecosystem/",
|
23
|
+
r"/showcase/",
|
24
|
+
r"/community/",
|
25
|
+
r"/built-with/",
|
26
|
+
r"/case-studies/",
|
27
|
+
r"/customers/",
|
28
|
+
r"/partners/",
|
29
|
+
r"/users/",
|
30
|
+
r"/success-stories/",
|
31
|
+
r"/integrations/",
|
32
|
+
r"/extensions/",
|
33
|
+
r"/marketplace/",
|
34
|
+
r"/plugins/",
|
35
|
+
r"/addons/",
|
36
|
+
r"/gallery/",
|
37
|
+
r"/examples/",
|
38
|
+
r"/projects/",
|
39
|
+
r"/contributors/",
|
40
|
+
r"/whos-using/",
|
41
|
+
]
|
42
|
+
|
43
|
+
# Link text patterns suggesting ecosystem content
|
44
|
+
self.ecosystem_text_patterns = [
|
45
|
+
r"(?i)ecosystem",
|
46
|
+
r"(?i)showcase",
|
47
|
+
r"(?i)built with",
|
48
|
+
r"(?i)powered by",
|
49
|
+
r"(?i)case stud(y|ies)",
|
50
|
+
r"(?i)success stor(y|ies)",
|
51
|
+
r"(?i)who('s| is) using",
|
52
|
+
r"(?i)our users",
|
53
|
+
r"(?i)our customers",
|
54
|
+
r"(?i)integrations?",
|
55
|
+
r"(?i)extensions?",
|
56
|
+
r"(?i)plugins?",
|
57
|
+
r"(?i)addons?",
|
58
|
+
r"(?i)community projects",
|
59
|
+
r"(?i)community contributions",
|
60
|
+
r"(?i)user contributions",
|
61
|
+
r"(?i)featured projects",
|
62
|
+
r"(?i)gallery",
|
63
|
+
]
|
64
|
+
|
65
|
+
# Header/title patterns suggesting ecosystem sections
|
66
|
+
self.ecosystem_header_patterns = [
|
67
|
+
r"(?i)ecosystem",
|
68
|
+
r"(?i)who('s| is) using",
|
69
|
+
r"(?i)built (on|with)",
|
70
|
+
r"(?i)powered by",
|
71
|
+
r"(?i)trusted by",
|
72
|
+
r"(?i)customer(s| success)",
|
73
|
+
r"(?i)case stud(y|ies)",
|
74
|
+
r"(?i)success stor(y|ies)",
|
75
|
+
r"(?i)showcase",
|
76
|
+
r"(?i)featured (users|customers|projects)",
|
77
|
+
r"(?i)community (projects|showcase)",
|
78
|
+
r"(?i)partner(s| program)",
|
79
|
+
r"(?i)(our|notable) users",
|
80
|
+
r"(?i)companies using",
|
81
|
+
r"(?i)in production",
|
82
|
+
r"(?i)contributor(s| showcase)",
|
83
|
+
r"(?i)extension (gallery|showcase)",
|
84
|
+
r"(?i)plugin (directory|marketplace)",
|
85
|
+
r"(?i)apps? (built|marketplace|gallery)",
|
86
|
+
]
|
87
|
+
|
88
|
+
# Content phrases that suggest ecosystem descriptions
|
89
|
+
self.ecosystem_content_patterns = [
|
90
|
+
r"(?i)built (on|with) (our|this)",
|
91
|
+
r"(?i)(companies|organizations|projects) (using|powered by)",
|
92
|
+
r"(?i)(is|are) using (our|this)",
|
93
|
+
r"(?i)powered by (our|this)",
|
94
|
+
r"(?i)extend(s|ing)? (the|our) (platform|ecosystem)",
|
95
|
+
r"(?i)integrated with",
|
96
|
+
r"(?i)build(s|ing)? (on top of|with)",
|
97
|
+
r"(?i)leverage(s|ing)? (our|this)",
|
98
|
+
r"(?i)extend(s|ing)? (the|our) (functionality|capabilities)",
|
99
|
+
r"(?i)based on (our|this)",
|
100
|
+
r"(?i)implemented (with|using)",
|
101
|
+
r"(?i)developed (with|using)",
|
102
|
+
r"(?i)(join|be part of) (our|the) ecosystem",
|
103
|
+
]
|
104
|
+
|
105
|
+
# Builder and contribution-specific patterns
|
106
|
+
self.builder_patterns = [
|
107
|
+
r"(?i)how to (build|contribute)",
|
108
|
+
r"(?i)build(ing)? (with|on)",
|
109
|
+
r"(?i)develop(ing)? (with|on)",
|
110
|
+
r"(?i)contribute to",
|
111
|
+
r"(?i)contributor guide",
|
112
|
+
r"(?i)developer program",
|
113
|
+
r"(?i)join (our|the) (ecosystem|community)",
|
114
|
+
r"(?i)become a (contributor|partner)",
|
115
|
+
r"(?i)extend (our|the) (platform|ecosystem)",
|
116
|
+
r"(?i)create (your own|an?) (plugin|extension|integration)",
|
117
|
+
r"(?i)developer (resources|portal)",
|
118
|
+
r"(?i)sdk",
|
119
|
+
r"(?i)api (access|integration)",
|
120
|
+
r"(?i)partner (program|portal)",
|
121
|
+
]
|
122
|
+
|
123
|
+
# Visual cues that often indicate ecosystem showcases
|
124
|
+
self.visual_indicators = [
|
125
|
+
r"logo grid",
|
126
|
+
r"logo carousel",
|
127
|
+
r"client logos",
|
128
|
+
r"partner logos",
|
129
|
+
r"customer logos",
|
130
|
+
r"company logos",
|
131
|
+
r"card gallery",
|
132
|
+
r"project cards",
|
133
|
+
r"showcase gallery",
|
134
|
+
r"case study cards",
|
135
|
+
r"testimonials",
|
136
|
+
r"user testimonials",
|
137
|
+
]
|
138
|
+
|
139
|
+
# Collaboration-specific patterns
|
140
|
+
self.collaboration_patterns = [
|
141
|
+
r"(?i)how to collaborate",
|
142
|
+
r"(?i)collaboration (guide|opportunities)",
|
143
|
+
r"(?i)working together",
|
144
|
+
r"(?i)partner(ship|ing) (opportunities|program)",
|
145
|
+
r"(?i)join (our|the) (community|ecosystem)",
|
146
|
+
r"(?i)community (contribution|participation)",
|
147
|
+
r"(?i)open (source|collaboration)",
|
148
|
+
r"(?i)contribute (code|documentation|resources)",
|
149
|
+
r"(?i)become a (partner|contributor|maintainer)",
|
150
|
+
r"(?i)collaboration (framework|model)",
|
151
|
+
r"(?i)(business|technical) partnership",
|
152
|
+
r"(?i)developer relations",
|
153
|
+
r"(?i)community (engagement|involvement)",
|
154
|
+
]
|
155
|
+
|
156
|
+
# Key meta tags that might indicate ecosystem content
|
157
|
+
self.meta_tag_patterns = [
|
158
|
+
r"(?i)ecosystem",
|
159
|
+
r"(?i)showcase",
|
160
|
+
r"(?i)community",
|
161
|
+
r"(?i)partner program",
|
162
|
+
r"(?i)integration",
|
163
|
+
r"(?i)extension",
|
164
|
+
r"(?i)plugin",
|
165
|
+
r"(?i)marketplace",
|
166
|
+
r"(?i)collaboration",
|
167
|
+
r"(?i)use cases",
|
168
|
+
r"(?i)case studies",
|
169
|
+
r"(?i)success stories",
|
170
|
+
]
|
171
|
+
|
172
|
+
# Schema.org types that often indicate ecosystem relationships
|
173
|
+
self.schema_types = [
|
174
|
+
"Product",
|
175
|
+
"SoftwareApplication",
|
176
|
+
"Organization",
|
177
|
+
"BusinessPartner",
|
178
|
+
"ProgramMembership",
|
179
|
+
"CreativeWork",
|
180
|
+
"SoftwareSourceCode",
|
181
|
+
"WebApplication",
|
182
|
+
]
|
@@ -19,7 +19,13 @@ class PlayWrightWebContentFetcher(WebFetcherRepository):
|
|
19
19
|
|
20
20
|
async def fetch(self, url: str) -> Optional[str]:
|
21
21
|
async with async_playwright() as p:
|
22
|
-
browser = await p.chromium.launch(headless=True
|
22
|
+
browser = await p.chromium.launch(headless=True, args=[
|
23
|
+
'--disable-dev-shm-usage', # Required for Docker
|
24
|
+
'--no-sandbox', # Required for Docker non-root user
|
25
|
+
'--disable-setuid-sandbox', # Required for Docker security
|
26
|
+
'--disable-gpu', # Reduces resource usage
|
27
|
+
])
|
28
|
+
|
23
29
|
logger.debug(
|
24
30
|
f"Launching headless browser with user agent: {self.headers.get('User-Agent')}"
|
25
31
|
)
|
@@ -14,6 +14,8 @@ h_ai/domain/reasoning/tool_message.py,sha256=jpbfbJXj6oqZyB3lDxGOUyFB4faHtXAaEOV
|
|
14
14
|
h_ai/domain/web_docs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
h_ai/domain/web_docs/doc_link_scorer_service.py,sha256=EmLSOaX7BCUQcKHZquaUt-Ps_DssZrRqpch5MgbUhAc,1444
|
16
16
|
h_ai/domain/web_docs/documentation_pattern_repository.py,sha256=VhNzP3PUqgg9MaWhBVefj13XNxRBh6ZPUt-KH70ww2w,1302
|
17
|
+
h_ai/domain/web_docs/ecosystem_link_scorer_service.py,sha256=Slin3ZAdJ3o3CxTvJtfD-vd4R4f-MINd3PY2G3bCCQg,2899
|
18
|
+
h_ai/domain/web_docs/ecosystem_pattern_repository.py,sha256=uHBhEvz3HmhXRvFJ6BzJddZmngPSAQw-q39TgRLJiPg,6609
|
17
19
|
h_ai/domain/web_docs/gitbook_web_fetcher_service.py,sha256=Ye-TcuwgW1fhIY8x6v9_-pmPN9pVFWzlOpwRt-4teaA,6490
|
18
20
|
h_ai/domain/web_docs/web_docs_link_detector.py,sha256=NyMKFNs-41bqrxx6u-9GqIufy7pkDF_-_f1h8HECBK8,1192
|
19
21
|
h_ai/domain/web_docs/web_link.py,sha256=J4KC3MmjkvWlAPDdEdjcqAZCvuSnJMahudCohiBk3wk,307
|
@@ -39,10 +41,10 @@ h_ai/infrastructure/llm/ollama/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
39
41
|
h_ai/infrastructure/llm/ollama/models/ollama_chat_message.py,sha256=ZIz4PQ3869vI3xAYYufPrxXpacajRDtOI8RDl5Dm9RQ,305
|
40
42
|
h_ai/infrastructure/llm/ollama/models/ollama_chat_session.py,sha256=GZ_ddpbWa8iy6NZq50vokUFVZBiX0WNa81z9-r9RzTY,392
|
41
43
|
h_ai/infrastructure/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
h_ai/infrastructure/playwright/playwright_web_content_fetcher.py,sha256=
|
43
|
-
h_ai_brain-0.0.
|
44
|
-
h_ai_brain-0.0.
|
45
|
-
h_ai_brain-0.0.
|
46
|
-
h_ai_brain-0.0.
|
47
|
-
h_ai_brain-0.0.
|
48
|
-
h_ai_brain-0.0.
|
44
|
+
h_ai/infrastructure/playwright/playwright_web_content_fetcher.py,sha256=FVwcK6hv_6aE4fYlJapLHyxNHsztQkKaulklHabyrEc,2684
|
45
|
+
h_ai_brain-0.0.6.dist-info/licenses/LICENSE,sha256=SbvpEU5JIU3yzMMkyzrI0dGqHDoJR_lMKGdl6GZHsy4,11558
|
46
|
+
h_ai_brain-0.0.6.dist-info/licenses/NOTICE.txt,sha256=vxeIKUiGqAePLvDW4AVm3Xh-3BcsvMtCMn1tbsr9zsE,668
|
47
|
+
h_ai_brain-0.0.6.dist-info/METADATA,sha256=taZvGg8xXlBSu13t5DKFWXBPybB8k6Zi0Izjk3qpLv8,735
|
48
|
+
h_ai_brain-0.0.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
49
|
+
h_ai_brain-0.0.6.dist-info/top_level.txt,sha256=3MChDBWvDJV4cEHuZhzeODxQ4ewtw-arOuyaDOc6sIo,5
|
50
|
+
h_ai_brain-0.0.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|