entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -0,0 +1,88 @@
1
+ """
2
+ Crawling Data Models
3
+
4
+ Core data structures for the intelligent crawling system.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import List, Optional, Set
9
+
10
+
11
+ @dataclass
12
+ class CrawlConfig:
13
+ """
14
+ Crawling configuration with Crawl4AI-style parameters.
15
+
16
+ Focuses on page completeness and image loading guarantees.
17
+ """
18
+ # Wait Strategy (Priority: Image Loading)
19
+ wait_for_images: bool = True
20
+ wait_until: str = "networkidle" # domcontentloaded | networkidle | load
21
+ delay_before_return: float = 0.1
22
+ page_timeout: float = 30.0
23
+
24
+ # Image Loading Specific
25
+ image_load_timeout: float = 10.0 # Max wait for images
26
+ image_stability_checks: int = 3 # Consecutive stable checks needed
27
+ image_check_interval: float = 0.2 # Interval between checks
28
+ min_image_size: int = 50 # Ignore images smaller than this
29
+
30
+ # Scroll for lazy loading
31
+ scan_full_page: bool = True
32
+ scroll_step: int = 800
33
+ scroll_delay: float = 0.5
34
+ scroll_timeout: float = 15.0
35
+
36
+ # Height Stability
37
+ height_stability_checks: int = 3
38
+ height_stability_threshold: int = 10 # pixels
39
+
40
+ # Future: Adaptive Stop Logic
41
+ confidence_threshold: float = 0.75
42
+ min_gain_threshold: float = 0.1
43
+ max_pages: int = 20
44
+
45
+
46
+ @dataclass
47
+ class CompletenessResult:
48
+ """Result from completeness check."""
49
+ is_complete: bool
50
+ total_images: int
51
+ loaded_images: int
52
+ failed_images: int
53
+ placeholder_images: int
54
+ height: int
55
+ height_stable: bool
56
+ network_idle: bool
57
+ check_duration: float
58
+
59
+ @property
60
+ def image_load_ratio(self) -> float:
61
+ if self.total_images == 0:
62
+ return 1.0
63
+ return self.loaded_images / self.total_images
64
+
65
+
66
+ @dataclass
67
+ class PageResult:
68
+ """Result from fetching a single page."""
69
+ url: str
70
+ final_url: str
71
+ title: str
72
+ html: str
73
+ content: str # Extracted markdown
74
+ images: List[str] = field(default_factory=list) # base64 images
75
+ screenshot: Optional[str] = None
76
+
77
+ # Quality Signals
78
+ load_time: float = 0.0
79
+ completeness: Optional[CompletenessResult] = None
80
+
81
+ # Error handling
82
+ error: Optional[str] = None
83
+
84
+ @property
85
+ def is_complete(self) -> bool:
86
+ if self.completeness is None:
87
+ return False
88
+ return self.completeness.is_complete
hyw_core/search.py CHANGED
@@ -8,7 +8,6 @@ from loguru import logger
8
8
  from .browser_control.service import get_screenshot_service
9
9
  # Search engines from browser_control subpackage
10
10
  from .browser_control.engines.duckduckgo import DuckDuckGoEngine
11
- from .browser_control.engines.google import GoogleEngine
12
11
  from .browser_control.engines.default import DefaultEngine
13
12
 
14
13
  class SearchService:
@@ -21,15 +20,14 @@ class SearchService:
21
20
  # Domain blocking
22
21
  self._blocked_domains = getattr(config, "blocked_domains", []) or []
23
22
 
24
- # Select Engine - DefaultEngine when not specified
23
+ # Select Engine - DuckDuckGo is the default and only engine
25
24
  self._engine_name = getattr(config, "search_engine", None)
26
25
  if self._engine_name:
27
26
  self._engine_name = self._engine_name.lower()
28
27
 
29
- if self._engine_name == "google":
30
- self._engine = GoogleEngine()
31
- elif self._engine_name == "default_address_bar": # Explicitly requested address bar capability if needed
32
- self._engine = DefaultEngine()
28
+ if self._engine_name == "default_address_bar":
29
+ # Explicitly requested address bar capability if needed
30
+ self._engine = DefaultEngine()
33
31
  else:
34
32
  # Default: use DuckDuckGo
35
33
  self._engine = DuckDuckGoEngine()
@@ -1,155 +0,0 @@
1
-
2
- import urllib.parse
3
- import re
4
- from typing import List, Dict, Any
5
- from loguru import logger
6
- from .base import SearchEngine
7
-
8
-
9
- class GoogleEngine(SearchEngine):
10
- """
11
- Search engine implementation for Google.
12
- Parses Google Search HTML results.
13
- """
14
-
15
- def build_url(self, query: str, limit: int = 10) -> str:
16
- encoded_query = urllib.parse.quote(query)
17
- return f"https://www.google.com/search?q={encoded_query}&udm=14"
18
-
19
- def parse(self, content: str) -> List[Dict[str, Any]]:
20
- results = []
21
- seen_urls = set()
22
-
23
- # Google search results are in blocks with class="MjjYud" or similar containers
24
- # Split by result blocks first for more accurate extraction
25
-
26
- # Method 1: Split by common result block classes
27
- block_patterns = [
28
- r'<div class="MjjYud"[^>]*>',
29
- r'<div class="tF2Cxc"[^>]*>',
30
- r'<div class="g Ww4FFb"[^>]*>',
31
- ]
32
-
33
- blocks = [content]
34
- for bp in block_patterns:
35
- new_blocks = []
36
- for block in blocks:
37
- parts = re.split(bp, block)
38
- new_blocks.extend(parts)
39
- blocks = new_blocks
40
-
41
- for block in blocks:
42
- if len(block) < 100:
43
- continue
44
-
45
- # Find URL in this block - prefer links with h3 nearby
46
- url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
47
- if not url_match:
48
- continue
49
-
50
- url = url_match.group(1)
51
- if url in seen_urls or self._should_skip_url(url):
52
- continue
53
-
54
- # Find h3 title in this block
55
- h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
56
- if not h3_match:
57
- continue
58
-
59
- title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
60
- if not title or len(title) < 2:
61
- continue
62
-
63
- seen_urls.add(url)
64
-
65
- # Extract snippet from VwiC3b class (Google's snippet container)
66
- snippet = ""
67
- snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
68
- if snippet_match:
69
- snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
70
- snippet = re.sub(r'\s+', ' ', snippet).strip()
71
-
72
- # Fallback: look for any text after h3
73
- if not snippet:
74
- # Try other common snippet patterns
75
- alt_patterns = [
76
- r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
77
- r'<div[^>]*data-snc[^>]*>(.*?)</div>',
78
- ]
79
- for ap in alt_patterns:
80
- am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
81
- if am:
82
- snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
83
- snippet = re.sub(r'\s+', ' ', snippet).strip()
84
- break
85
-
86
- # Extract images from this block
87
- images = []
88
- # Pattern 1: Regular img src (excluding data: and tracking pixels)
89
- # Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
90
- img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
91
- for img_url in img_matches:
92
- # Decode HTML entities
93
- img_url = img_url.replace('&amp;', '&')
94
- # Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
95
- if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
96
- continue
97
- if img_url not in images:
98
- images.append(img_url)
99
-
100
- # Pattern 2: data-src (lazy loaded images)
101
- data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
102
- for img_url in data_src_matches:
103
- img_url = img_url.replace('&amp;', '&')
104
- if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
105
- continue
106
- if img_url not in images:
107
- images.append(img_url)
108
-
109
- results.append({
110
- "title": title,
111
- "url": url,
112
- "domain": urllib.parse.urlparse(url).hostname or "",
113
- "content": snippet[:1000],
114
- "images": images[:3] # Limit to 3 images per result
115
- })
116
-
117
- if len(results) >= 15:
118
- break
119
-
120
- total_images = sum(len(r.get("images", [])) for r in results)
121
- logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
122
- return results
123
-
124
- def _should_skip_url(self, url: str) -> bool:
125
- """Check if URL should be skipped."""
126
- skip_patterns = [
127
- "google.com",
128
- "googleusercontent.com",
129
- "gstatic.com",
130
- "youtube.com/watch", # Keep channel/playlist but skip individual videos
131
- "maps.google",
132
- "translate.google",
133
- "accounts.google",
134
- "support.google",
135
- "policies.google",
136
- "schema.org",
137
- "javascript:",
138
- "data:",
139
- "#",
140
- ]
141
-
142
- for pattern in skip_patterns:
143
- if pattern in url.lower():
144
- return True
145
-
146
- # Skip very short URLs (likely invalid)
147
- if len(url) < 20:
148
- return True
149
-
150
- # Skip URLs that are just root domains without path
151
- parsed = urllib.parse.urlparse(url)
152
- if not parsed.path or parsed.path == "/":
153
- return True
154
-
155
- return False