entari-plugin-hyw 4.0.0rc8__py3-none-any.whl → 4.0.0rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +62 -34
- {entari_plugin_hyw-4.0.0rc8.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-4.0.0rc8.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/RECORD +15 -13
- hyw_core/browser_control/__init__.py +1 -3
- hyw_core/browser_control/assets/card-dist/index.html +48 -32
- hyw_core/browser_control/engines/__init__.py +0 -2
- hyw_core/browser_control/manager.py +18 -5
- hyw_core/browser_control/renderer.py +36 -6
- hyw_core/browser_control/service.py +119 -52
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +348 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/search.py +4 -6
- hyw_core/browser_control/engines/google.py +0 -155
- {entari_plugin_hyw-4.0.0rc8.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc8.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Page Completeness Checker
|
|
3
|
+
|
|
4
|
+
Multi-signal page readiness detection with focus on image loading guarantees.
|
|
5
|
+
Implements Crawl4AI's wait_for_images and networkidle concepts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Optional, Dict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from loguru import logger
|
|
12
|
+
|
|
13
|
+
from .models import CrawlConfig, CompletenessResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# CSP-compliant JavaScript for image loading verification
|
|
17
|
+
# Based on Crawl4AI's wait_for_images logic
|
|
18
|
+
IMAGE_CHECK_JS = """
|
|
19
|
+
(() => {
|
|
20
|
+
const results = {
|
|
21
|
+
total: 0,
|
|
22
|
+
loaded: 0,
|
|
23
|
+
failed: 0,
|
|
24
|
+
placeholders: 0,
|
|
25
|
+
details: []
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
29
|
+
const viewportHeight = window.innerHeight;
|
|
30
|
+
const minSize = %d; // Injected from Python
|
|
31
|
+
|
|
32
|
+
for (const img of imgs) {
|
|
33
|
+
// Skip tiny images (icons, tracking pixels)
|
|
34
|
+
if (img.clientWidth < minSize && img.clientHeight < minSize) {
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Skip images outside viewport (unless scan_full_page)
|
|
39
|
+
const rect = img.getBoundingClientRect();
|
|
40
|
+
const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
|
|
41
|
+
|
|
42
|
+
if (!inViewport && !%s) { // scan_full_page injected
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
results.total++;
|
|
47
|
+
|
|
48
|
+
const src = img.src || img.getAttribute('data-src') || '';
|
|
49
|
+
const isPlaceholder = (
|
|
50
|
+
(img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
|
|
51
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
52
|
+
src.includes('placeholder') ||
|
|
53
|
+
src.includes('loading') ||
|
|
54
|
+
src.startsWith('data:image/svg+xml') ||
|
|
55
|
+
(img.naturalWidth === 1 && img.naturalHeight === 1)
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
if (isPlaceholder) {
|
|
59
|
+
results.placeholders++;
|
|
60
|
+
results.details.push({
|
|
61
|
+
src: src.substring(0, 100),
|
|
62
|
+
status: 'placeholder',
|
|
63
|
+
natural: [img.naturalWidth, img.naturalHeight],
|
|
64
|
+
display: [img.clientWidth, img.clientHeight]
|
|
65
|
+
});
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Check if fully loaded
|
|
70
|
+
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
71
|
+
results.loaded++;
|
|
72
|
+
results.details.push({
|
|
73
|
+
src: src.substring(0, 100),
|
|
74
|
+
status: 'loaded',
|
|
75
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
76
|
+
});
|
|
77
|
+
} else {
|
|
78
|
+
results.failed++;
|
|
79
|
+
results.details.push({
|
|
80
|
+
src: src.substring(0, 100),
|
|
81
|
+
status: 'pending',
|
|
82
|
+
complete: img.complete,
|
|
83
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return results;
|
|
89
|
+
})()
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
# JavaScript for height stability check
|
|
93
|
+
HEIGHT_CHECK_JS = """
|
|
94
|
+
(() => {
|
|
95
|
+
return {
|
|
96
|
+
height: Math.max(
|
|
97
|
+
document.body.scrollHeight || 0,
|
|
98
|
+
document.documentElement.scrollHeight || 0,
|
|
99
|
+
document.body.offsetHeight || 0,
|
|
100
|
+
document.documentElement.offsetHeight || 0
|
|
101
|
+
),
|
|
102
|
+
ready: document.readyState === 'complete'
|
|
103
|
+
};
|
|
104
|
+
})()
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class CompletenessChecker:
|
|
109
|
+
"""
|
|
110
|
+
Multi-signal page completeness verification.
|
|
111
|
+
|
|
112
|
+
Signals checked:
|
|
113
|
+
1. Image Loading (naturalWidth > 0, not placeholder)
|
|
114
|
+
2. Height Stability (no layout shifts)
|
|
115
|
+
3. DOM Ready State
|
|
116
|
+
|
|
117
|
+
Usage:
|
|
118
|
+
checker = CompletenessChecker(CrawlConfig())
|
|
119
|
+
result = checker.check(tab)
|
|
120
|
+
|
|
121
|
+
# Or wait until complete
|
|
122
|
+
result = checker.wait_for_complete(tab)
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, config: Optional[CrawlConfig] = None):
|
|
126
|
+
self._config = config or CrawlConfig()
|
|
127
|
+
|
|
128
|
+
def check(self, tab: Any) -> CompletenessResult:
|
|
129
|
+
"""
|
|
130
|
+
Perform a single completeness check.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
tab: DrissionPage tab object
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
CompletenessResult with all signals
|
|
137
|
+
"""
|
|
138
|
+
start = time.time()
|
|
139
|
+
|
|
140
|
+
# Check images
|
|
141
|
+
img_result = self._check_images(tab)
|
|
142
|
+
|
|
143
|
+
# Check height
|
|
144
|
+
height_result = self._check_height(tab)
|
|
145
|
+
|
|
146
|
+
# Determine overall completeness
|
|
147
|
+
# Complete if: all real images loaded AND height is stable
|
|
148
|
+
total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
|
|
149
|
+
all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
|
|
150
|
+
|
|
151
|
+
return CompletenessResult(
|
|
152
|
+
is_complete=all_images_loaded and height_result.get('ready', False),
|
|
153
|
+
total_images=img_result.get('total', 0),
|
|
154
|
+
loaded_images=img_result.get('loaded', 0),
|
|
155
|
+
failed_images=img_result.get('failed', 0),
|
|
156
|
+
placeholder_images=img_result.get('placeholders', 0),
|
|
157
|
+
height=height_result.get('height', 0),
|
|
158
|
+
height_stable=True, # Single check can't determine stability
|
|
159
|
+
network_idle=True, # TODO: network monitoring
|
|
160
|
+
check_duration=time.time() - start
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def wait_for_complete(
|
|
164
|
+
self,
|
|
165
|
+
tab: Any,
|
|
166
|
+
timeout: Optional[float] = None
|
|
167
|
+
) -> CompletenessResult:
|
|
168
|
+
"""
|
|
169
|
+
Wait for page to be fully loaded with image guarantees.
|
|
170
|
+
|
|
171
|
+
Uses multi-signal approach:
|
|
172
|
+
1. Poll image loading status
|
|
173
|
+
2. Track height stability
|
|
174
|
+
3. Respect timeout
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
tab: DrissionPage tab object
|
|
178
|
+
timeout: Max wait time (default from config)
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
CompletenessResult (may not be complete if timeout)
|
|
182
|
+
"""
|
|
183
|
+
timeout = timeout or self._config.image_load_timeout
|
|
184
|
+
start = time.time()
|
|
185
|
+
|
|
186
|
+
height_history = []
|
|
187
|
+
stable_count = 0
|
|
188
|
+
last_result = None
|
|
189
|
+
|
|
190
|
+
logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
|
|
191
|
+
|
|
192
|
+
while time.time() - start < timeout:
|
|
193
|
+
result = self.check(tab)
|
|
194
|
+
last_result = result
|
|
195
|
+
|
|
196
|
+
# Track height stability
|
|
197
|
+
height_history.append(result.height)
|
|
198
|
+
if len(height_history) > self._config.height_stability_checks:
|
|
199
|
+
height_history.pop(0)
|
|
200
|
+
|
|
201
|
+
# Check if height is stable
|
|
202
|
+
height_stable = False
|
|
203
|
+
if len(height_history) >= self._config.height_stability_checks:
|
|
204
|
+
max_h = max(height_history)
|
|
205
|
+
min_h = min(height_history)
|
|
206
|
+
height_stable = (max_h - min_h) <= self._config.height_stability_threshold
|
|
207
|
+
|
|
208
|
+
# Log progress
|
|
209
|
+
elapsed = time.time() - start
|
|
210
|
+
logger.debug(
|
|
211
|
+
f"CompletenessChecker: [{elapsed:.1f}s] "
|
|
212
|
+
f"images={result.loaded_images}/{result.total_images} "
|
|
213
|
+
f"pending={result.failed_images} "
|
|
214
|
+
f"placeholders={result.placeholder_images} "
|
|
215
|
+
f"height={result.height} stable={height_stable}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Check if all images loaded AND height stable
|
|
219
|
+
all_loaded = (
|
|
220
|
+
result.failed_images == 0 and
|
|
221
|
+
result.placeholder_images == 0
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if all_loaded and height_stable:
|
|
225
|
+
stable_count += 1
|
|
226
|
+
if stable_count >= self._config.image_stability_checks:
|
|
227
|
+
logger.info(
|
|
228
|
+
f"CompletenessChecker: Page complete! "
|
|
229
|
+
f"{result.loaded_images} images loaded, "
|
|
230
|
+
f"height={result.height}, "
|
|
231
|
+
f"took {elapsed:.2f}s"
|
|
232
|
+
)
|
|
233
|
+
return CompletenessResult(
|
|
234
|
+
is_complete=True,
|
|
235
|
+
total_images=result.total_images,
|
|
236
|
+
loaded_images=result.loaded_images,
|
|
237
|
+
failed_images=0,
|
|
238
|
+
placeholder_images=0,
|
|
239
|
+
height=result.height,
|
|
240
|
+
height_stable=True,
|
|
241
|
+
network_idle=True,
|
|
242
|
+
check_duration=elapsed
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
stable_count = 0
|
|
246
|
+
|
|
247
|
+
# Wait before next check
|
|
248
|
+
time.sleep(self._config.image_check_interval)
|
|
249
|
+
|
|
250
|
+
# Timeout reached
|
|
251
|
+
elapsed = time.time() - start
|
|
252
|
+
logger.warning(
|
|
253
|
+
f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
|
|
254
|
+
f"images={last_result.loaded_images}/{last_result.total_images} "
|
|
255
|
+
f"pending={last_result.failed_images}"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Return last result with is_complete=False
|
|
259
|
+
if last_result:
|
|
260
|
+
return CompletenessResult(
|
|
261
|
+
is_complete=False,
|
|
262
|
+
total_images=last_result.total_images,
|
|
263
|
+
loaded_images=last_result.loaded_images,
|
|
264
|
+
failed_images=last_result.failed_images,
|
|
265
|
+
placeholder_images=last_result.placeholder_images,
|
|
266
|
+
height=last_result.height,
|
|
267
|
+
height_stable=len(set(height_history)) == 1,
|
|
268
|
+
network_idle=True,
|
|
269
|
+
check_duration=elapsed
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return CompletenessResult(
|
|
273
|
+
is_complete=False,
|
|
274
|
+
total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
|
|
275
|
+
height=0, height_stable=False, network_idle=False,
|
|
276
|
+
check_duration=elapsed
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _check_images(self, tab: Any) -> Dict[str, Any]:
|
|
280
|
+
"""Run image check JavaScript and return results."""
|
|
281
|
+
try:
|
|
282
|
+
js = IMAGE_CHECK_JS % (
|
|
283
|
+
self._config.min_image_size,
|
|
284
|
+
'true' if self._config.scan_full_page else 'false'
|
|
285
|
+
)
|
|
286
|
+
result = tab.run_js(js, as_expr=True)
|
|
287
|
+
return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.warning(f"CompletenessChecker: Image check failed: {e}")
|
|
290
|
+
return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
291
|
+
|
|
292
|
+
def _check_height(self, tab: Any) -> Dict[str, Any]:
|
|
293
|
+
"""Run height check JavaScript and return results."""
|
|
294
|
+
try:
|
|
295
|
+
result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
|
|
296
|
+
return result or {'height': 0, 'ready': False}
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.warning(f"CompletenessChecker: Height check failed: {e}")
|
|
299
|
+
return {'height': 0, 'ready': False}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
|
|
303
|
+
"""
|
|
304
|
+
Scroll through page to trigger lazy-loaded images.
|
|
305
|
+
|
|
306
|
+
Implements Crawl4AI's scan_full_page behavior.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
tab: DrissionPage tab object
|
|
310
|
+
config: Crawl configuration
|
|
311
|
+
"""
|
|
312
|
+
config = config or CrawlConfig()
|
|
313
|
+
if not config.scan_full_page:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
start = time.time()
|
|
317
|
+
current_pos = 0
|
|
318
|
+
|
|
319
|
+
logger.debug("CompletenessChecker: Starting lazy load trigger scroll")
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
while time.time() - start < config.scroll_timeout:
|
|
323
|
+
# Scroll down
|
|
324
|
+
current_pos += config.scroll_step
|
|
325
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
326
|
+
|
|
327
|
+
# Wait for lazy load triggers
|
|
328
|
+
time.sleep(config.scroll_delay)
|
|
329
|
+
|
|
330
|
+
# Check if reached bottom
|
|
331
|
+
height = tab.run_js("""
|
|
332
|
+
return Math.max(
|
|
333
|
+
document.body.scrollHeight || 0,
|
|
334
|
+
document.documentElement.scrollHeight || 0
|
|
335
|
+
);
|
|
336
|
+
""", as_expr=True) or 0
|
|
337
|
+
|
|
338
|
+
if current_pos >= height:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
# Scroll back to top
|
|
342
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
343
|
+
|
|
344
|
+
elapsed = time.time() - start
|
|
345
|
+
logger.debug(f"CompletenessChecker: Lazy load scroll complete in {elapsed:.1f}s")
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawling Data Models
|
|
3
|
+
|
|
4
|
+
Core data structures for the intelligent crawling system.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import List, Optional, Set
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class CrawlConfig:
|
|
13
|
+
"""
|
|
14
|
+
Crawling configuration with Crawl4AI-style parameters.
|
|
15
|
+
|
|
16
|
+
Focuses on page completeness and image loading guarantees.
|
|
17
|
+
"""
|
|
18
|
+
# Wait Strategy (Priority: Image Loading)
|
|
19
|
+
wait_for_images: bool = True
|
|
20
|
+
wait_until: str = "networkidle" # domcontentloaded | networkidle | load
|
|
21
|
+
delay_before_return: float = 0.1
|
|
22
|
+
page_timeout: float = 30.0
|
|
23
|
+
|
|
24
|
+
# Image Loading Specific
|
|
25
|
+
image_load_timeout: float = 10.0 # Max wait for images
|
|
26
|
+
image_stability_checks: int = 3 # Consecutive stable checks needed
|
|
27
|
+
image_check_interval: float = 0.2 # Interval between checks
|
|
28
|
+
min_image_size: int = 50 # Ignore images smaller than this
|
|
29
|
+
|
|
30
|
+
# Scroll for lazy loading
|
|
31
|
+
scan_full_page: bool = True
|
|
32
|
+
scroll_step: int = 800
|
|
33
|
+
scroll_delay: float = 0.5
|
|
34
|
+
scroll_timeout: float = 15.0
|
|
35
|
+
|
|
36
|
+
# Height Stability
|
|
37
|
+
height_stability_checks: int = 3
|
|
38
|
+
height_stability_threshold: int = 10 # pixels
|
|
39
|
+
|
|
40
|
+
# Future: Adaptive Stop Logic
|
|
41
|
+
confidence_threshold: float = 0.75
|
|
42
|
+
min_gain_threshold: float = 0.1
|
|
43
|
+
max_pages: int = 20
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CompletenessResult:
|
|
48
|
+
"""Result from completeness check."""
|
|
49
|
+
is_complete: bool
|
|
50
|
+
total_images: int
|
|
51
|
+
loaded_images: int
|
|
52
|
+
failed_images: int
|
|
53
|
+
placeholder_images: int
|
|
54
|
+
height: int
|
|
55
|
+
height_stable: bool
|
|
56
|
+
network_idle: bool
|
|
57
|
+
check_duration: float
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def image_load_ratio(self) -> float:
|
|
61
|
+
if self.total_images == 0:
|
|
62
|
+
return 1.0
|
|
63
|
+
return self.loaded_images / self.total_images
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class PageResult:
|
|
68
|
+
"""Result from fetching a single page."""
|
|
69
|
+
url: str
|
|
70
|
+
final_url: str
|
|
71
|
+
title: str
|
|
72
|
+
html: str
|
|
73
|
+
content: str # Extracted markdown
|
|
74
|
+
images: List[str] = field(default_factory=list) # base64 images
|
|
75
|
+
screenshot: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
# Quality Signals
|
|
78
|
+
load_time: float = 0.0
|
|
79
|
+
completeness: Optional[CompletenessResult] = None
|
|
80
|
+
|
|
81
|
+
# Error handling
|
|
82
|
+
error: Optional[str] = None
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def is_complete(self) -> bool:
|
|
86
|
+
if self.completeness is None:
|
|
87
|
+
return False
|
|
88
|
+
return self.completeness.is_complete
|
hyw_core/search.py
CHANGED
|
@@ -8,7 +8,6 @@ from loguru import logger
|
|
|
8
8
|
from .browser_control.service import get_screenshot_service
|
|
9
9
|
# Search engines from browser_control subpackage
|
|
10
10
|
from .browser_control.engines.duckduckgo import DuckDuckGoEngine
|
|
11
|
-
from .browser_control.engines.google import GoogleEngine
|
|
12
11
|
from .browser_control.engines.default import DefaultEngine
|
|
13
12
|
|
|
14
13
|
class SearchService:
|
|
@@ -21,15 +20,14 @@ class SearchService:
|
|
|
21
20
|
# Domain blocking
|
|
22
21
|
self._blocked_domains = getattr(config, "blocked_domains", []) or []
|
|
23
22
|
|
|
24
|
-
# Select Engine -
|
|
23
|
+
# Select Engine - DuckDuckGo is the default and only engine
|
|
25
24
|
self._engine_name = getattr(config, "search_engine", None)
|
|
26
25
|
if self._engine_name:
|
|
27
26
|
self._engine_name = self._engine_name.lower()
|
|
28
27
|
|
|
29
|
-
if self._engine_name == "
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
self._engine = DefaultEngine()
|
|
28
|
+
if self._engine_name == "default_address_bar":
|
|
29
|
+
# Explicitly requested address bar capability if needed
|
|
30
|
+
self._engine = DefaultEngine()
|
|
33
31
|
else:
|
|
34
32
|
# Default: use DuckDuckGo
|
|
35
33
|
self._engine = DuckDuckGoEngine()
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import urllib.parse
|
|
3
|
-
import re
|
|
4
|
-
from typing import List, Dict, Any
|
|
5
|
-
from loguru import logger
|
|
6
|
-
from .base import SearchEngine
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class GoogleEngine(SearchEngine):
|
|
10
|
-
"""
|
|
11
|
-
Search engine implementation for Google.
|
|
12
|
-
Parses Google Search HTML results.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def build_url(self, query: str, limit: int = 10) -> str:
|
|
16
|
-
encoded_query = urllib.parse.quote(query)
|
|
17
|
-
return f"https://www.google.com/search?q={encoded_query}&udm=14"
|
|
18
|
-
|
|
19
|
-
def parse(self, content: str) -> List[Dict[str, Any]]:
|
|
20
|
-
results = []
|
|
21
|
-
seen_urls = set()
|
|
22
|
-
|
|
23
|
-
# Google search results are in blocks with class="MjjYud" or similar containers
|
|
24
|
-
# Split by result blocks first for more accurate extraction
|
|
25
|
-
|
|
26
|
-
# Method 1: Split by common result block classes
|
|
27
|
-
block_patterns = [
|
|
28
|
-
r'<div class="MjjYud"[^>]*>',
|
|
29
|
-
r'<div class="tF2Cxc"[^>]*>',
|
|
30
|
-
r'<div class="g Ww4FFb"[^>]*>',
|
|
31
|
-
]
|
|
32
|
-
|
|
33
|
-
blocks = [content]
|
|
34
|
-
for bp in block_patterns:
|
|
35
|
-
new_blocks = []
|
|
36
|
-
for block in blocks:
|
|
37
|
-
parts = re.split(bp, block)
|
|
38
|
-
new_blocks.extend(parts)
|
|
39
|
-
blocks = new_blocks
|
|
40
|
-
|
|
41
|
-
for block in blocks:
|
|
42
|
-
if len(block) < 100:
|
|
43
|
-
continue
|
|
44
|
-
|
|
45
|
-
# Find URL in this block - prefer links with h3 nearby
|
|
46
|
-
url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
|
|
47
|
-
if not url_match:
|
|
48
|
-
continue
|
|
49
|
-
|
|
50
|
-
url = url_match.group(1)
|
|
51
|
-
if url in seen_urls or self._should_skip_url(url):
|
|
52
|
-
continue
|
|
53
|
-
|
|
54
|
-
# Find h3 title in this block
|
|
55
|
-
h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
|
|
56
|
-
if not h3_match:
|
|
57
|
-
continue
|
|
58
|
-
|
|
59
|
-
title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
|
|
60
|
-
if not title or len(title) < 2:
|
|
61
|
-
continue
|
|
62
|
-
|
|
63
|
-
seen_urls.add(url)
|
|
64
|
-
|
|
65
|
-
# Extract snippet from VwiC3b class (Google's snippet container)
|
|
66
|
-
snippet = ""
|
|
67
|
-
snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
|
|
68
|
-
if snippet_match:
|
|
69
|
-
snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
|
|
70
|
-
snippet = re.sub(r'\s+', ' ', snippet).strip()
|
|
71
|
-
|
|
72
|
-
# Fallback: look for any text after h3
|
|
73
|
-
if not snippet:
|
|
74
|
-
# Try other common snippet patterns
|
|
75
|
-
alt_patterns = [
|
|
76
|
-
r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
|
|
77
|
-
r'<div[^>]*data-snc[^>]*>(.*?)</div>',
|
|
78
|
-
]
|
|
79
|
-
for ap in alt_patterns:
|
|
80
|
-
am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
|
|
81
|
-
if am:
|
|
82
|
-
snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
|
|
83
|
-
snippet = re.sub(r'\s+', ' ', snippet).strip()
|
|
84
|
-
break
|
|
85
|
-
|
|
86
|
-
# Extract images from this block
|
|
87
|
-
images = []
|
|
88
|
-
# Pattern 1: Regular img src (excluding data: and tracking pixels)
|
|
89
|
-
# Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
|
|
90
|
-
img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
|
|
91
|
-
for img_url in img_matches:
|
|
92
|
-
# Decode HTML entities
|
|
93
|
-
img_url = img_url.replace('&', '&')
|
|
94
|
-
# Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
|
|
95
|
-
if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
|
|
96
|
-
continue
|
|
97
|
-
if img_url not in images:
|
|
98
|
-
images.append(img_url)
|
|
99
|
-
|
|
100
|
-
# Pattern 2: data-src (lazy loaded images)
|
|
101
|
-
data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
|
|
102
|
-
for img_url in data_src_matches:
|
|
103
|
-
img_url = img_url.replace('&', '&')
|
|
104
|
-
if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
|
|
105
|
-
continue
|
|
106
|
-
if img_url not in images:
|
|
107
|
-
images.append(img_url)
|
|
108
|
-
|
|
109
|
-
results.append({
|
|
110
|
-
"title": title,
|
|
111
|
-
"url": url,
|
|
112
|
-
"domain": urllib.parse.urlparse(url).hostname or "",
|
|
113
|
-
"content": snippet[:1000],
|
|
114
|
-
"images": images[:3] # Limit to 3 images per result
|
|
115
|
-
})
|
|
116
|
-
|
|
117
|
-
if len(results) >= 15:
|
|
118
|
-
break
|
|
119
|
-
|
|
120
|
-
total_images = sum(len(r.get("images", [])) for r in results)
|
|
121
|
-
logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
|
|
122
|
-
return results
|
|
123
|
-
|
|
124
|
-
def _should_skip_url(self, url: str) -> bool:
|
|
125
|
-
"""Check if URL should be skipped."""
|
|
126
|
-
skip_patterns = [
|
|
127
|
-
"google.com",
|
|
128
|
-
"googleusercontent.com",
|
|
129
|
-
"gstatic.com",
|
|
130
|
-
"youtube.com/watch", # Keep channel/playlist but skip individual videos
|
|
131
|
-
"maps.google",
|
|
132
|
-
"translate.google",
|
|
133
|
-
"accounts.google",
|
|
134
|
-
"support.google",
|
|
135
|
-
"policies.google",
|
|
136
|
-
"schema.org",
|
|
137
|
-
"javascript:",
|
|
138
|
-
"data:",
|
|
139
|
-
"#",
|
|
140
|
-
]
|
|
141
|
-
|
|
142
|
-
for pattern in skip_patterns:
|
|
143
|
-
if pattern in url.lower():
|
|
144
|
-
return True
|
|
145
|
-
|
|
146
|
-
# Skip very short URLs (likely invalid)
|
|
147
|
-
if len(url) < 20:
|
|
148
|
-
return True
|
|
149
|
-
|
|
150
|
-
# Skip URLs that are just root domains without path
|
|
151
|
-
parsed = urllib.parse.urlparse(url)
|
|
152
|
-
if not parsed.path or parsed.path == "/":
|
|
153
|
-
return True
|
|
154
|
-
|
|
155
|
-
return False
|
|
File without changes
|
{entari_plugin_hyw-4.0.0rc8.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/top_level.txt
RENAMED
|
File without changes
|