thordata-mcp-server 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata_mcp/__init__.py +1 -1
- thordata_mcp/browser_session.py +157 -12
- thordata_mcp/config.py +14 -3
- thordata_mcp/context.py +1 -1
- thordata_mcp/tools/data/browser.py +124 -18
- thordata_mcp/tools/debug.py +125 -0
- thordata_mcp/tools/params_utils.py +107 -0
- thordata_mcp/tools/product.py +83 -5
- thordata_mcp/tools/product_compact.py +2108 -962
- thordata_mcp/tools/utils.py +2 -0
- thordata_mcp/utils.py +393 -322
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/METADATA +29 -54
- thordata_mcp_server-0.5.0.dist-info/RECORD +26 -0
- thordata_mcp_server-0.4.4.dist-info/RECORD +0 -24
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/entry_points.txt +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,962 +1,2108 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import json
|
|
5
|
-
from typing import Any, Optional
|
|
6
|
-
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
from thordata_mcp.config import
|
|
10
|
-
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
"
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from thordata_mcp.tools.params_utils import create_params_error, normalize_params
|
|
8
|
+
from thordata_mcp.tools.debug import register as register_debug
|
|
9
|
+
from thordata_mcp.config import get_settings
|
|
10
|
+
|
|
11
|
+
from mcp.server.fastmcp import Context, FastMCP
|
|
12
|
+
|
|
13
|
+
from thordata_mcp.config import settings
|
|
14
|
+
from thordata_mcp.context import ServerContext
|
|
15
|
+
from thordata_mcp.monitoring import PerformanceTimer
|
|
16
|
+
from thordata_mcp.utils import (
|
|
17
|
+
error_response,
|
|
18
|
+
handle_mcp_errors,
|
|
19
|
+
html_to_markdown_clean,
|
|
20
|
+
ok_response,
|
|
21
|
+
safe_ctx_info,
|
|
22
|
+
truncate_content,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Tool schema helper (for catalog)
|
|
26
|
+
from .utils import tool_schema # noqa: E402
|
|
27
|
+
|
|
28
|
+
# Reuse battle-tested helpers from the full product module
|
|
29
|
+
from .product import ( # noqa: E402
|
|
30
|
+
_catalog,
|
|
31
|
+
_candidate_tools_for_url,
|
|
32
|
+
_extract_structured_from_html,
|
|
33
|
+
_fetch_json_preview,
|
|
34
|
+
_guess_tool_for_url,
|
|
35
|
+
_hostname,
|
|
36
|
+
_normalize_extracted,
|
|
37
|
+
_normalize_record,
|
|
38
|
+
_run_web_scraper_tool,
|
|
39
|
+
_to_light_json,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def _build_params_template(schema: dict[str, Any]) -> dict[str, Any]:
|
|
43
|
+
"""Build a minimal runnable params template from a tool_schema() dict.
|
|
44
|
+
|
|
45
|
+
We do NOT include URL examples; we only provide placeholders and defaults.
|
|
46
|
+
"""
|
|
47
|
+
fields = schema.get("fields") if isinstance(schema, dict) else None
|
|
48
|
+
if not isinstance(fields, dict):
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
template: dict[str, Any] = {}
|
|
52
|
+
for k, meta in fields.items():
|
|
53
|
+
if k in {"SPIDER_ID", "SPIDER_NAME"}:
|
|
54
|
+
continue
|
|
55
|
+
if not isinstance(meta, dict):
|
|
56
|
+
continue
|
|
57
|
+
required = bool(meta.get("required"))
|
|
58
|
+
default = meta.get("default")
|
|
59
|
+
typ = str(meta.get("type") or "")
|
|
60
|
+
|
|
61
|
+
# Always special-case common_settings for video tools, regardless of required/optional.
|
|
62
|
+
if k == "common_settings":
|
|
63
|
+
try:
|
|
64
|
+
from thordata.types.common import CommonSettings
|
|
65
|
+
|
|
66
|
+
cs_fields = getattr(CommonSettings, "__dataclass_fields__", {}) # type: ignore[attr-defined]
|
|
67
|
+
cs_template: dict[str, Any] = {}
|
|
68
|
+
for ck, cf in cs_fields.items():
|
|
69
|
+
# Keep all optional keys visible; user fills what they need.
|
|
70
|
+
if ck.startswith("_"):
|
|
71
|
+
continue
|
|
72
|
+
# default is always None in SDK, keep placeholder to make schema explicit
|
|
73
|
+
cs_template[ck] = f"<{ck}>"
|
|
74
|
+
template[k] = cs_template
|
|
75
|
+
except Exception:
|
|
76
|
+
# Fall back to a generic dict placeholder if SDK shape changes.
|
|
77
|
+
template[k] = {}
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
# For required fields without defaults, provide a clear placeholder.
|
|
81
|
+
if required and default is None:
|
|
82
|
+
template[k] = f"<{k}>"
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
# For optional fields, include default only if it's not None.
|
|
86
|
+
if default is not None:
|
|
87
|
+
template[k] = default
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# For some known shapes, provide a sensible empty structure.
|
|
91
|
+
if "dict" in typ:
|
|
92
|
+
template[k] = {}
|
|
93
|
+
elif "list" in typ:
|
|
94
|
+
template[k] = []
|
|
95
|
+
# else: omit
|
|
96
|
+
|
|
97
|
+
return template
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def register(mcp: FastMCP) -> None:
|
|
101
|
+
"""Register the compact product surface (competitor-style).
|
|
102
|
+
|
|
103
|
+
Core tools are exposed:
|
|
104
|
+
- serp
|
|
105
|
+
- search_engine / search_engine_batch (minimal web search)
|
|
106
|
+
- unlocker
|
|
107
|
+
- web_scraper
|
|
108
|
+
- browser
|
|
109
|
+
- smart_scrape
|
|
110
|
+
|
|
111
|
+
Plus optional debug helper:
|
|
112
|
+
- debug.status
|
|
113
|
+
|
|
114
|
+
Tool exposure can be controlled via environment variables:
|
|
115
|
+
- THORDATA_TOOLS: comma-separated tool names to explicitly enable (optional)
|
|
116
|
+
- THORDATA_MODE / THORDATA_GROUPS: legacy knobs (kept for backward-compat)
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
cfg = get_settings()
|
|
120
|
+
mode = str(getattr(cfg, "THORDATA_MODE", "rapid")).strip().lower()
|
|
121
|
+
groups = [g.strip().lower() for g in (getattr(cfg, "THORDATA_GROUPS", "") or "").split(",") if g.strip()]
|
|
122
|
+
tools = [t.strip().lower() for t in (getattr(cfg, "THORDATA_TOOLS", "") or "").split(",") if t.strip()]
|
|
123
|
+
|
|
124
|
+
# Register debug helper tools (read-only) only when enabled
|
|
125
|
+
if getattr(cfg, "THORDATA_DEBUG_TOOLS", False):
|
|
126
|
+
register_debug(mcp)
|
|
127
|
+
|
|
128
|
+
# Decide which tools to register.
|
|
129
|
+
# Competitor-style defaults: keep tool surface small for LLMs.
|
|
130
|
+
# We always expose a small base set; advanced tools require explicit allowlisting via THORDATA_TOOLS.
|
|
131
|
+
all_tools = {
|
|
132
|
+
"search_engine",
|
|
133
|
+
"search_engine_batch",
|
|
134
|
+
"serp",
|
|
135
|
+
"unlocker",
|
|
136
|
+
"web_scraper",
|
|
137
|
+
"web_scraper.help",
|
|
138
|
+
"browser",
|
|
139
|
+
"smart_scrape",
|
|
140
|
+
}
|
|
141
|
+
base_tools = {"search_engine", "unlocker", "browser", "smart_scrape"}
|
|
142
|
+
|
|
143
|
+
# Legacy note:
|
|
144
|
+
# We keep THORDATA_MODE/THORDATA_GROUPS for backward-compat, but avoid relying on multi-tier modes.
|
|
145
|
+
# If someone explicitly sets THORDATA_MODE=pro, we still honor it for now.
|
|
146
|
+
if mode == "pro":
|
|
147
|
+
allowed_tools = set(all_tools)
|
|
148
|
+
else:
|
|
149
|
+
allowed_tools = set(base_tools)
|
|
150
|
+
allowed_tools |= {t for t in tools if t in all_tools}
|
|
151
|
+
|
|
152
|
+
def _allow(name: str) -> bool:
|
|
153
|
+
return name.lower() in allowed_tools
|
|
154
|
+
|
|
155
|
+
# -------------------------
|
|
156
|
+
# SERP (compact)
|
|
157
|
+
# -------------------------
|
|
158
|
+
# Web search aliases
|
|
159
|
+
# - search_engine: single query web search
|
|
160
|
+
# - search_engine_batch: batch web search
|
|
161
|
+
if _allow("search_engine"):
|
|
162
|
+
@mcp.tool(
|
|
163
|
+
name="search_engine",
|
|
164
|
+
description=(
|
|
165
|
+
"Web search with AI-optimized results. "
|
|
166
|
+
'Params example: {"q": "Python", "num": 10, "engine": "google", "format": "light_json"}. '
|
|
167
|
+
"Returns a minimal, LLM-friendly subset: title/link/description."
|
|
168
|
+
),
|
|
169
|
+
)
|
|
170
|
+
@handle_mcp_errors
|
|
171
|
+
async def search_engine(
|
|
172
|
+
*,
|
|
173
|
+
params: Any = None,
|
|
174
|
+
ctx: Optional[Context] = None,
|
|
175
|
+
) -> dict[str, Any]:
|
|
176
|
+
# Schema-friendly normalization: accept q/query, set sensible defaults.
|
|
177
|
+
try:
|
|
178
|
+
p = normalize_params(params, "search_engine", "search")
|
|
179
|
+
except ValueError as e:
|
|
180
|
+
return create_params_error("search_engine", "search", params, str(e))
|
|
181
|
+
|
|
182
|
+
q = str(p.get("q", "") or p.get("query", "")).strip()
|
|
183
|
+
if not q:
|
|
184
|
+
return error_response(
|
|
185
|
+
tool="search_engine",
|
|
186
|
+
input={"params": p},
|
|
187
|
+
error_type="validation_error",
|
|
188
|
+
code="E4001",
|
|
189
|
+
message="Missing q (provide params.q or params.query)",
|
|
190
|
+
details={"params_example": {"q": "Python web scraping", "num": 10, "engine": "google"}},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Normalize basic options with defaults (schema-style).
|
|
194
|
+
engine = str(p.get("engine", "google") or "google").strip()
|
|
195
|
+
num = int(p.get("num", 10) or 10)
|
|
196
|
+
start = int(p.get("start", 0) or 0)
|
|
197
|
+
fmt = str(p.get("format", "light_json") or "light_json").strip().lower()
|
|
198
|
+
if num <= 0 or num > 50:
|
|
199
|
+
return error_response(
|
|
200
|
+
tool="search_engine",
|
|
201
|
+
input={"params": p},
|
|
202
|
+
error_type="validation_error",
|
|
203
|
+
code="E4001",
|
|
204
|
+
message="num must be between 1 and 50",
|
|
205
|
+
details={"num": num},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Delegate to serp.search
|
|
209
|
+
await safe_ctx_info(ctx, f"search_engine q={q!r} engine={engine!r} num={num} start={start}")
|
|
210
|
+
out = await serp(
|
|
211
|
+
action="search",
|
|
212
|
+
params={"q": q, "engine": engine, "num": num, "start": start, "format": fmt, **{k: v for k, v in p.items() if k not in {"q", "query", "engine", "num", "start", "format"}}},
|
|
213
|
+
ctx=ctx,
|
|
214
|
+
)
|
|
215
|
+
if out.get("ok") is not True:
|
|
216
|
+
return out
|
|
217
|
+
|
|
218
|
+
data = out.get("output")
|
|
219
|
+
organic = data.get("organic") if isinstance(data, dict) else None
|
|
220
|
+
results = []
|
|
221
|
+
if isinstance(organic, list):
|
|
222
|
+
for r in organic[:num]:
|
|
223
|
+
if not isinstance(r, dict):
|
|
224
|
+
continue
|
|
225
|
+
results.append(
|
|
226
|
+
{
|
|
227
|
+
"title": r.get("title"),
|
|
228
|
+
"link": r.get("link"),
|
|
229
|
+
"description": r.get("description"),
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return ok_response(
|
|
234
|
+
tool="search_engine",
|
|
235
|
+
input={"params": p},
|
|
236
|
+
output={
|
|
237
|
+
"query": q,
|
|
238
|
+
"engine": engine,
|
|
239
|
+
"results": results,
|
|
240
|
+
"_meta": data.get("_meta") if isinstance(data, dict) else None,
|
|
241
|
+
},
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if _allow("search_engine_batch"):
|
|
245
|
+
@mcp.tool(
|
|
246
|
+
name="search_engine_batch",
|
|
247
|
+
description=(
|
|
248
|
+
"Batch web search. "
|
|
249
|
+
'Params example: {"requests": [{"q": "q1"}, {"q": "q2"}], "concurrency": 5, "engine": "google"}.'
|
|
250
|
+
),
|
|
251
|
+
)
|
|
252
|
+
@handle_mcp_errors
|
|
253
|
+
async def search_engine_batch(
|
|
254
|
+
*,
|
|
255
|
+
params: Any = None,
|
|
256
|
+
ctx: Optional[Context] = None,
|
|
257
|
+
) -> dict[str, Any]:
|
|
258
|
+
try:
|
|
259
|
+
p = normalize_params(params, "search_engine_batch", "batch_search")
|
|
260
|
+
except ValueError as e:
|
|
261
|
+
return create_params_error("search_engine_batch", "batch_search", params, str(e))
|
|
262
|
+
|
|
263
|
+
reqs = p.get("requests")
|
|
264
|
+
if not isinstance(reqs, list) or not reqs:
|
|
265
|
+
return error_response(
|
|
266
|
+
tool="search_engine_batch",
|
|
267
|
+
input={"params": p},
|
|
268
|
+
error_type="validation_error",
|
|
269
|
+
code="E4001",
|
|
270
|
+
message="Missing requests[] (array of {q,...} objects)",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Optional shared defaults for engine/num/start
|
|
274
|
+
default_engine = str(p.get("engine", "google") or "google").strip()
|
|
275
|
+
default_num = int(p.get("num", 10) or 10)
|
|
276
|
+
if default_num <= 0 or default_num > 50:
|
|
277
|
+
return error_response(
|
|
278
|
+
tool="search_engine_batch",
|
|
279
|
+
input={"params": p},
|
|
280
|
+
error_type="validation_error",
|
|
281
|
+
code="E4001",
|
|
282
|
+
message="num must be between 1 and 50",
|
|
283
|
+
details={"num": default_num},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Delegate to serp.batch_search
|
|
287
|
+
await safe_ctx_info(ctx, f"search_engine_batch count={len(reqs)}")
|
|
288
|
+
out = await serp(
|
|
289
|
+
action="batch_search",
|
|
290
|
+
params={
|
|
291
|
+
**p,
|
|
292
|
+
"requests": [
|
|
293
|
+
{
|
|
294
|
+
**r,
|
|
295
|
+
"q": str((r.get("q") if isinstance(r, dict) else "") or (r.get("query") if isinstance(r, dict) else "")).strip(),
|
|
296
|
+
"engine": str((r.get("engine") if isinstance(r, dict) else "") or default_engine),
|
|
297
|
+
"num": int((r.get("num") if isinstance(r, dict) else 0) or default_num),
|
|
298
|
+
}
|
|
299
|
+
for r in reqs if isinstance(r, dict)
|
|
300
|
+
],
|
|
301
|
+
},
|
|
302
|
+
ctx=ctx,
|
|
303
|
+
)
|
|
304
|
+
if out.get("ok") is not True:
|
|
305
|
+
return out
|
|
306
|
+
|
|
307
|
+
data = out.get("output")
|
|
308
|
+
results = []
|
|
309
|
+
if isinstance(data, dict):
|
|
310
|
+
for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
|
|
311
|
+
if not isinstance(item, dict):
|
|
312
|
+
continue
|
|
313
|
+
o = item.get("output")
|
|
314
|
+
organic = o.get("organic") if isinstance(o, dict) else None
|
|
315
|
+
mapped = []
|
|
316
|
+
if isinstance(organic, list):
|
|
317
|
+
for r in organic:
|
|
318
|
+
if not isinstance(r, dict):
|
|
319
|
+
continue
|
|
320
|
+
mapped.append({"title": r.get("title"), "link": r.get("link"), "description": r.get("description")})
|
|
321
|
+
results.append(
|
|
322
|
+
{
|
|
323
|
+
"index": item.get("index"),
|
|
324
|
+
"ok": bool(item.get("ok")),
|
|
325
|
+
"input": {"q": item.get("q"), "engine": item.get("engine"), "num": item.get("num")},
|
|
326
|
+
"results": mapped if item.get("ok") else None,
|
|
327
|
+
"error": item.get("error") if not item.get("ok") else None,
|
|
328
|
+
}
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return ok_response(tool="search_engine_batch", input={"params": p}, output={"results": results})
|
|
332
|
+
|
|
333
|
+
# -------------------------
|
|
334
|
+
# Low-level SERP (advanced users; not exposed by default)
|
|
335
|
+
@handle_mcp_errors
|
|
336
|
+
async def serp(
|
|
337
|
+
action: str,
|
|
338
|
+
*,
|
|
339
|
+
params: Any = None,
|
|
340
|
+
ctx: Optional[Context] = None,
|
|
341
|
+
) -> dict[str, Any]:
|
|
342
|
+
"""SERP SCRAPER: action in {search, batch_search}.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
action: Action to perform - "search" or "batch_search"
|
|
346
|
+
params: Parameters dictionary. For "search": {"q": "query", "num": 10, "engine": "google", ...}
|
|
347
|
+
For "batch_search": {"requests": [{"q": "query1"}, ...], "concurrency": 5}
|
|
348
|
+
|
|
349
|
+
Examples:
|
|
350
|
+
serp(action="search", params={"q": "Python programming", "num": 10})
|
|
351
|
+
serp(action="batch_search", params={"requests": [{"q": "query1"}, {"q": "query2"}], "concurrency": 5})
|
|
352
|
+
"""
|
|
353
|
+
# Normalize params with enhanced error messages
|
|
354
|
+
try:
|
|
355
|
+
p = normalize_params(params, "serp", action)
|
|
356
|
+
except ValueError as e:
|
|
357
|
+
if "JSON" in str(e):
|
|
358
|
+
return create_params_error("serp", action, params, str(e))
|
|
359
|
+
else:
|
|
360
|
+
return create_params_error("serp", action, params, str(e))
|
|
361
|
+
|
|
362
|
+
a = (action or "").strip().lower()
|
|
363
|
+
if not a:
|
|
364
|
+
return error_response(
|
|
365
|
+
tool="serp",
|
|
366
|
+
input={"action": action, "params": p},
|
|
367
|
+
error_type="validation_error",
|
|
368
|
+
code="E4001",
|
|
369
|
+
message="action is required",
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
client = await ServerContext.get_client()
|
|
373
|
+
|
|
374
|
+
if a == "search":
|
|
375
|
+
# Mirror serp.search product contract
|
|
376
|
+
q = str(p.get("q", ""))
|
|
377
|
+
if not q:
|
|
378
|
+
return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing q")
|
|
379
|
+
engine_in = str(p.get("engine", "google")).strip() or "google"
|
|
380
|
+
num = int(p.get("num", 10))
|
|
381
|
+
start = int(p.get("start", 0))
|
|
382
|
+
fmt = str(p.get("format", "json")).strip().lower()
|
|
383
|
+
# Backend contract nuance:
|
|
384
|
+
# - Some engines support "mode" via engine name (google_images/news/videos/shopping/ai_mode)
|
|
385
|
+
# - For engine=google, passing tbm often breaks on some backends. We route to a specific engine when possible.
|
|
386
|
+
tbm_raw = p.get("tbm")
|
|
387
|
+
tbm_lower = tbm_raw.strip().lower() if isinstance(tbm_raw, str) else None
|
|
388
|
+
engine = engine_in
|
|
389
|
+
if engine_in.lower() == "google" and tbm_lower in {"images", "news", "videos", "shops", "shopping"}:
|
|
390
|
+
# Map tbm-style mode to dedicated engine.
|
|
391
|
+
engine_map = {
|
|
392
|
+
"images": "google_images",
|
|
393
|
+
"news": "google_news",
|
|
394
|
+
"videos": "google_videos",
|
|
395
|
+
"shops": "google_shopping",
|
|
396
|
+
"shopping": "google_shopping",
|
|
397
|
+
}
|
|
398
|
+
engine = engine_map[tbm_lower]
|
|
399
|
+
|
|
400
|
+
# For engines that explicitly support tbm modes, keep tbm as-is but normalize common aliases
|
|
401
|
+
# (do NOT convert to isch/nws/vid/shop here; those are Google UI tbm values and may differ from backend contract).
|
|
402
|
+
if isinstance(tbm_raw, str):
|
|
403
|
+
tbm_alias = {"image": "images", "video": "videos", "shop": "shops"}
|
|
404
|
+
tbm_norm = tbm_alias.get(tbm_lower)
|
|
405
|
+
if tbm_norm:
|
|
406
|
+
p = dict(p)
|
|
407
|
+
p["tbm"] = tbm_norm
|
|
408
|
+
# Leverage SerpRequest mapping via SDK by calling full tool through request object
|
|
409
|
+
from thordata.types import SerpRequest
|
|
410
|
+
|
|
411
|
+
sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
|
|
412
|
+
extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
|
|
413
|
+
if p.get("ai_overview") is not None:
|
|
414
|
+
extra_params = dict(extra_params)
|
|
415
|
+
extra_params["ai_overview"] = p.get("ai_overview")
|
|
416
|
+
# Dashboard-style passthrough parameters (kept in extra_params)
|
|
417
|
+
for k in ("safe", "nfpr", "filter", "tbs", "ibp", "lsig", "si", "uds"):
|
|
418
|
+
if p.get(k) is not None:
|
|
419
|
+
extra_params = dict(extra_params)
|
|
420
|
+
extra_params[k] = p.get(k)
|
|
421
|
+
req = SerpRequest(
|
|
422
|
+
query=q,
|
|
423
|
+
engine=engine,
|
|
424
|
+
num=num,
|
|
425
|
+
start=start,
|
|
426
|
+
device=p.get("device"),
|
|
427
|
+
output_format=sdk_fmt,
|
|
428
|
+
render_js=p.get("render_js"),
|
|
429
|
+
no_cache=p.get("no_cache"),
|
|
430
|
+
google_domain=p.get("google_domain"),
|
|
431
|
+
country=p.get("gl"),
|
|
432
|
+
language=p.get("hl"),
|
|
433
|
+
countries_filter=p.get("cr"),
|
|
434
|
+
languages_filter=p.get("lr"),
|
|
435
|
+
location=p.get("location"),
|
|
436
|
+
uule=p.get("uule"),
|
|
437
|
+
search_type=p.get("tbm"),
|
|
438
|
+
ludocid=p.get("ludocid"),
|
|
439
|
+
kgmid=p.get("kgmid"),
|
|
440
|
+
extra_params=extra_params,
|
|
441
|
+
)
|
|
442
|
+
await safe_ctx_info(ctx, f"serp.search q={q!r} engine={engine} (input={engine_in}) num={num} start={start} format={fmt}")
|
|
443
|
+
try:
|
|
444
|
+
data = await client.serp_search_advanced(req)
|
|
445
|
+
except Exception as e:
|
|
446
|
+
msg = str(e)
|
|
447
|
+
if "Invalid tbm parameter" in msg or "invalid tbm parameter" in msg:
|
|
448
|
+
return error_response(
|
|
449
|
+
tool="serp",
|
|
450
|
+
input={"action": "search", "params": p},
|
|
451
|
+
error_type="validation_error",
|
|
452
|
+
code="E4001",
|
|
453
|
+
message="Invalid tbm (search type) parameter for SERP.",
|
|
454
|
+
details={
|
|
455
|
+
"tbm": p.get("tbm"),
|
|
456
|
+
"engine": engine,
|
|
457
|
+
"engine_input": engine_in,
|
|
458
|
+
"hint": "The upstream SERP endpoint rejected 'tbm'. Try removing tbm/search_type, or use engine-specific modes (e.g. google_images/google_news/google_videos/google_shopping).",
|
|
459
|
+
"examples": {"engine": ["google", "google_images", "google_news", "google_videos", "google_shopping"], "tbm": ["images", "news", "videos", "shops", "local", "patents"]},
|
|
460
|
+
},
|
|
461
|
+
)
|
|
462
|
+
raise
|
|
463
|
+
if fmt in {"light_json", "light"}:
|
|
464
|
+
data = _to_light_json(data)
|
|
465
|
+
|
|
466
|
+
# Add diagnostics for empty/no-result responses (common UX issue)
|
|
467
|
+
organic = None
|
|
468
|
+
if isinstance(data, dict):
|
|
469
|
+
organic = data.get("organic")
|
|
470
|
+
meta = {
|
|
471
|
+
"engine": engine,
|
|
472
|
+
"q": q,
|
|
473
|
+
"num": num,
|
|
474
|
+
"start": start,
|
|
475
|
+
"format": fmt,
|
|
476
|
+
"has_organic": isinstance(organic, list) and len(organic) > 0,
|
|
477
|
+
"organic_count": len(organic) if isinstance(organic, list) else None,
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if isinstance(organic, list) and len(organic) == 0:
|
|
481
|
+
return ok_response(
|
|
482
|
+
tool="serp",
|
|
483
|
+
input={"action": "search", "params": p},
|
|
484
|
+
output={"_meta": meta, **data},
|
|
485
|
+
)
|
|
486
|
+
if isinstance(data, dict):
|
|
487
|
+
return ok_response(
|
|
488
|
+
tool="serp",
|
|
489
|
+
input={"action": "search", "params": p},
|
|
490
|
+
output={"_meta": meta, **data},
|
|
491
|
+
)
|
|
492
|
+
return ok_response(tool="serp", input={"action": "search", "params": p}, output={"_meta": meta, "data": data})
|
|
493
|
+
|
|
494
|
+
if a == "batch_search":
|
|
495
|
+
reqs = p.get("requests")
|
|
496
|
+
if not isinstance(reqs, list) or not reqs:
|
|
497
|
+
return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
|
|
498
|
+
concurrency = int(p.get("concurrency", 5))
|
|
499
|
+
concurrency = max(1, min(concurrency, 20))
|
|
500
|
+
fmt = str(p.get("format", "json")).strip().lower()
|
|
501
|
+
sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
|
|
502
|
+
from thordata.types import SerpRequest
|
|
503
|
+
|
|
504
|
+
sem = asyncio.Semaphore(concurrency)
|
|
505
|
+
|
|
506
|
+
async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
|
|
507
|
+
q = str(r.get("q", r.get("query", "")))
|
|
508
|
+
if not q:
|
|
509
|
+
return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing q"}}
|
|
510
|
+
try:
|
|
511
|
+
engine_in = str(r.get("engine", "google")).strip() or "google"
|
|
512
|
+
num = int(r.get("num", 10))
|
|
513
|
+
start = int(r.get("start", 0))
|
|
514
|
+
tbm_raw = r.get("tbm")
|
|
515
|
+
tbm_lower = tbm_raw.strip().lower() if isinstance(tbm_raw, str) else None
|
|
516
|
+
engine = engine_in
|
|
517
|
+
if engine_in.lower() == "google" and tbm_lower in {"images", "news", "videos", "shops", "shopping"}:
|
|
518
|
+
engine_map = {
|
|
519
|
+
"images": "google_images",
|
|
520
|
+
"news": "google_news",
|
|
521
|
+
"videos": "google_videos",
|
|
522
|
+
"shops": "google_shopping",
|
|
523
|
+
"shopping": "google_shopping",
|
|
524
|
+
}
|
|
525
|
+
engine = engine_map[tbm_lower]
|
|
526
|
+
if isinstance(tbm_raw, str):
|
|
527
|
+
tbm_alias = {"image": "images", "video": "videos", "shop": "shops"}
|
|
528
|
+
tbm_norm = tbm_alias.get(tbm_lower)
|
|
529
|
+
if tbm_norm:
|
|
530
|
+
r = dict(r)
|
|
531
|
+
r["tbm"] = tbm_norm
|
|
532
|
+
extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
|
|
533
|
+
if r.get("ai_overview") is not None:
|
|
534
|
+
extra_params = dict(extra_params)
|
|
535
|
+
extra_params["ai_overview"] = r.get("ai_overview")
|
|
536
|
+
for k in ("safe", "nfpr", "filter", "tbs", "ibp", "lsig", "si", "uds"):
|
|
537
|
+
if r.get(k) is not None:
|
|
538
|
+
extra_params = dict(extra_params)
|
|
539
|
+
extra_params[k] = r.get(k)
|
|
540
|
+
async with sem:
|
|
541
|
+
req = SerpRequest(
|
|
542
|
+
query=q,
|
|
543
|
+
engine=engine,
|
|
544
|
+
num=num,
|
|
545
|
+
start=start,
|
|
546
|
+
device=r.get("device"),
|
|
547
|
+
output_format=sdk_fmt,
|
|
548
|
+
render_js=r.get("render_js"),
|
|
549
|
+
no_cache=r.get("no_cache"),
|
|
550
|
+
google_domain=r.get("google_domain"),
|
|
551
|
+
country=r.get("gl"),
|
|
552
|
+
language=r.get("hl"),
|
|
553
|
+
countries_filter=r.get("cr"),
|
|
554
|
+
languages_filter=r.get("lr"),
|
|
555
|
+
location=r.get("location"),
|
|
556
|
+
uule=r.get("uule"),
|
|
557
|
+
search_type=r.get("tbm"),
|
|
558
|
+
ludocid=r.get("ludocid"),
|
|
559
|
+
kgmid=r.get("kgmid"),
|
|
560
|
+
extra_params=extra_params,
|
|
561
|
+
)
|
|
562
|
+
try:
|
|
563
|
+
data = await client.serp_search_advanced(req)
|
|
564
|
+
except Exception as e:
|
|
565
|
+
msg = str(e)
|
|
566
|
+
if "Invalid tbm parameter" in msg or "invalid tbm parameter" in msg:
|
|
567
|
+
return {
|
|
568
|
+
"index": i,
|
|
569
|
+
"ok": False,
|
|
570
|
+
"q": q,
|
|
571
|
+
"error": {
|
|
572
|
+
"type": "validation_error",
|
|
573
|
+
"message": "Invalid tbm (search type) parameter for SERP.",
|
|
574
|
+
"details": {"tbm": r.get("tbm")},
|
|
575
|
+
},
|
|
576
|
+
}
|
|
577
|
+
raise
|
|
578
|
+
if fmt in {"light_json", "light"}:
|
|
579
|
+
data = _to_light_json(data)
|
|
580
|
+
return {"index": i, "ok": True, "q": q, "output": data}
|
|
581
|
+
except Exception as e:
|
|
582
|
+
return {"index": i, "ok": False, "q": q, "error": str(e)}
|
|
583
|
+
|
|
584
|
+
await safe_ctx_info(ctx, f"serp.batch_search count={len(reqs)} concurrency={concurrency} format={fmt}")
|
|
585
|
+
results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)], return_exceptions=False)
|
|
586
|
+
return ok_response(tool="serp", input={"action": "batch_search", "params": p}, output={"results": results})
|
|
587
|
+
|
|
588
|
+
return error_response(
|
|
589
|
+
tool="serp",
|
|
590
|
+
input={"action": action, "params": p},
|
|
591
|
+
error_type="validation_error",
|
|
592
|
+
code="E4001",
|
|
593
|
+
message=f"Unknown action '{action}'. Supported actions: 'search', 'batch_search'",
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
if _allow("serp"):
|
|
597
|
+
mcp.tool(
|
|
598
|
+
name="serp",
|
|
599
|
+
description=(
|
|
600
|
+
"Low-level SERP scraper with full parameter control. "
|
|
601
|
+
'Action in {search, batch_search}. Example: {"q": "Python", "num": 10, "engine": "google", "format": "light_json"}. '
|
|
602
|
+
"Prefer search_engine for minimal, LLM-friendly output."
|
|
603
|
+
),
|
|
604
|
+
)(serp)
|
|
605
|
+
|
|
606
|
+
# -------------------------
|
|
607
|
+
# WEB UNLOCKER (compact)
|
|
608
|
+
# -------------------------
|
|
609
|
+
@mcp.tool(
|
|
610
|
+
name="unlocker",
|
|
611
|
+
description=(
|
|
612
|
+
"WEB UNLOCKER (Universal Scrape): action in {fetch, batch_fetch}. "
|
|
613
|
+
'Use fetch for a single URL: {"url": "https://example.com", "output_format": "markdown", "js_render": true}. '
|
|
614
|
+
'Use batch_fetch for multiple URLs: {"requests": [{"url": "..."}, ...], "concurrency": 5}.'
|
|
615
|
+
),
|
|
616
|
+
)
|
|
617
|
+
@handle_mcp_errors
|
|
618
|
+
async def unlocker(
|
|
619
|
+
action: str,
|
|
620
|
+
*,
|
|
621
|
+
params: Any = None,
|
|
622
|
+
ctx: Optional[Context] = None,
|
|
623
|
+
) -> dict[str, Any]:
|
|
624
|
+
"""WEB UNLOCKER: action in {fetch, batch_fetch}.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
action: Action to perform - "fetch" or "batch_fetch"
|
|
628
|
+
params: Parameters dictionary. For "fetch": {"url": "https://...", "js_render": true, "output_format": "html", ...}
|
|
629
|
+
For "batch_fetch": {"requests": [{"url": "https://..."}, ...], "concurrency": 5}
|
|
630
|
+
|
|
631
|
+
Examples:
|
|
632
|
+
unlocker(action="fetch", params={"url": "https://www.google.com", "js_render": true})
|
|
633
|
+
unlocker(action="batch_fetch", params={"requests": [{"url": "https://example.com"}], "concurrency": 5})
|
|
634
|
+
"""
|
|
635
|
+
# Normalize params with enhanced error messages
|
|
636
|
+
try:
|
|
637
|
+
p = normalize_params(params, "unlocker", action)
|
|
638
|
+
except ValueError as e:
|
|
639
|
+
if "JSON" in str(e):
|
|
640
|
+
return create_params_error("unlocker", action, params, str(e))
|
|
641
|
+
else:
|
|
642
|
+
return create_params_error("unlocker", action, params, str(e))
|
|
643
|
+
|
|
644
|
+
a = (action or "").strip().lower()
|
|
645
|
+
if not a:
|
|
646
|
+
return error_response(
|
|
647
|
+
tool="unlocker",
|
|
648
|
+
input={"action": action, "params": p},
|
|
649
|
+
error_type="validation_error",
|
|
650
|
+
code="E4001",
|
|
651
|
+
message="action is required",
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
client = await ServerContext.get_client()
|
|
655
|
+
|
|
656
|
+
if a == "fetch":
|
|
657
|
+
url = str(p.get("url", "")).strip()
|
|
658
|
+
if not url:
|
|
659
|
+
return error_response(
|
|
660
|
+
tool="unlocker",
|
|
661
|
+
input={"action": action, "params": p},
|
|
662
|
+
error_type="validation_error",
|
|
663
|
+
code="E4001",
|
|
664
|
+
message="Missing url",
|
|
665
|
+
details={"params_example": {"url": "https://example.com", "output_format": "markdown", "js_render": True}},
|
|
666
|
+
)
|
|
667
|
+
fmt = str(p.get("output_format", "html") or "html").strip().lower()
|
|
668
|
+
js_render = bool(p.get("js_render", True))
|
|
669
|
+
wait_ms = p.get("wait_ms")
|
|
670
|
+
wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
|
|
671
|
+
country = p.get("country")
|
|
672
|
+
# Validate block_resources (allowed: script, image, video)
|
|
673
|
+
block_resources_raw = p.get("block_resources")
|
|
674
|
+
block_resources = None
|
|
675
|
+
if block_resources_raw is not None:
|
|
676
|
+
if isinstance(block_resources_raw, str):
|
|
677
|
+
items = [x.strip() for x in block_resources_raw.split(",") if x.strip()]
|
|
678
|
+
elif isinstance(block_resources_raw, list):
|
|
679
|
+
items = [str(x).strip() for x in block_resources_raw]
|
|
680
|
+
else:
|
|
681
|
+
items = []
|
|
682
|
+
allowed = {"script", "image", "video"}
|
|
683
|
+
invalid = [x for x in items if x not in allowed]
|
|
684
|
+
if invalid:
|
|
685
|
+
return error_response(
|
|
686
|
+
tool="unlocker",
|
|
687
|
+
input={"action": action, "params": p},
|
|
688
|
+
error_type="validation_error",
|
|
689
|
+
code="E4001",
|
|
690
|
+
message="Invalid block_resources values.",
|
|
691
|
+
details={
|
|
692
|
+
"allowed": ["script", "image", "video"],
|
|
693
|
+
"invalid": invalid,
|
|
694
|
+
},
|
|
695
|
+
)
|
|
696
|
+
block_resources = ",".join(items) if items else None
|
|
697
|
+
|
|
698
|
+
# Validate clean_content (allowed: js, css)
|
|
699
|
+
clean_content_raw = p.get("clean_content")
|
|
700
|
+
clean_content = None
|
|
701
|
+
if clean_content_raw is not None:
|
|
702
|
+
if isinstance(clean_content_raw, str):
|
|
703
|
+
items = [x.strip() for x in clean_content_raw.split(",") if x.strip()]
|
|
704
|
+
elif isinstance(clean_content_raw, list):
|
|
705
|
+
items = [str(x).strip() for x in clean_content_raw]
|
|
706
|
+
else:
|
|
707
|
+
items = []
|
|
708
|
+
allowed = {"js", "css"}
|
|
709
|
+
invalid = [x for x in items if x not in allowed]
|
|
710
|
+
if invalid:
|
|
711
|
+
return error_response(
|
|
712
|
+
tool="unlocker",
|
|
713
|
+
input={"action": action, "params": p},
|
|
714
|
+
error_type="validation_error",
|
|
715
|
+
code="E4001",
|
|
716
|
+
message="Invalid clean_content values.",
|
|
717
|
+
details={
|
|
718
|
+
"allowed": ["js", "css"],
|
|
719
|
+
"invalid": invalid,
|
|
720
|
+
},
|
|
721
|
+
)
|
|
722
|
+
clean_content = ",".join(items) if items else None
|
|
723
|
+
|
|
724
|
+
# Default wait_for to .content if not provided
|
|
725
|
+
wait_for = p.get("wait_for") or ".content"
|
|
726
|
+
max_chars = int(p.get("max_chars", 20_000))
|
|
727
|
+
headers = p.get("headers") # Custom headers (list[{'name','value'}] or dict)
|
|
728
|
+
cookies = p.get("cookies") # Custom cookies (list[{'name','value'}])
|
|
729
|
+
extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
|
|
730
|
+
|
|
731
|
+
# Apply validated clean_content (allowed: js, css)
|
|
732
|
+
if clean_content:
|
|
733
|
+
extra_params["clean_content"] = clean_content
|
|
734
|
+
|
|
735
|
+
# Headers: accept list[{name,value}] or dict
|
|
736
|
+
if headers is not None:
|
|
737
|
+
if isinstance(headers, list):
|
|
738
|
+
bad = [h for h in headers if not (isinstance(h, dict) and "name" in h and "value" in h)]
|
|
739
|
+
if bad:
|
|
740
|
+
return error_response(
|
|
741
|
+
tool="unlocker",
|
|
742
|
+
input={"action": action, "params": p},
|
|
743
|
+
error_type="validation_error",
|
|
744
|
+
code="E4001",
|
|
745
|
+
message="Invalid headers format.",
|
|
746
|
+
details={"expected": "list[{name,value}] or dict", "example": [{"name": "User-Agent", "value": "..."}]},
|
|
747
|
+
)
|
|
748
|
+
extra_params["headers"] = headers
|
|
749
|
+
elif isinstance(headers, dict):
|
|
750
|
+
extra_params["headers"] = [{"name": k, "value": v} for k, v in headers.items()]
|
|
751
|
+
else:
|
|
752
|
+
return error_response(
|
|
753
|
+
tool="unlocker",
|
|
754
|
+
input={"action": action, "params": p},
|
|
755
|
+
error_type="validation_error",
|
|
756
|
+
code="E4001",
|
|
757
|
+
message="Invalid headers type.",
|
|
758
|
+
details={"expected": "list or dict"},
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
# Cookies: accept list[{name,value}] only (panel format)
|
|
762
|
+
if cookies is not None:
|
|
763
|
+
if isinstance(cookies, list):
|
|
764
|
+
bad = [c for c in cookies if not (isinstance(c, dict) and "name" in c and "value" in c)]
|
|
765
|
+
if bad:
|
|
766
|
+
return error_response(
|
|
767
|
+
tool="unlocker",
|
|
768
|
+
input={"action": action, "params": p},
|
|
769
|
+
error_type="validation_error",
|
|
770
|
+
code="E4001",
|
|
771
|
+
message="Invalid cookies format.",
|
|
772
|
+
details={"expected": "list[{name,value}]", "example": [{"name": "__csrf_token", "value": "..."}]},
|
|
773
|
+
)
|
|
774
|
+
extra_params["cookies"] = cookies
|
|
775
|
+
elif isinstance(cookies, dict):
|
|
776
|
+
extra_params["cookies"] = [{"name": k, "value": v} for k, v in cookies.items()]
|
|
777
|
+
else:
|
|
778
|
+
return error_response(
|
|
779
|
+
tool="unlocker",
|
|
780
|
+
input={"action": action, "params": p},
|
|
781
|
+
error_type="validation_error",
|
|
782
|
+
code="E4001",
|
|
783
|
+
message="Invalid cookies type.",
|
|
784
|
+
details={"expected": "list or dict"},
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
fetch_format = "html" if fmt in {"markdown", "md"} else fmt
|
|
788
|
+
|
|
789
|
+
# If the user asked for Markdown, we still fetch HTML from Unlocker and convert locally.
|
|
790
|
+
# Default: strip JS/CSS in the same request (avoid double network calls).
|
|
791
|
+
raw_markdown = bool(p.get("raw_markdown", False)) if fmt in {"markdown", "md"} else False
|
|
792
|
+
if fmt in {"markdown", "md"} and not raw_markdown:
|
|
793
|
+
cc = extra_params.get("clean_content")
|
|
794
|
+
if isinstance(cc, str) and cc.strip():
|
|
795
|
+
parts = [x.strip() for x in cc.split(",") if x.strip()]
|
|
796
|
+
else:
|
|
797
|
+
parts = []
|
|
798
|
+
for x in ("js", "css"):
|
|
799
|
+
if x not in parts:
|
|
800
|
+
parts.append(x)
|
|
801
|
+
extra_params["clean_content"] = ",".join(parts)
|
|
802
|
+
|
|
803
|
+
await safe_ctx_info(ctx, f"unlocker.fetch url={url!r} format={fmt} js_render={js_render} raw_markdown={raw_markdown}")
|
|
804
|
+
with PerformanceTimer(tool="unlocker.fetch", url=url):
|
|
805
|
+
try:
|
|
806
|
+
data = await client.universal_scrape(
|
|
807
|
+
url=url,
|
|
808
|
+
js_render=js_render,
|
|
809
|
+
output_format=fetch_format,
|
|
810
|
+
country=country,
|
|
811
|
+
block_resources=block_resources,
|
|
812
|
+
wait=wait_seconds,
|
|
813
|
+
wait_for=wait_for,
|
|
814
|
+
**extra_params,
|
|
815
|
+
)
|
|
816
|
+
except Exception as e:
|
|
817
|
+
msg = str(e)
|
|
818
|
+
# Some upstream failures return HTML (e.g. gateway errors) which can trigger JSON decode errors in the SDK.
|
|
819
|
+
if "Attempt to decode JSON" in msg or "unexpected mimetype: text/html" in msg:
|
|
820
|
+
return error_response(
|
|
821
|
+
tool="unlocker",
|
|
822
|
+
input={"action": action, "params": p},
|
|
823
|
+
error_type="upstream_internal_error",
|
|
824
|
+
code="E2106",
|
|
825
|
+
message="Universal API returned a non-JSON error page (likely gateway/upstream failure).",
|
|
826
|
+
details={"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
|
|
827
|
+
)
|
|
828
|
+
raise
|
|
829
|
+
if fetch_format == "png":
|
|
830
|
+
import base64
|
|
831
|
+
|
|
832
|
+
if isinstance(data, (bytes, bytearray)):
|
|
833
|
+
png_base64 = base64.b64encode(data).decode("utf-8")
|
|
834
|
+
size = len(data)
|
|
835
|
+
else:
|
|
836
|
+
png_base64 = str(data)
|
|
837
|
+
size = None
|
|
838
|
+
return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"png_base64": png_base64, "size": size, "format": "png"})
|
|
839
|
+
html = str(data) if not isinstance(data, str) else data
|
|
840
|
+
if fmt in {"markdown", "md"}:
|
|
841
|
+
raw_markdown = bool(p.get("raw_markdown", False))
|
|
842
|
+
|
|
843
|
+
# Default behavior: clean Markdown by stripping common noise (style/script).
|
|
844
|
+
# IMPORTANT: do this with a single universal_scrape request by injecting clean_content into extra_params.
|
|
845
|
+
if not raw_markdown:
|
|
846
|
+
cc = extra_params.get("clean_content")
|
|
847
|
+
if isinstance(cc, str) and cc.strip():
|
|
848
|
+
parts = [x.strip() for x in cc.split(",") if x.strip()]
|
|
849
|
+
else:
|
|
850
|
+
parts = []
|
|
851
|
+
for x in ("js", "css"):
|
|
852
|
+
if x not in parts:
|
|
853
|
+
parts.append(x)
|
|
854
|
+
extra_params["clean_content"] = ",".join(parts)
|
|
855
|
+
|
|
856
|
+
md = html_to_markdown_clean(html)
|
|
857
|
+
md = truncate_content(md, max_length=max_chars)
|
|
858
|
+
return ok_response(
|
|
859
|
+
tool="unlocker",
|
|
860
|
+
input={"action": "fetch", "params": p},
|
|
861
|
+
output={"markdown": md, "_meta": {"raw_markdown": raw_markdown}},
|
|
862
|
+
)
|
|
863
|
+
return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"html": html})
|
|
864
|
+
|
|
865
|
+
if a == "batch_fetch":
|
|
866
|
+
reqs = p.get("requests")
|
|
867
|
+
if not isinstance(reqs, list) or not reqs:
|
|
868
|
+
return error_response(
|
|
869
|
+
tool="unlocker",
|
|
870
|
+
input={"action": action, "params": p},
|
|
871
|
+
error_type="validation_error",
|
|
872
|
+
code="E4001",
|
|
873
|
+
message="Missing requests[] (array of {url,...} objects)",
|
|
874
|
+
)
|
|
875
|
+
concurrency = int(p.get("concurrency", 5))
|
|
876
|
+
concurrency = max(1, min(concurrency, 20))
|
|
877
|
+
sem = asyncio.Semaphore(concurrency)
|
|
878
|
+
|
|
879
|
+
async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
|
|
880
|
+
url = str(r.get("url", ""))
|
|
881
|
+
if not url:
|
|
882
|
+
return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing url"}}
|
|
883
|
+
fmt = str(r.get("output_format", "html")).strip().lower()
|
|
884
|
+
fetch_format = "html" if fmt in {"markdown", "md"} else fmt
|
|
885
|
+
js_render = bool(r.get("js_render", True))
|
|
886
|
+
wait_ms = r.get("wait_ms")
|
|
887
|
+
wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
|
|
888
|
+
# Per-request params normalization to match unlocker.fetch
|
|
889
|
+
country = r.get("country")
|
|
890
|
+
|
|
891
|
+
# Validate block_resources (allowed: script, image, video)
|
|
892
|
+
block_resources_raw = r.get("block_resources")
|
|
893
|
+
block_resources = None
|
|
894
|
+
if block_resources_raw is not None:
|
|
895
|
+
if isinstance(block_resources_raw, str):
|
|
896
|
+
items = [x.strip() for x in block_resources_raw.split(",") if x.strip()]
|
|
897
|
+
elif isinstance(block_resources_raw, list):
|
|
898
|
+
items = [str(x).strip() for x in block_resources_raw]
|
|
899
|
+
else:
|
|
900
|
+
items = []
|
|
901
|
+
allowed = {"script", "image", "video"}
|
|
902
|
+
invalid = [x for x in items if x not in allowed]
|
|
903
|
+
if invalid:
|
|
904
|
+
return {
|
|
905
|
+
"index": i,
|
|
906
|
+
"ok": False,
|
|
907
|
+
"url": url,
|
|
908
|
+
"error": {
|
|
909
|
+
"type": "validation_error",
|
|
910
|
+
"message": "Invalid block_resources values.",
|
|
911
|
+
"details": {"allowed": ["script", "image", "video"], "invalid": invalid},
|
|
912
|
+
},
|
|
913
|
+
}
|
|
914
|
+
block_resources = ",".join(items) if items else None
|
|
915
|
+
|
|
916
|
+
# Validate clean_content (allowed: js, css)
|
|
917
|
+
clean_content_raw = r.get("clean_content")
|
|
918
|
+
clean_content = None
|
|
919
|
+
if clean_content_raw is not None:
|
|
920
|
+
if isinstance(clean_content_raw, str):
|
|
921
|
+
items = [x.strip() for x in clean_content_raw.split(",") if x.strip()]
|
|
922
|
+
elif isinstance(clean_content_raw, list):
|
|
923
|
+
items = [str(x).strip() for x in clean_content_raw]
|
|
924
|
+
else:
|
|
925
|
+
items = []
|
|
926
|
+
allowed = {"js", "css"}
|
|
927
|
+
invalid = [x for x in items if x not in allowed]
|
|
928
|
+
if invalid:
|
|
929
|
+
return {
|
|
930
|
+
"index": i,
|
|
931
|
+
"ok": False,
|
|
932
|
+
"url": url,
|
|
933
|
+
"error": {
|
|
934
|
+
"type": "validation_error",
|
|
935
|
+
"message": "Invalid clean_content values.",
|
|
936
|
+
"details": {"allowed": ["js", "css"], "invalid": invalid},
|
|
937
|
+
},
|
|
938
|
+
}
|
|
939
|
+
clean_content = ",".join(items) if items else None
|
|
940
|
+
|
|
941
|
+
# Default wait_for to .content if not provided
|
|
942
|
+
wait_for = r.get("wait_for") or ".content"
|
|
943
|
+
|
|
944
|
+
headers = r.get("headers")
|
|
945
|
+
cookies = r.get("cookies")
|
|
946
|
+
extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
|
|
947
|
+
|
|
948
|
+
# Apply validated clean_content
|
|
949
|
+
if clean_content:
|
|
950
|
+
extra_params["clean_content"] = clean_content
|
|
951
|
+
|
|
952
|
+
# Headers: accept list[{name,value}] or dict
|
|
953
|
+
if headers is not None:
|
|
954
|
+
if isinstance(headers, list):
|
|
955
|
+
bad = [h for h in headers if not (isinstance(h, dict) and "name" in h and "value" in h)]
|
|
956
|
+
if bad:
|
|
957
|
+
return {
|
|
958
|
+
"index": i,
|
|
959
|
+
"ok": False,
|
|
960
|
+
"url": url,
|
|
961
|
+
"error": {
|
|
962
|
+
"type": "validation_error",
|
|
963
|
+
"message": "Invalid headers format.",
|
|
964
|
+
"details": {"expected": "list[{name,value}] or dict", "example": [{"name": "User-Agent", "value": "..."}]},
|
|
965
|
+
},
|
|
966
|
+
}
|
|
967
|
+
extra_params["headers"] = headers
|
|
968
|
+
elif isinstance(headers, dict):
|
|
969
|
+
extra_params["headers"] = [{"name": k, "value": v} for k, v in headers.items()]
|
|
970
|
+
else:
|
|
971
|
+
return {
|
|
972
|
+
"index": i,
|
|
973
|
+
"ok": False,
|
|
974
|
+
"url": url,
|
|
975
|
+
"error": {"type": "validation_error", "message": "Invalid headers type.", "details": {"expected": "list or dict"}},
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
# Cookies: accept list[{name,value}] or dict
|
|
979
|
+
if cookies is not None:
|
|
980
|
+
if isinstance(cookies, list):
|
|
981
|
+
bad = [c for c in cookies if not (isinstance(c, dict) and "name" in c and "value" in c)]
|
|
982
|
+
if bad:
|
|
983
|
+
return {
|
|
984
|
+
"index": i,
|
|
985
|
+
"ok": False,
|
|
986
|
+
"url": url,
|
|
987
|
+
"error": {
|
|
988
|
+
"type": "validation_error",
|
|
989
|
+
"message": "Invalid cookies format.",
|
|
990
|
+
"details": {"expected": "list[{name,value}]", "example": [{"name": "__csrf_token", "value": "..."}]},
|
|
991
|
+
},
|
|
992
|
+
}
|
|
993
|
+
extra_params["cookies"] = cookies
|
|
994
|
+
elif isinstance(cookies, dict):
|
|
995
|
+
extra_params["cookies"] = [{"name": k, "value": v} for k, v in cookies.items()]
|
|
996
|
+
else:
|
|
997
|
+
return {
|
|
998
|
+
"index": i,
|
|
999
|
+
"ok": False,
|
|
1000
|
+
"url": url,
|
|
1001
|
+
"error": {"type": "validation_error", "message": "Invalid cookies type.", "details": {"expected": "list or dict"}},
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
# If the user asked for Markdown, we still fetch HTML from Unlocker and convert locally.
|
|
1005
|
+
raw_markdown = bool(r.get("raw_markdown", False)) if fmt in {"markdown", "md"} else False
|
|
1006
|
+
if fmt in {"markdown", "md"} and not raw_markdown:
|
|
1007
|
+
cc = extra_params.get("clean_content")
|
|
1008
|
+
if isinstance(cc, str) and cc.strip():
|
|
1009
|
+
parts = [x.strip() for x in cc.split(",") if x.strip()]
|
|
1010
|
+
else:
|
|
1011
|
+
parts = []
|
|
1012
|
+
for x in ("js", "css"):
|
|
1013
|
+
if x not in parts:
|
|
1014
|
+
parts.append(x)
|
|
1015
|
+
extra_params["clean_content"] = ",".join(parts)
|
|
1016
|
+
async with sem:
|
|
1017
|
+
with PerformanceTimer(tool="unlocker.batch_fetch", url=url):
|
|
1018
|
+
try:
|
|
1019
|
+
data = await client.universal_scrape(
|
|
1020
|
+
url=url,
|
|
1021
|
+
js_render=js_render,
|
|
1022
|
+
output_format=fetch_format,
|
|
1023
|
+
country=country,
|
|
1024
|
+
block_resources=block_resources,
|
|
1025
|
+
wait=wait_seconds,
|
|
1026
|
+
wait_for=wait_for,
|
|
1027
|
+
**extra_params,
|
|
1028
|
+
)
|
|
1029
|
+
except Exception as e:
|
|
1030
|
+
msg = str(e)
|
|
1031
|
+
if "Attempt to decode JSON" in msg or "unexpected mimetype: text/html" in msg:
|
|
1032
|
+
return {
|
|
1033
|
+
"index": i,
|
|
1034
|
+
"ok": False,
|
|
1035
|
+
"url": url,
|
|
1036
|
+
"error": {
|
|
1037
|
+
"type": "upstream_internal_error",
|
|
1038
|
+
"code": "E2106",
|
|
1039
|
+
"message": "Universal API returned a non-JSON error page (likely gateway/upstream failure).",
|
|
1040
|
+
"details": {"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
|
|
1041
|
+
},
|
|
1042
|
+
}
|
|
1043
|
+
# Ensure batch_fetch never fails the whole batch on a single upstream error.
|
|
1044
|
+
return {
|
|
1045
|
+
"index": i,
|
|
1046
|
+
"ok": False,
|
|
1047
|
+
"url": url,
|
|
1048
|
+
"error": {
|
|
1049
|
+
"type": "upstream_internal_error",
|
|
1050
|
+
"code": "E2106",
|
|
1051
|
+
"message": "Universal API request failed.",
|
|
1052
|
+
"details": {"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
|
|
1053
|
+
},
|
|
1054
|
+
}
|
|
1055
|
+
if fetch_format == "png":
|
|
1056
|
+
import base64
|
|
1057
|
+
|
|
1058
|
+
if isinstance(data, (bytes, bytearray)):
|
|
1059
|
+
png_base64 = base64.b64encode(data).decode("utf-8")
|
|
1060
|
+
size = len(data)
|
|
1061
|
+
else:
|
|
1062
|
+
png_base64 = str(data)
|
|
1063
|
+
size = None
|
|
1064
|
+
return {"index": i, "ok": True, "url": url, "output": {"png_base64": png_base64, "size": size, "format": "png"}}
|
|
1065
|
+
html = str(data) if not isinstance(data, str) else data
|
|
1066
|
+
if fmt in {"markdown", "md"}:
|
|
1067
|
+
md = html_to_markdown_clean(html)
|
|
1068
|
+
md = truncate_content(md, max_length=int(r.get("max_chars", 20_000)))
|
|
1069
|
+
return {"index": i, "ok": True, "url": url, "output": {"markdown": md}}
|
|
1070
|
+
return {"index": i, "ok": True, "url": url, "output": {"html": html}}
|
|
1071
|
+
|
|
1072
|
+
await safe_ctx_info(ctx, f"unlocker.batch_fetch count={len(reqs)} concurrency={concurrency}")
|
|
1073
|
+
results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
|
|
1074
|
+
return ok_response(tool="unlocker", input={"action": "batch_fetch", "params": p}, output={"results": results})
|
|
1075
|
+
|
|
1076
|
+
return error_response(
|
|
1077
|
+
tool="unlocker",
|
|
1078
|
+
input={"action": action, "params": p},
|
|
1079
|
+
error_type="validation_error",
|
|
1080
|
+
code="E4001",
|
|
1081
|
+
message=f"Unknown action '{action}'. Supported actions: 'fetch', 'batch_fetch'",
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
# -------------------------
|
|
1085
|
+
# WEB SCRAPER (compact)
|
|
1086
|
+
# -------------------------
|
|
1087
|
+
async def web_scraper(
|
|
1088
|
+
action: str,
|
|
1089
|
+
*,
|
|
1090
|
+
params: Any = None,
|
|
1091
|
+
ctx: Optional[Context] = None,
|
|
1092
|
+
) -> dict[str, Any]:
|
|
1093
|
+
"""WEB SCRAPER: action covers catalog/groups/run/batch_run/status/wait/result/list_tasks and batch helpers.
|
|
1094
|
+
|
|
1095
|
+
Args:
|
|
1096
|
+
action: Action to perform - "catalog", "groups", "run", "batch_run", "status", "wait", "result", "list_tasks", etc.
|
|
1097
|
+
params: Parameters dictionary. Varies by action:
|
|
1098
|
+
- "catalog": {"group": "...", "keyword": "...", "limit": 100, "offset": 0}
|
|
1099
|
+
- "run": {"tool": "tool_key", "params": {...}, "wait": true, "file_type": "json"}
|
|
1100
|
+
- "status": {"task_id": "..."}
|
|
1101
|
+
- etc.
|
|
1102
|
+
|
|
1103
|
+
Examples:
|
|
1104
|
+
web_scraper(action="catalog", params={"limit": 20})
|
|
1105
|
+
web_scraper(action="run", params={"tool": "thordata.tools.ecommerce.Amazon.ProductByUrl", "params": {"url": "https://amazon.com/..."}})
|
|
1106
|
+
"""
|
|
1107
|
+
# Normalize params with enhanced error messages
|
|
1108
|
+
try:
|
|
1109
|
+
p = normalize_params(params, "web_scraper", action)
|
|
1110
|
+
except ValueError as e:
|
|
1111
|
+
if "JSON" in str(e):
|
|
1112
|
+
return create_params_error("web_scraper", action, params, str(e))
|
|
1113
|
+
else:
|
|
1114
|
+
return create_params_error("web_scraper", action, params, str(e))
|
|
1115
|
+
|
|
1116
|
+
a = (action or "").strip().lower()
|
|
1117
|
+
if not a:
|
|
1118
|
+
return error_response(
|
|
1119
|
+
tool="web_scraper",
|
|
1120
|
+
input={"action": action, "params": p},
|
|
1121
|
+
error_type="validation_error",
|
|
1122
|
+
code="E4001",
|
|
1123
|
+
message="action is required",
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
client = await ServerContext.get_client()
|
|
1127
|
+
|
|
1128
|
+
if a == "groups":
|
|
1129
|
+
# Reuse helper via full module: simply call web_scraper.groups by computing from catalog
|
|
1130
|
+
# We use web_scraper.catalog meta/groups via _catalog
|
|
1131
|
+
page, meta = _catalog(group=None, keyword=None, limit=1, offset=0)
|
|
1132
|
+
return ok_response(tool="web_scraper", input={"action": "groups", "params": p}, output={"groups": meta.get("groups"), "total": meta.get("total")})
|
|
1133
|
+
|
|
1134
|
+
if a in {"spiders", "spider_ids", "ids"}:
|
|
1135
|
+
# Convenience: return the full list of spider_id mappings without huge field schemas.
|
|
1136
|
+
limit = max(1, min(int(p.get("limit", 500)), 2000))
|
|
1137
|
+
offset = max(0, int(p.get("offset", 0)))
|
|
1138
|
+
page, meta = _catalog(group=p.get("group"), keyword=p.get("keyword"), limit=limit, offset=offset)
|
|
1139
|
+
items = []
|
|
1140
|
+
for t in page:
|
|
1141
|
+
s = tool_schema(t)
|
|
1142
|
+
items.append(
|
|
1143
|
+
{
|
|
1144
|
+
"tool_key": s.get("tool_key"),
|
|
1145
|
+
"spider_id": s.get("spider_id"),
|
|
1146
|
+
"spider_name": s.get("spider_name"),
|
|
1147
|
+
"group": s.get("group"),
|
|
1148
|
+
}
|
|
1149
|
+
)
|
|
1150
|
+
return ok_response(tool="web_scraper", input={"action": a, "params": p}, output={"items": items, "meta": meta})
|
|
1151
|
+
|
|
1152
|
+
if a == "catalog":
|
|
1153
|
+
# Tool discovery is configurable to reduce LLM tool selection noise.
|
|
1154
|
+
# - mode=curated: only allow groups from THORDATA_TASKS_GROUPS
|
|
1155
|
+
# - mode=all: list everything
|
|
1156
|
+
cfg = get_settings()
|
|
1157
|
+
mode = str(getattr(cfg, "THORDATA_TASKS_LIST_MODE", "curated") or "curated").strip().lower()
|
|
1158
|
+
groups_allow = [g.strip().lower() for g in (getattr(cfg, "THORDATA_TASKS_GROUPS", "") or "").split(",") if g.strip()]
|
|
1159
|
+
|
|
1160
|
+
# Respect explicit group filter provided by user
|
|
1161
|
+
group_in = p.get("group")
|
|
1162
|
+
group = str(group_in).strip() if group_in is not None else None
|
|
1163
|
+
group = group or None
|
|
1164
|
+
|
|
1165
|
+
# If curated, and no group provided, default to first allowed group to keep list small.
|
|
1166
|
+
# Users can still browse other groups by passing params.group.
|
|
1167
|
+
if mode == "curated" and not group and groups_allow:
|
|
1168
|
+
group = groups_allow[0]
|
|
1169
|
+
|
|
1170
|
+
# If curated + group provided but not allowed, return helpful error
|
|
1171
|
+
if mode == "curated" and group and groups_allow and group.lower() not in groups_allow:
|
|
1172
|
+
return error_response(
|
|
1173
|
+
tool="web_scraper",
|
|
1174
|
+
input={"action": "catalog", "params": p},
|
|
1175
|
+
error_type="not_allowed",
|
|
1176
|
+
code="E4010",
|
|
1177
|
+
message="Group not allowed in curated mode.",
|
|
1178
|
+
details={
|
|
1179
|
+
"mode": mode,
|
|
1180
|
+
"allowed_groups": groups_allow,
|
|
1181
|
+
"requested_group": group,
|
|
1182
|
+
"tip": "Set THORDATA_TASKS_LIST_MODE=all to browse all groups, or update THORDATA_TASKS_GROUPS.",
|
|
1183
|
+
},
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
limit_default = int(getattr(cfg, "THORDATA_TASKS_LIST_DEFAULT_LIMIT", 60) or 60)
|
|
1187
|
+
limit = max(1, min(int(p.get("limit", limit_default)), 500))
|
|
1188
|
+
offset = max(0, int(p.get("offset", 0)))
|
|
1189
|
+
page, meta = _catalog(group=group, keyword=p.get("keyword"), limit=limit, offset=offset)
|
|
1190
|
+
|
|
1191
|
+
meta = dict(meta)
|
|
1192
|
+
meta.update(
|
|
1193
|
+
{
|
|
1194
|
+
"mode": mode,
|
|
1195
|
+
"allowed_groups": groups_allow,
|
|
1196
|
+
"effective_group": group,
|
|
1197
|
+
"how_to_show_all": "Set THORDATA_TASKS_LIST_MODE=all",
|
|
1198
|
+
}
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
return ok_response(
|
|
1202
|
+
tool="web_scraper",
|
|
1203
|
+
input={"action": "catalog", "params": {**p, "group": group} if group else p},
|
|
1204
|
+
output={"tools": [tool_schema(t) for t in page], "meta": meta},
|
|
1205
|
+
)
|
|
1206
|
+
|
|
1207
|
+
if a in {"example", "template"}:
|
|
1208
|
+
tool = str(p.get("tool", "")) or str(p.get("tool_key", ""))
|
|
1209
|
+
if not tool:
|
|
1210
|
+
return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="Missing tool (tool_key)")
|
|
1211
|
+
# Ensure tool exists and produce its schema + minimal params template.
|
|
1212
|
+
from .product import _ensure_tools as _ensure # local import to avoid cycles
|
|
1213
|
+
_, tools_map = _ensure()
|
|
1214
|
+
t = tools_map.get(tool)
|
|
1215
|
+
if not t:
|
|
1216
|
+
return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="invalid_tool", code="E4003", message="Unknown tool key. Use web_scraper.catalog to discover valid keys.")
|
|
1217
|
+
schema = tool_schema(t)
|
|
1218
|
+
params_template = _build_params_template(schema)
|
|
1219
|
+
spider_id = schema.get("spider_id")
|
|
1220
|
+
spider_name = schema.get("spider_name")
|
|
1221
|
+
|
|
1222
|
+
# LLM-oriented notes: explain the two main calling styles.
|
|
1223
|
+
notes: list[str] = [
|
|
1224
|
+
"Step 1: Use web_scraper.catalog to discover tools (filter by keyword/group).",
|
|
1225
|
+
"Step 2: Use web_scraper.example to get this params_template, then fill placeholders like <field> with real values.",
|
|
1226
|
+
"Step 3: Call web_scraper.run with {'tool': tool_key, 'params': {...}, 'wait': true} for a single task, or web_scraper.batch_run for many.",
|
|
1227
|
+
]
|
|
1228
|
+
if spider_id and spider_name:
|
|
1229
|
+
# Many dashboard examples in documentation use builder/video_builder + spider_id.
|
|
1230
|
+
notes.append(
|
|
1231
|
+
"Alternative: For full Dashboard parity, you can call web_scraper.raw_run with "
|
|
1232
|
+
"{'builder': 'builder' or 'video_builder', 'spider_name': spider_name, "
|
|
1233
|
+
"'spider_id': spider_id, 'spider_parameters': [...]}. Use this to mirror curl examples "
|
|
1234
|
+
"from the official web scraper tasks documentation."
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
return ok_response(
|
|
1238
|
+
tool="web_scraper",
|
|
1239
|
+
input={"action": a, "params": {"tool": tool}},
|
|
1240
|
+
output={
|
|
1241
|
+
"tool": tool,
|
|
1242
|
+
"spider_id": spider_id,
|
|
1243
|
+
"spider_name": spider_name,
|
|
1244
|
+
"group": schema.get("group"),
|
|
1245
|
+
"params_template": params_template,
|
|
1246
|
+
"notes": notes,
|
|
1247
|
+
},
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
if a in {"raw_run", "raw_batch_run"}:
|
|
1251
|
+
# Ultimate fallback for 100% Dashboard parity: run by spider_id/spider_name directly,
|
|
1252
|
+
# even if SDK doesn't provide a ToolRequest class for it.
|
|
1253
|
+
client = await ServerContext.get_client()
|
|
1254
|
+
|
|
1255
|
+
async def _one(raw: dict[str, Any]) -> dict[str, Any]:
|
|
1256
|
+
spider_name = str(raw.get("spider_name", "") or raw.get("name", ""))
|
|
1257
|
+
spider_id = str(raw.get("spider_id", "") or raw.get("id", ""))
|
|
1258
|
+
if not spider_name or not spider_id:
|
|
1259
|
+
return {"ok": False, "error": {"type": "validation_error", "message": "Missing spider_name or spider_id"}}
|
|
1260
|
+
|
|
1261
|
+
builder = str(raw.get("builder", "builder")).strip().lower()
|
|
1262
|
+
wait = bool(raw.get("wait", True))
|
|
1263
|
+
max_wait_seconds = int(raw.get("max_wait_seconds", 300))
|
|
1264
|
+
file_type = str(raw.get("file_type", "json"))
|
|
1265
|
+
include_errors = bool(raw.get("include_errors", True))
|
|
1266
|
+
file_name = raw.get("file_name")
|
|
1267
|
+
|
|
1268
|
+
# spider_parameters can be dict/list or JSON string
|
|
1269
|
+
sp = raw.get("spider_parameters", raw.get("parameters"))
|
|
1270
|
+
if isinstance(sp, str):
|
|
1271
|
+
try:
|
|
1272
|
+
sp = json.loads(sp) if sp else {}
|
|
1273
|
+
except Exception:
|
|
1274
|
+
sp = {"raw": sp}
|
|
1275
|
+
if isinstance(sp, dict):
|
|
1276
|
+
sp_list: list[dict[str, Any]] = [sp]
|
|
1277
|
+
elif isinstance(sp, list):
|
|
1278
|
+
sp_list = [x for x in sp if isinstance(x, dict)]
|
|
1279
|
+
if not sp_list:
|
|
1280
|
+
sp_list = [{}]
|
|
1281
|
+
else:
|
|
1282
|
+
sp_list = [{}]
|
|
1283
|
+
|
|
1284
|
+
# spider_universal: for builder universal params or video common_settings
|
|
1285
|
+
su = raw.get("spider_universal") or raw.get("universal_params") or raw.get("common_settings")
|
|
1286
|
+
if isinstance(su, str):
|
|
1287
|
+
try:
|
|
1288
|
+
su = json.loads(su) if su else None
|
|
1289
|
+
except Exception:
|
|
1290
|
+
su = None
|
|
1291
|
+
su_dict = su if isinstance(su, dict) else None
|
|
1292
|
+
|
|
1293
|
+
# Lazy import types from SDK
|
|
1294
|
+
from thordata.types.task import ScraperTaskConfig, VideoTaskConfig
|
|
1295
|
+
from thordata.types.common import CommonSettings
|
|
1296
|
+
|
|
1297
|
+
# Generate file_name if missing (mirror SDK behavior)
|
|
1298
|
+
if not file_name:
|
|
1299
|
+
import uuid
|
|
1300
|
+
short_id = uuid.uuid4().hex[:8]
|
|
1301
|
+
file_name = f"{spider_id}_{short_id}"
|
|
1302
|
+
|
|
1303
|
+
await safe_ctx_info(ctx, f"web_scraper.{a} spider_id={spider_id} builder={builder} wait={wait}")
|
|
1304
|
+
|
|
1305
|
+
# Create task via correct builder endpoint
|
|
1306
|
+
if builder in {"video_builder", "video"}:
|
|
1307
|
+
# Defensive filtering: CommonSettings in the SDK may not include every
|
|
1308
|
+
# key shown in external documentation (e.g. some newer fields like
|
|
1309
|
+
# "kilohertz" / "bitrate" may not yet exist in this SDK version).
|
|
1310
|
+
# Passing unknown keys would raise "unexpected keyword argument" errors,
|
|
1311
|
+
# so we restrict to the dataclass' declared fields.
|
|
1312
|
+
cs_input: dict[str, Any] = {}
|
|
1313
|
+
if su_dict:
|
|
1314
|
+
allowed_keys = getattr(CommonSettings, "__dataclass_fields__", {}).keys()
|
|
1315
|
+
cs_input = {k: v for k, v in su_dict.items() if k in allowed_keys}
|
|
1316
|
+
cs = CommonSettings(**cs_input)
|
|
1317
|
+
config = VideoTaskConfig(
|
|
1318
|
+
file_name=str(file_name),
|
|
1319
|
+
spider_id=spider_id,
|
|
1320
|
+
spider_name=spider_name,
|
|
1321
|
+
parameters=sp_list if len(sp_list) > 1 else sp_list[0],
|
|
1322
|
+
common_settings=cs,
|
|
1323
|
+
include_errors=include_errors,
|
|
1324
|
+
)
|
|
1325
|
+
task_id = await client.create_video_task_advanced(config)
|
|
1326
|
+
else:
|
|
1327
|
+
config = ScraperTaskConfig(
|
|
1328
|
+
file_name=str(file_name),
|
|
1329
|
+
spider_id=spider_id,
|
|
1330
|
+
spider_name=spider_name,
|
|
1331
|
+
parameters=sp_list if len(sp_list) > 1 else sp_list[0],
|
|
1332
|
+
universal_params=su_dict,
|
|
1333
|
+
include_errors=include_errors,
|
|
1334
|
+
)
|
|
1335
|
+
task_id = await client.create_scraper_task_advanced(config)
|
|
1336
|
+
|
|
1337
|
+
result: dict[str, Any] = {"task_id": task_id, "spider_id": spider_id, "spider_name": spider_name}
|
|
1338
|
+
if wait:
|
|
1339
|
+
status = await client.wait_for_task(task_id, max_wait=max_wait_seconds)
|
|
1340
|
+
status_s = str(status)
|
|
1341
|
+
result["status"] = status_s
|
|
1342
|
+
if status_s.strip().lower() in {"ready", "success", "finished", "succeeded", "task succeeded", "task_succeeded"}:
|
|
1343
|
+
dl = await client.get_task_result(task_id, file_type=file_type)
|
|
1344
|
+
from thordata_mcp.utils import enrich_download_url
|
|
1345
|
+
result["download_url"] = enrich_download_url(dl, task_id=task_id, file_type=file_type)
|
|
1346
|
+
return {"ok": True, "output": result}
|
|
1347
|
+
|
|
1348
|
+
if a == "raw_run":
|
|
1349
|
+
out = await _one(p)
|
|
1350
|
+
if out.get("ok") is True:
|
|
1351
|
+
return ok_response(tool="web_scraper", input={"action": a, "params": p}, output=out["output"])
|
|
1352
|
+
return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="raw_run failed", details=out.get("error"))
|
|
1353
|
+
|
|
1354
|
+
reqs = p.get("requests")
|
|
1355
|
+
if not isinstance(reqs, list) or not reqs:
|
|
1356
|
+
return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
|
|
1357
|
+
concurrency = max(1, min(int(p.get("concurrency", 5)), 20))
|
|
1358
|
+
sem = asyncio.Semaphore(concurrency)
|
|
1359
|
+
|
|
1360
|
+
async def _wrap(i: int, r: Any) -> dict[str, Any]:
|
|
1361
|
+
raw = r if isinstance(r, dict) else {}
|
|
1362
|
+
async with sem:
|
|
1363
|
+
one = await _one(raw)
|
|
1364
|
+
return {"index": i, **one}
|
|
1365
|
+
|
|
1366
|
+
results = await asyncio.gather(*[_wrap(i, r) for i, r in enumerate(reqs)], return_exceptions=False)
|
|
1367
|
+
return ok_response(tool="web_scraper", input={"action": a, "params": {"count": len(reqs), "concurrency": concurrency}}, output={"results": results})
|
|
1368
|
+
|
|
1369
|
+
if a == "run":
|
|
1370
|
+
tool = str(p.get("tool", ""))
|
|
1371
|
+
if not tool:
|
|
1372
|
+
return error_response(
|
|
1373
|
+
tool="web_scraper",
|
|
1374
|
+
input={"action": action, "params": p},
|
|
1375
|
+
error_type="validation_error",
|
|
1376
|
+
code="E4001",
|
|
1377
|
+
message="Missing tool",
|
|
1378
|
+
details={
|
|
1379
|
+
"missing_fields": ["tool"],
|
|
1380
|
+
"next_step": "Call web_scraper(action='catalog', params={'keyword': '...'}) to discover tool_key",
|
|
1381
|
+
},
|
|
1382
|
+
)
|
|
1383
|
+
params_dict = p.get("params") if isinstance(p.get("params"), dict) else None
|
|
1384
|
+
param_json = p.get("param_json")
|
|
1385
|
+
if params_dict is None:
|
|
1386
|
+
if isinstance(param_json, str) and param_json:
|
|
1387
|
+
try:
|
|
1388
|
+
params_dict = json.loads(param_json)
|
|
1389
|
+
except json.JSONDecodeError as e:
|
|
1390
|
+
return error_response(
|
|
1391
|
+
tool="web_scraper",
|
|
1392
|
+
input={"action": action, "params": p},
|
|
1393
|
+
error_type="json_error",
|
|
1394
|
+
code="E4002",
|
|
1395
|
+
message=str(e),
|
|
1396
|
+
)
|
|
1397
|
+
else:
|
|
1398
|
+
params_dict = {}
|
|
1399
|
+
wait = bool(p.get("wait", True))
|
|
1400
|
+
|
|
1401
|
+
# Validate required fields based on tool schema
|
|
1402
|
+
from .product import _ensure_tools as _ensure
|
|
1403
|
+
_, tools_map = _ensure()
|
|
1404
|
+
t = tools_map.get(tool)
|
|
1405
|
+
if not t:
|
|
1406
|
+
return error_response(
|
|
1407
|
+
tool="web_scraper",
|
|
1408
|
+
input={"action": action, "params": p},
|
|
1409
|
+
error_type="invalid_tool",
|
|
1410
|
+
code="E4003",
|
|
1411
|
+
message="Unknown tool key. Use web_scraper.catalog to discover valid keys.",
|
|
1412
|
+
)
|
|
1413
|
+
schema = tool_schema(t)
|
|
1414
|
+
fields = schema.get("fields", {})
|
|
1415
|
+
missing_fields = []
|
|
1416
|
+
params_template = {}
|
|
1417
|
+
for key, meta in fields.items():
|
|
1418
|
+
required = bool(meta.get("required"))
|
|
1419
|
+
if required and (params_dict is None or key not in params_dict or params_dict.get(key) in (None, "", [])):
|
|
1420
|
+
missing_fields.append(key)
|
|
1421
|
+
# Build minimal template for missing fields
|
|
1422
|
+
if required and key not in (params_dict or {}):
|
|
1423
|
+
default = meta.get("default")
|
|
1424
|
+
typ = str(meta.get("type", "")).lower()
|
|
1425
|
+
if "dict" in typ:
|
|
1426
|
+
params_template[key] = {}
|
|
1427
|
+
elif "list" in typ:
|
|
1428
|
+
params_template[key] = []
|
|
1429
|
+
elif default is not None:
|
|
1430
|
+
params_template[key] = default
|
|
1431
|
+
else:
|
|
1432
|
+
params_template[key] = f"<{key}>"
|
|
1433
|
+
|
|
1434
|
+
if missing_fields:
|
|
1435
|
+
return error_response(
|
|
1436
|
+
tool="web_scraper",
|
|
1437
|
+
input={"action": action, "params": p},
|
|
1438
|
+
error_type="validation_error",
|
|
1439
|
+
code="E4001",
|
|
1440
|
+
message="Missing required fields for tool params",
|
|
1441
|
+
details={
|
|
1442
|
+
"tool": tool,
|
|
1443
|
+
"missing_fields": missing_fields,
|
|
1444
|
+
"params_template": params_template,
|
|
1445
|
+
"tip": f"Run web_scraper(action='example', params={{'tool': '{tool}'}}) to see full template",
|
|
1446
|
+
},
|
|
1447
|
+
)
|
|
1448
|
+
wait = bool(p.get("wait", True))
|
|
1449
|
+
|
|
1450
|
+
# Execution-layer allowlist (optional safety)
|
|
1451
|
+
allowlist = getattr(settings, "THORDATA_TASKS_ALLOWLIST", "")
|
|
1452
|
+
if allowlist and allowlist.strip():
|
|
1453
|
+
allowed_prefixes = [prefix.strip().lower() for prefix in allowlist.split(",") if prefix.strip()]
|
|
1454
|
+
allowed_exact = [exact.strip() for exact in allowlist.split(",") if exact.strip()]
|
|
1455
|
+
tool_lower = tool.lower()
|
|
1456
|
+
if not any(tool_lower.startswith(p) for p in allowed_prefixes) and tool_lower not in allowed_exact:
|
|
1457
|
+
return error_response(
|
|
1458
|
+
tool="web_scraper",
|
|
1459
|
+
input={"action": action, "params": p},
|
|
1460
|
+
error_type="not_allowed",
|
|
1461
|
+
code="E4011",
|
|
1462
|
+
message="Tool not allowed by allowlist.",
|
|
1463
|
+
details={
|
|
1464
|
+
"tool": tool,
|
|
1465
|
+
"allowlist": allowlist,
|
|
1466
|
+
"tip": "Update THORDATA_TASKS_ALLOWLIST or set THORDATA_TASKS_LIST_MODE=all to bypass.",
|
|
1467
|
+
},
|
|
1468
|
+
)
|
|
1469
|
+
wait = bool(p.get("wait", True))
|
|
1470
|
+
max_wait_seconds = int(p.get("max_wait_seconds", 300))
|
|
1471
|
+
file_type = str(p.get("file_type", "json"))
|
|
1472
|
+
return await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
|
|
1473
|
+
|
|
1474
|
+
if a == "batch_run":
|
|
1475
|
+
reqs = p.get("requests")
|
|
1476
|
+
if not isinstance(reqs, list) or not reqs:
|
|
1477
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
|
|
1478
|
+
concurrency = max(1, min(int(p.get("concurrency", 5)), 20))
|
|
1479
|
+
wait = bool(p.get("wait", True))
|
|
1480
|
+
max_wait_seconds = int(p.get("max_wait_seconds", 300))
|
|
1481
|
+
file_type = str(p.get("file_type", "json"))
|
|
1482
|
+
sem = asyncio.Semaphore(concurrency)
|
|
1483
|
+
|
|
1484
|
+
async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
|
|
1485
|
+
tool = str(r.get("tool", ""))
|
|
1486
|
+
if not tool:
|
|
1487
|
+
return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing tool"}}
|
|
1488
|
+
params_dict = r.get("params") if isinstance(r.get("params"), dict) else {}
|
|
1489
|
+
async with sem:
|
|
1490
|
+
out = await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
|
|
1491
|
+
# compact per-item
|
|
1492
|
+
if out.get("ok") is True and isinstance(out.get("output"), dict):
|
|
1493
|
+
o = out["output"]
|
|
1494
|
+
out["output"] = {k: o.get(k) for k in ("task_id", "spider_id", "spider_name", "status", "download_url") if k in o}
|
|
1495
|
+
return {"index": i, **out}
|
|
1496
|
+
|
|
1497
|
+
await safe_ctx_info(ctx, f"web_scraper.batch_run count={len(reqs)} concurrency={concurrency}")
|
|
1498
|
+
results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
|
|
1499
|
+
return ok_response(tool="web_scraper", input={"action": "batch_run", "params": p}, output={"results": results})
|
|
1500
|
+
|
|
1501
|
+
if a == "list_tasks":
|
|
1502
|
+
page = max(1, int(p.get("page", 1)))
|
|
1503
|
+
size = max(1, min(int(p.get("size", 20)), 200))
|
|
1504
|
+
data = await client.list_tasks(page=page, size=size)
|
|
1505
|
+
return ok_response(tool="web_scraper", input={"action": "list_tasks", "params": p}, output=data)
|
|
1506
|
+
|
|
1507
|
+
if a == "status":
|
|
1508
|
+
tid = str(p.get("task_id", ""))
|
|
1509
|
+
if not tid:
|
|
1510
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
|
|
1511
|
+
s = await client.get_task_status(tid)
|
|
1512
|
+
return ok_response(tool="web_scraper", input={"action": "status", "params": p}, output={"task_id": tid, "status": str(s)})
|
|
1513
|
+
|
|
1514
|
+
if a == "status_batch":
|
|
1515
|
+
tids = p.get("task_ids")
|
|
1516
|
+
if not isinstance(tids, list) or not tids:
|
|
1517
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
|
|
1518
|
+
results = []
|
|
1519
|
+
for tid in [str(x) for x in tids[:200]]:
|
|
1520
|
+
try:
|
|
1521
|
+
s = await client.get_task_status(tid)
|
|
1522
|
+
results.append({"task_id": tid, "ok": True, "status": str(s)})
|
|
1523
|
+
except Exception as e:
|
|
1524
|
+
results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
|
|
1525
|
+
return ok_response(tool="web_scraper", input={"action": "status_batch", "params": {"count": len(tids)}}, output={"results": results})
|
|
1526
|
+
|
|
1527
|
+
if a == "wait":
|
|
1528
|
+
tid = str(p.get("task_id", ""))
|
|
1529
|
+
if not tid:
|
|
1530
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
|
|
1531
|
+
poll = float(p.get("poll_interval_seconds", 5.0))
|
|
1532
|
+
max_wait = float(p.get("max_wait_seconds", 600.0))
|
|
1533
|
+
s = await client.wait_for_task(tid, poll_interval=poll, max_wait=max_wait)
|
|
1534
|
+
return ok_response(tool="web_scraper", input={"action": "wait", "params": p}, output={"task_id": tid, "status": str(s)})
|
|
1535
|
+
|
|
1536
|
+
if a == "result":
|
|
1537
|
+
tid = str(p.get("task_id", ""))
|
|
1538
|
+
if not tid:
|
|
1539
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
|
|
1540
|
+
file_type = str(p.get("file_type", "json"))
|
|
1541
|
+
preview = bool(p.get("preview", True))
|
|
1542
|
+
preview_max_chars = int(p.get("preview_max_chars", 20_000))
|
|
1543
|
+
dl = await client.get_task_result(tid, file_type=file_type)
|
|
1544
|
+
from thordata_mcp.utils import enrich_download_url
|
|
1545
|
+
|
|
1546
|
+
dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
|
|
1547
|
+
preview_obj = None
|
|
1548
|
+
structured = None
|
|
1549
|
+
if preview and file_type.lower() == "json":
|
|
1550
|
+
preview_obj = await _fetch_json_preview(dl, max_chars=preview_max_chars)
|
|
1551
|
+
if preview_obj.get("ok") is True:
|
|
1552
|
+
data = preview_obj.get("data")
|
|
1553
|
+
if isinstance(data, list) and data:
|
|
1554
|
+
structured = _normalize_record(data[0])
|
|
1555
|
+
elif isinstance(data, dict):
|
|
1556
|
+
structured = _normalize_record(data)
|
|
1557
|
+
return ok_response(tool="web_scraper", input={"action": "result", "params": p}, output={"task_id": tid, "download_url": dl, "preview": preview_obj, "structured": structured})
|
|
1558
|
+
|
|
1559
|
+
if a == "result_batch":
|
|
1560
|
+
tids = p.get("task_ids")
|
|
1561
|
+
if not isinstance(tids, list) or not tids:
|
|
1562
|
+
return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
|
|
1563
|
+
file_type = str(p.get("file_type", "json"))
|
|
1564
|
+
preview = bool(p.get("preview", False))
|
|
1565
|
+
preview_max_chars = int(p.get("preview_max_chars", 20_000))
|
|
1566
|
+
from thordata_mcp.utils import enrich_download_url
|
|
1567
|
+
|
|
1568
|
+
results = []
|
|
1569
|
+
for tid in [str(x) for x in tids[:100]]:
|
|
1570
|
+
try:
|
|
1571
|
+
dl = await client.get_task_result(tid, file_type=file_type)
|
|
1572
|
+
dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
|
|
1573
|
+
prev = None
|
|
1574
|
+
structured = None
|
|
1575
|
+
if preview and file_type.lower() == "json":
|
|
1576
|
+
prev = await _fetch_json_preview(dl, max_chars=preview_max_chars)
|
|
1577
|
+
if prev.get("ok") is True:
|
|
1578
|
+
data = prev.get("data")
|
|
1579
|
+
if isinstance(data, list) and data:
|
|
1580
|
+
structured = _normalize_record(data[0])
|
|
1581
|
+
elif isinstance(data, dict):
|
|
1582
|
+
structured = _normalize_record(data)
|
|
1583
|
+
results.append({"task_id": tid, "ok": True, "download_url": dl, "preview": prev, "structured": structured})
|
|
1584
|
+
except Exception as e:
|
|
1585
|
+
results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
|
|
1586
|
+
return ok_response(tool="web_scraper", input={"action": "result_batch", "params": {"count": len(tids)}}, output={"results": results})
|
|
1587
|
+
|
|
1588
|
+
if a == "cancel":
|
|
1589
|
+
# Public spec currently doesn't provide cancel; keep clear error
|
|
1590
|
+
tid = str(p.get("task_id", ""))
|
|
1591
|
+
return error_response(tool="web_scraper", input={"action": "cancel", "params": p}, error_type="not_supported", code="E4005", message="Cancel endpoint not available in public Tasks API.", details={"task_id": tid})
|
|
1592
|
+
|
|
1593
|
+
return error_response(
|
|
1594
|
+
tool="web_scraper",
|
|
1595
|
+
input={"action": action, "params": p},
|
|
1596
|
+
error_type="validation_error",
|
|
1597
|
+
code="E4001",
|
|
1598
|
+
message=(
|
|
1599
|
+
f"Unknown action '{action}'. Supported actions: "
|
|
1600
|
+
"'catalog', 'groups', 'spiders', 'spider_ids', 'ids', "
|
|
1601
|
+
"'example', 'template', "
|
|
1602
|
+
"'run', 'batch_run', "
|
|
1603
|
+
"'raw_run', 'raw_batch_run', "
|
|
1604
|
+
"'list_tasks', 'status', 'status_batch', 'wait', "
|
|
1605
|
+
"'result', 'result_batch', 'cancel'"
|
|
1606
|
+
),
|
|
1607
|
+
)
|
|
1608
|
+
|
|
1609
|
+
# Conditionally register WEB SCRAPER tools (kept out of default rapid mode to reduce surface area).
|
|
1610
|
+
if _allow("web_scraper"):
|
|
1611
|
+
mcp.tool(
|
|
1612
|
+
name="web_scraper",
|
|
1613
|
+
description=(
|
|
1614
|
+
"WEB SCRAPER TASKS: action in {catalog, groups, spiders, example, run, batch_run, "
|
|
1615
|
+
"raw_run, raw_batch_run, list_tasks, status, status_batch, wait, result, result_batch, cancel}. "
|
|
1616
|
+
"Typical flow: catalog → example (params_template) → run / batch_run, or use raw_run for direct "
|
|
1617
|
+
"builder/video_builder spider_id calls that mirror Dashboard curl examples."
|
|
1618
|
+
),
|
|
1619
|
+
)(handle_mcp_errors(web_scraper))
|
|
1620
|
+
|
|
1621
|
+
# -------------------------
|
|
1622
|
+
# WEB SCRAPER HELP (UX helper)
|
|
1623
|
+
# -------------------------
|
|
1624
|
+
if _allow("web_scraper.help"):
|
|
1625
|
+
mcp.tool(
|
|
1626
|
+
name="web_scraper.help",
|
|
1627
|
+
description=(
|
|
1628
|
+
"Explain how to use web_scraper actions (catalog/example/run/batch_run/raw_run/...). "
|
|
1629
|
+
"Use this as a quick reference for LLMs and users."
|
|
1630
|
+
),
|
|
1631
|
+
)(handle_mcp_errors(web_scraper_help))
|
|
1632
|
+
async def web_scraper_help() -> dict[str, Any]:
|
|
1633
|
+
"""Return a high-level usage guide for web_scraper.* actions."""
|
|
1634
|
+
guide = {
|
|
1635
|
+
"recommended_flow": [
|
|
1636
|
+
"1. Discover tools: call web_scraper with action='catalog' (and optional group/keyword/limit/offset).",
|
|
1637
|
+
"2. Inspect a tool: call web_scraper with action='example' to get params_template and metadata.",
|
|
1638
|
+
"3. Run a single task: call web_scraper with action='run' and provide tool + params.",
|
|
1639
|
+
"4. Run many tasks: call web_scraper with action='batch_run' and a list of {tool, params}.",
|
|
1640
|
+
"5. Get status/result: call web_scraper with action='status'/'wait'/'result' (or their *_batch variants).",
|
|
1641
|
+
],
|
|
1642
|
+
"quick_example": {
|
|
1643
|
+
"catalog": {"action": "catalog", "params": {"keyword": "amazon_product_by-url", "limit": 5}},
|
|
1644
|
+
"example": {"action": "example", "params": {"tool": "<tool_key_from_catalog>"}},
|
|
1645
|
+
"run": {
|
|
1646
|
+
"action": "run",
|
|
1647
|
+
"params": {
|
|
1648
|
+
"tool": "<tool_key_from_catalog>",
|
|
1649
|
+
"params": {"<field>": "<value>"},
|
|
1650
|
+
"wait": True,
|
|
1651
|
+
"file_type": "json",
|
|
1652
|
+
},
|
|
1653
|
+
},
|
|
1654
|
+
"result": {"action": "result", "params": {"task_id": "<task_id>", "file_type": "json", "preview": True}},
|
|
1655
|
+
},
|
|
1656
|
+
"when_to_use_raw_run": [
|
|
1657
|
+
"Use action='raw_run' or 'raw_batch_run' when you only know spider_name/spider_id from Dashboard docs, "
|
|
1658
|
+
"or when a spider does not yet have a dedicated SDK ToolRequest.",
|
|
1659
|
+
"These actions mirror the 'builder' / 'video_builder' curl examples: you pass spider_id, spider_name, "
|
|
1660
|
+
"spider_parameters and optional spider_universal/common_settings directly.",
|
|
1661
|
+
],
|
|
1662
|
+
"raw_run_cheatsheet": {
|
|
1663
|
+
"builder": {
|
|
1664
|
+
"action": "raw_run",
|
|
1665
|
+
"params": {
|
|
1666
|
+
"builder": "builder",
|
|
1667
|
+
"spider_name": "<spider_name>",
|
|
1668
|
+
"spider_id": "<spider_id>",
|
|
1669
|
+
"spider_parameters": [{"<param>": "<value>"}],
|
|
1670
|
+
"spider_universal": {"<universal_param>": "<value>"},
|
|
1671
|
+
"wait": True,
|
|
1672
|
+
"file_type": "json",
|
|
1673
|
+
"include_errors": True,
|
|
1674
|
+
},
|
|
1675
|
+
},
|
|
1676
|
+
"video_builder": {
|
|
1677
|
+
"action": "raw_run",
|
|
1678
|
+
"params": {
|
|
1679
|
+
"builder": "video_builder",
|
|
1680
|
+
"spider_name": "<spider_name>",
|
|
1681
|
+
"spider_id": "<spider_id>",
|
|
1682
|
+
"spider_parameters": [{"<param>": "<value>"}],
|
|
1683
|
+
"common_settings": {"<common_setting>": "<value>"},
|
|
1684
|
+
"wait": True,
|
|
1685
|
+
"file_type": "json",
|
|
1686
|
+
"include_errors": True,
|
|
1687
|
+
},
|
|
1688
|
+
},
|
|
1689
|
+
"curl_mapping": [
|
|
1690
|
+
"curl builder/video_builder → params.builder",
|
|
1691
|
+
"curl spider_name → params.spider_name",
|
|
1692
|
+
"curl spider_id → params.spider_id",
|
|
1693
|
+
"curl spider_parameters → params.spider_parameters (dict or list[dict])",
|
|
1694
|
+
"curl spider_universal → params.spider_universal (builder only)",
|
|
1695
|
+
"curl common_settings → params.common_settings (video_builder only)",
|
|
1696
|
+
],
|
|
1697
|
+
},
|
|
1698
|
+
"llm_tips": [
|
|
1699
|
+
"If you know a tool_key: catalog → example → run/batch_run (best schema, safer defaults).",
|
|
1700
|
+
"If you only have a URL and you're unsure which task fits: try smart_scrape(url=...) first (structured if possible, else unlocker).",
|
|
1701
|
+
"If catalog cannot find a matching tool by keyword/group: try web_scraper.spiders with a broader keyword (e.g. domain name) to confirm whether the spider_id exists in this MCP build.",
|
|
1702
|
+
"If the spider_id is not present in catalog/spiders: treat it as NOT SUPPORTED by this MCP build. Next best action is to use unlocker.fetch (or smart_scrape with prefer_structured=false) to still get content, then extract fields from HTML/Markdown.",
|
|
1703
|
+
"When a structured task fails but unlocker succeeds: include the URL + tool_key/spider_id + error.message in your report; it usually indicates site changes or anti-bot and we can improve routing/tool defaults.",
|
|
1704
|
+
"If run/raw_run returns task_id: use web_scraper.status / wait / result to poll and fetch outputs.",
|
|
1705
|
+
],
|
|
1706
|
+
}
|
|
1707
|
+
return ok_response(tool="web_scraper.help", input={}, output=guide)
|
|
1708
|
+
|
|
1709
|
+
# -------------------------
|
|
1710
|
+
# BROWSER SCRAPER (compact)
|
|
1711
|
+
# -------------------------
|
|
1712
|
+
@mcp.tool(
|
|
1713
|
+
name="browser",
|
|
1714
|
+
description=(
|
|
1715
|
+
"BROWSER SCRAPER (Playwright): action in {navigate, snapshot}. "
|
|
1716
|
+
'Use navigate with {"url": "..."} to open a page, then snapshot with {"filtered": true} to get ARIA refs '
|
|
1717
|
+
"for click/type tools from the separate browser.* namespace."
|
|
1718
|
+
),
|
|
1719
|
+
)
|
|
1720
|
+
@handle_mcp_errors
|
|
1721
|
+
async def browser(
|
|
1722
|
+
action: str,
|
|
1723
|
+
*,
|
|
1724
|
+
params: Any = None,
|
|
1725
|
+
ctx: Optional[Context] = None,
|
|
1726
|
+
) -> dict[str, Any]:
|
|
1727
|
+
"""BROWSER SCRAPER: action in {navigate, snapshot}.
|
|
1728
|
+
|
|
1729
|
+
Args:
|
|
1730
|
+
action: Action to perform - "navigate" or "snapshot"
|
|
1731
|
+
params: Parameters dictionary. For "navigate": {"url": "https://..."}
|
|
1732
|
+
For "snapshot": {"filtered": true}
|
|
1733
|
+
|
|
1734
|
+
Examples:
|
|
1735
|
+
browser(action="navigate", params={"url": "https://www.google.com"})
|
|
1736
|
+
browser(action="snapshot", params={"filtered": true})
|
|
1737
|
+
"""
|
|
1738
|
+
# Normalize params with enhanced error messages
|
|
1739
|
+
try:
|
|
1740
|
+
p = normalize_params(params, "browser", action)
|
|
1741
|
+
except ValueError as e:
|
|
1742
|
+
if "JSON" in str(e):
|
|
1743
|
+
return create_params_error("browser", action, params, str(e))
|
|
1744
|
+
else:
|
|
1745
|
+
return create_params_error("browser", action, params, str(e))
|
|
1746
|
+
|
|
1747
|
+
a = (action or "").strip().lower()
|
|
1748
|
+
if not a:
|
|
1749
|
+
return error_response(
|
|
1750
|
+
tool="browser",
|
|
1751
|
+
input={"action": action, "params": p},
|
|
1752
|
+
error_type="validation_error",
|
|
1753
|
+
code="E4001",
|
|
1754
|
+
message="action is required",
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
# Credentials check
|
|
1758
|
+
user = settings.THORDATA_BROWSER_USERNAME
|
|
1759
|
+
pwd = settings.THORDATA_BROWSER_PASSWORD
|
|
1760
|
+
if not user or not pwd:
|
|
1761
|
+
return error_response(
|
|
1762
|
+
tool="browser",
|
|
1763
|
+
input={"action": action, "params": p},
|
|
1764
|
+
error_type="config_error",
|
|
1765
|
+
code="E1001",
|
|
1766
|
+
message="Missing browser credentials. Set THORDATA_BROWSER_USERNAME and THORDATA_BROWSER_PASSWORD.",
|
|
1767
|
+
)
|
|
1768
|
+
session = await ServerContext.get_browser_session()
|
|
1769
|
+
if a == "navigate":
|
|
1770
|
+
url = str(p.get("url", ""))
|
|
1771
|
+
if not url:
|
|
1772
|
+
return error_response(tool="browser", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing url")
|
|
1773
|
+
page = await session.get_page(url)
|
|
1774
|
+
if page.url != url:
|
|
1775
|
+
await page.goto(url, timeout=120_000)
|
|
1776
|
+
title = await page.title()
|
|
1777
|
+
return ok_response(tool="browser", input={"action": "navigate", "params": p}, output={"url": page.url, "title": title})
|
|
1778
|
+
if a == "snapshot":
|
|
1779
|
+
filtered = bool(p.get("filtered", True))
|
|
1780
|
+
mode = str(p.get("mode", "compact") or "compact").strip().lower()
|
|
1781
|
+
max_items = int(p.get("max_items", 80) or 80)
|
|
1782
|
+
if max_items <= 0 or max_items > 500:
|
|
1783
|
+
return error_response(
|
|
1784
|
+
tool="browser",
|
|
1785
|
+
input={"action": action, "params": p},
|
|
1786
|
+
error_type="validation_error",
|
|
1787
|
+
code="E4001",
|
|
1788
|
+
message="max_items must be between 1 and 500",
|
|
1789
|
+
details={"max_items": max_items},
|
|
1790
|
+
)
|
|
1791
|
+
include_dom = bool(p.get("include_dom", False))
|
|
1792
|
+
# Optional: allow snapshot to navigate when url is provided (better UX)
|
|
1793
|
+
url = p.get("url")
|
|
1794
|
+
if isinstance(url, str) and url.strip():
|
|
1795
|
+
page = await session.get_page(url)
|
|
1796
|
+
if page.url != url:
|
|
1797
|
+
await page.goto(url, timeout=120_000)
|
|
1798
|
+
data = await session.capture_snapshot(filtered=filtered, mode=mode, max_items=max_items, include_dom=include_dom)
|
|
1799
|
+
# Apply an additional safety max_chars guard to avoid flooding context.
|
|
1800
|
+
max_chars = int(p.get("max_chars", 20_000) or 20_000)
|
|
1801
|
+
aria_snapshot = truncate_content(str(data.get("aria_snapshot", "")), max_length=max_chars)
|
|
1802
|
+
dom_snapshot = data.get("dom_snapshot")
|
|
1803
|
+
dom_snapshot = truncate_content(str(dom_snapshot), max_length=max_chars) if dom_snapshot else None
|
|
1804
|
+
meta = data.get("_meta") if isinstance(data, dict) else None
|
|
1805
|
+
return ok_response(
|
|
1806
|
+
tool="browser",
|
|
1807
|
+
input={"action": "snapshot", "params": p},
|
|
1808
|
+
output={
|
|
1809
|
+
"url": data.get("url"),
|
|
1810
|
+
"title": data.get("title"),
|
|
1811
|
+
"aria_snapshot": aria_snapshot,
|
|
1812
|
+
"dom_snapshot": dom_snapshot,
|
|
1813
|
+
"_meta": meta,
|
|
1814
|
+
},
|
|
1815
|
+
)
|
|
1816
|
+
return error_response(
|
|
1817
|
+
tool="browser",
|
|
1818
|
+
input={"action": action, "params": p},
|
|
1819
|
+
error_type="validation_error",
|
|
1820
|
+
code="E4001",
|
|
1821
|
+
message=f"Unknown action '{action}'. Supported actions: 'navigate', 'snapshot'",
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
# -------------------------
|
|
1825
|
+
# SMART SCRAPE (compact)
|
|
1826
|
+
# -------------------------
|
|
1827
|
+
@mcp.tool(
|
|
1828
|
+
name="smart_scrape",
|
|
1829
|
+
description=(
|
|
1830
|
+
"Auto-pick a Web Scraper task for URL; fallback to Unlocker. "
|
|
1831
|
+
"Always returns a structured summary plus raw HTML/JSON preview when possible."
|
|
1832
|
+
),
|
|
1833
|
+
)
|
|
1834
|
+
@handle_mcp_errors
|
|
1835
|
+
async def smart_scrape(
|
|
1836
|
+
url: str,
|
|
1837
|
+
*,
|
|
1838
|
+
prefer_structured: bool = True,
|
|
1839
|
+
preview: bool = True,
|
|
1840
|
+
preview_max_chars: int = 20_000,
|
|
1841
|
+
max_wait_seconds: int = 300,
|
|
1842
|
+
unlocker_output: str = "markdown",
|
|
1843
|
+
ctx: Optional[Context] = None,
|
|
1844
|
+
) -> dict[str, Any]:
|
|
1845
|
+
"""Auto-pick a Web Scraper task for URL; fallback to Unlocker. Always returns structured."""
|
|
1846
|
+
# Basic schema-style guards for numeric params
|
|
1847
|
+
if preview_max_chars <= 0 or preview_max_chars > 100_000:
|
|
1848
|
+
return error_response(
|
|
1849
|
+
tool="smart_scrape",
|
|
1850
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview, "preview_max_chars": preview_max_chars},
|
|
1851
|
+
error_type="validation_error",
|
|
1852
|
+
code="E4001",
|
|
1853
|
+
message="preview_max_chars must be between 1 and 100000",
|
|
1854
|
+
details={"preview_max_chars": preview_max_chars},
|
|
1855
|
+
)
|
|
1856
|
+
if max_wait_seconds <= 0 or max_wait_seconds > 600:
|
|
1857
|
+
return error_response(
|
|
1858
|
+
tool="smart_scrape",
|
|
1859
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview, "max_wait_seconds": max_wait_seconds},
|
|
1860
|
+
error_type="validation_error",
|
|
1861
|
+
code="E4001",
|
|
1862
|
+
message="max_wait_seconds must be between 1 and 600",
|
|
1863
|
+
details={"max_wait_seconds": max_wait_seconds},
|
|
1864
|
+
)
|
|
1865
|
+
await safe_ctx_info(ctx, f"smart_scrape url={url!r} prefer_structured={prefer_structured}")
|
|
1866
|
+
host = _hostname(url)
|
|
1867
|
+
url_lower = url.lower()
|
|
1868
|
+
tried: list[dict[str, Any]] = []
|
|
1869
|
+
|
|
1870
|
+
# Special-case: Google search pages are best handled by SERP (more reliable than Unlocker).
|
|
1871
|
+
if prefer_structured:
|
|
1872
|
+
def _is_google_search_local(u: str) -> tuple[bool, str | None]:
|
|
1873
|
+
try:
|
|
1874
|
+
from urllib.parse import urlparse, parse_qs
|
|
1875
|
+
|
|
1876
|
+
p0 = urlparse(u)
|
|
1877
|
+
h0 = (p0.hostname or "").lower()
|
|
1878
|
+
if h0.startswith("www."):
|
|
1879
|
+
h0 = h0[4:]
|
|
1880
|
+
if h0 != "google.com":
|
|
1881
|
+
return (False, None)
|
|
1882
|
+
if p0.path != "/search":
|
|
1883
|
+
return (False, None)
|
|
1884
|
+
qs0 = parse_qs(p0.query or "")
|
|
1885
|
+
q0 = (qs0.get("q") or [""])[0].strip()
|
|
1886
|
+
return (bool(q0), q0 or None)
|
|
1887
|
+
except Exception:
|
|
1888
|
+
return (False, None)
|
|
1889
|
+
|
|
1890
|
+
is_g, q = _is_google_search_local(url)
|
|
1891
|
+
if is_g:
|
|
1892
|
+
await safe_ctx_info(ctx, f"smart_scrape: Google search detected, routing to SERP q={q!r}")
|
|
1893
|
+
try:
|
|
1894
|
+
from thordata.types import SerpRequest
|
|
1895
|
+
from thordata.types import Engine as EngineEnum
|
|
1896
|
+
client = await ServerContext.get_client()
|
|
1897
|
+
req = SerpRequest(
|
|
1898
|
+
query=str(q or ""),
|
|
1899
|
+
engine=EngineEnum.GOOGLE,
|
|
1900
|
+
num=10,
|
|
1901
|
+
start=0,
|
|
1902
|
+
country=None,
|
|
1903
|
+
language=None,
|
|
1904
|
+
google_domain="google.com",
|
|
1905
|
+
extra_params={},
|
|
1906
|
+
)
|
|
1907
|
+
data = await client.serp_search_advanced(req)
|
|
1908
|
+
serp_preview = None
|
|
1909
|
+
if preview:
|
|
1910
|
+
raw = truncate_content(str(data), max_length=int(preview_max_chars))
|
|
1911
|
+
serp_preview = {"format": "light_json", "raw": raw}
|
|
1912
|
+
return ok_response(
|
|
1913
|
+
tool="smart_scrape",
|
|
1914
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
|
|
1915
|
+
output={
|
|
1916
|
+
"path": "SERP",
|
|
1917
|
+
"serp": {"engine": "google", "q": q, "num": 10, "start": 0},
|
|
1918
|
+
"result": data,
|
|
1919
|
+
"structured": {"url": url, "query": q, "engine": "google"},
|
|
1920
|
+
"preview": serp_preview,
|
|
1921
|
+
"candidates": [],
|
|
1922
|
+
"tried": tried,
|
|
1923
|
+
},
|
|
1924
|
+
)
|
|
1925
|
+
except Exception as e:
|
|
1926
|
+
err_msg = str(e)
|
|
1927
|
+
tried.append({"path": "SERP", "engine": "google", "q": q, "ok": False, "error": err_msg})
|
|
1928
|
+
await safe_ctx_info(ctx, f"smart_scrape: SERP routing failed, falling back. err={e}")
|
|
1929
|
+
|
|
1930
|
+
# Match product.py behavior: for certain URLs, don't even attempt Web Scraper.
|
|
1931
|
+
# - Google search pages: prefer SERP / Unlocker
|
|
1932
|
+
# - Generic/example domains: never pick marketplace/product tools
|
|
1933
|
+
skip_web_scraper = False
|
|
1934
|
+
if host == "google.com" and "/search" in url_lower:
|
|
1935
|
+
skip_web_scraper = True
|
|
1936
|
+
generic_domains = {"example.com", "example.org", "example.net", "test.com", "localhost"}
|
|
1937
|
+
if host in generic_domains or (host and host.endswith(".example.com")):
|
|
1938
|
+
skip_web_scraper = True
|
|
1939
|
+
|
|
1940
|
+
selected_tool: str | None = None
|
|
1941
|
+
selected_params: dict[str, Any] = {}
|
|
1942
|
+
candidates: list[tuple[str, dict[str, Any]]] = []
|
|
1943
|
+
if not skip_web_scraper:
|
|
1944
|
+
selected_tool, selected_params = _guess_tool_for_url(url)
|
|
1945
|
+
# Only keep guessed tool if it exists in tool map (avoid invalid hardcode drift)
|
|
1946
|
+
from .product import _ensure_tools as _ensure # local import to avoid cycles
|
|
1947
|
+
|
|
1948
|
+
_, tools_map = _ensure()
|
|
1949
|
+
if selected_tool and selected_tool in tools_map:
|
|
1950
|
+
candidates.append((selected_tool, selected_params))
|
|
1951
|
+
|
|
1952
|
+
if not candidates:
|
|
1953
|
+
candidate_keys = _candidate_tools_for_url(url, limit=3)
|
|
1954
|
+
# Filter out obviously wrong tools (like GitHub for non-GitHub URLs)
|
|
1955
|
+
filtered_candidates: list[str] = []
|
|
1956
|
+
for k in candidate_keys:
|
|
1957
|
+
lk = k.lower()
|
|
1958
|
+
if "github" in lk and host and "github" not in host.lower():
|
|
1959
|
+
continue
|
|
1960
|
+
if "repository" in lk and host and "github" not in host.lower() and "gitlab" not in host.lower():
|
|
1961
|
+
continue
|
|
1962
|
+
if "amazon" in lk and host and "amazon" not in host.lower():
|
|
1963
|
+
continue
|
|
1964
|
+
if "walmart" in lk and host and "walmart" not in host.lower():
|
|
1965
|
+
continue
|
|
1966
|
+
if ("googleshopping" in lk or "google.shopping" in lk) and (host == "google.com" or "/search" in url_lower):
|
|
1967
|
+
continue
|
|
1968
|
+
filtered_candidates.append(k)
|
|
1969
|
+
|
|
1970
|
+
for k in filtered_candidates:
|
|
1971
|
+
candidates.append((k, {"url": url}))
|
|
1972
|
+
else:
|
|
1973
|
+
await safe_ctx_info(ctx, f"smart_scrape: skipping Web Scraper for host={host!r} url={url!r}")
|
|
1974
|
+
|
|
1975
|
+
if prefer_structured and candidates:
|
|
1976
|
+
for tool, params in candidates[:3]:
|
|
1977
|
+
r = await _run_web_scraper_tool(tool=tool, params=params, wait=True, max_wait_seconds=max_wait_seconds, file_type="json", ctx=ctx)
|
|
1978
|
+
# Check if task succeeded (status should be Ready/Success, not Failed)
|
|
1979
|
+
result_obj = r.get("output") if isinstance(r.get("output"), dict) else {}
|
|
1980
|
+
status = result_obj.get("status", "").lower() if isinstance(result_obj, dict) else ""
|
|
1981
|
+
|
|
1982
|
+
# If status is Failed, don't try more Web Scraper tools - go to Unlocker
|
|
1983
|
+
# Also check if r.get("ok") is False, which indicates the tool call itself failed
|
|
1984
|
+
if status == "failed" or r.get("ok") is False:
|
|
1985
|
+
await safe_ctx_info(ctx, f"smart_scrape: Web Scraper tool {tool} failed (status={status}, ok={r.get('ok')}), falling back to Unlocker")
|
|
1986
|
+
tried.append({
|
|
1987
|
+
"tool": tool,
|
|
1988
|
+
"ok": r.get("ok"),
|
|
1989
|
+
"status": status,
|
|
1990
|
+
"error": r.get("error"),
|
|
1991
|
+
})
|
|
1992
|
+
break # Exit loop and go to Unlocker fallback
|
|
1993
|
+
|
|
1994
|
+
# Only return success if both ok is True AND status is not failed
|
|
1995
|
+
if r.get("ok") is True and status not in {"failed", "error", "failure"}:
|
|
1996
|
+
out = r.get("output") if isinstance(r.get("output"), dict) else {}
|
|
1997
|
+
dl = out.get("download_url") if isinstance(out, dict) else None
|
|
1998
|
+
preview_obj = None
|
|
1999
|
+
structured = {"url": url}
|
|
2000
|
+
if preview and isinstance(dl, str) and dl:
|
|
2001
|
+
preview_obj = await _fetch_json_preview(dl, max_chars=int(preview_max_chars))
|
|
2002
|
+
# Try to use preview data even if JSON parsing failed but we have raw data
|
|
2003
|
+
if preview_obj.get("ok") is True:
|
|
2004
|
+
data = preview_obj.get("data")
|
|
2005
|
+
if isinstance(data, list) and data:
|
|
2006
|
+
structured = _normalize_record(data[0], url=url)
|
|
2007
|
+
elif isinstance(data, dict):
|
|
2008
|
+
structured = _normalize_record(data, url=url)
|
|
2009
|
+
elif preview_obj.get("status") == 200 and preview_obj.get("raw"):
|
|
2010
|
+
# JSON parsing failed but we have raw data - try to extract basic info
|
|
2011
|
+
raw = preview_obj.get("raw", "")
|
|
2012
|
+
if raw:
|
|
2013
|
+
# Try to extract basic fields from raw text if possible
|
|
2014
|
+
structured = {"url": url, "raw_preview": raw[:500]} # Limit raw preview size
|
|
2015
|
+
return ok_response(
|
|
2016
|
+
tool="smart_scrape",
|
|
2017
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
|
|
2018
|
+
output={
|
|
2019
|
+
"path": "WEB_SCRAPER",
|
|
2020
|
+
"selected_tool": tool,
|
|
2021
|
+
"selected_params": params,
|
|
2022
|
+
"result": out,
|
|
2023
|
+
"structured": structured,
|
|
2024
|
+
"preview": preview_obj,
|
|
2025
|
+
"candidates": [c[0] for c in candidates],
|
|
2026
|
+
"tried": tried,
|
|
2027
|
+
},
|
|
2028
|
+
)
|
|
2029
|
+
tried.append({"tool": tool, "ok": r.get("ok"), "status": status, "error": r.get("error")})
|
|
2030
|
+
|
|
2031
|
+
client = await ServerContext.get_client()
|
|
2032
|
+
try:
|
|
2033
|
+
with PerformanceTimer(tool="smart_scrape.unlocker", url=url):
|
|
2034
|
+
html = await client.universal_scrape(url=url, js_render=True, output_format="html", wait_for=".content")
|
|
2035
|
+
html_str = str(html) if not isinstance(html, str) else html
|
|
2036
|
+
extracted = _extract_structured_from_html(html_str) if html_str else {}
|
|
2037
|
+
structured = _normalize_extracted(extracted, url=url)
|
|
2038
|
+
# Token-efficient preview
|
|
2039
|
+
preview_obj: dict[str, Any] | None = None
|
|
2040
|
+
out_mode = (unlocker_output or "markdown").strip().lower()
|
|
2041
|
+
if out_mode not in {"markdown", "md", "html"}:
|
|
2042
|
+
out_mode = "markdown"
|
|
2043
|
+
if preview:
|
|
2044
|
+
if out_mode in {"markdown", "md"}:
|
|
2045
|
+
md = html_to_markdown_clean(html_str)
|
|
2046
|
+
md = truncate_content(md, max_length=int(preview_max_chars))
|
|
2047
|
+
preview_obj = {"format": "markdown", "raw": md}
|
|
2048
|
+
else:
|
|
2049
|
+
preview_obj = {"format": "html", "raw": truncate_content(html_str, max_length=int(preview_max_chars))}
|
|
2050
|
+
return ok_response(
|
|
2051
|
+
tool="smart_scrape",
|
|
2052
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
|
|
2053
|
+
output={
|
|
2054
|
+
"path": "WEB_UNLOCKER",
|
|
2055
|
+
"preview": preview_obj,
|
|
2056
|
+
"extracted": extracted,
|
|
2057
|
+
"structured": structured,
|
|
2058
|
+
"selected_tool": selected_tool,
|
|
2059
|
+
"selected_params": selected_params,
|
|
2060
|
+
"candidates": [c[0] for c in candidates],
|
|
2061
|
+
"tried": tried,
|
|
2062
|
+
},
|
|
2063
|
+
)
|
|
2064
|
+
except asyncio.TimeoutError as e:
|
|
2065
|
+
# Handle timeout specifically
|
|
2066
|
+
await safe_ctx_info(ctx, f"smart_scrape: Unlocker timed out: {e}")
|
|
2067
|
+
return error_response(
|
|
2068
|
+
tool="smart_scrape",
|
|
2069
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
|
|
2070
|
+
error_type="timeout_error",
|
|
2071
|
+
code="E2003",
|
|
2072
|
+
message=f"Unlocker request timed out. The page may be slow to load or blocked.",
|
|
2073
|
+
details={
|
|
2074
|
+
"selected_tool": selected_tool,
|
|
2075
|
+
"candidates": [c[0] for c in candidates],
|
|
2076
|
+
"tried": tried,
|
|
2077
|
+
},
|
|
2078
|
+
)
|
|
2079
|
+
except Exception as e:
|
|
2080
|
+
# If Unlocker also fails, return error with context
|
|
2081
|
+
await safe_ctx_info(ctx, f"smart_scrape: Unlocker also failed: {e}")
|
|
2082
|
+
error_msg = str(e)
|
|
2083
|
+
# Extract more useful error information
|
|
2084
|
+
if "504" in error_msg or "Gateway Timeout" in error_msg:
|
|
2085
|
+
error_type = "timeout_error"
|
|
2086
|
+
error_code = "E2003"
|
|
2087
|
+
error_message = f"Unlocker request timed out (504 Gateway Timeout). The page may be slow to load or blocked."
|
|
2088
|
+
elif "timeout" in error_msg.lower():
|
|
2089
|
+
error_type = "timeout_error"
|
|
2090
|
+
error_code = "E2003"
|
|
2091
|
+
error_message = f"Unlocker request timed out: {error_msg}"
|
|
2092
|
+
else:
|
|
2093
|
+
error_type = "network_error"
|
|
2094
|
+
error_code = "E2002"
|
|
2095
|
+
error_message = f"Both Web Scraper and Unlocker failed. Last error: {error_msg}"
|
|
2096
|
+
return error_response(
|
|
2097
|
+
tool="smart_scrape",
|
|
2098
|
+
input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
|
|
2099
|
+
error_type=error_type,
|
|
2100
|
+
code=error_code,
|
|
2101
|
+
message=error_message,
|
|
2102
|
+
details={
|
|
2103
|
+
"selected_tool": selected_tool,
|
|
2104
|
+
"candidates": [c[0] for c in candidates],
|
|
2105
|
+
"tried": tried,
|
|
2106
|
+
},
|
|
2107
|
+
)
|
|
2108
|
+
|