tooluniverse 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tooluniverse might be problematic. Click here for more details.
- tooluniverse/__init__.py +37 -14
- tooluniverse/admetai_tool.py +16 -5
- tooluniverse/base_tool.py +36 -0
- tooluniverse/biogrid_tool.py +118 -0
- tooluniverse/build_optimizer.py +87 -0
- tooluniverse/cache/__init__.py +3 -0
- tooluniverse/cache/memory_cache.py +99 -0
- tooluniverse/cache/result_cache_manager.py +235 -0
- tooluniverse/cache/sqlite_backend.py +257 -0
- tooluniverse/clinvar_tool.py +90 -0
- tooluniverse/compose_scripts/output_summarizer.py +87 -33
- tooluniverse/compose_tool.py +2 -2
- tooluniverse/custom_tool.py +28 -0
- tooluniverse/data/adverse_event_tools.json +97 -98
- tooluniverse/data/agentic_tools.json +81 -162
- tooluniverse/data/arxiv_tools.json +1 -4
- tooluniverse/data/compose_tools.json +0 -54
- tooluniverse/data/core_tools.json +1 -4
- tooluniverse/data/dataset_tools.json +7 -7
- tooluniverse/data/doaj_tools.json +1 -3
- tooluniverse/data/drug_discovery_agents.json +282 -0
- tooluniverse/data/europe_pmc_tools.json +1 -2
- tooluniverse/data/genomics_tools.json +174 -0
- tooluniverse/data/geo_tools.json +86 -0
- tooluniverse/data/literature_search_tools.json +15 -35
- tooluniverse/data/markitdown_tools.json +51 -0
- tooluniverse/data/monarch_tools.json +1 -2
- tooluniverse/data/openalex_tools.json +1 -5
- tooluniverse/data/opentarget_tools.json +8 -16
- tooluniverse/data/output_summarization_tools.json +23 -20
- tooluniverse/data/packages/bioinformatics_core_tools.json +2 -2
- tooluniverse/data/packages/cheminformatics_tools.json +1 -1
- tooluniverse/data/packages/genomics_tools.json +1 -1
- tooluniverse/data/packages/single_cell_tools.json +1 -1
- tooluniverse/data/packages/structural_biology_tools.json +1 -1
- tooluniverse/data/pmc_tools.json +1 -4
- tooluniverse/data/ppi_tools.json +139 -0
- tooluniverse/data/pubmed_tools.json +1 -3
- tooluniverse/data/semantic_scholar_tools.json +1 -2
- tooluniverse/data/tool_composition_tools.json +2 -4
- tooluniverse/data/unified_guideline_tools.json +206 -4
- tooluniverse/data/xml_tools.json +15 -15
- tooluniverse/data/zenodo_tools.json +1 -2
- tooluniverse/dbsnp_tool.py +71 -0
- tooluniverse/default_config.py +6 -0
- tooluniverse/ensembl_tool.py +61 -0
- tooluniverse/execute_function.py +235 -76
- tooluniverse/generate_tools.py +303 -20
- tooluniverse/genomics_gene_search_tool.py +56 -0
- tooluniverse/geo_tool.py +116 -0
- tooluniverse/gnomad_tool.py +63 -0
- tooluniverse/logging_config.py +64 -2
- tooluniverse/markitdown_tool.py +159 -0
- tooluniverse/mcp_client_tool.py +10 -5
- tooluniverse/molecule_2d_tool.py +9 -3
- tooluniverse/molecule_3d_tool.py +9 -3
- tooluniverse/output_hook.py +217 -150
- tooluniverse/smcp.py +18 -10
- tooluniverse/smcp_server.py +89 -199
- tooluniverse/string_tool.py +112 -0
- tooluniverse/tools/{MultiAgentLiteratureSearch.py → ADMETAnalyzerAgent.py} +18 -18
- tooluniverse/tools/ArXiv_search_papers.py +3 -3
- tooluniverse/tools/CMA_Guidelines_Search.py +52 -0
- tooluniverse/tools/CORE_search_papers.py +3 -3
- tooluniverse/tools/ClinVar_search_variants.py +52 -0
- tooluniverse/tools/ClinicalTrialDesignAgent.py +63 -0
- tooluniverse/tools/CompoundDiscoveryAgent.py +59 -0
- tooluniverse/tools/DOAJ_search_articles.py +2 -2
- tooluniverse/tools/DiseaseAnalyzerAgent.py +52 -0
- tooluniverse/tools/DrugInteractionAnalyzerAgent.py +52 -0
- tooluniverse/tools/DrugOptimizationAgent.py +63 -0
- tooluniverse/tools/Ensembl_lookup_gene_by_symbol.py +52 -0
- tooluniverse/tools/EuropePMC_search_articles.py +1 -1
- tooluniverse/tools/GIN_Guidelines_Search.py +52 -0
- tooluniverse/tools/GWAS_search_associations_by_gene.py +52 -0
- tooluniverse/tools/LiteratureSynthesisAgent.py +59 -0
- tooluniverse/tools/PMC_search_papers.py +3 -3
- tooluniverse/tools/PubMed_search_articles.py +2 -2
- tooluniverse/tools/SemanticScholar_search_papers.py +1 -1
- tooluniverse/tools/UCSC_get_genes_by_region.py +67 -0
- tooluniverse/tools/Zenodo_search_records.py +1 -1
- tooluniverse/tools/__init__.py +33 -3
- tooluniverse/tools/convert_to_markdown.py +59 -0
- tooluniverse/tools/dbSNP_get_variant_by_rsid.py +46 -0
- tooluniverse/tools/gnomAD_query_variant.py +52 -0
- tooluniverse/tools/openalex_literature_search.py +4 -4
- tooluniverse/ucsc_tool.py +60 -0
- tooluniverse/unified_guideline_tools.py +1175 -57
- tooluniverse/utils.py +51 -4
- tooluniverse/zenodo_tool.py +2 -1
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/METADATA +10 -3
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/RECORD +96 -61
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/entry_points.txt +0 -3
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/WHEEL +0 -0
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/licenses/LICENSE +0 -0
- {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/top_level.txt +0 -0
|
@@ -9,10 +9,44 @@ import time
|
|
|
9
9
|
import re
|
|
10
10
|
import xml.etree.ElementTree as ET
|
|
11
11
|
from bs4 import BeautifulSoup
|
|
12
|
+
from markitdown import MarkItDown
|
|
12
13
|
from .base_tool import BaseTool
|
|
13
14
|
from .tool_registry import register_tool
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
def _extract_meaningful_terms(query):
|
|
18
|
+
"""Return significant query terms for relevance filtering."""
|
|
19
|
+
if not isinstance(query, str):
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
# Keep alphabetic tokens with length >= 3
|
|
23
|
+
tokens = re.findall(r"[a-zA-Z]{3,}", query.lower())
|
|
24
|
+
stop_terms = {
|
|
25
|
+
"management",
|
|
26
|
+
"care",
|
|
27
|
+
"guideline",
|
|
28
|
+
"guidelines",
|
|
29
|
+
"clinical",
|
|
30
|
+
"practice",
|
|
31
|
+
"and",
|
|
32
|
+
"with",
|
|
33
|
+
"for",
|
|
34
|
+
"the",
|
|
35
|
+
"that",
|
|
36
|
+
"from",
|
|
37
|
+
"into",
|
|
38
|
+
"using",
|
|
39
|
+
"update",
|
|
40
|
+
"introduction",
|
|
41
|
+
"review",
|
|
42
|
+
"overview",
|
|
43
|
+
"recommendation",
|
|
44
|
+
"recommendations",
|
|
45
|
+
}
|
|
46
|
+
meaningful = [token for token in tokens if token not in stop_terms]
|
|
47
|
+
return meaningful if meaningful else tokens
|
|
48
|
+
|
|
49
|
+
|
|
16
50
|
@register_tool()
|
|
17
51
|
class NICEWebScrapingTool(BaseTool):
|
|
18
52
|
"""
|
|
@@ -174,6 +208,7 @@ class NICEWebScrapingTool(BaseTool):
|
|
|
174
208
|
"title": title,
|
|
175
209
|
"url": url,
|
|
176
210
|
"summary": summary,
|
|
211
|
+
"content": summary, # Copy summary to content field
|
|
177
212
|
"date": date,
|
|
178
213
|
"type": guideline_type,
|
|
179
214
|
"source": "NICE",
|
|
@@ -302,6 +337,8 @@ class PubMedGuidelinesTool(BaseTool):
|
|
|
302
337
|
|
|
303
338
|
# Process results
|
|
304
339
|
results = []
|
|
340
|
+
query_terms = _extract_meaningful_terms(query)
|
|
341
|
+
|
|
305
342
|
for pmid in pmids:
|
|
306
343
|
if pmid in detail_data.get("result", {}):
|
|
307
344
|
article = detail_data["result"][pmid]
|
|
@@ -318,10 +355,25 @@ class PubMedGuidelinesTool(BaseTool):
|
|
|
318
355
|
pub_types = article.get("pubtype", [])
|
|
319
356
|
is_guideline = any("guideline" in pt.lower() for pt in pub_types)
|
|
320
357
|
|
|
358
|
+
abstract_text = abstracts.get(pmid, "")
|
|
359
|
+
searchable_text = " ".join(
|
|
360
|
+
[
|
|
361
|
+
article.get("title", ""),
|
|
362
|
+
abstract_text or "",
|
|
363
|
+
" ".join(pub_types),
|
|
364
|
+
]
|
|
365
|
+
).lower()
|
|
366
|
+
|
|
367
|
+
if query_terms and not any(
|
|
368
|
+
term in searchable_text for term in query_terms
|
|
369
|
+
):
|
|
370
|
+
continue
|
|
371
|
+
|
|
321
372
|
result = {
|
|
322
373
|
"pmid": pmid,
|
|
323
374
|
"title": article.get("title", ""),
|
|
324
|
-
"abstract":
|
|
375
|
+
"abstract": abstract_text,
|
|
376
|
+
"content": abstract_text, # Copy abstract to content field
|
|
325
377
|
"authors": author_str,
|
|
326
378
|
"journal": article.get("source", ""),
|
|
327
379
|
"publication_date": article.get("pubdate", ""),
|
|
@@ -373,10 +425,14 @@ class EuropePMCGuidelinesTool(BaseTool):
|
|
|
373
425
|
def _search_europepmc_guidelines(self, query, limit):
|
|
374
426
|
"""Search Europe PMC for guideline publications."""
|
|
375
427
|
try:
|
|
376
|
-
#
|
|
377
|
-
guideline_query = f"
|
|
428
|
+
# More specific guideline search query
|
|
429
|
+
guideline_query = f'"{query}" AND (guideline OR "practice guideline" OR "clinical guideline" OR recommendation OR "consensus statement")'
|
|
378
430
|
|
|
379
|
-
params = {
|
|
431
|
+
params = {
|
|
432
|
+
"query": guideline_query,
|
|
433
|
+
"format": "json",
|
|
434
|
+
"pageSize": limit * 2,
|
|
435
|
+
} # Get more to filter
|
|
380
436
|
|
|
381
437
|
response = self.session.get(self.base_url, params=params, timeout=30)
|
|
382
438
|
response.raise_for_status()
|
|
@@ -388,18 +444,101 @@ class EuropePMCGuidelinesTool(BaseTool):
|
|
|
388
444
|
if not results_list:
|
|
389
445
|
return []
|
|
390
446
|
|
|
391
|
-
# Process results
|
|
447
|
+
# Process results with stricter filtering
|
|
392
448
|
results = []
|
|
393
449
|
for result in results_list:
|
|
394
450
|
title = result.get("title", "")
|
|
395
451
|
pub_type = result.get("pubType", "")
|
|
396
|
-
|
|
452
|
+
|
|
453
|
+
# Get abstract from detailed API call
|
|
454
|
+
abstract = self._get_europepmc_abstract(result.get("pmid", ""))
|
|
455
|
+
|
|
456
|
+
# If abstract is too short or just a question, try to get more content
|
|
457
|
+
if len(abstract) < 200 or abstract.endswith("?"):
|
|
458
|
+
# Try to get full text or more detailed content
|
|
459
|
+
abstract = self._get_europepmc_full_content(
|
|
460
|
+
result.get("pmid", ""), result.get("pmcid", "")
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# More strict guideline detection
|
|
464
|
+
title_lower = title.lower()
|
|
465
|
+
abstract_lower = abstract.lower()
|
|
466
|
+
|
|
467
|
+
# Must contain guideline-related keywords in title or abstract
|
|
468
|
+
guideline_keywords = [
|
|
469
|
+
"guideline",
|
|
470
|
+
"practice guideline",
|
|
471
|
+
"clinical guideline",
|
|
472
|
+
"recommendation",
|
|
473
|
+
"consensus statement",
|
|
474
|
+
"position statement",
|
|
475
|
+
"clinical practice",
|
|
476
|
+
"best practice",
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
has_guideline_keywords = any(
|
|
480
|
+
keyword in title_lower or keyword in abstract_lower
|
|
481
|
+
for keyword in guideline_keywords
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Exclude research papers and studies
|
|
485
|
+
exclude_keywords = [
|
|
486
|
+
"study",
|
|
487
|
+
"trial",
|
|
488
|
+
"analysis",
|
|
489
|
+
"evaluation",
|
|
490
|
+
"assessment",
|
|
491
|
+
"effectiveness",
|
|
492
|
+
"efficacy",
|
|
493
|
+
"outcome",
|
|
494
|
+
"result",
|
|
495
|
+
"finding",
|
|
496
|
+
]
|
|
497
|
+
|
|
498
|
+
is_research = any(
|
|
499
|
+
keyword in title_lower for keyword in exclude_keywords
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Publication type must confirm guideline nature
|
|
503
|
+
pub_type_tokens = []
|
|
504
|
+
if isinstance(pub_type, str):
|
|
505
|
+
pub_type_tokens.append(pub_type.lower())
|
|
506
|
+
|
|
507
|
+
pub_type_list = result.get("pubTypeList", {}).get("pubType", [])
|
|
508
|
+
if isinstance(pub_type_list, str):
|
|
509
|
+
pub_type_list = [pub_type_list]
|
|
510
|
+
|
|
511
|
+
if isinstance(pub_type_list, list):
|
|
512
|
+
for entry in pub_type_list:
|
|
513
|
+
if isinstance(entry, str):
|
|
514
|
+
pub_type_tokens.append(entry.lower())
|
|
515
|
+
elif isinstance(entry, dict):
|
|
516
|
+
label = (
|
|
517
|
+
entry.get("text")
|
|
518
|
+
or entry.get("name")
|
|
519
|
+
or entry.get("value")
|
|
520
|
+
)
|
|
521
|
+
if label:
|
|
522
|
+
pub_type_tokens.append(str(label).lower())
|
|
523
|
+
|
|
524
|
+
pub_type_combined = " ".join(pub_type_tokens)
|
|
525
|
+
|
|
526
|
+
pub_type_has_guideline = any(
|
|
527
|
+
term in pub_type_combined
|
|
528
|
+
for term in [
|
|
529
|
+
"guideline",
|
|
530
|
+
"practice guideline",
|
|
531
|
+
"consensus",
|
|
532
|
+
"recommendation",
|
|
533
|
+
]
|
|
534
|
+
)
|
|
397
535
|
|
|
398
536
|
# Determine if it's a guideline
|
|
399
537
|
is_guideline = (
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
538
|
+
has_guideline_keywords
|
|
539
|
+
and pub_type_has_guideline
|
|
540
|
+
and not is_research
|
|
541
|
+
and len(title) > 20
|
|
403
542
|
)
|
|
404
543
|
|
|
405
544
|
# Build URL
|
|
@@ -415,24 +554,33 @@ class EuropePMCGuidelinesTool(BaseTool):
|
|
|
415
554
|
elif doi:
|
|
416
555
|
url = f"https://doi.org/{doi}"
|
|
417
556
|
|
|
418
|
-
|
|
419
|
-
"
|
|
420
|
-
|
|
421
|
-
"pmcid": pmcid,
|
|
422
|
-
"doi": doi,
|
|
423
|
-
"authors": result.get("authorString", ""),
|
|
424
|
-
"journal": result.get("journalTitle", ""),
|
|
425
|
-
"publication_date": result.get("firstPublicationDate", ""),
|
|
426
|
-
"publication_type": pub_type,
|
|
427
|
-
"abstract": (
|
|
428
|
-
abstract[:500] + "..." if len(abstract) > 500 else abstract
|
|
429
|
-
),
|
|
430
|
-
"is_guideline": is_guideline,
|
|
431
|
-
"url": url,
|
|
432
|
-
"source": "Europe PMC",
|
|
433
|
-
}
|
|
557
|
+
abstract_text = (
|
|
558
|
+
abstract[:500] + "..." if len(abstract) > 500 else abstract
|
|
559
|
+
)
|
|
434
560
|
|
|
435
|
-
|
|
561
|
+
# Only add if it's actually a guideline
|
|
562
|
+
if is_guideline:
|
|
563
|
+
guideline_result = {
|
|
564
|
+
"title": title,
|
|
565
|
+
"pmid": pmid,
|
|
566
|
+
"pmcid": pmcid,
|
|
567
|
+
"doi": doi,
|
|
568
|
+
"authors": result.get("authorString", ""),
|
|
569
|
+
"journal": result.get("journalTitle", ""),
|
|
570
|
+
"publication_date": result.get("firstPublicationDate", ""),
|
|
571
|
+
"publication_type": pub_type,
|
|
572
|
+
"abstract": abstract_text,
|
|
573
|
+
"content": abstract_text, # Copy abstract to content field
|
|
574
|
+
"is_guideline": is_guideline,
|
|
575
|
+
"url": url,
|
|
576
|
+
"source": "Europe PMC",
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
results.append(guideline_result)
|
|
580
|
+
|
|
581
|
+
# Stop when we have enough guidelines
|
|
582
|
+
if len(results) >= limit:
|
|
583
|
+
break
|
|
436
584
|
|
|
437
585
|
return results
|
|
438
586
|
|
|
@@ -447,6 +595,101 @@ class EuropePMCGuidelinesTool(BaseTool):
|
|
|
447
595
|
"source": "Europe PMC",
|
|
448
596
|
}
|
|
449
597
|
|
|
598
|
+
def _get_europepmc_abstract(self, pmid):
|
|
599
|
+
"""Get abstract for a specific PMID using PubMed API."""
|
|
600
|
+
if not pmid:
|
|
601
|
+
return ""
|
|
602
|
+
|
|
603
|
+
try:
|
|
604
|
+
# Use PubMed's E-utilities API
|
|
605
|
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
606
|
+
params = {
|
|
607
|
+
"db": "pubmed",
|
|
608
|
+
"id": pmid,
|
|
609
|
+
"retmode": "xml",
|
|
610
|
+
"rettype": "abstract",
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
response = self.session.get(base_url, params=params, timeout=15)
|
|
614
|
+
response.raise_for_status()
|
|
615
|
+
|
|
616
|
+
# Parse XML response
|
|
617
|
+
import xml.etree.ElementTree as ET
|
|
618
|
+
|
|
619
|
+
root = ET.fromstring(response.content)
|
|
620
|
+
|
|
621
|
+
# Find abstract text
|
|
622
|
+
abstract_elem = root.find(".//AbstractText")
|
|
623
|
+
if abstract_elem is not None:
|
|
624
|
+
return abstract_elem.text or ""
|
|
625
|
+
|
|
626
|
+
# Try alternative path
|
|
627
|
+
abstract_elem = root.find(".//abstract")
|
|
628
|
+
if abstract_elem is not None:
|
|
629
|
+
return abstract_elem.text or ""
|
|
630
|
+
|
|
631
|
+
return ""
|
|
632
|
+
|
|
633
|
+
except Exception as e:
|
|
634
|
+
return f"Error fetching abstract: {str(e)}"
|
|
635
|
+
|
|
636
|
+
def _get_europepmc_full_content(self, pmid, pmcid):
|
|
637
|
+
"""Get more detailed content from Europe PMC."""
|
|
638
|
+
if not pmid and not pmcid:
|
|
639
|
+
return ""
|
|
640
|
+
|
|
641
|
+
try:
|
|
642
|
+
# Try to get full text from Europe PMC
|
|
643
|
+
if pmcid:
|
|
644
|
+
full_text_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
|
|
645
|
+
else:
|
|
646
|
+
full_text_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/{pmid}/fullTextXML"
|
|
647
|
+
|
|
648
|
+
response = self.session.get(full_text_url, timeout=15)
|
|
649
|
+
if response.status_code == 200:
|
|
650
|
+
# Parse XML to extract meaningful content
|
|
651
|
+
import xml.etree.ElementTree as ET
|
|
652
|
+
|
|
653
|
+
root = ET.fromstring(response.content)
|
|
654
|
+
|
|
655
|
+
# Extract sections that might contain clinical recommendations
|
|
656
|
+
content_parts = []
|
|
657
|
+
|
|
658
|
+
# Look for methods, results, conclusions, recommendations
|
|
659
|
+
for section in root.findall(".//sec"):
|
|
660
|
+
title_elem = section.find("title")
|
|
661
|
+
if title_elem is not None:
|
|
662
|
+
title = title_elem.text or ""
|
|
663
|
+
if any(
|
|
664
|
+
keyword in title.lower()
|
|
665
|
+
for keyword in [
|
|
666
|
+
"recommendation",
|
|
667
|
+
"conclusion",
|
|
668
|
+
"method",
|
|
669
|
+
"result",
|
|
670
|
+
"guideline",
|
|
671
|
+
"clinical",
|
|
672
|
+
]
|
|
673
|
+
):
|
|
674
|
+
# Extract text from this section
|
|
675
|
+
text_content = ""
|
|
676
|
+
for p in section.findall(".//p"):
|
|
677
|
+
if p.text:
|
|
678
|
+
text_content += p.text + " "
|
|
679
|
+
|
|
680
|
+
if text_content.strip():
|
|
681
|
+
content_parts.append(f"{title}: {text_content.strip()}")
|
|
682
|
+
|
|
683
|
+
if content_parts:
|
|
684
|
+
return " ".join(
|
|
685
|
+
content_parts[:3]
|
|
686
|
+
) # Limit to first 3 relevant sections
|
|
687
|
+
|
|
688
|
+
return ""
|
|
689
|
+
|
|
690
|
+
except Exception as e:
|
|
691
|
+
return f"Error fetching full content: {str(e)}"
|
|
692
|
+
|
|
450
693
|
|
|
451
694
|
@register_tool()
|
|
452
695
|
class TRIPDatabaseTool(BaseTool):
|
|
@@ -506,12 +749,58 @@ class TRIPDatabaseTool(BaseTool):
|
|
|
506
749
|
category_elem = doc.find("category")
|
|
507
750
|
description_elem = doc.find("description")
|
|
508
751
|
|
|
752
|
+
description_text = (
|
|
753
|
+
description_elem.text if description_elem is not None else ""
|
|
754
|
+
)
|
|
755
|
+
url = link_elem.text if link_elem is not None else ""
|
|
756
|
+
|
|
757
|
+
key_recommendations = []
|
|
758
|
+
evidence_strength = []
|
|
759
|
+
|
|
760
|
+
fetched_content = None
|
|
761
|
+
requires_detailed_fetch = url and any(
|
|
762
|
+
domain in url for domain in ["bmj.com/content/", "e-dmj.org"]
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
if (not description_text and url) or requires_detailed_fetch:
|
|
766
|
+
fetched_content = self._fetch_guideline_content(url)
|
|
767
|
+
|
|
768
|
+
if isinstance(fetched_content, dict):
|
|
769
|
+
description_text = (
|
|
770
|
+
fetched_content.get("content", "") or description_text
|
|
771
|
+
)
|
|
772
|
+
key_recommendations = fetched_content.get("key_recommendations", [])
|
|
773
|
+
evidence_strength = fetched_content.get("evidence_strength", [])
|
|
774
|
+
elif isinstance(fetched_content, str) and fetched_content:
|
|
775
|
+
description_text = fetched_content
|
|
776
|
+
|
|
777
|
+
category_text = (
|
|
778
|
+
category_elem.text.lower()
|
|
779
|
+
if category_elem is not None and category_elem.text
|
|
780
|
+
else ""
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
if category_text and "guideline" not in category_text:
|
|
784
|
+
# Skip clearly non-guideline categories such as news or trials
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
description_lower = description_text.lower()
|
|
788
|
+
if any(
|
|
789
|
+
phrase in description_lower
|
|
790
|
+
for phrase in [
|
|
791
|
+
"login required",
|
|
792
|
+
"temporarily unavailable",
|
|
793
|
+
"subscription required",
|
|
794
|
+
"no results",
|
|
795
|
+
]
|
|
796
|
+
):
|
|
797
|
+
continue
|
|
798
|
+
|
|
509
799
|
guideline_result = {
|
|
510
800
|
"title": title_elem.text if title_elem is not None else "",
|
|
511
|
-
"url":
|
|
512
|
-
"description":
|
|
513
|
-
|
|
514
|
-
),
|
|
801
|
+
"url": url,
|
|
802
|
+
"description": description_text,
|
|
803
|
+
"content": description_text, # Copy description to content field
|
|
515
804
|
"publication": (
|
|
516
805
|
publication_elem.text if publication_elem is not None else ""
|
|
517
806
|
),
|
|
@@ -520,6 +809,11 @@ class TRIPDatabaseTool(BaseTool):
|
|
|
520
809
|
"source": "TRIP Database",
|
|
521
810
|
}
|
|
522
811
|
|
|
812
|
+
if key_recommendations:
|
|
813
|
+
guideline_result["key_recommendations"] = key_recommendations
|
|
814
|
+
if evidence_strength:
|
|
815
|
+
guideline_result["evidence_strength"] = evidence_strength
|
|
816
|
+
|
|
523
817
|
results.append(guideline_result)
|
|
524
818
|
|
|
525
819
|
return results
|
|
@@ -540,6 +834,274 @@ class TRIPDatabaseTool(BaseTool):
|
|
|
540
834
|
"source": "TRIP Database",
|
|
541
835
|
}
|
|
542
836
|
|
|
837
|
+
def _fetch_guideline_content(self, url):
|
|
838
|
+
"""Extract content from a guideline URL using targeted parsers when available."""
|
|
839
|
+
try:
|
|
840
|
+
time.sleep(0.5) # Be respectful
|
|
841
|
+
|
|
842
|
+
if "bmj.com/content/" in url:
|
|
843
|
+
return self._extract_bmj_guideline_content(url)
|
|
844
|
+
|
|
845
|
+
if "e-dmj.org" in url:
|
|
846
|
+
return self._extract_dmj_guideline_content(url)
|
|
847
|
+
|
|
848
|
+
# Fallback: generic MarkItDown extraction
|
|
849
|
+
md = MarkItDown()
|
|
850
|
+
result = md.convert(url)
|
|
851
|
+
|
|
852
|
+
if not result or not getattr(result, "text_content", None):
|
|
853
|
+
return f"Content extraction failed. Document available at: {url}"
|
|
854
|
+
|
|
855
|
+
content = self._clean_generic_content(result.text_content)
|
|
856
|
+
return content
|
|
857
|
+
|
|
858
|
+
except Exception as e:
|
|
859
|
+
return f"Error extracting content: {str(e)}"
|
|
860
|
+
|
|
861
|
+
def _clean_generic_content(self, raw_text):
|
|
862
|
+
"""Clean generic text content to emphasise clinical lines."""
|
|
863
|
+
content = raw_text.strip()
|
|
864
|
+
content = re.sub(r"\n\s*\n", "\n\n", content)
|
|
865
|
+
content = re.sub(r" +", " ", content)
|
|
866
|
+
|
|
867
|
+
meaningful_lines = []
|
|
868
|
+
for line in content.split("\n"):
|
|
869
|
+
line = line.strip()
|
|
870
|
+
if len(line) < 20:
|
|
871
|
+
continue
|
|
872
|
+
if line.count("[") > 0 or line.count("]") > 0:
|
|
873
|
+
continue
|
|
874
|
+
if "http" in line or "//" in line:
|
|
875
|
+
continue
|
|
876
|
+
|
|
877
|
+
skip_keywords = [
|
|
878
|
+
"copyright",
|
|
879
|
+
"rights reserved",
|
|
880
|
+
"notice of rights",
|
|
881
|
+
"terms and conditions",
|
|
882
|
+
"your responsibility",
|
|
883
|
+
"local commissioners",
|
|
884
|
+
"environmental impact",
|
|
885
|
+
"medicines and healthcare",
|
|
886
|
+
"yellow card scheme",
|
|
887
|
+
"©",
|
|
888
|
+
"all rights reserved",
|
|
889
|
+
]
|
|
890
|
+
if any(keyword in line.lower() for keyword in skip_keywords):
|
|
891
|
+
continue
|
|
892
|
+
|
|
893
|
+
clinical_keywords = [
|
|
894
|
+
"recommendation",
|
|
895
|
+
"recommendations",
|
|
896
|
+
"should",
|
|
897
|
+
"strong recommendation",
|
|
898
|
+
"conditional recommendation",
|
|
899
|
+
"clinicians",
|
|
900
|
+
"patients",
|
|
901
|
+
"treatment",
|
|
902
|
+
"management",
|
|
903
|
+
"diagnosis",
|
|
904
|
+
"assessment",
|
|
905
|
+
"therapy",
|
|
906
|
+
"intervention",
|
|
907
|
+
"pharmacologic",
|
|
908
|
+
"monitoring",
|
|
909
|
+
"screening",
|
|
910
|
+
"diabetes",
|
|
911
|
+
"glycaemic",
|
|
912
|
+
]
|
|
913
|
+
if any(keyword in line.lower() for keyword in clinical_keywords):
|
|
914
|
+
meaningful_lines.append(line)
|
|
915
|
+
|
|
916
|
+
if meaningful_lines:
|
|
917
|
+
content = "\n".join(meaningful_lines[:8])
|
|
918
|
+
else:
|
|
919
|
+
content = content[:1000]
|
|
920
|
+
|
|
921
|
+
if len(content) > 2000:
|
|
922
|
+
truncated = content[:2000]
|
|
923
|
+
last_period = truncated.rfind(".")
|
|
924
|
+
if last_period > 1000:
|
|
925
|
+
content = truncated[: last_period + 1] + "..."
|
|
926
|
+
else:
|
|
927
|
+
content = truncated + "..."
|
|
928
|
+
|
|
929
|
+
return content
|
|
930
|
+
|
|
931
|
+
def _extract_bmj_guideline_content(self, url):
|
|
932
|
+
"""Fetch BMJ Rapid Recommendation content with key recommendations."""
|
|
933
|
+
try:
|
|
934
|
+
md = MarkItDown()
|
|
935
|
+
result = md.convert(url)
|
|
936
|
+
if not result or not getattr(result, "text_content", None):
|
|
937
|
+
return {
|
|
938
|
+
"content": f"Content extraction failed. Document available at: {url}",
|
|
939
|
+
"key_recommendations": [],
|
|
940
|
+
"evidence_strength": [],
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
text = result.text_content
|
|
944
|
+
content = self._clean_generic_content(text)
|
|
945
|
+
|
|
946
|
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
947
|
+
recommendations = []
|
|
948
|
+
grading = []
|
|
949
|
+
tokens = [
|
|
950
|
+
"strong recommendation",
|
|
951
|
+
"conditional recommendation",
|
|
952
|
+
"weak recommendation",
|
|
953
|
+
"good practice statement",
|
|
954
|
+
]
|
|
955
|
+
|
|
956
|
+
for idx, line in enumerate(lines):
|
|
957
|
+
lower = line.lower()
|
|
958
|
+
if "recommendation" not in lower:
|
|
959
|
+
continue
|
|
960
|
+
if len(line) > 180:
|
|
961
|
+
continue
|
|
962
|
+
|
|
963
|
+
title_clean = line.lstrip("#").strip()
|
|
964
|
+
if title_clean.startswith("+"):
|
|
965
|
+
continue
|
|
966
|
+
if title_clean.lower().startswith("rapid recommendations"):
|
|
967
|
+
continue
|
|
968
|
+
|
|
969
|
+
summary_lines = []
|
|
970
|
+
for following in lines[idx + 1 : idx + 10]:
|
|
971
|
+
if "recommendation" in following.lower() and len(following) < 180:
|
|
972
|
+
break
|
|
973
|
+
if len(following) < 40:
|
|
974
|
+
continue
|
|
975
|
+
summary_lines.append(following)
|
|
976
|
+
if len(summary_lines) >= 3:
|
|
977
|
+
break
|
|
978
|
+
|
|
979
|
+
summary = " ".join(summary_lines)
|
|
980
|
+
if summary:
|
|
981
|
+
recommendations.append(
|
|
982
|
+
{"title": title_clean, "summary": summary[:400]}
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
strength = None
|
|
986
|
+
for token in tokens:
|
|
987
|
+
if token in lower or any(token in s.lower() for s in summary_lines):
|
|
988
|
+
strength = token.title()
|
|
989
|
+
break
|
|
990
|
+
|
|
991
|
+
if not strength:
|
|
992
|
+
grade_match = re.search(r"grade\s+[A-D1-9]+", lower)
|
|
993
|
+
if grade_match:
|
|
994
|
+
strength = grade_match.group(0).title()
|
|
995
|
+
|
|
996
|
+
if strength and not any(
|
|
997
|
+
entry.get("section") == title_clean for entry in grading
|
|
998
|
+
):
|
|
999
|
+
grading.append({"section": title_clean, "strength": strength})
|
|
1000
|
+
|
|
1001
|
+
return {
|
|
1002
|
+
"content": content,
|
|
1003
|
+
"key_recommendations": recommendations[:5],
|
|
1004
|
+
"evidence_strength": grading,
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
except Exception as e:
|
|
1008
|
+
return {
|
|
1009
|
+
"content": f"Error extracting BMJ content: {str(e)}",
|
|
1010
|
+
"key_recommendations": [],
|
|
1011
|
+
"evidence_strength": [],
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
def _extract_dmj_guideline_content(self, url):
|
|
1015
|
+
"""Fetch Diabetes & Metabolism Journal guideline content and GRADE statements."""
|
|
1016
|
+
try:
|
|
1017
|
+
md = MarkItDown()
|
|
1018
|
+
result = md.convert(url)
|
|
1019
|
+
if not result or not getattr(result, "text_content", None):
|
|
1020
|
+
return {
|
|
1021
|
+
"content": f"Content extraction failed. Document available at: {url}",
|
|
1022
|
+
"key_recommendations": [],
|
|
1023
|
+
"evidence_strength": [],
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
text = result.text_content
|
|
1027
|
+
content = self._clean_generic_content(text)
|
|
1028
|
+
|
|
1029
|
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
1030
|
+
recommendations = []
|
|
1031
|
+
grading = []
|
|
1032
|
+
|
|
1033
|
+
for idx, line in enumerate(lines):
|
|
1034
|
+
lower = line.lower()
|
|
1035
|
+
if not any(
|
|
1036
|
+
keyword in lower
|
|
1037
|
+
for keyword in ["recommendation", "statement", "guideline"]
|
|
1038
|
+
):
|
|
1039
|
+
continue
|
|
1040
|
+
if len(line) > 200:
|
|
1041
|
+
continue
|
|
1042
|
+
|
|
1043
|
+
title_clean = line.lstrip("#").strip()
|
|
1044
|
+
if title_clean.startswith("+") or title_clean.startswith("Table"):
|
|
1045
|
+
continue
|
|
1046
|
+
|
|
1047
|
+
summary_lines = []
|
|
1048
|
+
for following in lines[idx + 1 : idx + 10]:
|
|
1049
|
+
if (
|
|
1050
|
+
any(
|
|
1051
|
+
keyword in following.lower()
|
|
1052
|
+
for keyword in ["recommendation", "statement", "guideline"]
|
|
1053
|
+
)
|
|
1054
|
+
and len(following) < 200
|
|
1055
|
+
):
|
|
1056
|
+
break
|
|
1057
|
+
if len(following) < 30:
|
|
1058
|
+
continue
|
|
1059
|
+
summary_lines.append(following)
|
|
1060
|
+
if len(summary_lines) >= 3:
|
|
1061
|
+
break
|
|
1062
|
+
|
|
1063
|
+
summary = " ".join(summary_lines)
|
|
1064
|
+
if summary:
|
|
1065
|
+
recommendations.append(
|
|
1066
|
+
{"title": title_clean, "summary": summary[:400]}
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
strength = None
|
|
1070
|
+
grade_match = re.search(r"grade\s+[A-E]\b", lower)
|
|
1071
|
+
if grade_match:
|
|
1072
|
+
strength = grade_match.group(0).title()
|
|
1073
|
+
level_match = re.search(r"level\s+[0-4]", lower)
|
|
1074
|
+
if level_match:
|
|
1075
|
+
level_text = level_match.group(0).title()
|
|
1076
|
+
strength = f"{strength} ({level_text})" if strength else level_text
|
|
1077
|
+
|
|
1078
|
+
for line_text in summary_lines:
|
|
1079
|
+
lower_line = line_text.lower()
|
|
1080
|
+
if "strong" in lower_line and "recommendation" in lower_line:
|
|
1081
|
+
strength = "Strong recommendation"
|
|
1082
|
+
break
|
|
1083
|
+
if "conditional" in lower_line and "recommendation" in lower_line:
|
|
1084
|
+
strength = "Conditional recommendation"
|
|
1085
|
+
break
|
|
1086
|
+
|
|
1087
|
+
if strength and not any(
|
|
1088
|
+
entry.get("section") == title_clean for entry in grading
|
|
1089
|
+
):
|
|
1090
|
+
grading.append({"section": title_clean, "strength": strength})
|
|
1091
|
+
|
|
1092
|
+
return {
|
|
1093
|
+
"content": content,
|
|
1094
|
+
"key_recommendations": recommendations[:5],
|
|
1095
|
+
"evidence_strength": grading,
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
except Exception as e:
|
|
1099
|
+
return {
|
|
1100
|
+
"content": f"Error extracting DMJ content: {str(e)}",
|
|
1101
|
+
"key_recommendations": [],
|
|
1102
|
+
"evidence_strength": [],
|
|
1103
|
+
}
|
|
1104
|
+
|
|
543
1105
|
|
|
544
1106
|
@register_tool()
|
|
545
1107
|
class WHOGuidelinesTool(BaseTool):
|
|
@@ -632,6 +1194,7 @@ class WHOGuidelinesTool(BaseTool):
|
|
|
632
1194
|
guidelines = []
|
|
633
1195
|
|
|
634
1196
|
query_lower = query.lower()
|
|
1197
|
+
query_terms = _extract_meaningful_terms(query)
|
|
635
1198
|
|
|
636
1199
|
for link in all_links:
|
|
637
1200
|
href = link["href"]
|
|
@@ -654,11 +1217,18 @@ class WHOGuidelinesTool(BaseTool):
|
|
|
654
1217
|
# Fetch description from detail page
|
|
655
1218
|
description = self._fetch_guideline_description(full_url)
|
|
656
1219
|
|
|
1220
|
+
searchable_text = (text + " " + (description or "")).lower()
|
|
1221
|
+
if query_terms and not any(
|
|
1222
|
+
term in searchable_text for term in query_terms
|
|
1223
|
+
):
|
|
1224
|
+
continue
|
|
1225
|
+
|
|
657
1226
|
guidelines.append(
|
|
658
1227
|
{
|
|
659
1228
|
"title": text,
|
|
660
1229
|
"url": full_url,
|
|
661
1230
|
"description": description,
|
|
1231
|
+
"content": description, # Copy description to content field
|
|
662
1232
|
"source": "WHO",
|
|
663
1233
|
"organization": "World Health Organization",
|
|
664
1234
|
"is_guideline": True,
|
|
@@ -696,11 +1266,18 @@ class WHOGuidelinesTool(BaseTool):
|
|
|
696
1266
|
# Fetch description from detail page
|
|
697
1267
|
description = self._fetch_guideline_description(full_url)
|
|
698
1268
|
|
|
1269
|
+
searchable_text = (text + " " + (description or "")).lower()
|
|
1270
|
+
if query_terms and not any(
|
|
1271
|
+
term in searchable_text for term in query_terms
|
|
1272
|
+
):
|
|
1273
|
+
continue
|
|
1274
|
+
|
|
699
1275
|
all_guidelines.append(
|
|
700
1276
|
{
|
|
701
1277
|
"title": text,
|
|
702
1278
|
"url": full_url,
|
|
703
1279
|
"description": description,
|
|
1280
|
+
"content": description, # Copy description to content field
|
|
704
1281
|
"source": "WHO",
|
|
705
1282
|
"organization": "World Health Organization",
|
|
706
1283
|
"is_guideline": True,
|
|
@@ -750,7 +1327,9 @@ class OpenAlexGuidelinesTool(BaseTool):
|
|
|
750
1327
|
"""Search for clinical guidelines using OpenAlex API."""
|
|
751
1328
|
try:
|
|
752
1329
|
# Build search query to focus on guidelines
|
|
753
|
-
search_query =
|
|
1330
|
+
search_query = (
|
|
1331
|
+
f'"{query}" AND (guideline OR "clinical practice" OR recommendation)'
|
|
1332
|
+
)
|
|
754
1333
|
|
|
755
1334
|
# Build parameters
|
|
756
1335
|
params = {
|
|
@@ -815,16 +1394,71 @@ class OpenAlexGuidelinesTool(BaseTool):
|
|
|
815
1394
|
else None
|
|
816
1395
|
)
|
|
817
1396
|
|
|
818
|
-
#
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
1397
|
+
# More strict guideline detection
|
|
1398
|
+
title_lower = title.lower()
|
|
1399
|
+
abstract_lower = abstract.lower() if abstract else ""
|
|
1400
|
+
|
|
1401
|
+
# Must contain specific guideline keywords
|
|
1402
|
+
guideline_keywords = [
|
|
1403
|
+
"guideline",
|
|
1404
|
+
"practice guideline",
|
|
1405
|
+
"clinical guideline",
|
|
1406
|
+
"recommendation",
|
|
1407
|
+
"consensus statement",
|
|
1408
|
+
"position statement",
|
|
1409
|
+
"clinical practice",
|
|
1410
|
+
"best practice",
|
|
1411
|
+
]
|
|
1412
|
+
|
|
1413
|
+
has_guideline_keywords = any(
|
|
1414
|
+
keyword in title_lower or keyword in abstract_lower
|
|
1415
|
+
for keyword in guideline_keywords
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
# Check structured concepts from OpenAlex for guideline markers
|
|
1419
|
+
concepts = work.get("concepts", []) or []
|
|
1420
|
+
has_guideline_concept = False
|
|
1421
|
+
for concept in concepts:
|
|
1422
|
+
display_name = concept.get("display_name", "").lower()
|
|
1423
|
+
if any(
|
|
1424
|
+
term in display_name
|
|
1425
|
+
for term in [
|
|
1426
|
+
"guideline",
|
|
1427
|
+
"clinical practice",
|
|
1428
|
+
"recommendation",
|
|
1429
|
+
"consensus",
|
|
1430
|
+
]
|
|
1431
|
+
):
|
|
1432
|
+
has_guideline_concept = True
|
|
1433
|
+
break
|
|
1434
|
+
|
|
1435
|
+
primary_topic = work.get("primary_topic", {}) or {}
|
|
1436
|
+
primary_topic_name = primary_topic.get("display_name", "").lower()
|
|
1437
|
+
if any(
|
|
1438
|
+
term in primary_topic_name
|
|
1439
|
+
for term in ["guideline", "clinical practice", "recommendation"]
|
|
1440
|
+
):
|
|
1441
|
+
has_guideline_concept = True
|
|
1442
|
+
|
|
1443
|
+
# Exclude research papers and studies (but be less strict)
|
|
1444
|
+
exclude_keywords = [
|
|
1445
|
+
"statistics",
|
|
1446
|
+
"data",
|
|
1447
|
+
"survey",
|
|
1448
|
+
"meta-analysis",
|
|
1449
|
+
"systematic review",
|
|
1450
|
+
]
|
|
1451
|
+
|
|
1452
|
+
is_research = any(
|
|
1453
|
+
keyword in title_lower for keyword in exclude_keywords
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
# Determine if it's a guideline
|
|
1457
|
+
is_guideline = (
|
|
1458
|
+
has_guideline_keywords
|
|
1459
|
+
and has_guideline_concept
|
|
1460
|
+
and not is_research
|
|
1461
|
+
and len(title) > 20
|
|
828
1462
|
)
|
|
829
1463
|
|
|
830
1464
|
# Build URL
|
|
@@ -838,23 +1472,29 @@ class OpenAlexGuidelinesTool(BaseTool):
|
|
|
838
1472
|
)
|
|
839
1473
|
)
|
|
840
1474
|
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
1475
|
+
# Only add if it's actually a guideline
|
|
1476
|
+
if is_guideline:
|
|
1477
|
+
abstract_text = abstract[:500] if abstract else None
|
|
1478
|
+
guideline = {
|
|
1479
|
+
"title": title,
|
|
1480
|
+
"authors": authors,
|
|
1481
|
+
"institutions": institutions[:3],
|
|
1482
|
+
"year": year,
|
|
1483
|
+
"doi": doi,
|
|
1484
|
+
"url": url,
|
|
1485
|
+
"openalex_id": openalex_id,
|
|
1486
|
+
"cited_by_count": cited_by,
|
|
1487
|
+
"is_guideline": is_guideline,
|
|
1488
|
+
"source": "OpenAlex",
|
|
1489
|
+
"abstract": abstract_text,
|
|
1490
|
+
"content": abstract_text, # Copy abstract to content field
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
guidelines.append(guideline)
|
|
856
1494
|
|
|
857
|
-
|
|
1495
|
+
# Stop when we have enough guidelines
|
|
1496
|
+
if len(guidelines) >= limit:
|
|
1497
|
+
break
|
|
858
1498
|
|
|
859
1499
|
return guidelines
|
|
860
1500
|
|
|
@@ -1208,3 +1848,481 @@ class WHOGuidelineFullTextTool(BaseTool):
|
|
|
1208
1848
|
return {"error": f"Failed to fetch WHO guideline: {str(e)}", "url": url}
|
|
1209
1849
|
except Exception as e:
|
|
1210
1850
|
return {"error": f"Error parsing WHO guideline: {str(e)}", "url": url}
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
@register_tool()
|
|
1854
|
+
class GINGuidelinesTool(BaseTool):
|
|
1855
|
+
"""
|
|
1856
|
+
Guidelines International Network (GIN) Guidelines Search Tool.
|
|
1857
|
+
Searches the global guidelines database with 6400+ guidelines from various organizations.
|
|
1858
|
+
"""
|
|
1859
|
+
|
|
1860
|
+
def __init__(self, tool_config):
|
|
1861
|
+
super().__init__(tool_config)
|
|
1862
|
+
self.base_url = "https://www.g-i-n.net"
|
|
1863
|
+
self.search_url = f"{self.base_url}/library/international-guidelines-library"
|
|
1864
|
+
self.session = requests.Session()
|
|
1865
|
+
self.session.headers.update(
|
|
1866
|
+
{
|
|
1867
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1868
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
1869
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
1870
|
+
"Accept-Encoding": "gzip, deflate",
|
|
1871
|
+
"Connection": "keep-alive",
|
|
1872
|
+
"Upgrade-Insecure-Requests": "1",
|
|
1873
|
+
}
|
|
1874
|
+
)
|
|
1875
|
+
|
|
1876
|
+
def run(self, arguments):
|
|
1877
|
+
query = arguments.get("query", "")
|
|
1878
|
+
limit = arguments.get("limit", 10)
|
|
1879
|
+
|
|
1880
|
+
if not query:
|
|
1881
|
+
return {"error": "Query parameter is required"}
|
|
1882
|
+
|
|
1883
|
+
return self._search_gin_guidelines(query, limit)
|
|
1884
|
+
|
|
1885
|
+
def _search_gin_guidelines(self, query, limit):
|
|
1886
|
+
"""Search GIN guidelines using web scraping."""
|
|
1887
|
+
try:
|
|
1888
|
+
time.sleep(1) # Be respectful
|
|
1889
|
+
|
|
1890
|
+
# Try to search GIN guidelines
|
|
1891
|
+
try:
|
|
1892
|
+
# GIN search typically uses form parameters
|
|
1893
|
+
search_params = {"search": query, "type": "guideline", "limit": limit}
|
|
1894
|
+
|
|
1895
|
+
response = self.session.get(
|
|
1896
|
+
self.search_url, params=search_params, timeout=30
|
|
1897
|
+
)
|
|
1898
|
+
response.raise_for_status()
|
|
1899
|
+
|
|
1900
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
1901
|
+
|
|
1902
|
+
# Find guideline results - common selectors for guideline databases
|
|
1903
|
+
guidelines = []
|
|
1904
|
+
|
|
1905
|
+
# Try different selectors for guideline results
|
|
1906
|
+
result_selectors = [
|
|
1907
|
+
"div.guideline-item",
|
|
1908
|
+
"div.search-result",
|
|
1909
|
+
"div.result-item",
|
|
1910
|
+
"article.guideline",
|
|
1911
|
+
"div.item",
|
|
1912
|
+
"li.guideline",
|
|
1913
|
+
]
|
|
1914
|
+
|
|
1915
|
+
results = []
|
|
1916
|
+
for selector in result_selectors:
|
|
1917
|
+
results = soup.select(selector)
|
|
1918
|
+
if results:
|
|
1919
|
+
break
|
|
1920
|
+
|
|
1921
|
+
if not results:
|
|
1922
|
+
# Fallback: look for any div with guideline-related content
|
|
1923
|
+
results = soup.find_all(
|
|
1924
|
+
"div",
|
|
1925
|
+
class_=lambda x: x
|
|
1926
|
+
and any(
|
|
1927
|
+
keyword in x.lower()
|
|
1928
|
+
for keyword in ["guideline", "result", "item", "card"]
|
|
1929
|
+
),
|
|
1930
|
+
)
|
|
1931
|
+
|
|
1932
|
+
for result in results[:limit]:
|
|
1933
|
+
try:
|
|
1934
|
+
# Extract title
|
|
1935
|
+
title_elem = (
|
|
1936
|
+
result.find("h3")
|
|
1937
|
+
or result.find("h2")
|
|
1938
|
+
or result.find("a", class_="title")
|
|
1939
|
+
or result.find("a")
|
|
1940
|
+
)
|
|
1941
|
+
if not title_elem:
|
|
1942
|
+
continue
|
|
1943
|
+
|
|
1944
|
+
title = title_elem.get_text().strip()
|
|
1945
|
+
if not title or len(title) < 10:
|
|
1946
|
+
continue
|
|
1947
|
+
|
|
1948
|
+
# Extract URL
|
|
1949
|
+
link_elem = result.find("a", href=True)
|
|
1950
|
+
if not link_elem:
|
|
1951
|
+
continue
|
|
1952
|
+
|
|
1953
|
+
url = link_elem.get("href", "")
|
|
1954
|
+
if url.startswith("/"):
|
|
1955
|
+
url = self.base_url + url
|
|
1956
|
+
elif not url.startswith("http"):
|
|
1957
|
+
continue
|
|
1958
|
+
|
|
1959
|
+
# Extract description/summary
|
|
1960
|
+
desc_elem = (
|
|
1961
|
+
result.find("p")
|
|
1962
|
+
or result.find("div", class_="description")
|
|
1963
|
+
or result.find("div", class_="summary")
|
|
1964
|
+
)
|
|
1965
|
+
description = desc_elem.get_text().strip() if desc_elem else ""
|
|
1966
|
+
|
|
1967
|
+
# Extract organization
|
|
1968
|
+
org_elem = (
|
|
1969
|
+
result.find("span", class_="organization")
|
|
1970
|
+
or result.find("div", class_="org")
|
|
1971
|
+
or result.find("cite")
|
|
1972
|
+
)
|
|
1973
|
+
organization = (
|
|
1974
|
+
org_elem.get_text().strip()
|
|
1975
|
+
if org_elem
|
|
1976
|
+
else "GIN Member Organization"
|
|
1977
|
+
)
|
|
1978
|
+
|
|
1979
|
+
# Extract date
|
|
1980
|
+
date_elem = (
|
|
1981
|
+
result.find("time")
|
|
1982
|
+
or result.find("span", class_="date")
|
|
1983
|
+
or result.find("div", class_="date")
|
|
1984
|
+
)
|
|
1985
|
+
date = date_elem.get_text().strip() if date_elem else ""
|
|
1986
|
+
|
|
1987
|
+
# Extract content from the guideline page
|
|
1988
|
+
content = self._extract_guideline_content(url)
|
|
1989
|
+
|
|
1990
|
+
guidelines.append(
|
|
1991
|
+
{
|
|
1992
|
+
"title": title,
|
|
1993
|
+
"url": url,
|
|
1994
|
+
"description": description,
|
|
1995
|
+
"content": content,
|
|
1996
|
+
"date": date,
|
|
1997
|
+
"source": "GIN",
|
|
1998
|
+
"organization": organization,
|
|
1999
|
+
"is_guideline": True,
|
|
2000
|
+
"official": True,
|
|
2001
|
+
}
|
|
2002
|
+
)
|
|
2003
|
+
|
|
2004
|
+
except Exception:
|
|
2005
|
+
continue
|
|
2006
|
+
|
|
2007
|
+
if guidelines:
|
|
2008
|
+
return guidelines
|
|
2009
|
+
|
|
2010
|
+
except requests.exceptions.RequestException as e:
|
|
2011
|
+
print(f"GIN website access failed: {e}, trying fallback search...")
|
|
2012
|
+
|
|
2013
|
+
# Fallback: Return sample guidelines based on query
|
|
2014
|
+
return self._get_fallback_gin_guidelines(query, limit)
|
|
2015
|
+
|
|
2016
|
+
except Exception as e:
|
|
2017
|
+
return {
|
|
2018
|
+
"error": f"Error processing GIN guidelines: {str(e)}",
|
|
2019
|
+
"source": "GIN",
|
|
2020
|
+
}
|
|
2021
|
+
|
|
2022
|
+
def _get_fallback_gin_guidelines(self, query, limit):
|
|
2023
|
+
"""Provide fallback guidelines when direct access fails."""
|
|
2024
|
+
# This would contain sample guidelines based on common queries
|
|
2025
|
+
# For now, return a message indicating the issue
|
|
2026
|
+
return [
|
|
2027
|
+
{
|
|
2028
|
+
"title": f"GIN Guidelines Search for '{query}'",
|
|
2029
|
+
"url": self.search_url,
|
|
2030
|
+
"description": "GIN guidelines database access temporarily unavailable. Please try again later or visit the GIN website directly.",
|
|
2031
|
+
"content": "The Guidelines International Network (GIN) maintains the world's largest database of clinical guidelines with over 6400 guidelines from various organizations worldwide.",
|
|
2032
|
+
"date": "",
|
|
2033
|
+
"source": "GIN",
|
|
2034
|
+
"organization": "Guidelines International Network",
|
|
2035
|
+
"is_guideline": False,
|
|
2036
|
+
"official": True,
|
|
2037
|
+
"is_placeholder": True,
|
|
2038
|
+
"note": "Direct access to GIN database failed. Please visit g-i-n.net for full access.",
|
|
2039
|
+
}
|
|
2040
|
+
]
|
|
2041
|
+
|
|
2042
|
+
def _extract_guideline_content(self, url):
|
|
2043
|
+
"""Extract actual content from a guideline URL."""
|
|
2044
|
+
try:
|
|
2045
|
+
time.sleep(0.5) # Be respectful
|
|
2046
|
+
response = self.session.get(url, timeout=15)
|
|
2047
|
+
response.raise_for_status()
|
|
2048
|
+
|
|
2049
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
2050
|
+
|
|
2051
|
+
# Extract main content
|
|
2052
|
+
content_selectors = [
|
|
2053
|
+
"main",
|
|
2054
|
+
".content",
|
|
2055
|
+
".article-content",
|
|
2056
|
+
".guideline-content",
|
|
2057
|
+
"article",
|
|
2058
|
+
".main-content",
|
|
2059
|
+
]
|
|
2060
|
+
|
|
2061
|
+
content_text = ""
|
|
2062
|
+
for selector in content_selectors:
|
|
2063
|
+
content_elem = soup.select_one(selector)
|
|
2064
|
+
if content_elem:
|
|
2065
|
+
# Get all text content
|
|
2066
|
+
paragraphs = content_elem.find_all("p")
|
|
2067
|
+
content_parts = []
|
|
2068
|
+
for p in paragraphs:
|
|
2069
|
+
text = p.get_text().strip()
|
|
2070
|
+
if len(text) > 20: # Skip very short paragraphs
|
|
2071
|
+
content_parts.append(text)
|
|
2072
|
+
|
|
2073
|
+
if content_parts:
|
|
2074
|
+
content_text = "\n\n".join(
|
|
2075
|
+
content_parts[:10]
|
|
2076
|
+
) # Limit to first 10 paragraphs
|
|
2077
|
+
break
|
|
2078
|
+
|
|
2079
|
+
# If no main content found, try to get any meaningful text
|
|
2080
|
+
if not content_text:
|
|
2081
|
+
all_text = soup.get_text()
|
|
2082
|
+
# Clean up the text
|
|
2083
|
+
lines = [line.strip() for line in all_text.split("\n") if line.strip()]
|
|
2084
|
+
content_text = "\n".join(lines[:20]) # First 20 meaningful lines
|
|
2085
|
+
|
|
2086
|
+
return content_text[:2000] # Limit content length
|
|
2087
|
+
|
|
2088
|
+
except Exception as e:
|
|
2089
|
+
return f"Error extracting content: {str(e)}"
|
|
2090
|
+
|
|
2091
|
+
|
|
2092
|
+
@register_tool()
|
|
2093
|
+
class CMAGuidelinesTool(BaseTool):
|
|
2094
|
+
"""
|
|
2095
|
+
Canadian Medical Association (CMA) Infobase Guidelines Search Tool.
|
|
2096
|
+
Searches the CMA Infobase with 1200+ Canadian clinical practice guidelines.
|
|
2097
|
+
"""
|
|
2098
|
+
|
|
2099
|
+
def __init__(self, tool_config):
|
|
2100
|
+
super().__init__(tool_config)
|
|
2101
|
+
self.base_url = "https://joulecma.ca"
|
|
2102
|
+
self.search_url = f"{self.base_url}/infobase"
|
|
2103
|
+
self.session = requests.Session()
|
|
2104
|
+
self.session.headers.update(
|
|
2105
|
+
{
|
|
2106
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
2107
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
2108
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
2109
|
+
"Accept-Encoding": "gzip, deflate",
|
|
2110
|
+
"Connection": "keep-alive",
|
|
2111
|
+
"Upgrade-Insecure-Requests": "1",
|
|
2112
|
+
}
|
|
2113
|
+
)
|
|
2114
|
+
|
|
2115
|
+
def run(self, arguments):
|
|
2116
|
+
query = arguments.get("query", "")
|
|
2117
|
+
limit = arguments.get("limit", 10)
|
|
2118
|
+
|
|
2119
|
+
if not query:
|
|
2120
|
+
return {"error": "Query parameter is required"}
|
|
2121
|
+
|
|
2122
|
+
return self._search_cma_guidelines(query, limit)
|
|
2123
|
+
|
|
2124
|
+
def _search_cma_guidelines(self, query, limit):
|
|
2125
|
+
"""Search CMA Infobase guidelines using web scraping."""
|
|
2126
|
+
try:
|
|
2127
|
+
time.sleep(1) # Be respectful
|
|
2128
|
+
|
|
2129
|
+
# Try to search CMA Infobase
|
|
2130
|
+
try:
|
|
2131
|
+
# CMA search typically uses form parameters
|
|
2132
|
+
search_params = {"search": query, "type": "guideline", "limit": limit}
|
|
2133
|
+
|
|
2134
|
+
response = self.session.get(
|
|
2135
|
+
self.search_url, params=search_params, timeout=30
|
|
2136
|
+
)
|
|
2137
|
+
response.raise_for_status()
|
|
2138
|
+
|
|
2139
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
2140
|
+
|
|
2141
|
+
# Find guideline results
|
|
2142
|
+
guidelines = []
|
|
2143
|
+
|
|
2144
|
+
# Try different selectors for guideline results
|
|
2145
|
+
result_selectors = [
|
|
2146
|
+
"div.guideline-item",
|
|
2147
|
+
"div.search-result",
|
|
2148
|
+
"div.result-item",
|
|
2149
|
+
"article.guideline",
|
|
2150
|
+
"div.item",
|
|
2151
|
+
"li.guideline",
|
|
2152
|
+
]
|
|
2153
|
+
|
|
2154
|
+
results = []
|
|
2155
|
+
for selector in result_selectors:
|
|
2156
|
+
results = soup.select(selector)
|
|
2157
|
+
if results:
|
|
2158
|
+
break
|
|
2159
|
+
|
|
2160
|
+
if not results:
|
|
2161
|
+
# Fallback: look for any div with guideline-related content
|
|
2162
|
+
results = soup.find_all(
|
|
2163
|
+
"div",
|
|
2164
|
+
class_=lambda x: x
|
|
2165
|
+
and any(
|
|
2166
|
+
keyword in x.lower()
|
|
2167
|
+
for keyword in ["guideline", "result", "item", "card"]
|
|
2168
|
+
),
|
|
2169
|
+
)
|
|
2170
|
+
|
|
2171
|
+
for result in results[:limit]:
|
|
2172
|
+
try:
|
|
2173
|
+
# Extract title
|
|
2174
|
+
title_elem = (
|
|
2175
|
+
result.find("h3")
|
|
2176
|
+
or result.find("h2")
|
|
2177
|
+
or result.find("a", class_="title")
|
|
2178
|
+
or result.find("a")
|
|
2179
|
+
)
|
|
2180
|
+
if not title_elem:
|
|
2181
|
+
continue
|
|
2182
|
+
|
|
2183
|
+
title = title_elem.get_text().strip()
|
|
2184
|
+
if not title or len(title) < 10:
|
|
2185
|
+
continue
|
|
2186
|
+
|
|
2187
|
+
# Extract URL
|
|
2188
|
+
link_elem = result.find("a", href=True)
|
|
2189
|
+
if not link_elem:
|
|
2190
|
+
continue
|
|
2191
|
+
|
|
2192
|
+
url = link_elem.get("href", "")
|
|
2193
|
+
if url.startswith("/"):
|
|
2194
|
+
url = self.base_url + url
|
|
2195
|
+
elif not url.startswith("http"):
|
|
2196
|
+
continue
|
|
2197
|
+
|
|
2198
|
+
# Extract description/summary
|
|
2199
|
+
desc_elem = (
|
|
2200
|
+
result.find("p")
|
|
2201
|
+
or result.find("div", class_="description")
|
|
2202
|
+
or result.find("div", class_="summary")
|
|
2203
|
+
)
|
|
2204
|
+
description = desc_elem.get_text().strip() if desc_elem else ""
|
|
2205
|
+
|
|
2206
|
+
# Extract organization
|
|
2207
|
+
org_elem = (
|
|
2208
|
+
result.find("span", class_="organization")
|
|
2209
|
+
or result.find("div", class_="org")
|
|
2210
|
+
or result.find("cite")
|
|
2211
|
+
)
|
|
2212
|
+
organization = (
|
|
2213
|
+
org_elem.get_text().strip()
|
|
2214
|
+
if org_elem
|
|
2215
|
+
else "Canadian Medical Association"
|
|
2216
|
+
)
|
|
2217
|
+
|
|
2218
|
+
# Extract date
|
|
2219
|
+
date_elem = (
|
|
2220
|
+
result.find("time")
|
|
2221
|
+
or result.find("span", class_="date")
|
|
2222
|
+
or result.find("div", class_="date")
|
|
2223
|
+
)
|
|
2224
|
+
date = date_elem.get_text().strip() if date_elem else ""
|
|
2225
|
+
|
|
2226
|
+
# Extract content from the guideline page
|
|
2227
|
+
content = self._extract_guideline_content(url)
|
|
2228
|
+
|
|
2229
|
+
guidelines.append(
|
|
2230
|
+
{
|
|
2231
|
+
"title": title,
|
|
2232
|
+
"url": url,
|
|
2233
|
+
"description": description,
|
|
2234
|
+
"content": content,
|
|
2235
|
+
"date": date,
|
|
2236
|
+
"source": "CMA",
|
|
2237
|
+
"organization": organization,
|
|
2238
|
+
"is_guideline": True,
|
|
2239
|
+
"official": True,
|
|
2240
|
+
}
|
|
2241
|
+
)
|
|
2242
|
+
|
|
2243
|
+
except Exception:
|
|
2244
|
+
continue
|
|
2245
|
+
|
|
2246
|
+
if guidelines:
|
|
2247
|
+
return guidelines
|
|
2248
|
+
|
|
2249
|
+
except requests.exceptions.RequestException as e:
|
|
2250
|
+
print(f"CMA Infobase access failed: {e}, trying fallback search...")
|
|
2251
|
+
|
|
2252
|
+
# Fallback: Return sample guidelines based on query
|
|
2253
|
+
return self._get_fallback_cma_guidelines(query, limit)
|
|
2254
|
+
|
|
2255
|
+
except Exception as e:
|
|
2256
|
+
return {
|
|
2257
|
+
"error": f"Error processing CMA guidelines: {str(e)}",
|
|
2258
|
+
"source": "CMA",
|
|
2259
|
+
}
|
|
2260
|
+
|
|
2261
|
+
def _get_fallback_cma_guidelines(self, query, limit):
|
|
2262
|
+
"""Provide fallback guidelines when direct access fails."""
|
|
2263
|
+
# This would contain sample guidelines based on common queries
|
|
2264
|
+
# For now, return a message indicating the issue
|
|
2265
|
+
return [
|
|
2266
|
+
{
|
|
2267
|
+
"title": f"CMA Infobase Guidelines Search for '{query}'",
|
|
2268
|
+
"url": self.search_url,
|
|
2269
|
+
"description": "CMA Infobase access temporarily unavailable. Please try again later or visit the CMA website directly.",
|
|
2270
|
+
"content": "The Canadian Medical Association Infobase contains over 1200 evidence-based clinical practice guidelines developed or endorsed by Canadian healthcare organizations.",
|
|
2271
|
+
"date": "",
|
|
2272
|
+
"source": "CMA",
|
|
2273
|
+
"organization": "Canadian Medical Association",
|
|
2274
|
+
"is_guideline": False,
|
|
2275
|
+
"official": True,
|
|
2276
|
+
"is_placeholder": True,
|
|
2277
|
+
"note": "Direct access to CMA Infobase failed. Please visit joulecma.ca/infobase for full access.",
|
|
2278
|
+
}
|
|
2279
|
+
]
|
|
2280
|
+
|
|
2281
|
+
def _extract_guideline_content(self, url):
|
|
2282
|
+
"""Extract actual content from a guideline URL."""
|
|
2283
|
+
try:
|
|
2284
|
+
time.sleep(0.5) # Be respectful
|
|
2285
|
+
response = self.session.get(url, timeout=15)
|
|
2286
|
+
response.raise_for_status()
|
|
2287
|
+
|
|
2288
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
2289
|
+
|
|
2290
|
+
# Extract main content
|
|
2291
|
+
content_selectors = [
|
|
2292
|
+
"main",
|
|
2293
|
+
".content",
|
|
2294
|
+
".article-content",
|
|
2295
|
+
".guideline-content",
|
|
2296
|
+
"article",
|
|
2297
|
+
".main-content",
|
|
2298
|
+
]
|
|
2299
|
+
|
|
2300
|
+
content_text = ""
|
|
2301
|
+
for selector in content_selectors:
|
|
2302
|
+
content_elem = soup.select_one(selector)
|
|
2303
|
+
if content_elem:
|
|
2304
|
+
# Get all text content
|
|
2305
|
+
paragraphs = content_elem.find_all("p")
|
|
2306
|
+
content_parts = []
|
|
2307
|
+
for p in paragraphs:
|
|
2308
|
+
text = p.get_text().strip()
|
|
2309
|
+
if len(text) > 20: # Skip very short paragraphs
|
|
2310
|
+
content_parts.append(text)
|
|
2311
|
+
|
|
2312
|
+
if content_parts:
|
|
2313
|
+
content_text = "\n\n".join(
|
|
2314
|
+
content_parts[:10]
|
|
2315
|
+
) # Limit to first 10 paragraphs
|
|
2316
|
+
break
|
|
2317
|
+
|
|
2318
|
+
# If no main content found, try to get any meaningful text
|
|
2319
|
+
if not content_text:
|
|
2320
|
+
all_text = soup.get_text()
|
|
2321
|
+
# Clean up the text
|
|
2322
|
+
lines = [line.strip() for line in all_text.split("\n") if line.strip()]
|
|
2323
|
+
content_text = "\n".join(lines[:20]) # First 20 meaningful lines
|
|
2324
|
+
|
|
2325
|
+
return content_text[:2000] # Limit content length
|
|
2326
|
+
|
|
2327
|
+
except Exception as e:
|
|
2328
|
+
return f"Error extracting content: {str(e)}"
|