tooluniverse 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tooluniverse might be problematic. Click here for more details.

Files changed (96) hide show
  1. tooluniverse/__init__.py +37 -14
  2. tooluniverse/admetai_tool.py +16 -5
  3. tooluniverse/base_tool.py +36 -0
  4. tooluniverse/biogrid_tool.py +118 -0
  5. tooluniverse/build_optimizer.py +87 -0
  6. tooluniverse/cache/__init__.py +3 -0
  7. tooluniverse/cache/memory_cache.py +99 -0
  8. tooluniverse/cache/result_cache_manager.py +235 -0
  9. tooluniverse/cache/sqlite_backend.py +257 -0
  10. tooluniverse/clinvar_tool.py +90 -0
  11. tooluniverse/compose_scripts/output_summarizer.py +87 -33
  12. tooluniverse/compose_tool.py +2 -2
  13. tooluniverse/custom_tool.py +28 -0
  14. tooluniverse/data/adverse_event_tools.json +97 -98
  15. tooluniverse/data/agentic_tools.json +81 -162
  16. tooluniverse/data/arxiv_tools.json +1 -4
  17. tooluniverse/data/compose_tools.json +0 -54
  18. tooluniverse/data/core_tools.json +1 -4
  19. tooluniverse/data/dataset_tools.json +7 -7
  20. tooluniverse/data/doaj_tools.json +1 -3
  21. tooluniverse/data/drug_discovery_agents.json +282 -0
  22. tooluniverse/data/europe_pmc_tools.json +1 -2
  23. tooluniverse/data/genomics_tools.json +174 -0
  24. tooluniverse/data/geo_tools.json +86 -0
  25. tooluniverse/data/literature_search_tools.json +15 -35
  26. tooluniverse/data/markitdown_tools.json +51 -0
  27. tooluniverse/data/monarch_tools.json +1 -2
  28. tooluniverse/data/openalex_tools.json +1 -5
  29. tooluniverse/data/opentarget_tools.json +8 -16
  30. tooluniverse/data/output_summarization_tools.json +23 -20
  31. tooluniverse/data/packages/bioinformatics_core_tools.json +2 -2
  32. tooluniverse/data/packages/cheminformatics_tools.json +1 -1
  33. tooluniverse/data/packages/genomics_tools.json +1 -1
  34. tooluniverse/data/packages/single_cell_tools.json +1 -1
  35. tooluniverse/data/packages/structural_biology_tools.json +1 -1
  36. tooluniverse/data/pmc_tools.json +1 -4
  37. tooluniverse/data/ppi_tools.json +139 -0
  38. tooluniverse/data/pubmed_tools.json +1 -3
  39. tooluniverse/data/semantic_scholar_tools.json +1 -2
  40. tooluniverse/data/tool_composition_tools.json +2 -4
  41. tooluniverse/data/unified_guideline_tools.json +206 -4
  42. tooluniverse/data/xml_tools.json +15 -15
  43. tooluniverse/data/zenodo_tools.json +1 -2
  44. tooluniverse/dbsnp_tool.py +71 -0
  45. tooluniverse/default_config.py +6 -0
  46. tooluniverse/ensembl_tool.py +61 -0
  47. tooluniverse/execute_function.py +235 -76
  48. tooluniverse/generate_tools.py +303 -20
  49. tooluniverse/genomics_gene_search_tool.py +56 -0
  50. tooluniverse/geo_tool.py +116 -0
  51. tooluniverse/gnomad_tool.py +63 -0
  52. tooluniverse/logging_config.py +64 -2
  53. tooluniverse/markitdown_tool.py +159 -0
  54. tooluniverse/mcp_client_tool.py +10 -5
  55. tooluniverse/molecule_2d_tool.py +9 -3
  56. tooluniverse/molecule_3d_tool.py +9 -3
  57. tooluniverse/output_hook.py +217 -150
  58. tooluniverse/smcp.py +18 -10
  59. tooluniverse/smcp_server.py +89 -199
  60. tooluniverse/string_tool.py +112 -0
  61. tooluniverse/tools/{MultiAgentLiteratureSearch.py → ADMETAnalyzerAgent.py} +18 -18
  62. tooluniverse/tools/ArXiv_search_papers.py +3 -3
  63. tooluniverse/tools/CMA_Guidelines_Search.py +52 -0
  64. tooluniverse/tools/CORE_search_papers.py +3 -3
  65. tooluniverse/tools/ClinVar_search_variants.py +52 -0
  66. tooluniverse/tools/ClinicalTrialDesignAgent.py +63 -0
  67. tooluniverse/tools/CompoundDiscoveryAgent.py +59 -0
  68. tooluniverse/tools/DOAJ_search_articles.py +2 -2
  69. tooluniverse/tools/DiseaseAnalyzerAgent.py +52 -0
  70. tooluniverse/tools/DrugInteractionAnalyzerAgent.py +52 -0
  71. tooluniverse/tools/DrugOptimizationAgent.py +63 -0
  72. tooluniverse/tools/Ensembl_lookup_gene_by_symbol.py +52 -0
  73. tooluniverse/tools/EuropePMC_search_articles.py +1 -1
  74. tooluniverse/tools/GIN_Guidelines_Search.py +52 -0
  75. tooluniverse/tools/GWAS_search_associations_by_gene.py +52 -0
  76. tooluniverse/tools/LiteratureSynthesisAgent.py +59 -0
  77. tooluniverse/tools/PMC_search_papers.py +3 -3
  78. tooluniverse/tools/PubMed_search_articles.py +2 -2
  79. tooluniverse/tools/SemanticScholar_search_papers.py +1 -1
  80. tooluniverse/tools/UCSC_get_genes_by_region.py +67 -0
  81. tooluniverse/tools/Zenodo_search_records.py +1 -1
  82. tooluniverse/tools/__init__.py +33 -3
  83. tooluniverse/tools/convert_to_markdown.py +59 -0
  84. tooluniverse/tools/dbSNP_get_variant_by_rsid.py +46 -0
  85. tooluniverse/tools/gnomAD_query_variant.py +52 -0
  86. tooluniverse/tools/openalex_literature_search.py +4 -4
  87. tooluniverse/ucsc_tool.py +60 -0
  88. tooluniverse/unified_guideline_tools.py +1175 -57
  89. tooluniverse/utils.py +51 -4
  90. tooluniverse/zenodo_tool.py +2 -1
  91. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/METADATA +10 -3
  92. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/RECORD +96 -61
  93. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/entry_points.txt +0 -3
  94. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/WHEEL +0 -0
  95. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/licenses/LICENSE +0 -0
  96. {tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/top_level.txt +0 -0
@@ -9,10 +9,44 @@ import time
9
9
  import re
10
10
  import xml.etree.ElementTree as ET
11
11
  from bs4 import BeautifulSoup
12
+ from markitdown import MarkItDown
12
13
  from .base_tool import BaseTool
13
14
  from .tool_registry import register_tool
14
15
 
15
16
 
17
+ def _extract_meaningful_terms(query):
18
+ """Return significant query terms for relevance filtering."""
19
+ if not isinstance(query, str):
20
+ return []
21
+
22
+ # Keep alphabetic tokens with length >= 3
23
+ tokens = re.findall(r"[a-zA-Z]{3,}", query.lower())
24
+ stop_terms = {
25
+ "management",
26
+ "care",
27
+ "guideline",
28
+ "guidelines",
29
+ "clinical",
30
+ "practice",
31
+ "and",
32
+ "with",
33
+ "for",
34
+ "the",
35
+ "that",
36
+ "from",
37
+ "into",
38
+ "using",
39
+ "update",
40
+ "introduction",
41
+ "review",
42
+ "overview",
43
+ "recommendation",
44
+ "recommendations",
45
+ }
46
+ meaningful = [token for token in tokens if token not in stop_terms]
47
+ return meaningful if meaningful else tokens
48
+
49
+
16
50
  @register_tool()
17
51
  class NICEWebScrapingTool(BaseTool):
18
52
  """
@@ -174,6 +208,7 @@ class NICEWebScrapingTool(BaseTool):
174
208
  "title": title,
175
209
  "url": url,
176
210
  "summary": summary,
211
+ "content": summary, # Copy summary to content field
177
212
  "date": date,
178
213
  "type": guideline_type,
179
214
  "source": "NICE",
@@ -302,6 +337,8 @@ class PubMedGuidelinesTool(BaseTool):
302
337
 
303
338
  # Process results
304
339
  results = []
340
+ query_terms = _extract_meaningful_terms(query)
341
+
305
342
  for pmid in pmids:
306
343
  if pmid in detail_data.get("result", {}):
307
344
  article = detail_data["result"][pmid]
@@ -318,10 +355,25 @@ class PubMedGuidelinesTool(BaseTool):
318
355
  pub_types = article.get("pubtype", [])
319
356
  is_guideline = any("guideline" in pt.lower() for pt in pub_types)
320
357
 
358
+ abstract_text = abstracts.get(pmid, "")
359
+ searchable_text = " ".join(
360
+ [
361
+ article.get("title", ""),
362
+ abstract_text or "",
363
+ " ".join(pub_types),
364
+ ]
365
+ ).lower()
366
+
367
+ if query_terms and not any(
368
+ term in searchable_text for term in query_terms
369
+ ):
370
+ continue
371
+
321
372
  result = {
322
373
  "pmid": pmid,
323
374
  "title": article.get("title", ""),
324
- "abstract": abstracts.get(pmid, ""),
375
+ "abstract": abstract_text,
376
+ "content": abstract_text, # Copy abstract to content field
325
377
  "authors": author_str,
326
378
  "journal": article.get("source", ""),
327
379
  "publication_date": article.get("pubdate", ""),
@@ -373,10 +425,14 @@ class EuropePMCGuidelinesTool(BaseTool):
373
425
  def _search_europepmc_guidelines(self, query, limit):
374
426
  """Search Europe PMC for guideline publications."""
375
427
  try:
376
- # Add guideline filter to query
377
- guideline_query = f"guideline AND {query}"
428
+ # More specific guideline search query
429
+ guideline_query = f'"{query}" AND (guideline OR "practice guideline" OR "clinical guideline" OR recommendation OR "consensus statement")'
378
430
 
379
- params = {"query": guideline_query, "format": "json", "pageSize": limit}
431
+ params = {
432
+ "query": guideline_query,
433
+ "format": "json",
434
+ "pageSize": limit * 2,
435
+ } # Get more to filter
380
436
 
381
437
  response = self.session.get(self.base_url, params=params, timeout=30)
382
438
  response.raise_for_status()
@@ -388,18 +444,101 @@ class EuropePMCGuidelinesTool(BaseTool):
388
444
  if not results_list:
389
445
  return []
390
446
 
391
- # Process results
447
+ # Process results with stricter filtering
392
448
  results = []
393
449
  for result in results_list:
394
450
  title = result.get("title", "")
395
451
  pub_type = result.get("pubType", "")
396
- abstract = result.get("abstractText", "")
452
+
453
+ # Get abstract from detailed API call
454
+ abstract = self._get_europepmc_abstract(result.get("pmid", ""))
455
+
456
+ # If abstract is too short or just a question, try to get more content
457
+ if len(abstract) < 200 or abstract.endswith("?"):
458
+ # Try to get full text or more detailed content
459
+ abstract = self._get_europepmc_full_content(
460
+ result.get("pmid", ""), result.get("pmcid", "")
461
+ )
462
+
463
+ # More strict guideline detection
464
+ title_lower = title.lower()
465
+ abstract_lower = abstract.lower()
466
+
467
+ # Must contain guideline-related keywords in title or abstract
468
+ guideline_keywords = [
469
+ "guideline",
470
+ "practice guideline",
471
+ "clinical guideline",
472
+ "recommendation",
473
+ "consensus statement",
474
+ "position statement",
475
+ "clinical practice",
476
+ "best practice",
477
+ ]
478
+
479
+ has_guideline_keywords = any(
480
+ keyword in title_lower or keyword in abstract_lower
481
+ for keyword in guideline_keywords
482
+ )
483
+
484
+ # Exclude research papers and studies
485
+ exclude_keywords = [
486
+ "study",
487
+ "trial",
488
+ "analysis",
489
+ "evaluation",
490
+ "assessment",
491
+ "effectiveness",
492
+ "efficacy",
493
+ "outcome",
494
+ "result",
495
+ "finding",
496
+ ]
497
+
498
+ is_research = any(
499
+ keyword in title_lower for keyword in exclude_keywords
500
+ )
501
+
502
+ # Publication type must confirm guideline nature
503
+ pub_type_tokens = []
504
+ if isinstance(pub_type, str):
505
+ pub_type_tokens.append(pub_type.lower())
506
+
507
+ pub_type_list = result.get("pubTypeList", {}).get("pubType", [])
508
+ if isinstance(pub_type_list, str):
509
+ pub_type_list = [pub_type_list]
510
+
511
+ if isinstance(pub_type_list, list):
512
+ for entry in pub_type_list:
513
+ if isinstance(entry, str):
514
+ pub_type_tokens.append(entry.lower())
515
+ elif isinstance(entry, dict):
516
+ label = (
517
+ entry.get("text")
518
+ or entry.get("name")
519
+ or entry.get("value")
520
+ )
521
+ if label:
522
+ pub_type_tokens.append(str(label).lower())
523
+
524
+ pub_type_combined = " ".join(pub_type_tokens)
525
+
526
+ pub_type_has_guideline = any(
527
+ term in pub_type_combined
528
+ for term in [
529
+ "guideline",
530
+ "practice guideline",
531
+ "consensus",
532
+ "recommendation",
533
+ ]
534
+ )
397
535
 
398
536
  # Determine if it's a guideline
399
537
  is_guideline = (
400
- "guideline" in title.lower()
401
- or "guideline" in pub_type.lower()
402
- or "guideline" in abstract.lower()
538
+ has_guideline_keywords
539
+ and pub_type_has_guideline
540
+ and not is_research
541
+ and len(title) > 20
403
542
  )
404
543
 
405
544
  # Build URL
@@ -415,24 +554,33 @@ class EuropePMCGuidelinesTool(BaseTool):
415
554
  elif doi:
416
555
  url = f"https://doi.org/{doi}"
417
556
 
418
- guideline_result = {
419
- "title": title,
420
- "pmid": pmid,
421
- "pmcid": pmcid,
422
- "doi": doi,
423
- "authors": result.get("authorString", ""),
424
- "journal": result.get("journalTitle", ""),
425
- "publication_date": result.get("firstPublicationDate", ""),
426
- "publication_type": pub_type,
427
- "abstract": (
428
- abstract[:500] + "..." if len(abstract) > 500 else abstract
429
- ),
430
- "is_guideline": is_guideline,
431
- "url": url,
432
- "source": "Europe PMC",
433
- }
557
+ abstract_text = (
558
+ abstract[:500] + "..." if len(abstract) > 500 else abstract
559
+ )
434
560
 
435
- results.append(guideline_result)
561
+ # Only add if it's actually a guideline
562
+ if is_guideline:
563
+ guideline_result = {
564
+ "title": title,
565
+ "pmid": pmid,
566
+ "pmcid": pmcid,
567
+ "doi": doi,
568
+ "authors": result.get("authorString", ""),
569
+ "journal": result.get("journalTitle", ""),
570
+ "publication_date": result.get("firstPublicationDate", ""),
571
+ "publication_type": pub_type,
572
+ "abstract": abstract_text,
573
+ "content": abstract_text, # Copy abstract to content field
574
+ "is_guideline": is_guideline,
575
+ "url": url,
576
+ "source": "Europe PMC",
577
+ }
578
+
579
+ results.append(guideline_result)
580
+
581
+ # Stop when we have enough guidelines
582
+ if len(results) >= limit:
583
+ break
436
584
 
437
585
  return results
438
586
 
@@ -447,6 +595,101 @@ class EuropePMCGuidelinesTool(BaseTool):
447
595
  "source": "Europe PMC",
448
596
  }
449
597
 
598
+ def _get_europepmc_abstract(self, pmid):
599
+ """Get abstract for a specific PMID using PubMed API."""
600
+ if not pmid:
601
+ return ""
602
+
603
+ try:
604
+ # Use PubMed's E-utilities API
605
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
606
+ params = {
607
+ "db": "pubmed",
608
+ "id": pmid,
609
+ "retmode": "xml",
610
+ "rettype": "abstract",
611
+ }
612
+
613
+ response = self.session.get(base_url, params=params, timeout=15)
614
+ response.raise_for_status()
615
+
616
+ # Parse XML response
617
+ import xml.etree.ElementTree as ET
618
+
619
+ root = ET.fromstring(response.content)
620
+
621
+ # Find abstract text
622
+ abstract_elem = root.find(".//AbstractText")
623
+ if abstract_elem is not None:
624
+ return abstract_elem.text or ""
625
+
626
+ # Try alternative path
627
+ abstract_elem = root.find(".//abstract")
628
+ if abstract_elem is not None:
629
+ return abstract_elem.text or ""
630
+
631
+ return ""
632
+
633
+ except Exception as e:
634
+ return f"Error fetching abstract: {str(e)}"
635
+
636
+ def _get_europepmc_full_content(self, pmid, pmcid):
637
+ """Get more detailed content from Europe PMC."""
638
+ if not pmid and not pmcid:
639
+ return ""
640
+
641
+ try:
642
+ # Try to get full text from Europe PMC
643
+ if pmcid:
644
+ full_text_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
645
+ else:
646
+ full_text_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/{pmid}/fullTextXML"
647
+
648
+ response = self.session.get(full_text_url, timeout=15)
649
+ if response.status_code == 200:
650
+ # Parse XML to extract meaningful content
651
+ import xml.etree.ElementTree as ET
652
+
653
+ root = ET.fromstring(response.content)
654
+
655
+ # Extract sections that might contain clinical recommendations
656
+ content_parts = []
657
+
658
+ # Look for methods, results, conclusions, recommendations
659
+ for section in root.findall(".//sec"):
660
+ title_elem = section.find("title")
661
+ if title_elem is not None:
662
+ title = title_elem.text or ""
663
+ if any(
664
+ keyword in title.lower()
665
+ for keyword in [
666
+ "recommendation",
667
+ "conclusion",
668
+ "method",
669
+ "result",
670
+ "guideline",
671
+ "clinical",
672
+ ]
673
+ ):
674
+ # Extract text from this section
675
+ text_content = ""
676
+ for p in section.findall(".//p"):
677
+ if p.text:
678
+ text_content += p.text + " "
679
+
680
+ if text_content.strip():
681
+ content_parts.append(f"{title}: {text_content.strip()}")
682
+
683
+ if content_parts:
684
+ return " ".join(
685
+ content_parts[:3]
686
+ ) # Limit to first 3 relevant sections
687
+
688
+ return ""
689
+
690
+ except Exception as e:
691
+ return f"Error fetching full content: {str(e)}"
692
+
450
693
 
451
694
  @register_tool()
452
695
  class TRIPDatabaseTool(BaseTool):
@@ -506,12 +749,58 @@ class TRIPDatabaseTool(BaseTool):
506
749
  category_elem = doc.find("category")
507
750
  description_elem = doc.find("description")
508
751
 
752
+ description_text = (
753
+ description_elem.text if description_elem is not None else ""
754
+ )
755
+ url = link_elem.text if link_elem is not None else ""
756
+
757
+ key_recommendations = []
758
+ evidence_strength = []
759
+
760
+ fetched_content = None
761
+ requires_detailed_fetch = url and any(
762
+ domain in url for domain in ["bmj.com/content/", "e-dmj.org"]
763
+ )
764
+
765
+ if (not description_text and url) or requires_detailed_fetch:
766
+ fetched_content = self._fetch_guideline_content(url)
767
+
768
+ if isinstance(fetched_content, dict):
769
+ description_text = (
770
+ fetched_content.get("content", "") or description_text
771
+ )
772
+ key_recommendations = fetched_content.get("key_recommendations", [])
773
+ evidence_strength = fetched_content.get("evidence_strength", [])
774
+ elif isinstance(fetched_content, str) and fetched_content:
775
+ description_text = fetched_content
776
+
777
+ category_text = (
778
+ category_elem.text.lower()
779
+ if category_elem is not None and category_elem.text
780
+ else ""
781
+ )
782
+
783
+ if category_text and "guideline" not in category_text:
784
+ # Skip clearly non-guideline categories such as news or trials
785
+ continue
786
+
787
+ description_lower = description_text.lower()
788
+ if any(
789
+ phrase in description_lower
790
+ for phrase in [
791
+ "login required",
792
+ "temporarily unavailable",
793
+ "subscription required",
794
+ "no results",
795
+ ]
796
+ ):
797
+ continue
798
+
509
799
  guideline_result = {
510
800
  "title": title_elem.text if title_elem is not None else "",
511
- "url": link_elem.text if link_elem is not None else "",
512
- "description": (
513
- description_elem.text if description_elem is not None else ""
514
- ),
801
+ "url": url,
802
+ "description": description_text,
803
+ "content": description_text, # Copy description to content field
515
804
  "publication": (
516
805
  publication_elem.text if publication_elem is not None else ""
517
806
  ),
@@ -520,6 +809,11 @@ class TRIPDatabaseTool(BaseTool):
520
809
  "source": "TRIP Database",
521
810
  }
522
811
 
812
+ if key_recommendations:
813
+ guideline_result["key_recommendations"] = key_recommendations
814
+ if evidence_strength:
815
+ guideline_result["evidence_strength"] = evidence_strength
816
+
523
817
  results.append(guideline_result)
524
818
 
525
819
  return results
@@ -540,6 +834,274 @@ class TRIPDatabaseTool(BaseTool):
540
834
  "source": "TRIP Database",
541
835
  }
542
836
 
837
+ def _fetch_guideline_content(self, url):
838
+ """Extract content from a guideline URL using targeted parsers when available."""
839
+ try:
840
+ time.sleep(0.5) # Be respectful
841
+
842
+ if "bmj.com/content/" in url:
843
+ return self._extract_bmj_guideline_content(url)
844
+
845
+ if "e-dmj.org" in url:
846
+ return self._extract_dmj_guideline_content(url)
847
+
848
+ # Fallback: generic MarkItDown extraction
849
+ md = MarkItDown()
850
+ result = md.convert(url)
851
+
852
+ if not result or not getattr(result, "text_content", None):
853
+ return f"Content extraction failed. Document available at: {url}"
854
+
855
+ content = self._clean_generic_content(result.text_content)
856
+ return content
857
+
858
+ except Exception as e:
859
+ return f"Error extracting content: {str(e)}"
860
+
861
+ def _clean_generic_content(self, raw_text):
862
+ """Clean generic text content to emphasise clinical lines."""
863
+ content = raw_text.strip()
864
+ content = re.sub(r"\n\s*\n", "\n\n", content)
865
+ content = re.sub(r" +", " ", content)
866
+
867
+ meaningful_lines = []
868
+ for line in content.split("\n"):
869
+ line = line.strip()
870
+ if len(line) < 20:
871
+ continue
872
+ if line.count("[") > 0 or line.count("]") > 0:
873
+ continue
874
+ if "http" in line or "//" in line:
875
+ continue
876
+
877
+ skip_keywords = [
878
+ "copyright",
879
+ "rights reserved",
880
+ "notice of rights",
881
+ "terms and conditions",
882
+ "your responsibility",
883
+ "local commissioners",
884
+ "environmental impact",
885
+ "medicines and healthcare",
886
+ "yellow card scheme",
887
+ "©",
888
+ "all rights reserved",
889
+ ]
890
+ if any(keyword in line.lower() for keyword in skip_keywords):
891
+ continue
892
+
893
+ clinical_keywords = [
894
+ "recommendation",
895
+ "recommendations",
896
+ "should",
897
+ "strong recommendation",
898
+ "conditional recommendation",
899
+ "clinicians",
900
+ "patients",
901
+ "treatment",
902
+ "management",
903
+ "diagnosis",
904
+ "assessment",
905
+ "therapy",
906
+ "intervention",
907
+ "pharmacologic",
908
+ "monitoring",
909
+ "screening",
910
+ "diabetes",
911
+ "glycaemic",
912
+ ]
913
+ if any(keyword in line.lower() for keyword in clinical_keywords):
914
+ meaningful_lines.append(line)
915
+
916
+ if meaningful_lines:
917
+ content = "\n".join(meaningful_lines[:8])
918
+ else:
919
+ content = content[:1000]
920
+
921
+ if len(content) > 2000:
922
+ truncated = content[:2000]
923
+ last_period = truncated.rfind(".")
924
+ if last_period > 1000:
925
+ content = truncated[: last_period + 1] + "..."
926
+ else:
927
+ content = truncated + "..."
928
+
929
+ return content
930
+
931
+ def _extract_bmj_guideline_content(self, url):
932
+ """Fetch BMJ Rapid Recommendation content with key recommendations."""
933
+ try:
934
+ md = MarkItDown()
935
+ result = md.convert(url)
936
+ if not result or not getattr(result, "text_content", None):
937
+ return {
938
+ "content": f"Content extraction failed. Document available at: {url}",
939
+ "key_recommendations": [],
940
+ "evidence_strength": [],
941
+ }
942
+
943
+ text = result.text_content
944
+ content = self._clean_generic_content(text)
945
+
946
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
947
+ recommendations = []
948
+ grading = []
949
+ tokens = [
950
+ "strong recommendation",
951
+ "conditional recommendation",
952
+ "weak recommendation",
953
+ "good practice statement",
954
+ ]
955
+
956
+ for idx, line in enumerate(lines):
957
+ lower = line.lower()
958
+ if "recommendation" not in lower:
959
+ continue
960
+ if len(line) > 180:
961
+ continue
962
+
963
+ title_clean = line.lstrip("#").strip()
964
+ if title_clean.startswith("+"):
965
+ continue
966
+ if title_clean.lower().startswith("rapid recommendations"):
967
+ continue
968
+
969
+ summary_lines = []
970
+ for following in lines[idx + 1 : idx + 10]:
971
+ if "recommendation" in following.lower() and len(following) < 180:
972
+ break
973
+ if len(following) < 40:
974
+ continue
975
+ summary_lines.append(following)
976
+ if len(summary_lines) >= 3:
977
+ break
978
+
979
+ summary = " ".join(summary_lines)
980
+ if summary:
981
+ recommendations.append(
982
+ {"title": title_clean, "summary": summary[:400]}
983
+ )
984
+
985
+ strength = None
986
+ for token in tokens:
987
+ if token in lower or any(token in s.lower() for s in summary_lines):
988
+ strength = token.title()
989
+ break
990
+
991
+ if not strength:
992
+ grade_match = re.search(r"grade\s+[A-D1-9]+", lower)
993
+ if grade_match:
994
+ strength = grade_match.group(0).title()
995
+
996
+ if strength and not any(
997
+ entry.get("section") == title_clean for entry in grading
998
+ ):
999
+ grading.append({"section": title_clean, "strength": strength})
1000
+
1001
+ return {
1002
+ "content": content,
1003
+ "key_recommendations": recommendations[:5],
1004
+ "evidence_strength": grading,
1005
+ }
1006
+
1007
+ except Exception as e:
1008
+ return {
1009
+ "content": f"Error extracting BMJ content: {str(e)}",
1010
+ "key_recommendations": [],
1011
+ "evidence_strength": [],
1012
+ }
1013
+
1014
+ def _extract_dmj_guideline_content(self, url):
1015
+ """Fetch Diabetes & Metabolism Journal guideline content and GRADE statements."""
1016
+ try:
1017
+ md = MarkItDown()
1018
+ result = md.convert(url)
1019
+ if not result or not getattr(result, "text_content", None):
1020
+ return {
1021
+ "content": f"Content extraction failed. Document available at: {url}",
1022
+ "key_recommendations": [],
1023
+ "evidence_strength": [],
1024
+ }
1025
+
1026
+ text = result.text_content
1027
+ content = self._clean_generic_content(text)
1028
+
1029
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
1030
+ recommendations = []
1031
+ grading = []
1032
+
1033
+ for idx, line in enumerate(lines):
1034
+ lower = line.lower()
1035
+ if not any(
1036
+ keyword in lower
1037
+ for keyword in ["recommendation", "statement", "guideline"]
1038
+ ):
1039
+ continue
1040
+ if len(line) > 200:
1041
+ continue
1042
+
1043
+ title_clean = line.lstrip("#").strip()
1044
+ if title_clean.startswith("+") or title_clean.startswith("Table"):
1045
+ continue
1046
+
1047
+ summary_lines = []
1048
+ for following in lines[idx + 1 : idx + 10]:
1049
+ if (
1050
+ any(
1051
+ keyword in following.lower()
1052
+ for keyword in ["recommendation", "statement", "guideline"]
1053
+ )
1054
+ and len(following) < 200
1055
+ ):
1056
+ break
1057
+ if len(following) < 30:
1058
+ continue
1059
+ summary_lines.append(following)
1060
+ if len(summary_lines) >= 3:
1061
+ break
1062
+
1063
+ summary = " ".join(summary_lines)
1064
+ if summary:
1065
+ recommendations.append(
1066
+ {"title": title_clean, "summary": summary[:400]}
1067
+ )
1068
+
1069
+ strength = None
1070
+ grade_match = re.search(r"grade\s+[A-E]\b", lower)
1071
+ if grade_match:
1072
+ strength = grade_match.group(0).title()
1073
+ level_match = re.search(r"level\s+[0-4]", lower)
1074
+ if level_match:
1075
+ level_text = level_match.group(0).title()
1076
+ strength = f"{strength} ({level_text})" if strength else level_text
1077
+
1078
+ for line_text in summary_lines:
1079
+ lower_line = line_text.lower()
1080
+ if "strong" in lower_line and "recommendation" in lower_line:
1081
+ strength = "Strong recommendation"
1082
+ break
1083
+ if "conditional" in lower_line and "recommendation" in lower_line:
1084
+ strength = "Conditional recommendation"
1085
+ break
1086
+
1087
+ if strength and not any(
1088
+ entry.get("section") == title_clean for entry in grading
1089
+ ):
1090
+ grading.append({"section": title_clean, "strength": strength})
1091
+
1092
+ return {
1093
+ "content": content,
1094
+ "key_recommendations": recommendations[:5],
1095
+ "evidence_strength": grading,
1096
+ }
1097
+
1098
+ except Exception as e:
1099
+ return {
1100
+ "content": f"Error extracting DMJ content: {str(e)}",
1101
+ "key_recommendations": [],
1102
+ "evidence_strength": [],
1103
+ }
1104
+
543
1105
 
544
1106
  @register_tool()
545
1107
  class WHOGuidelinesTool(BaseTool):
@@ -632,6 +1194,7 @@ class WHOGuidelinesTool(BaseTool):
632
1194
  guidelines = []
633
1195
 
634
1196
  query_lower = query.lower()
1197
+ query_terms = _extract_meaningful_terms(query)
635
1198
 
636
1199
  for link in all_links:
637
1200
  href = link["href"]
@@ -654,11 +1217,18 @@ class WHOGuidelinesTool(BaseTool):
654
1217
  # Fetch description from detail page
655
1218
  description = self._fetch_guideline_description(full_url)
656
1219
 
1220
+ searchable_text = (text + " " + (description or "")).lower()
1221
+ if query_terms and not any(
1222
+ term in searchable_text for term in query_terms
1223
+ ):
1224
+ continue
1225
+
657
1226
  guidelines.append(
658
1227
  {
659
1228
  "title": text,
660
1229
  "url": full_url,
661
1230
  "description": description,
1231
+ "content": description, # Copy description to content field
662
1232
  "source": "WHO",
663
1233
  "organization": "World Health Organization",
664
1234
  "is_guideline": True,
@@ -696,11 +1266,18 @@ class WHOGuidelinesTool(BaseTool):
696
1266
  # Fetch description from detail page
697
1267
  description = self._fetch_guideline_description(full_url)
698
1268
 
1269
+ searchable_text = (text + " " + (description or "")).lower()
1270
+ if query_terms and not any(
1271
+ term in searchable_text for term in query_terms
1272
+ ):
1273
+ continue
1274
+
699
1275
  all_guidelines.append(
700
1276
  {
701
1277
  "title": text,
702
1278
  "url": full_url,
703
1279
  "description": description,
1280
+ "content": description, # Copy description to content field
704
1281
  "source": "WHO",
705
1282
  "organization": "World Health Organization",
706
1283
  "is_guideline": True,
@@ -750,7 +1327,9 @@ class OpenAlexGuidelinesTool(BaseTool):
750
1327
  """Search for clinical guidelines using OpenAlex API."""
751
1328
  try:
752
1329
  # Build search query to focus on guidelines
753
- search_query = f"{query} clinical practice guideline"
1330
+ search_query = (
1331
+ f'"{query}" AND (guideline OR "clinical practice" OR recommendation)'
1332
+ )
754
1333
 
755
1334
  # Build parameters
756
1335
  params = {
@@ -815,16 +1394,71 @@ class OpenAlexGuidelinesTool(BaseTool):
815
1394
  else None
816
1395
  )
817
1396
 
818
- # Check if it's likely a guideline
819
- is_guideline = any(
820
- keyword in title.lower()
821
- for keyword in [
822
- "guideline",
823
- "recommendation",
824
- "consensus",
825
- "practice",
826
- "statement",
827
- ]
1397
+ # More strict guideline detection
1398
+ title_lower = title.lower()
1399
+ abstract_lower = abstract.lower() if abstract else ""
1400
+
1401
+ # Must contain specific guideline keywords
1402
+ guideline_keywords = [
1403
+ "guideline",
1404
+ "practice guideline",
1405
+ "clinical guideline",
1406
+ "recommendation",
1407
+ "consensus statement",
1408
+ "position statement",
1409
+ "clinical practice",
1410
+ "best practice",
1411
+ ]
1412
+
1413
+ has_guideline_keywords = any(
1414
+ keyword in title_lower or keyword in abstract_lower
1415
+ for keyword in guideline_keywords
1416
+ )
1417
+
1418
+ # Check structured concepts from OpenAlex for guideline markers
1419
+ concepts = work.get("concepts", []) or []
1420
+ has_guideline_concept = False
1421
+ for concept in concepts:
1422
+ display_name = concept.get("display_name", "").lower()
1423
+ if any(
1424
+ term in display_name
1425
+ for term in [
1426
+ "guideline",
1427
+ "clinical practice",
1428
+ "recommendation",
1429
+ "consensus",
1430
+ ]
1431
+ ):
1432
+ has_guideline_concept = True
1433
+ break
1434
+
1435
+ primary_topic = work.get("primary_topic", {}) or {}
1436
+ primary_topic_name = primary_topic.get("display_name", "").lower()
1437
+ if any(
1438
+ term in primary_topic_name
1439
+ for term in ["guideline", "clinical practice", "recommendation"]
1440
+ ):
1441
+ has_guideline_concept = True
1442
+
1443
+ # Exclude research papers and studies (but be less strict)
1444
+ exclude_keywords = [
1445
+ "statistics",
1446
+ "data",
1447
+ "survey",
1448
+ "meta-analysis",
1449
+ "systematic review",
1450
+ ]
1451
+
1452
+ is_research = any(
1453
+ keyword in title_lower for keyword in exclude_keywords
1454
+ )
1455
+
1456
+ # Determine if it's a guideline
1457
+ is_guideline = (
1458
+ has_guideline_keywords
1459
+ and has_guideline_concept
1460
+ and not is_research
1461
+ and len(title) > 20
828
1462
  )
829
1463
 
830
1464
  # Build URL
@@ -838,23 +1472,29 @@ class OpenAlexGuidelinesTool(BaseTool):
838
1472
  )
839
1473
  )
840
1474
 
841
- guideline = {
842
- "title": title,
843
- "authors": authors,
844
- "institutions": institutions[:3],
845
- "year": year,
846
- "doi": doi,
847
- "url": url,
848
- "openalex_id": openalex_id,
849
- "cited_by_count": cited_by,
850
- "is_guideline": is_guideline,
851
- "source": "OpenAlex",
852
- "abstract": (
853
- abstract[:500] if abstract else None
854
- ), # Limit abstract length
855
- }
1475
+ # Only add if it's actually a guideline
1476
+ if is_guideline:
1477
+ abstract_text = abstract[:500] if abstract else None
1478
+ guideline = {
1479
+ "title": title,
1480
+ "authors": authors,
1481
+ "institutions": institutions[:3],
1482
+ "year": year,
1483
+ "doi": doi,
1484
+ "url": url,
1485
+ "openalex_id": openalex_id,
1486
+ "cited_by_count": cited_by,
1487
+ "is_guideline": is_guideline,
1488
+ "source": "OpenAlex",
1489
+ "abstract": abstract_text,
1490
+ "content": abstract_text, # Copy abstract to content field
1491
+ }
1492
+
1493
+ guidelines.append(guideline)
856
1494
 
857
- guidelines.append(guideline)
1495
+ # Stop when we have enough guidelines
1496
+ if len(guidelines) >= limit:
1497
+ break
858
1498
 
859
1499
  return guidelines
860
1500
 
@@ -1208,3 +1848,481 @@ class WHOGuidelineFullTextTool(BaseTool):
1208
1848
  return {"error": f"Failed to fetch WHO guideline: {str(e)}", "url": url}
1209
1849
  except Exception as e:
1210
1850
  return {"error": f"Error parsing WHO guideline: {str(e)}", "url": url}
1851
+
1852
+
1853
+ @register_tool()
1854
+ class GINGuidelinesTool(BaseTool):
1855
+ """
1856
+ Guidelines International Network (GIN) Guidelines Search Tool.
1857
+ Searches the global guidelines database with 6400+ guidelines from various organizations.
1858
+ """
1859
+
1860
+ def __init__(self, tool_config):
1861
+ super().__init__(tool_config)
1862
+ self.base_url = "https://www.g-i-n.net"
1863
+ self.search_url = f"{self.base_url}/library/international-guidelines-library"
1864
+ self.session = requests.Session()
1865
+ self.session.headers.update(
1866
+ {
1867
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1868
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
1869
+ "Accept-Language": "en-US,en;q=0.5",
1870
+ "Accept-Encoding": "gzip, deflate",
1871
+ "Connection": "keep-alive",
1872
+ "Upgrade-Insecure-Requests": "1",
1873
+ }
1874
+ )
1875
+
1876
+ def run(self, arguments):
1877
+ query = arguments.get("query", "")
1878
+ limit = arguments.get("limit", 10)
1879
+
1880
+ if not query:
1881
+ return {"error": "Query parameter is required"}
1882
+
1883
+ return self._search_gin_guidelines(query, limit)
1884
+
1885
+ def _search_gin_guidelines(self, query, limit):
1886
+ """Search GIN guidelines using web scraping."""
1887
+ try:
1888
+ time.sleep(1) # Be respectful
1889
+
1890
+ # Try to search GIN guidelines
1891
+ try:
1892
+ # GIN search typically uses form parameters
1893
+ search_params = {"search": query, "type": "guideline", "limit": limit}
1894
+
1895
+ response = self.session.get(
1896
+ self.search_url, params=search_params, timeout=30
1897
+ )
1898
+ response.raise_for_status()
1899
+
1900
+ soup = BeautifulSoup(response.content, "html.parser")
1901
+
1902
+ # Find guideline results - common selectors for guideline databases
1903
+ guidelines = []
1904
+
1905
+ # Try different selectors for guideline results
1906
+ result_selectors = [
1907
+ "div.guideline-item",
1908
+ "div.search-result",
1909
+ "div.result-item",
1910
+ "article.guideline",
1911
+ "div.item",
1912
+ "li.guideline",
1913
+ ]
1914
+
1915
+ results = []
1916
+ for selector in result_selectors:
1917
+ results = soup.select(selector)
1918
+ if results:
1919
+ break
1920
+
1921
+ if not results:
1922
+ # Fallback: look for any div with guideline-related content
1923
+ results = soup.find_all(
1924
+ "div",
1925
+ class_=lambda x: x
1926
+ and any(
1927
+ keyword in x.lower()
1928
+ for keyword in ["guideline", "result", "item", "card"]
1929
+ ),
1930
+ )
1931
+
1932
+ for result in results[:limit]:
1933
+ try:
1934
+ # Extract title
1935
+ title_elem = (
1936
+ result.find("h3")
1937
+ or result.find("h2")
1938
+ or result.find("a", class_="title")
1939
+ or result.find("a")
1940
+ )
1941
+ if not title_elem:
1942
+ continue
1943
+
1944
+ title = title_elem.get_text().strip()
1945
+ if not title or len(title) < 10:
1946
+ continue
1947
+
1948
+ # Extract URL
1949
+ link_elem = result.find("a", href=True)
1950
+ if not link_elem:
1951
+ continue
1952
+
1953
+ url = link_elem.get("href", "")
1954
+ if url.startswith("/"):
1955
+ url = self.base_url + url
1956
+ elif not url.startswith("http"):
1957
+ continue
1958
+
1959
+ # Extract description/summary
1960
+ desc_elem = (
1961
+ result.find("p")
1962
+ or result.find("div", class_="description")
1963
+ or result.find("div", class_="summary")
1964
+ )
1965
+ description = desc_elem.get_text().strip() if desc_elem else ""
1966
+
1967
+ # Extract organization
1968
+ org_elem = (
1969
+ result.find("span", class_="organization")
1970
+ or result.find("div", class_="org")
1971
+ or result.find("cite")
1972
+ )
1973
+ organization = (
1974
+ org_elem.get_text().strip()
1975
+ if org_elem
1976
+ else "GIN Member Organization"
1977
+ )
1978
+
1979
+ # Extract date
1980
+ date_elem = (
1981
+ result.find("time")
1982
+ or result.find("span", class_="date")
1983
+ or result.find("div", class_="date")
1984
+ )
1985
+ date = date_elem.get_text().strip() if date_elem else ""
1986
+
1987
+ # Extract content from the guideline page
1988
+ content = self._extract_guideline_content(url)
1989
+
1990
+ guidelines.append(
1991
+ {
1992
+ "title": title,
1993
+ "url": url,
1994
+ "description": description,
1995
+ "content": content,
1996
+ "date": date,
1997
+ "source": "GIN",
1998
+ "organization": organization,
1999
+ "is_guideline": True,
2000
+ "official": True,
2001
+ }
2002
+ )
2003
+
2004
+ except Exception:
2005
+ continue
2006
+
2007
+ if guidelines:
2008
+ return guidelines
2009
+
2010
+ except requests.exceptions.RequestException as e:
2011
+ print(f"GIN website access failed: {e}, trying fallback search...")
2012
+
2013
+ # Fallback: Return sample guidelines based on query
2014
+ return self._get_fallback_gin_guidelines(query, limit)
2015
+
2016
+ except Exception as e:
2017
+ return {
2018
+ "error": f"Error processing GIN guidelines: {str(e)}",
2019
+ "source": "GIN",
2020
+ }
2021
+
2022
+ def _get_fallback_gin_guidelines(self, query, limit):
2023
+ """Provide fallback guidelines when direct access fails."""
2024
+ # This would contain sample guidelines based on common queries
2025
+ # For now, return a message indicating the issue
2026
+ return [
2027
+ {
2028
+ "title": f"GIN Guidelines Search for '{query}'",
2029
+ "url": self.search_url,
2030
+ "description": "GIN guidelines database access temporarily unavailable. Please try again later or visit the GIN website directly.",
2031
+ "content": "The Guidelines International Network (GIN) maintains the world's largest database of clinical guidelines with over 6400 guidelines from various organizations worldwide.",
2032
+ "date": "",
2033
+ "source": "GIN",
2034
+ "organization": "Guidelines International Network",
2035
+ "is_guideline": False,
2036
+ "official": True,
2037
+ "is_placeholder": True,
2038
+ "note": "Direct access to GIN database failed. Please visit g-i-n.net for full access.",
2039
+ }
2040
+ ]
2041
+
2042
+ def _extract_guideline_content(self, url):
2043
+ """Extract actual content from a guideline URL."""
2044
+ try:
2045
+ time.sleep(0.5) # Be respectful
2046
+ response = self.session.get(url, timeout=15)
2047
+ response.raise_for_status()
2048
+
2049
+ soup = BeautifulSoup(response.content, "html.parser")
2050
+
2051
+ # Extract main content
2052
+ content_selectors = [
2053
+ "main",
2054
+ ".content",
2055
+ ".article-content",
2056
+ ".guideline-content",
2057
+ "article",
2058
+ ".main-content",
2059
+ ]
2060
+
2061
+ content_text = ""
2062
+ for selector in content_selectors:
2063
+ content_elem = soup.select_one(selector)
2064
+ if content_elem:
2065
+ # Get all text content
2066
+ paragraphs = content_elem.find_all("p")
2067
+ content_parts = []
2068
+ for p in paragraphs:
2069
+ text = p.get_text().strip()
2070
+ if len(text) > 20: # Skip very short paragraphs
2071
+ content_parts.append(text)
2072
+
2073
+ if content_parts:
2074
+ content_text = "\n\n".join(
2075
+ content_parts[:10]
2076
+ ) # Limit to first 10 paragraphs
2077
+ break
2078
+
2079
+ # If no main content found, try to get any meaningful text
2080
+ if not content_text:
2081
+ all_text = soup.get_text()
2082
+ # Clean up the text
2083
+ lines = [line.strip() for line in all_text.split("\n") if line.strip()]
2084
+ content_text = "\n".join(lines[:20]) # First 20 meaningful lines
2085
+
2086
+ return content_text[:2000] # Limit content length
2087
+
2088
+ except Exception as e:
2089
+ return f"Error extracting content: {str(e)}"
2090
+
2091
+
2092
+ @register_tool()
2093
+ class CMAGuidelinesTool(BaseTool):
2094
+ """
2095
+ Canadian Medical Association (CMA) Infobase Guidelines Search Tool.
2096
+ Searches the CMA Infobase with 1200+ Canadian clinical practice guidelines.
2097
+ """
2098
+
2099
+ def __init__(self, tool_config):
2100
+ super().__init__(tool_config)
2101
+ self.base_url = "https://joulecma.ca"
2102
+ self.search_url = f"{self.base_url}/infobase"
2103
+ self.session = requests.Session()
2104
+ self.session.headers.update(
2105
+ {
2106
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
2107
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
2108
+ "Accept-Language": "en-US,en;q=0.5",
2109
+ "Accept-Encoding": "gzip, deflate",
2110
+ "Connection": "keep-alive",
2111
+ "Upgrade-Insecure-Requests": "1",
2112
+ }
2113
+ )
2114
+
2115
+ def run(self, arguments):
2116
+ query = arguments.get("query", "")
2117
+ limit = arguments.get("limit", 10)
2118
+
2119
+ if not query:
2120
+ return {"error": "Query parameter is required"}
2121
+
2122
+ return self._search_cma_guidelines(query, limit)
2123
+
2124
+ def _search_cma_guidelines(self, query, limit):
2125
+ """Search CMA Infobase guidelines using web scraping."""
2126
+ try:
2127
+ time.sleep(1) # Be respectful
2128
+
2129
+ # Try to search CMA Infobase
2130
+ try:
2131
+ # CMA search typically uses form parameters
2132
+ search_params = {"search": query, "type": "guideline", "limit": limit}
2133
+
2134
+ response = self.session.get(
2135
+ self.search_url, params=search_params, timeout=30
2136
+ )
2137
+ response.raise_for_status()
2138
+
2139
+ soup = BeautifulSoup(response.content, "html.parser")
2140
+
2141
+ # Find guideline results
2142
+ guidelines = []
2143
+
2144
+ # Try different selectors for guideline results
2145
+ result_selectors = [
2146
+ "div.guideline-item",
2147
+ "div.search-result",
2148
+ "div.result-item",
2149
+ "article.guideline",
2150
+ "div.item",
2151
+ "li.guideline",
2152
+ ]
2153
+
2154
+ results = []
2155
+ for selector in result_selectors:
2156
+ results = soup.select(selector)
2157
+ if results:
2158
+ break
2159
+
2160
+ if not results:
2161
+ # Fallback: look for any div with guideline-related content
2162
+ results = soup.find_all(
2163
+ "div",
2164
+ class_=lambda x: x
2165
+ and any(
2166
+ keyword in x.lower()
2167
+ for keyword in ["guideline", "result", "item", "card"]
2168
+ ),
2169
+ )
2170
+
2171
+ for result in results[:limit]:
2172
+ try:
2173
+ # Extract title
2174
+ title_elem = (
2175
+ result.find("h3")
2176
+ or result.find("h2")
2177
+ or result.find("a", class_="title")
2178
+ or result.find("a")
2179
+ )
2180
+ if not title_elem:
2181
+ continue
2182
+
2183
+ title = title_elem.get_text().strip()
2184
+ if not title or len(title) < 10:
2185
+ continue
2186
+
2187
+ # Extract URL
2188
+ link_elem = result.find("a", href=True)
2189
+ if not link_elem:
2190
+ continue
2191
+
2192
+ url = link_elem.get("href", "")
2193
+ if url.startswith("/"):
2194
+ url = self.base_url + url
2195
+ elif not url.startswith("http"):
2196
+ continue
2197
+
2198
+ # Extract description/summary
2199
+ desc_elem = (
2200
+ result.find("p")
2201
+ or result.find("div", class_="description")
2202
+ or result.find("div", class_="summary")
2203
+ )
2204
+ description = desc_elem.get_text().strip() if desc_elem else ""
2205
+
2206
+ # Extract organization
2207
+ org_elem = (
2208
+ result.find("span", class_="organization")
2209
+ or result.find("div", class_="org")
2210
+ or result.find("cite")
2211
+ )
2212
+ organization = (
2213
+ org_elem.get_text().strip()
2214
+ if org_elem
2215
+ else "Canadian Medical Association"
2216
+ )
2217
+
2218
+ # Extract date
2219
+ date_elem = (
2220
+ result.find("time")
2221
+ or result.find("span", class_="date")
2222
+ or result.find("div", class_="date")
2223
+ )
2224
+ date = date_elem.get_text().strip() if date_elem else ""
2225
+
2226
+ # Extract content from the guideline page
2227
+ content = self._extract_guideline_content(url)
2228
+
2229
+ guidelines.append(
2230
+ {
2231
+ "title": title,
2232
+ "url": url,
2233
+ "description": description,
2234
+ "content": content,
2235
+ "date": date,
2236
+ "source": "CMA",
2237
+ "organization": organization,
2238
+ "is_guideline": True,
2239
+ "official": True,
2240
+ }
2241
+ )
2242
+
2243
+ except Exception:
2244
+ continue
2245
+
2246
+ if guidelines:
2247
+ return guidelines
2248
+
2249
+ except requests.exceptions.RequestException as e:
2250
+ print(f"CMA Infobase access failed: {e}, trying fallback search...")
2251
+
2252
+ # Fallback: Return sample guidelines based on query
2253
+ return self._get_fallback_cma_guidelines(query, limit)
2254
+
2255
+ except Exception as e:
2256
+ return {
2257
+ "error": f"Error processing CMA guidelines: {str(e)}",
2258
+ "source": "CMA",
2259
+ }
2260
+
2261
+ def _get_fallback_cma_guidelines(self, query, limit):
2262
+ """Provide fallback guidelines when direct access fails."""
2263
+ # This would contain sample guidelines based on common queries
2264
+ # For now, return a message indicating the issue
2265
+ return [
2266
+ {
2267
+ "title": f"CMA Infobase Guidelines Search for '{query}'",
2268
+ "url": self.search_url,
2269
+ "description": "CMA Infobase access temporarily unavailable. Please try again later or visit the CMA website directly.",
2270
+ "content": "The Canadian Medical Association Infobase contains over 1200 evidence-based clinical practice guidelines developed or endorsed by Canadian healthcare organizations.",
2271
+ "date": "",
2272
+ "source": "CMA",
2273
+ "organization": "Canadian Medical Association",
2274
+ "is_guideline": False,
2275
+ "official": True,
2276
+ "is_placeholder": True,
2277
+ "note": "Direct access to CMA Infobase failed. Please visit joulecma.ca/infobase for full access.",
2278
+ }
2279
+ ]
2280
+
2281
+ def _extract_guideline_content(self, url):
2282
+ """Extract actual content from a guideline URL."""
2283
+ try:
2284
+ time.sleep(0.5) # Be respectful
2285
+ response = self.session.get(url, timeout=15)
2286
+ response.raise_for_status()
2287
+
2288
+ soup = BeautifulSoup(response.content, "html.parser")
2289
+
2290
+ # Extract main content
2291
+ content_selectors = [
2292
+ "main",
2293
+ ".content",
2294
+ ".article-content",
2295
+ ".guideline-content",
2296
+ "article",
2297
+ ".main-content",
2298
+ ]
2299
+
2300
+ content_text = ""
2301
+ for selector in content_selectors:
2302
+ content_elem = soup.select_one(selector)
2303
+ if content_elem:
2304
+ # Get all text content
2305
+ paragraphs = content_elem.find_all("p")
2306
+ content_parts = []
2307
+ for p in paragraphs:
2308
+ text = p.get_text().strip()
2309
+ if len(text) > 20: # Skip very short paragraphs
2310
+ content_parts.append(text)
2311
+
2312
+ if content_parts:
2313
+ content_text = "\n\n".join(
2314
+ content_parts[:10]
2315
+ ) # Limit to first 10 paragraphs
2316
+ break
2317
+
2318
+ # If no main content found, try to get any meaningful text
2319
+ if not content_text:
2320
+ all_text = soup.get_text()
2321
+ # Clean up the text
2322
+ lines = [line.strip() for line in all_text.split("\n") if line.strip()]
2323
+ content_text = "\n".join(lines[:20]) # First 20 meaningful lines
2324
+
2325
+ return content_text[:2000] # Limit content length
2326
+
2327
+ except Exception as e:
2328
+ return f"Error extracting content: {str(e)}"