aix 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aix/contexts.py CHANGED
@@ -548,6 +548,8 @@ def notebook_to_markdown(
548
548
  return target_file
549
549
 
550
550
 
551
+ dflt_converters["ipynb"] = notebook_to_markdown
552
+
551
553
  # --------------------------------------------------------------------------------------
552
554
  # Download articles from a markdown string and save them as PDF files
553
555
 
@@ -559,11 +561,123 @@ def notebook_to_markdown(
559
561
 
560
562
  import os
561
563
  import re
564
+ from typing import Callable, Iterator, Pattern, Tuple, Optional
562
565
  import requests
563
566
 
564
567
  DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
565
568
 
566
569
 
570
+ def extract_urls(
571
+ markdown: str,
572
+ pattern: Optional[Pattern] = None,
573
+ extractor: Optional[Callable[[re.Match], Tuple[str, str]]] = None,
574
+ ) -> Iterator[Tuple[str, str]]:
575
+ """
576
+ Extract URLs and their context from a markdown string.
577
+
578
+ Args:
579
+ markdown: The markdown string to process
580
+ pattern: A compiled regex pattern to match URLs and their context
581
+ Defaults to matching markdown hyperlinks [context](url)
582
+ extractor: A function that extracts (context, url) from a match
583
+ Defaults to extracting from markdown hyperlinks
584
+
585
+ Returns:
586
+ Iterator of (context, url) pairs
587
+
588
+ >>> text = "[Google](https://google.com) and [GitHub](https://github.com)"
589
+ >>> list(extract_urls(text))
590
+ [('Google', 'https://google.com'), ('GitHub', 'https://github.com')]
591
+ """
592
+ if pattern is None:
593
+ # Default pattern matches markdown hyperlinks: [context](url)
594
+ pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
595
+
596
+ if extractor is None:
597
+ # Default extractor for markdown hyperlinks
598
+ def extractor(match: re.Match) -> Tuple[str, str]:
599
+ return match.group(1), match.group(2)
600
+
601
+ for match in pattern.finditer(markdown):
602
+ yield extractor(match)
603
+
604
+
605
+ # Example alternative patterns and extractors
606
+
607
+
608
+ def extract_with_surrounding_context(
609
+ markdown: str, context_chars: int = 30
610
+ ) -> Iterator[Tuple[str, str]]:
611
+ """
612
+ Extract URLs with surrounding text as context.
613
+
614
+ Args:
615
+ markdown: The markdown string to process
616
+ context_chars: Number of characters to include before and after URL
617
+
618
+ Returns:
619
+ Iterator of (context, url) pairs
620
+ """
621
+ # Pattern to match URLs with a simple validation
622
+ pattern = re.compile(r"https?://[^\s]+")
623
+
624
+ def surrounding_context_extractor(match: re.Match) -> Tuple[str, str]:
625
+ url = match.group(0)
626
+ start = max(0, match.start() - context_chars)
627
+ end = min(len(markdown), match.end() + context_chars)
628
+ context = markdown[start:end].strip()
629
+ return context, url
630
+
631
+ return extract_urls(markdown, pattern, surrounding_context_extractor)
632
+
633
+
634
+ def extract_urls_only(markdown: str) -> Iterator[Tuple[str, str]]:
635
+ """
636
+ Extract URLs with empty context.
637
+
638
+ Args:
639
+ markdown: The markdown string to process
640
+
641
+ Returns:
642
+ Iterator of (empty_context, url) pairs
643
+ """
644
+ # More comprehensive URL pattern
645
+ pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^\s]*)?")
646
+
647
+ def url_only_extractor(match: re.Match) -> Tuple[str, str]:
648
+ url = match.group(0)
649
+ return "", url
650
+
651
+ return extract_urls(markdown, pattern, url_only_extractor)
652
+
653
+
654
+ def extract_html_links(markdown: str) -> Iterator[Tuple[str, str]]:
655
+ """
656
+ Extract URLs from HTML anchor tags.
657
+
658
+ Args:
659
+ markdown: The markdown or HTML string to process
660
+
661
+ Returns:
662
+ Iterator of (anchor_text, url) pairs
663
+ """
664
+ # Simple pattern for HTML anchor tags
665
+ pattern = re.compile(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)</a>')
666
+
667
+ def html_link_extractor(match: re.Match) -> Tuple[str, str]:
668
+ # Note the order is reversed in HTML: href first, then text
669
+ return match.group(2), match.group(1)
670
+
671
+ return extract_urls(markdown, pattern, html_link_extractor)
672
+
673
+
674
+ extract_urls.with_surrounding_context = extract_with_surrounding_context
675
+ extract_urls.only_urls = extract_urls_only
676
+ extract_urls.html_links = extract_html_links
677
+
678
+ DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
679
+
680
+
567
681
  def download_articles(
568
682
  md_string: str,
569
683
  save_dir: str = DFLT_SAVE_DIR,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aix
3
- Version: 0.0.22
3
+ Version: 0.0.23
4
4
  Summary: Artificial Intelligence eXtensions
5
5
  Home-page: https://github.com/thorwhalen/aix
6
6
  Author: Thor Whalen
@@ -1,5 +1,5 @@
1
1
  aix/__init__.py,sha256=5_Ktv6lJbdwGf6c94mLYGi00zMTn2HkrUOvHAawhr_4,1921
2
- aix/contexts.py,sha256=_SPPR1oMa59nh-BV1jdKtY27UdAqDxiSL65yps5Opkk,33942
2
+ aix/contexts.py,sha256=uLjAkIlumPrBECEOfTrFrvNsjBIFJFppNVZ__Pe3aBE,37576
3
3
  aix/misc.py,sha256=evC4FqE63z_gnZ_4vCLsfKZkksuPBDlfK0fI8jHEbGg,204
4
4
  aix/np.py,sha256=D6uTumkK5Y9kB_XbSqtMzzBsnuai9WZWLVa6-sWybls,194
5
5
  aix/pd.py,sha256=LqJ13OEOox6K6vs9hMYkhBRgCu0EMPiYSnd2no4RdDc,197
@@ -9,8 +9,8 @@ aix/util.py,sha256=d0VjSbpTNzjGFH_upNOnaUnrRawrVbXdlFBan1Q9CRo,107
9
9
  aix/gen_ai/__init__.py,sha256=ky5WRID0rIb8KLxtulB9t2CN_GKUxu1KdiRN-n92q2U,2341
10
10
  aix/gen_ai/google_genai.py,sha256=KRYc52DQtn-V5vycULyoNpoHcR3lcBt1Z0DHj0XYcuI,966
11
11
  aix/gen_ai/openai_genai.py,sha256=RzJy7pIu4dngUThEJdALqZpexHK_quDkug-SjAXm41E,539
12
- aix-0.0.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
13
- aix-0.0.22.dist-info/METADATA,sha256=1k3kyzwTw_iZj7fqnwj1SZRbQhgto3rKAWFGJmLqIHs,6010
14
- aix-0.0.22.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
15
- aix-0.0.22.dist-info/top_level.txt,sha256=JV67V91ws1X6NwMtcBSxqB7HJx0xOuo_of1K7yg33Z0,4
16
- aix-0.0.22.dist-info/RECORD,,
12
+ aix-0.0.23.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
13
+ aix-0.0.23.dist-info/METADATA,sha256=Ff5jkUtLowI3yTkaVT8tbzyet-cDeuVSfda_KyfWs_U,6010
14
+ aix-0.0.23.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
15
+ aix-0.0.23.dist-info/top_level.txt,sha256=JV67V91ws1X6NwMtcBSxqB7HJx0xOuo_of1K7yg33Z0,4
16
+ aix-0.0.23.dist-info/RECORD,,
File without changes
File without changes