aix 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aix/contexts.py +114 -0
- {aix-0.0.22.dist-info → aix-0.0.23.dist-info}/METADATA +1 -1
- {aix-0.0.22.dist-info → aix-0.0.23.dist-info}/RECORD +6 -6
- {aix-0.0.22.dist-info → aix-0.0.23.dist-info}/LICENSE +0 -0
- {aix-0.0.22.dist-info → aix-0.0.23.dist-info}/WHEEL +0 -0
- {aix-0.0.22.dist-info → aix-0.0.23.dist-info}/top_level.txt +0 -0
aix/contexts.py
CHANGED
|
@@ -548,6 +548,8 @@ def notebook_to_markdown(
|
|
|
548
548
|
return target_file
|
|
549
549
|
|
|
550
550
|
|
|
551
|
+
dflt_converters["ipynb"] = notebook_to_markdown
|
|
552
|
+
|
|
551
553
|
# --------------------------------------------------------------------------------------
|
|
552
554
|
# Download articles from a markdown string and save them as PDF files
|
|
553
555
|
|
|
@@ -559,11 +561,123 @@ def notebook_to_markdown(
|
|
|
559
561
|
|
|
560
562
|
import os
|
|
561
563
|
import re
|
|
564
|
+
from typing import Callable, Iterator, Pattern, Tuple, Optional
|
|
562
565
|
import requests
|
|
563
566
|
|
|
564
567
|
DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
|
|
565
568
|
|
|
566
569
|
|
|
570
|
+
def extract_urls(
|
|
571
|
+
markdown: str,
|
|
572
|
+
pattern: Optional[Pattern] = None,
|
|
573
|
+
extractor: Optional[Callable[[re.Match], Tuple[str, str]]] = None,
|
|
574
|
+
) -> Iterator[Tuple[str, str]]:
|
|
575
|
+
"""
|
|
576
|
+
Extract URLs and their context from a markdown string.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
markdown: The markdown string to process
|
|
580
|
+
pattern: A compiled regex pattern to match URLs and their context
|
|
581
|
+
Defaults to matching markdown hyperlinks [context](url)
|
|
582
|
+
extractor: A function that extracts (context, url) from a match
|
|
583
|
+
Defaults to extracting from markdown hyperlinks
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
Iterator of (context, url) pairs
|
|
587
|
+
|
|
588
|
+
>>> text = "[Google](https://google.com) and [GitHub](https://github.com)"
|
|
589
|
+
>>> list(extract_urls(text))
|
|
590
|
+
[('Google', 'https://google.com'), ('GitHub', 'https://github.com')]
|
|
591
|
+
"""
|
|
592
|
+
if pattern is None:
|
|
593
|
+
# Default pattern matches markdown hyperlinks: [context](url)
|
|
594
|
+
pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
595
|
+
|
|
596
|
+
if extractor is None:
|
|
597
|
+
# Default extractor for markdown hyperlinks
|
|
598
|
+
def extractor(match: re.Match) -> Tuple[str, str]:
|
|
599
|
+
return match.group(1), match.group(2)
|
|
600
|
+
|
|
601
|
+
for match in pattern.finditer(markdown):
|
|
602
|
+
yield extractor(match)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
# Example alternative patterns and extractors
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def extract_with_surrounding_context(
|
|
609
|
+
markdown: str, context_chars: int = 30
|
|
610
|
+
) -> Iterator[Tuple[str, str]]:
|
|
611
|
+
"""
|
|
612
|
+
Extract URLs with surrounding text as context.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
markdown: The markdown string to process
|
|
616
|
+
context_chars: Number of characters to include before and after URL
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
Iterator of (context, url) pairs
|
|
620
|
+
"""
|
|
621
|
+
# Pattern to match URLs with a simple validation
|
|
622
|
+
pattern = re.compile(r"https?://[^\s]+")
|
|
623
|
+
|
|
624
|
+
def surrounding_context_extractor(match: re.Match) -> Tuple[str, str]:
|
|
625
|
+
url = match.group(0)
|
|
626
|
+
start = max(0, match.start() - context_chars)
|
|
627
|
+
end = min(len(markdown), match.end() + context_chars)
|
|
628
|
+
context = markdown[start:end].strip()
|
|
629
|
+
return context, url
|
|
630
|
+
|
|
631
|
+
return extract_urls(markdown, pattern, surrounding_context_extractor)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def extract_urls_only(markdown: str) -> Iterator[Tuple[str, str]]:
|
|
635
|
+
"""
|
|
636
|
+
Extract URLs with empty context.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
markdown: The markdown string to process
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
Iterator of (empty_context, url) pairs
|
|
643
|
+
"""
|
|
644
|
+
# More comprehensive URL pattern
|
|
645
|
+
pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^\s]*)?")
|
|
646
|
+
|
|
647
|
+
def url_only_extractor(match: re.Match) -> Tuple[str, str]:
|
|
648
|
+
url = match.group(0)
|
|
649
|
+
return "", url
|
|
650
|
+
|
|
651
|
+
return extract_urls(markdown, pattern, url_only_extractor)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def extract_html_links(markdown: str) -> Iterator[Tuple[str, str]]:
|
|
655
|
+
"""
|
|
656
|
+
Extract URLs from HTML anchor tags.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
markdown: The markdown or HTML string to process
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Iterator of (anchor_text, url) pairs
|
|
663
|
+
"""
|
|
664
|
+
# Simple pattern for HTML anchor tags
|
|
665
|
+
pattern = re.compile(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)</a>')
|
|
666
|
+
|
|
667
|
+
def html_link_extractor(match: re.Match) -> Tuple[str, str]:
|
|
668
|
+
# Note the order is reversed in HTML: href first, then text
|
|
669
|
+
return match.group(2), match.group(1)
|
|
670
|
+
|
|
671
|
+
return extract_urls(markdown, pattern, html_link_extractor)
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
extract_urls.with_surrounding_context = extract_with_surrounding_context
|
|
675
|
+
extract_urls.only_urls = extract_urls_only
|
|
676
|
+
extract_urls.html_links = extract_html_links
|
|
677
|
+
|
|
678
|
+
DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
|
|
679
|
+
|
|
680
|
+
|
|
567
681
|
def download_articles(
|
|
568
682
|
md_string: str,
|
|
569
683
|
save_dir: str = DFLT_SAVE_DIR,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
aix/__init__.py,sha256=5_Ktv6lJbdwGf6c94mLYGi00zMTn2HkrUOvHAawhr_4,1921
|
|
2
|
-
aix/contexts.py,sha256=
|
|
2
|
+
aix/contexts.py,sha256=uLjAkIlumPrBECEOfTrFrvNsjBIFJFppNVZ__Pe3aBE,37576
|
|
3
3
|
aix/misc.py,sha256=evC4FqE63z_gnZ_4vCLsfKZkksuPBDlfK0fI8jHEbGg,204
|
|
4
4
|
aix/np.py,sha256=D6uTumkK5Y9kB_XbSqtMzzBsnuai9WZWLVa6-sWybls,194
|
|
5
5
|
aix/pd.py,sha256=LqJ13OEOox6K6vs9hMYkhBRgCu0EMPiYSnd2no4RdDc,197
|
|
@@ -9,8 +9,8 @@ aix/util.py,sha256=d0VjSbpTNzjGFH_upNOnaUnrRawrVbXdlFBan1Q9CRo,107
|
|
|
9
9
|
aix/gen_ai/__init__.py,sha256=ky5WRID0rIb8KLxtulB9t2CN_GKUxu1KdiRN-n92q2U,2341
|
|
10
10
|
aix/gen_ai/google_genai.py,sha256=KRYc52DQtn-V5vycULyoNpoHcR3lcBt1Z0DHj0XYcuI,966
|
|
11
11
|
aix/gen_ai/openai_genai.py,sha256=RzJy7pIu4dngUThEJdALqZpexHK_quDkug-SjAXm41E,539
|
|
12
|
-
aix-0.0.
|
|
13
|
-
aix-0.0.
|
|
14
|
-
aix-0.0.
|
|
15
|
-
aix-0.0.
|
|
16
|
-
aix-0.0.
|
|
12
|
+
aix-0.0.23.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
13
|
+
aix-0.0.23.dist-info/METADATA,sha256=Ff5jkUtLowI3yTkaVT8tbzyet-cDeuVSfda_KyfWs_U,6010
|
|
14
|
+
aix-0.0.23.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
15
|
+
aix-0.0.23.dist-info/top_level.txt,sha256=JV67V91ws1X6NwMtcBSxqB7HJx0xOuo_of1K7yg33Z0,4
|
|
16
|
+
aix-0.0.23.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|