aix 0.0.21__tar.gz → 0.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aix-0.0.21 → aix-0.0.23}/PKG-INFO +1 -1
- {aix-0.0.21 → aix-0.0.23}/aix/contexts.py +116 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/PKG-INFO +1 -1
- {aix-0.0.21 → aix-0.0.23}/setup.cfg +1 -1
- {aix-0.0.21 → aix-0.0.23}/LICENSE +0 -0
- {aix-0.0.21 → aix-0.0.23}/README.md +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/__init__.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/gen_ai/__init__.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/gen_ai/google_genai.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/gen_ai/openai_genai.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/misc.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/np.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/pd.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/sk.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/stores.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix/util.py +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/SOURCES.txt +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/dependency_links.txt +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/not-zip-safe +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/requires.txt +0 -0
- {aix-0.0.21 → aix-0.0.23}/aix.egg-info/top_level.txt +0 -0
- {aix-0.0.21 → aix-0.0.23}/setup.py +0 -0
|
@@ -260,6 +260,8 @@ def bytes_to_markdown(
|
|
|
260
260
|
|
|
261
261
|
Returns:
|
|
262
262
|
str: Markdown-formatted text
|
|
263
|
+
|
|
264
|
+
See also: https://github.com/thorwhalen/aix/discussions/3#discussioncomment-12387852
|
|
263
265
|
"""
|
|
264
266
|
converter = converters.get(input_format.lower(), None)
|
|
265
267
|
if converter is not None:
|
|
@@ -546,6 +548,8 @@ def notebook_to_markdown(
|
|
|
546
548
|
return target_file
|
|
547
549
|
|
|
548
550
|
|
|
551
|
+
dflt_converters["ipynb"] = notebook_to_markdown
|
|
552
|
+
|
|
549
553
|
# --------------------------------------------------------------------------------------
|
|
550
554
|
# Download articles from a markdown string and save them as PDF files
|
|
551
555
|
|
|
@@ -557,11 +561,123 @@ def notebook_to_markdown(
|
|
|
557
561
|
|
|
558
562
|
import os
|
|
559
563
|
import re
|
|
564
|
+
from typing import Callable, Iterator, Pattern, Tuple, Optional
|
|
560
565
|
import requests
|
|
561
566
|
|
|
562
567
|
DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
|
|
563
568
|
|
|
564
569
|
|
|
570
|
+
def extract_urls(
|
|
571
|
+
markdown: str,
|
|
572
|
+
pattern: Optional[Pattern] = None,
|
|
573
|
+
extractor: Optional[Callable[[re.Match], Tuple[str, str]]] = None,
|
|
574
|
+
) -> Iterator[Tuple[str, str]]:
|
|
575
|
+
"""
|
|
576
|
+
Extract URLs and their context from a markdown string.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
markdown: The markdown string to process
|
|
580
|
+
pattern: A compiled regex pattern to match URLs and their context
|
|
581
|
+
Defaults to matching markdown hyperlinks [context](url)
|
|
582
|
+
extractor: A function that extracts (context, url) from a match
|
|
583
|
+
Defaults to extracting from markdown hyperlinks
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
Iterator of (context, url) pairs
|
|
587
|
+
|
|
588
|
+
>>> text = "[Google](https://google.com) and [GitHub](https://github.com)"
|
|
589
|
+
>>> list(extract_urls(text))
|
|
590
|
+
[('Google', 'https://google.com'), ('GitHub', 'https://github.com')]
|
|
591
|
+
"""
|
|
592
|
+
if pattern is None:
|
|
593
|
+
# Default pattern matches markdown hyperlinks: [context](url)
|
|
594
|
+
pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
595
|
+
|
|
596
|
+
if extractor is None:
|
|
597
|
+
# Default extractor for markdown hyperlinks
|
|
598
|
+
def extractor(match: re.Match) -> Tuple[str, str]:
|
|
599
|
+
return match.group(1), match.group(2)
|
|
600
|
+
|
|
601
|
+
for match in pattern.finditer(markdown):
|
|
602
|
+
yield extractor(match)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
# Example alternative patterns and extractors
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def extract_with_surrounding_context(
|
|
609
|
+
markdown: str, context_chars: int = 30
|
|
610
|
+
) -> Iterator[Tuple[str, str]]:
|
|
611
|
+
"""
|
|
612
|
+
Extract URLs with surrounding text as context.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
markdown: The markdown string to process
|
|
616
|
+
context_chars: Number of characters to include before and after URL
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
Iterator of (context, url) pairs
|
|
620
|
+
"""
|
|
621
|
+
# Pattern to match URLs with a simple validation
|
|
622
|
+
pattern = re.compile(r"https?://[^\s]+")
|
|
623
|
+
|
|
624
|
+
def surrounding_context_extractor(match: re.Match) -> Tuple[str, str]:
|
|
625
|
+
url = match.group(0)
|
|
626
|
+
start = max(0, match.start() - context_chars)
|
|
627
|
+
end = min(len(markdown), match.end() + context_chars)
|
|
628
|
+
context = markdown[start:end].strip()
|
|
629
|
+
return context, url
|
|
630
|
+
|
|
631
|
+
return extract_urls(markdown, pattern, surrounding_context_extractor)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def extract_urls_only(markdown: str) -> Iterator[Tuple[str, str]]:
|
|
635
|
+
"""
|
|
636
|
+
Extract URLs with empty context.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
markdown: The markdown string to process
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
Iterator of (empty_context, url) pairs
|
|
643
|
+
"""
|
|
644
|
+
# More comprehensive URL pattern
|
|
645
|
+
pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^\s]*)?")
|
|
646
|
+
|
|
647
|
+
def url_only_extractor(match: re.Match) -> Tuple[str, str]:
|
|
648
|
+
url = match.group(0)
|
|
649
|
+
return "", url
|
|
650
|
+
|
|
651
|
+
return extract_urls(markdown, pattern, url_only_extractor)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def extract_html_links(markdown: str) -> Iterator[Tuple[str, str]]:
|
|
655
|
+
"""
|
|
656
|
+
Extract URLs from HTML anchor tags.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
markdown: The markdown or HTML string to process
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Iterator of (anchor_text, url) pairs
|
|
663
|
+
"""
|
|
664
|
+
# Simple pattern for HTML anchor tags
|
|
665
|
+
pattern = re.compile(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)</a>')
|
|
666
|
+
|
|
667
|
+
def html_link_extractor(match: re.Match) -> Tuple[str, str]:
|
|
668
|
+
# Note the order is reversed in HTML: href first, then text
|
|
669
|
+
return match.group(2), match.group(1)
|
|
670
|
+
|
|
671
|
+
return extract_urls(markdown, pattern, html_link_extractor)
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
extract_urls.with_surrounding_context = extract_with_surrounding_context
|
|
675
|
+
extract_urls.only_urls = extract_urls_only
|
|
676
|
+
extract_urls.html_links = extract_html_links
|
|
677
|
+
|
|
678
|
+
DFLT_SAVE_DIR = os.path.expanduser("~/Downloads")
|
|
679
|
+
|
|
680
|
+
|
|
565
681
|
def download_articles(
|
|
566
682
|
md_string: str,
|
|
567
683
|
save_dir: str = DFLT_SAVE_DIR,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|