janito 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,137 +0,0 @@
1
- """
2
- Specialized functionality for handling news aggregator sites.
3
- """
4
-
5
- from typing import Tuple, List
6
- from urllib.parse import urlparse
7
- from bs4 import BeautifulSoup
8
- import re
9
-
10
- from janito.tools.rich_console import print_info, print_success, print_warning
11
- from janito.tools.usage_tracker import track_usage
12
- from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
13
- # Import moved to function to avoid circular imports
14
- from janito.tools.fetch_webpage.extractors import extract_clean_text
15
- from janito.tools.fetch_webpage.chunking import chunk_large_content
16
-
17
-
18
- @track_usage('web_content')
19
- def fetch_and_extract_news_aggregator(url: str, max_stories: int = 15) -> Tuple[str, bool]:
20
- """
21
- Specialized extraction for news aggregator sites like Google News.
22
-
23
- Args:
24
- url: The URL of the news aggregator site
25
- max_stories: Maximum number of stories to extract
26
-
27
- Returns:
28
- A tuple containing (extracted_content, is_error)
29
- """
30
- domain = urlparse(url).netloc
31
-
32
- # Check if we have a specific strategy for this domain
33
- strategy = None
34
- for site_domain, site_strategy in SITE_SPECIFIC_STRATEGIES.items():
35
- if site_domain in domain:
36
- strategy = site_strategy
37
- break
38
-
39
- if not strategy:
40
- print_warning(f"News Extraction: No specific strategy found for {domain}. Using general extraction.")
41
- from janito.tools.fetch_webpage.core import fetch_and_extract
42
- return fetch_and_extract(url)
43
-
44
- print_info(f"Using specialized extraction for {domain}", "News Extraction")
45
-
46
- # Import here to avoid circular imports
47
- from janito.tools.fetch_webpage.core import fetch_webpage
48
-
49
- # Fetch the page
50
- html_content, is_error = fetch_webpage(url, max_size=2000000) # Limit to 2MB for news sites
51
-
52
- if is_error:
53
- return html_content, True
54
-
55
- # Extract content using the site-specific strategy
56
- extracted_text = extract_clean_text(
57
- html_content,
58
- method=strategy.get("method", "beautifulsoup"),
59
- url=url,
60
- target_strings=strategy.get("target_strings", [])
61
- )
62
-
63
- if not extracted_text or len(extracted_text) < 100:
64
- return f"Could not extract meaningful content from {url}", True
65
-
66
- # Get article titles and snippets using BeautifulSoup
67
- soup = BeautifulSoup(html_content, 'html.parser')
68
-
69
- article_titles = []
70
- article_snippets = []
71
-
72
- # Use site-specific selectors
73
- selectors = strategy.get("article_selectors", ["article", "h3", "h4"])
74
-
75
- # Find article elements
76
- for selector in selectors:
77
- elements = soup.select(selector)
78
- for element in elements[:max_stories*2]: # Get more than we need, then filter
79
- text = element.get_text(strip=True)
80
- if text and len(text) > 15:
81
- if len(text) < 200: # Likely a title
82
- if text not in article_titles:
83
- article_titles.append(text)
84
- else: # Likely a snippet
85
- if text not in article_snippets:
86
- article_snippets.append(text)
87
-
88
- # Limit to requested number of stories
89
- article_titles = article_titles[:max_stories]
90
- article_snippets = article_snippets[:max_stories]
91
-
92
- # Format the result
93
- result = ["# Top Stories\n"]
94
-
95
- # Add titles and snippets
96
- for i, title in enumerate(article_titles):
97
- result.append(f"## {title}")
98
- # Try to find a matching snippet
99
- snippet_added = False
100
- for snippet in article_snippets:
101
- # Check if any significant words from title appear in snippet
102
- title_words = set(re.findall(r'\b\w{5,}\b', title.lower()))
103
- if any(word in snippet.lower() for word in title_words if len(word) > 4):
104
- result.append(f"{snippet[:300]}...")
105
- snippet_added = True
106
- break
107
-
108
- if not snippet_added and i < len(article_snippets):
109
- result.append(f"{article_snippets[i][:300]}...")
110
-
111
- result.append("") # Add spacing between articles
112
-
113
- # If we didn't get enough specific articles, add some generic extracted content
114
- if len(article_titles) < 3:
115
- # Chunk the generic extracted content
116
- chunks = chunk_large_content(extracted_text, chunk_size=2000, overlap=200)
117
- relevant_chunks = []
118
-
119
- # Find chunks that look like news
120
- for chunk in chunks[:10]:
121
- if any(marker in chunk for marker in [":", " - ", "reports", "according to", "says"]):
122
- relevant_chunks.append(chunk)
123
-
124
- if relevant_chunks:
125
- result.append("# Additional News Content\n")
126
- result.append("\n".join(relevant_chunks[:3]))
127
-
128
- max_length = strategy.get("max_length", 15000)
129
- final_text = "\n".join(result)
130
-
131
- # Truncate if needed
132
- if len(final_text) > max_length:
133
- print_info(f"Truncating content from {len(final_text)} to {max_length} characters", "News Extraction")
134
- final_text = final_text[:max_length] + "..."
135
-
136
- print_success(f"Successfully extracted {len(article_titles)} news stories", "News Extraction")
137
- return final_text, False
@@ -1,108 +0,0 @@
1
- """
2
- Utility functions and constants for the fetch_webpage package.
3
- """
4
-
5
- import re
6
- import html
7
- import unicodedata
8
-
9
- # Dictionary of known content types and extraction strategies
10
- SITE_SPECIFIC_STRATEGIES = {
11
- "news.google.com": {
12
- "method": "beautifulsoup",
13
- "target_strings": [
14
- "Top stories", "Headlines", "For you",
15
- "U.S.", "World", "Business", "Technology",
16
- "Entertainment", "Sports", "Science", "Health"
17
- ],
18
- "max_length": 20000,
19
- "article_selectors": ["article", "h3", "h4", ".ipQwMb", ".BOz6fb", ".MgUUmf"]
20
- },
21
- "news.yahoo.com": {
22
- "method": "beautifulsoup",
23
- "target_strings": ["Top Stories", "Trending News"],
24
- "max_length": 20000,
25
- "article_selectors": [".js-stream-content", ".js-content", "h3", "h2"]
26
- },
27
- "msn.com": {
28
- "method": "newspaper",
29
- "max_length": 20000,
30
- "target_strings": ["Top stories", "Headlines"]
31
- },
32
- "reddit.com": {
33
- "method": "trafilatura",
34
- "target_strings": ["comments", "Posted by", "communities"],
35
- "max_length": 15000,
36
- "article_selectors": [".Post", "h1", "h2", ".title"]
37
- },
38
- "twitter.com": {
39
- "method": "beautifulsoup",
40
- "target_strings": ["Trending", "Following", "For you"],
41
- "max_length": 15000,
42
- "article_selectors": [".tweet", ".content", "[data-testid='tweet']"]
43
- }
44
- }
45
-
46
-
47
- def clean_text(text: str) -> str:
48
- """
49
- Clean extracted text by removing extra whitespace, normalizing Unicode, etc.
50
-
51
- Args:
52
- text: The text to clean
53
-
54
- Returns:
55
- Cleaned text
56
- """
57
- # Decode HTML entities
58
- text = html.unescape(text)
59
-
60
- # Normalize Unicode characters
61
- text = unicodedata.normalize('NFKC', text)
62
-
63
- # Remove excess whitespace
64
- text = re.sub(r'\s+', ' ', text)
65
-
66
- # Remove duplicate newlines (but preserve paragraph breaks)
67
- text = re.sub(r'\n{3,}', '\n\n', text)
68
-
69
- # Remove very short lines that are often menu items or UI elements
70
- lines = [line for line in text.split('\n') if len(line.strip()) > 20]
71
- text = '\n'.join(lines)
72
-
73
- return text.strip()
74
-
75
-
76
- def similar_text(text1: str, text2: str, threshold: float = 0.7) -> bool:
77
- """
78
- Check if two text strings are similar using a simple similarity metric.
79
-
80
- Args:
81
- text1: First text string
82
- text2: Second text string
83
- threshold: Similarity threshold (0-1)
84
-
85
- Returns:
86
- True if texts are similar, False otherwise
87
- """
88
- # Simple character-based similarity
89
- if len(text1) == 0 or len(text2) == 0:
90
- return False
91
-
92
- # If one string is much shorter than the other, they're not similar
93
- if len(text1) < len(text2) * 0.5 or len(text2) < len(text1) * 0.5:
94
- return False
95
-
96
- # Check for substring relationship
97
- if text1 in text2 or text2 in text1:
98
- return True
99
-
100
- # Simple character-based similarity for short strings
101
- if len(text1) < 200 and len(text2) < 200:
102
- shorter = text1 if len(text1) <= len(text2) else text2
103
- longer = text2 if len(text1) <= len(text2) else text1
104
-
105
- matches = sum(c1 == c2 for c1, c2 in zip(shorter, longer))
106
- return matches / len(shorter) >= threshold
107
-
108
- return False