janito 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janito/__init__.py +1 -1
- janito/cli/agent/__init__.py +7 -0
- janito/cli/agent/conversation.py +149 -0
- janito/cli/agent/initialization.py +172 -0
- janito/cli/agent/query.py +108 -0
- janito/cli/agent.py +7 -282
- janito/cli/app.py +105 -9
- janito/cli/commands/__init__.py +12 -0
- janito/cli/commands/config.py +242 -0
- janito/cli/commands/history.py +119 -0
- janito/cli/commands/profile.py +72 -0
- janito/cli/commands/validation.py +24 -0
- janito/cli/commands/workspace.py +31 -0
- janito/cli/commands.py +9 -326
- janito/config.py +37 -0
- janito/data/instructions_template.txt +9 -5
- janito/tools/__init__.py +8 -2
- janito/tools/bash/bash.py +3 -1
- janito/tools/bash/unix_persistent_bash.py +183 -181
- janito/tools/bash/win_persistent_bash.py +4 -2
- janito/tools/fetch_webpage/__init__.py +22 -33
- janito/tools/fetch_webpage/core.py +182 -155
- janito/tools/rich_console.py +46 -9
- janito/tools/search_text.py +225 -238
- janito/tools/str_replace_editor/handlers/str_replace.py +3 -1
- janito/tools/str_replace_editor/handlers/view.py +14 -8
- janito/tools/think.py +37 -0
- janito/tools/usage_tracker.py +1 -0
- janito-0.14.0.dist-info/METADATA +396 -0
- janito-0.14.0.dist-info/RECORD +53 -0
- janito/test_file.py +0 -4
- janito/tools/fetch_webpage/chunking.py +0 -76
- janito/tools/fetch_webpage/extractors.py +0 -276
- janito/tools/fetch_webpage/news.py +0 -137
- janito/tools/fetch_webpage/utils.py +0 -108
- janito-0.12.0.dist-info/METADATA +0 -203
- janito-0.12.0.dist-info/RECORD +0 -47
- {janito-0.12.0.dist-info → janito-0.14.0.dist-info}/WHEEL +0 -0
- {janito-0.12.0.dist-info → janito-0.14.0.dist-info}/entry_points.txt +0 -0
- {janito-0.12.0.dist-info → janito-0.14.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,276 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Content extraction methods for web pages.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import List, Dict, Union, Optional
|
6
|
-
from bs4 import BeautifulSoup
|
7
|
-
import trafilatura
|
8
|
-
from newspaper import Article
|
9
|
-
import re
|
10
|
-
|
11
|
-
from janito.tools.rich_console import print_info, print_success, print_warning
|
12
|
-
from janito.tools.fetch_webpage.utils import clean_text, similar_text
|
13
|
-
|
14
|
-
|
15
|
-
def extract_clean_text(html_content: str, method: str = 'trafilatura',
|
16
|
-
url: Optional[str] = None, target_strings: List[str] = None) -> str:
|
17
|
-
"""
|
18
|
-
Extract clean, relevant text from HTML content using various methods.
|
19
|
-
|
20
|
-
Args:
|
21
|
-
html_content: The HTML content to extract text from
|
22
|
-
method: The extraction method to use ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
|
23
|
-
url: Optional URL for methods that require it (like newspaper)
|
24
|
-
target_strings: Optional list of strings to target specific content sections
|
25
|
-
|
26
|
-
Returns:
|
27
|
-
Extracted text content
|
28
|
-
"""
|
29
|
-
print_info(f"Extracting content using method: {method}", "Content Extraction")
|
30
|
-
|
31
|
-
extracted_text = ""
|
32
|
-
|
33
|
-
if method == 'trafilatura' or method == 'all':
|
34
|
-
try:
|
35
|
-
traf_text = trafilatura.extract(html_content, include_links=False,
|
36
|
-
include_tables=False, include_images=False,
|
37
|
-
favor_precision=True)
|
38
|
-
if traf_text and len(traf_text) > 100:
|
39
|
-
if method == 'trafilatura':
|
40
|
-
print_success("Successfully extracted content with Trafilatura", "Content Extraction")
|
41
|
-
return clean_text(traf_text)
|
42
|
-
extracted_text = traf_text
|
43
|
-
print_success("Successfully extracted content with Trafilatura", "Content Extraction")
|
44
|
-
except Exception as e:
|
45
|
-
print_warning(f"Content Extraction: Trafilatura extraction error: {str(e)}")
|
46
|
-
|
47
|
-
if method == 'newspaper' or method == 'all':
|
48
|
-
if not url:
|
49
|
-
print_warning("Content Extraction: URL required for newspaper extraction but not provided")
|
50
|
-
else:
|
51
|
-
try:
|
52
|
-
article = Article(url)
|
53
|
-
article.download(html_content)
|
54
|
-
article.parse()
|
55
|
-
np_text = article.text
|
56
|
-
if np_text and len(np_text) > 100:
|
57
|
-
if method == 'newspaper':
|
58
|
-
print_success("Successfully extracted content with Newspaper3k", "Content Extraction")
|
59
|
-
return clean_text(np_text)
|
60
|
-
if not extracted_text or len(np_text) > len(extracted_text):
|
61
|
-
extracted_text = np_text
|
62
|
-
print_success("Successfully extracted content with Newspaper3k", "Content Extraction")
|
63
|
-
except Exception as e:
|
64
|
-
print_warning(f"Content Extraction: Newspaper extraction error: {str(e)}")
|
65
|
-
|
66
|
-
if method == 'beautifulsoup' or method == 'all':
|
67
|
-
try:
|
68
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
69
|
-
|
70
|
-
# Remove script, style, and other non-content elements
|
71
|
-
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
|
72
|
-
element.decompose()
|
73
|
-
|
74
|
-
# Extract text from paragraph and heading tags
|
75
|
-
paragraphs = []
|
76
|
-
for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article']):
|
77
|
-
text = tag.get_text(strip=True)
|
78
|
-
if text and len(text) > 20: # Skip very short pieces that might be UI elements
|
79
|
-
paragraphs.append(text)
|
80
|
-
|
81
|
-
bs_text = "\n\n".join(paragraphs)
|
82
|
-
if bs_text and len(bs_text) > 100:
|
83
|
-
if method == 'beautifulsoup':
|
84
|
-
print_success("Successfully extracted content with BeautifulSoup", "Content Extraction")
|
85
|
-
return clean_text(bs_text)
|
86
|
-
if not extracted_text or len(bs_text) > len(extracted_text):
|
87
|
-
extracted_text = bs_text
|
88
|
-
print_success("Successfully extracted content with BeautifulSoup", "Content Extraction")
|
89
|
-
except Exception as e:
|
90
|
-
print_warning(f"Content Extraction: BeautifulSoup extraction error: {str(e)}")
|
91
|
-
|
92
|
-
if not extracted_text:
|
93
|
-
print_warning("Content Extraction: Could not extract meaningful content with any method")
|
94
|
-
# Fall back to the raw text with HTML tags removed
|
95
|
-
extracted_text = BeautifulSoup(html_content, 'html.parser').get_text(separator='\n')
|
96
|
-
|
97
|
-
return clean_text(extracted_text)
|
98
|
-
|
99
|
-
|
100
|
-
def extract_targeted_content(html_content: str, target_strings: List[str],
|
101
|
-
context_size: int = 500) -> str:
|
102
|
-
"""
|
103
|
-
Extract content sections that contain specific target strings.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
html_content: The HTML content to search within
|
107
|
-
target_strings: List of strings to search for in the content
|
108
|
-
context_size: Number of characters to include before and after each match
|
109
|
-
|
110
|
-
Returns:
|
111
|
-
Extracted content focusing on sections containing target strings
|
112
|
-
"""
|
113
|
-
if not target_strings:
|
114
|
-
return ""
|
115
|
-
|
116
|
-
print_info(f"Extracting content targeted around {len(target_strings)} search strings", "Targeted Extraction")
|
117
|
-
|
118
|
-
# First clean the HTML to make text extraction easier
|
119
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
120
|
-
|
121
|
-
# Remove script, style, and other non-content elements
|
122
|
-
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
|
123
|
-
element.decompose()
|
124
|
-
|
125
|
-
# Get the full text content
|
126
|
-
full_text = soup.get_text(' ', strip=True)
|
127
|
-
full_text = re.sub(r'\s+', ' ', full_text) # Normalize whitespace
|
128
|
-
|
129
|
-
matched_sections = []
|
130
|
-
for target in target_strings:
|
131
|
-
if not target or len(target) < 3:
|
132
|
-
continue
|
133
|
-
|
134
|
-
# Try exact match first
|
135
|
-
if target in full_text:
|
136
|
-
indices = [m.start() for m in re.finditer(re.escape(target), full_text)]
|
137
|
-
for idx in indices:
|
138
|
-
start = max(0, idx - context_size)
|
139
|
-
end = min(len(full_text), idx + len(target) + context_size)
|
140
|
-
section = full_text[start:end]
|
141
|
-
# Add ellipsis if we're showing a fragment
|
142
|
-
if start > 0:
|
143
|
-
section = "..." + section
|
144
|
-
if end < len(full_text):
|
145
|
-
section = section + "..."
|
146
|
-
matched_sections.append(section)
|
147
|
-
else:
|
148
|
-
# Try fuzzy search if no exact match (looking for words in the target string)
|
149
|
-
words = [w for w in target.lower().split() if len(w) > 3]
|
150
|
-
if words:
|
151
|
-
for word in words:
|
152
|
-
pattern = r'\b' + re.escape(word) + r'\b'
|
153
|
-
matches = list(re.finditer(pattern, full_text.lower()))
|
154
|
-
for match in matches[:3]: # Limit to first 3 matches per word
|
155
|
-
idx = match.start()
|
156
|
-
start = max(0, idx - context_size)
|
157
|
-
end = min(len(full_text), idx + len(word) + context_size)
|
158
|
-
section = full_text[start:end]
|
159
|
-
if start > 0:
|
160
|
-
section = "..." + section
|
161
|
-
if end < len(full_text):
|
162
|
-
section = section + "..."
|
163
|
-
matched_sections.append(section)
|
164
|
-
|
165
|
-
# Deduplicate similar sections
|
166
|
-
unique_sections = []
|
167
|
-
for section in matched_sections:
|
168
|
-
if not any(similar_text(section, existing, threshold=0.7) for existing in unique_sections):
|
169
|
-
unique_sections.append(section)
|
170
|
-
|
171
|
-
if not unique_sections:
|
172
|
-
print_warning("Targeted Extraction: No content sections found matching the target strings")
|
173
|
-
return ""
|
174
|
-
|
175
|
-
# Join the sections with paragraph breaks
|
176
|
-
result = "\n\n".join(unique_sections)
|
177
|
-
print_success(f"Found {len(unique_sections)} relevant content sections", "Targeted Extraction")
|
178
|
-
|
179
|
-
return clean_text(result)
|
180
|
-
|
181
|
-
|
182
|
-
def extract_structured_content(html_content: str, url: str = None,
|
183
|
-
target_strings: List[str] = None) -> Dict[str, Union[str, List[str]]]:
|
184
|
-
"""
|
185
|
-
Extract structured content from a webpage, including title, main text, and key points.
|
186
|
-
|
187
|
-
Args:
|
188
|
-
html_content: The HTML content to extract from
|
189
|
-
url: Optional URL for methods that require it
|
190
|
-
target_strings: Optional list of strings to target specific content sections
|
191
|
-
|
192
|
-
Returns:
|
193
|
-
Dictionary with structured content elements
|
194
|
-
"""
|
195
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
196
|
-
|
197
|
-
# Extract title
|
198
|
-
title = ""
|
199
|
-
if soup.title:
|
200
|
-
title = soup.title.text.strip()
|
201
|
-
|
202
|
-
# Try to get more specific title from h1 if title looks generic
|
203
|
-
if not title or len(title) < 10:
|
204
|
-
h1_tags = soup.find_all('h1')
|
205
|
-
if h1_tags and len(h1_tags[0].text.strip()) > 10:
|
206
|
-
title = h1_tags[0].text.strip()
|
207
|
-
|
208
|
-
# Extract main content using trafilatura (our primary extractor)
|
209
|
-
main_text = extract_clean_text(html_content, method='trafilatura', url=url)
|
210
|
-
|
211
|
-
# If target strings are provided, prioritize content around those strings
|
212
|
-
targeted_text = ""
|
213
|
-
if target_strings:
|
214
|
-
targeted_text = extract_targeted_content(html_content, target_strings)
|
215
|
-
if targeted_text:
|
216
|
-
main_text = targeted_text
|
217
|
-
|
218
|
-
# Extract key points (using headers)
|
219
|
-
key_points = []
|
220
|
-
for header in soup.find_all(['h1', 'h2', 'h3']):
|
221
|
-
text = header.text.strip()
|
222
|
-
if text and len(text) > 5 and text not in key_points:
|
223
|
-
key_points.append(text)
|
224
|
-
|
225
|
-
# For news aggregators like Google News, look for news article titles specifically
|
226
|
-
if url and ('news.google.com' in url or 'news.yahoo.com' in url or 'msn.com/news' in url):
|
227
|
-
print_info("Detected news aggregator site, searching for article titles", "Content Extraction")
|
228
|
-
|
229
|
-
# Look for common news article title patterns
|
230
|
-
article_titles = []
|
231
|
-
|
232
|
-
# Google News specific article elements
|
233
|
-
articles = soup.find_all('article')
|
234
|
-
for article in articles[:20]: # Limit to first 20 articles
|
235
|
-
# Try to find the headline
|
236
|
-
headline = article.find(['h3', 'h4'])
|
237
|
-
if headline:
|
238
|
-
title = headline.text.strip()
|
239
|
-
if title and len(title) > 15 and title not in article_titles: # Skip short titles
|
240
|
-
article_titles.append(title)
|
241
|
-
|
242
|
-
# Add these to our key points
|
243
|
-
if article_titles:
|
244
|
-
key_points = article_titles + key_points
|
245
|
-
|
246
|
-
# Limit key points to most important ones
|
247
|
-
key_points = key_points[:15]
|
248
|
-
|
249
|
-
# Extract potential highlights (often in <strong>, <b>, <em> tags)
|
250
|
-
highlights = []
|
251
|
-
for tag in soup.find_all(['strong', 'b', 'em']):
|
252
|
-
text = tag.text.strip()
|
253
|
-
if text and len(text) > 15 and text not in highlights:
|
254
|
-
highlights.append(text)
|
255
|
-
|
256
|
-
# Limit highlights to most important ones
|
257
|
-
highlights = highlights[:5]
|
258
|
-
|
259
|
-
# Create a summary of the extracted content
|
260
|
-
summary = ""
|
261
|
-
if len(main_text) > 200:
|
262
|
-
# Extract first paragraph or two for summary
|
263
|
-
paragraphs = main_text.split('\n\n')
|
264
|
-
summary = '\n\n'.join(paragraphs[:2])
|
265
|
-
if len(summary) > 500:
|
266
|
-
summary = summary[:500] + "..."
|
267
|
-
|
268
|
-
return {
|
269
|
-
"title": title,
|
270
|
-
"main_text": main_text,
|
271
|
-
"key_points": key_points,
|
272
|
-
"highlights": highlights,
|
273
|
-
"summary": summary,
|
274
|
-
"word_count": len(main_text.split()),
|
275
|
-
"targeted_extraction": bool(target_strings and targeted_text)
|
276
|
-
}
|
@@ -1,137 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Specialized functionality for handling news aggregator sites.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import Tuple, List
|
6
|
-
from urllib.parse import urlparse
|
7
|
-
from bs4 import BeautifulSoup
|
8
|
-
import re
|
9
|
-
|
10
|
-
from janito.tools.rich_console import print_info, print_success, print_warning
|
11
|
-
from janito.tools.usage_tracker import track_usage
|
12
|
-
from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
|
13
|
-
# Import moved to function to avoid circular imports
|
14
|
-
from janito.tools.fetch_webpage.extractors import extract_clean_text
|
15
|
-
from janito.tools.fetch_webpage.chunking import chunk_large_content
|
16
|
-
|
17
|
-
|
18
|
-
@track_usage('web_content')
|
19
|
-
def fetch_and_extract_news_aggregator(url: str, max_stories: int = 15) -> Tuple[str, bool]:
|
20
|
-
"""
|
21
|
-
Specialized extraction for news aggregator sites like Google News.
|
22
|
-
|
23
|
-
Args:
|
24
|
-
url: The URL of the news aggregator site
|
25
|
-
max_stories: Maximum number of stories to extract
|
26
|
-
|
27
|
-
Returns:
|
28
|
-
A tuple containing (extracted_content, is_error)
|
29
|
-
"""
|
30
|
-
domain = urlparse(url).netloc
|
31
|
-
|
32
|
-
# Check if we have a specific strategy for this domain
|
33
|
-
strategy = None
|
34
|
-
for site_domain, site_strategy in SITE_SPECIFIC_STRATEGIES.items():
|
35
|
-
if site_domain in domain:
|
36
|
-
strategy = site_strategy
|
37
|
-
break
|
38
|
-
|
39
|
-
if not strategy:
|
40
|
-
print_warning(f"News Extraction: No specific strategy found for {domain}. Using general extraction.")
|
41
|
-
from janito.tools.fetch_webpage.core import fetch_and_extract
|
42
|
-
return fetch_and_extract(url)
|
43
|
-
|
44
|
-
print_info(f"Using specialized extraction for {domain}", "News Extraction")
|
45
|
-
|
46
|
-
# Import here to avoid circular imports
|
47
|
-
from janito.tools.fetch_webpage.core import fetch_webpage
|
48
|
-
|
49
|
-
# Fetch the page
|
50
|
-
html_content, is_error = fetch_webpage(url, max_size=2000000) # Limit to 2MB for news sites
|
51
|
-
|
52
|
-
if is_error:
|
53
|
-
return html_content, True
|
54
|
-
|
55
|
-
# Extract content using the site-specific strategy
|
56
|
-
extracted_text = extract_clean_text(
|
57
|
-
html_content,
|
58
|
-
method=strategy.get("method", "beautifulsoup"),
|
59
|
-
url=url,
|
60
|
-
target_strings=strategy.get("target_strings", [])
|
61
|
-
)
|
62
|
-
|
63
|
-
if not extracted_text or len(extracted_text) < 100:
|
64
|
-
return f"Could not extract meaningful content from {url}", True
|
65
|
-
|
66
|
-
# Get article titles and snippets using BeautifulSoup
|
67
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
68
|
-
|
69
|
-
article_titles = []
|
70
|
-
article_snippets = []
|
71
|
-
|
72
|
-
# Use site-specific selectors
|
73
|
-
selectors = strategy.get("article_selectors", ["article", "h3", "h4"])
|
74
|
-
|
75
|
-
# Find article elements
|
76
|
-
for selector in selectors:
|
77
|
-
elements = soup.select(selector)
|
78
|
-
for element in elements[:max_stories*2]: # Get more than we need, then filter
|
79
|
-
text = element.get_text(strip=True)
|
80
|
-
if text and len(text) > 15:
|
81
|
-
if len(text) < 200: # Likely a title
|
82
|
-
if text not in article_titles:
|
83
|
-
article_titles.append(text)
|
84
|
-
else: # Likely a snippet
|
85
|
-
if text not in article_snippets:
|
86
|
-
article_snippets.append(text)
|
87
|
-
|
88
|
-
# Limit to requested number of stories
|
89
|
-
article_titles = article_titles[:max_stories]
|
90
|
-
article_snippets = article_snippets[:max_stories]
|
91
|
-
|
92
|
-
# Format the result
|
93
|
-
result = ["# Top Stories\n"]
|
94
|
-
|
95
|
-
# Add titles and snippets
|
96
|
-
for i, title in enumerate(article_titles):
|
97
|
-
result.append(f"## {title}")
|
98
|
-
# Try to find a matching snippet
|
99
|
-
snippet_added = False
|
100
|
-
for snippet in article_snippets:
|
101
|
-
# Check if any significant words from title appear in snippet
|
102
|
-
title_words = set(re.findall(r'\b\w{5,}\b', title.lower()))
|
103
|
-
if any(word in snippet.lower() for word in title_words if len(word) > 4):
|
104
|
-
result.append(f"{snippet[:300]}...")
|
105
|
-
snippet_added = True
|
106
|
-
break
|
107
|
-
|
108
|
-
if not snippet_added and i < len(article_snippets):
|
109
|
-
result.append(f"{article_snippets[i][:300]}...")
|
110
|
-
|
111
|
-
result.append("") # Add spacing between articles
|
112
|
-
|
113
|
-
# If we didn't get enough specific articles, add some generic extracted content
|
114
|
-
if len(article_titles) < 3:
|
115
|
-
# Chunk the generic extracted content
|
116
|
-
chunks = chunk_large_content(extracted_text, chunk_size=2000, overlap=200)
|
117
|
-
relevant_chunks = []
|
118
|
-
|
119
|
-
# Find chunks that look like news
|
120
|
-
for chunk in chunks[:10]:
|
121
|
-
if any(marker in chunk for marker in [":", " - ", "reports", "according to", "says"]):
|
122
|
-
relevant_chunks.append(chunk)
|
123
|
-
|
124
|
-
if relevant_chunks:
|
125
|
-
result.append("# Additional News Content\n")
|
126
|
-
result.append("\n".join(relevant_chunks[:3]))
|
127
|
-
|
128
|
-
max_length = strategy.get("max_length", 15000)
|
129
|
-
final_text = "\n".join(result)
|
130
|
-
|
131
|
-
# Truncate if needed
|
132
|
-
if len(final_text) > max_length:
|
133
|
-
print_info(f"Truncating content from {len(final_text)} to {max_length} characters", "News Extraction")
|
134
|
-
final_text = final_text[:max_length] + "..."
|
135
|
-
|
136
|
-
print_success(f"Successfully extracted {len(article_titles)} news stories", "News Extraction")
|
137
|
-
return final_text, False
|
@@ -1,108 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Utility functions and constants for the fetch_webpage package.
|
3
|
-
"""
|
4
|
-
|
5
|
-
import re
|
6
|
-
import html
|
7
|
-
import unicodedata
|
8
|
-
|
9
|
-
# Dictionary of known content types and extraction strategies
|
10
|
-
SITE_SPECIFIC_STRATEGIES = {
|
11
|
-
"news.google.com": {
|
12
|
-
"method": "beautifulsoup",
|
13
|
-
"target_strings": [
|
14
|
-
"Top stories", "Headlines", "For you",
|
15
|
-
"U.S.", "World", "Business", "Technology",
|
16
|
-
"Entertainment", "Sports", "Science", "Health"
|
17
|
-
],
|
18
|
-
"max_length": 20000,
|
19
|
-
"article_selectors": ["article", "h3", "h4", ".ipQwMb", ".BOz6fb", ".MgUUmf"]
|
20
|
-
},
|
21
|
-
"news.yahoo.com": {
|
22
|
-
"method": "beautifulsoup",
|
23
|
-
"target_strings": ["Top Stories", "Trending News"],
|
24
|
-
"max_length": 20000,
|
25
|
-
"article_selectors": [".js-stream-content", ".js-content", "h3", "h2"]
|
26
|
-
},
|
27
|
-
"msn.com": {
|
28
|
-
"method": "newspaper",
|
29
|
-
"max_length": 20000,
|
30
|
-
"target_strings": ["Top stories", "Headlines"]
|
31
|
-
},
|
32
|
-
"reddit.com": {
|
33
|
-
"method": "trafilatura",
|
34
|
-
"target_strings": ["comments", "Posted by", "communities"],
|
35
|
-
"max_length": 15000,
|
36
|
-
"article_selectors": [".Post", "h1", "h2", ".title"]
|
37
|
-
},
|
38
|
-
"twitter.com": {
|
39
|
-
"method": "beautifulsoup",
|
40
|
-
"target_strings": ["Trending", "Following", "For you"],
|
41
|
-
"max_length": 15000,
|
42
|
-
"article_selectors": [".tweet", ".content", "[data-testid='tweet']"]
|
43
|
-
}
|
44
|
-
}
|
45
|
-
|
46
|
-
|
47
|
-
def clean_text(text: str) -> str:
|
48
|
-
"""
|
49
|
-
Clean extracted text by removing extra whitespace, normalizing Unicode, etc.
|
50
|
-
|
51
|
-
Args:
|
52
|
-
text: The text to clean
|
53
|
-
|
54
|
-
Returns:
|
55
|
-
Cleaned text
|
56
|
-
"""
|
57
|
-
# Decode HTML entities
|
58
|
-
text = html.unescape(text)
|
59
|
-
|
60
|
-
# Normalize Unicode characters
|
61
|
-
text = unicodedata.normalize('NFKC', text)
|
62
|
-
|
63
|
-
# Remove excess whitespace
|
64
|
-
text = re.sub(r'\s+', ' ', text)
|
65
|
-
|
66
|
-
# Remove duplicate newlines (but preserve paragraph breaks)
|
67
|
-
text = re.sub(r'\n{3,}', '\n\n', text)
|
68
|
-
|
69
|
-
# Remove very short lines that are often menu items or UI elements
|
70
|
-
lines = [line for line in text.split('\n') if len(line.strip()) > 20]
|
71
|
-
text = '\n'.join(lines)
|
72
|
-
|
73
|
-
return text.strip()
|
74
|
-
|
75
|
-
|
76
|
-
def similar_text(text1: str, text2: str, threshold: float = 0.7) -> bool:
|
77
|
-
"""
|
78
|
-
Check if two text strings are similar using a simple similarity metric.
|
79
|
-
|
80
|
-
Args:
|
81
|
-
text1: First text string
|
82
|
-
text2: Second text string
|
83
|
-
threshold: Similarity threshold (0-1)
|
84
|
-
|
85
|
-
Returns:
|
86
|
-
True if texts are similar, False otherwise
|
87
|
-
"""
|
88
|
-
# Simple character-based similarity
|
89
|
-
if len(text1) == 0 or len(text2) == 0:
|
90
|
-
return False
|
91
|
-
|
92
|
-
# If one string is much shorter than the other, they're not similar
|
93
|
-
if len(text1) < len(text2) * 0.5 or len(text2) < len(text1) * 0.5:
|
94
|
-
return False
|
95
|
-
|
96
|
-
# Check for substring relationship
|
97
|
-
if text1 in text2 or text2 in text1:
|
98
|
-
return True
|
99
|
-
|
100
|
-
# Simple character-based similarity for short strings
|
101
|
-
if len(text1) < 200 and len(text2) < 200:
|
102
|
-
shorter = text1 if len(text1) <= len(text2) else text2
|
103
|
-
longer = text2 if len(text1) <= len(text2) else text1
|
104
|
-
|
105
|
-
matches = sum(c1 == c2 for c1, c2 in zip(shorter, longer))
|
106
|
-
return matches / len(shorter) >= threshold
|
107
|
-
|
108
|
-
return False
|