ngpt 3.4.5__py3-none-any.whl → 3.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ngpt/utils/web_search.py +386 -66
- {ngpt-3.4.5.dist-info → ngpt-3.5.1.dist-info}/METADATA +94 -125
- {ngpt-3.4.5.dist-info → ngpt-3.5.1.dist-info}/RECORD +6 -6
- {ngpt-3.4.5.dist-info → ngpt-3.5.1.dist-info}/WHEEL +0 -0
- {ngpt-3.4.5.dist-info → ngpt-3.5.1.dist-info}/entry_points.txt +0 -0
- {ngpt-3.4.5.dist-info → ngpt-3.5.1.dist-info}/licenses/LICENSE +0 -0
ngpt/utils/web_search.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Web search utilities for nGPT using
|
2
|
+
Web search utilities for nGPT using BeautifulSoup4.
|
3
3
|
|
4
4
|
This module provides functionality to search the web and extract
|
5
5
|
information from search results to enhance AI prompts.
|
@@ -7,11 +7,13 @@ information from search results to enhance AI prompts.
|
|
7
7
|
|
8
8
|
import re
|
9
9
|
from typing import List, Dict, Any, Optional
|
10
|
-
from
|
11
|
-
from urllib.parse import urlparse
|
10
|
+
from urllib.parse import urlparse, parse_qs
|
12
11
|
import requests
|
13
12
|
import sys
|
14
13
|
import datetime
|
14
|
+
from bs4 import BeautifulSoup
|
15
|
+
from bs4.element import Comment, Declaration, Doctype, ProcessingInstruction
|
16
|
+
import json
|
15
17
|
|
16
18
|
# Get actual logger from global context instead of using standard logging
|
17
19
|
from . import log
|
@@ -39,7 +41,7 @@ def get_logger():
|
|
39
41
|
|
40
42
|
def perform_web_search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
41
43
|
"""
|
42
|
-
Search
|
44
|
+
Search DuckDuckGo directly and return relevant results.
|
43
45
|
|
44
46
|
Args:
|
45
47
|
query: The search query
|
@@ -50,17 +52,58 @@ def perform_web_search(query: str, max_results: int = 5) -> List[Dict[str, Any]]
|
|
50
52
|
"""
|
51
53
|
logger = get_logger()
|
52
54
|
try:
|
53
|
-
|
54
|
-
|
55
|
+
# Headers to mimic a browser request
|
56
|
+
headers = {
|
57
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
58
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
59
|
+
}
|
60
|
+
|
61
|
+
# DuckDuckGo search URL
|
62
|
+
encoded_query = requests.utils.quote(query)
|
63
|
+
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
64
|
+
|
65
|
+
# Fetch search results
|
66
|
+
response = requests.get(url, headers=headers, timeout=10)
|
67
|
+
response.raise_for_status()
|
68
|
+
|
69
|
+
# Parse HTML response with html.parser (no lxml dependency)
|
70
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
71
|
+
results = []
|
72
|
+
|
73
|
+
# Extract search results
|
74
|
+
for result in soup.select('.result')[:max_results]:
|
75
|
+
title_elem = result.select_one('.result__title')
|
76
|
+
snippet_elem = result.select_one('.result__snippet')
|
77
|
+
url_elem = result.select_one('.result__url')
|
78
|
+
|
79
|
+
# Extract actual URL from DDG's redirect URL if needed
|
80
|
+
href = title_elem.find('a')['href'] if title_elem and title_elem.find('a') else None
|
81
|
+
if href and href.startswith('/'):
|
82
|
+
# Parse DDG redirect URL to get actual URL
|
83
|
+
parsed_url = urlparse(href)
|
84
|
+
query_params = parse_qs(parsed_url.query)
|
85
|
+
actual_url = query_params.get('uddg', [None])[0]
|
86
|
+
else:
|
87
|
+
actual_url = href
|
88
|
+
|
89
|
+
# Add result to list
|
90
|
+
if title_elem and actual_url:
|
91
|
+
results.append({
|
92
|
+
'title': title_elem.get_text(strip=True),
|
93
|
+
'href': actual_url,
|
94
|
+
'body': snippet_elem.get_text(strip=True) if snippet_elem else ''
|
95
|
+
})
|
96
|
+
|
55
97
|
return results
|
56
98
|
except Exception as e:
|
57
99
|
logger.error(f"Error performing web search: {str(e)}")
|
58
100
|
logger.info("Web search encountered an issue, but will continue with available results")
|
59
101
|
return []
|
60
102
|
|
61
|
-
def extract_article_content(url: str, max_chars: int =
|
103
|
+
def extract_article_content(url: str, max_chars: int = 5000) -> Optional[str]:
|
62
104
|
"""
|
63
|
-
Extract and clean content from a webpage URL
|
105
|
+
Extract and clean content from a webpage URL using a hybrid approach
|
106
|
+
inspired by trafilatura and readability algorithms.
|
64
107
|
|
65
108
|
Args:
|
66
109
|
url: The URL to extract content from
|
@@ -81,82 +124,333 @@ def extract_article_content(url: str, max_chars: int = 2000) -> Optional[str]:
|
|
81
124
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
82
125
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
83
126
|
'Accept-Language': 'en-US,en;q=0.5',
|
127
|
+
'DNT': '1', # Do Not Track
|
84
128
|
'Connection': 'keep-alive',
|
85
129
|
'Upgrade-Insecure-Requests': '1',
|
130
|
+
'Cache-Control': 'max-age=0',
|
131
|
+
'Sec-Fetch-Dest': 'document',
|
132
|
+
'Sec-Fetch-Mode': 'navigate',
|
133
|
+
'Sec-Fetch-Site': 'none',
|
134
|
+
'Pragma': 'no-cache',
|
86
135
|
}
|
87
136
|
|
88
137
|
logger.info(f"Fetching content from {url}")
|
89
138
|
|
90
139
|
try:
|
91
|
-
#
|
92
|
-
|
140
|
+
# Fetch the page content
|
141
|
+
response = requests.get(url, headers=headers, timeout=15)
|
93
142
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
143
|
+
if response.status_code == 200:
|
144
|
+
# Try to detect the encoding if not properly specified
|
145
|
+
if response.encoding == 'ISO-8859-1':
|
146
|
+
# Try to detect encoding from content
|
147
|
+
possible_encoding = re.search(r'charset=["\'](.*?)["\']', response.text)
|
148
|
+
if possible_encoding:
|
149
|
+
response.encoding = possible_encoding.group(1)
|
150
|
+
else:
|
151
|
+
# Default to UTF-8 if we can't detect
|
152
|
+
response.encoding = 'utf-8'
|
153
|
+
|
154
|
+
# Parse with BeautifulSoup using html.parser
|
155
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
156
|
+
|
157
|
+
# Extract main content using multiple strategies
|
158
|
+
extracted_content = None
|
159
|
+
|
160
|
+
# ---------- PREPROCESSING ----------
|
161
|
+
# Clone the soup before preprocessing
|
162
|
+
processed_soup = BeautifulSoup(str(soup), 'html.parser')
|
163
|
+
|
164
|
+
# Remove all script, style tags and comments
|
165
|
+
for element in processed_soup.find_all(['script', 'style', 'noscript']):
|
166
|
+
element.decompose()
|
103
167
|
|
104
|
-
|
105
|
-
|
106
|
-
|
168
|
+
# Remove HTML comments
|
169
|
+
for comment in processed_soup.find_all(text=lambda text: isinstance(text, Comment)):
|
170
|
+
comment.extract()
|
171
|
+
|
172
|
+
# Remove hidden elements
|
173
|
+
for hidden in processed_soup.find_all(style=lambda s: s and isinstance(s, str) and ('display:none' in s.lower() or 'visibility:hidden' in s.lower())):
|
174
|
+
hidden.decompose()
|
175
|
+
for hidden in processed_soup.find_all(hidden=True):
|
176
|
+
hidden.decompose()
|
177
|
+
for hidden in processed_soup.find_all(class_=lambda c: c and isinstance(c, str) and any(x in c.lower() for x in ['hidden', 'invisible'])):
|
178
|
+
hidden.decompose()
|
179
|
+
|
180
|
+
# Handle iframes and frames
|
181
|
+
for frame in processed_soup.find_all(['iframe', 'frame']):
|
182
|
+
frame.decompose()
|
183
|
+
|
184
|
+
# ---------- SITE-SPECIFIC HANDLING ----------
|
185
|
+
domain = parsed_url.netloc.lower()
|
186
|
+
|
187
|
+
# Wikipedia-specific extraction
|
188
|
+
if 'wikipedia.org' in domain:
|
189
|
+
content_div = processed_soup.select_one('#mw-content-text')
|
190
|
+
if content_div:
|
191
|
+
# Remove tables, references, navigation elements
|
192
|
+
for unwanted in content_div.select('table, .reference, .reflist, .navbox, .vertical-navbox, .thumbcaption, .mw-editsection, .mw-headline, .toc, #toc'):
|
193
|
+
unwanted.decompose()
|
194
|
+
extracted_content = content_div.get_text(separator=' ', strip=True)
|
195
|
+
|
196
|
+
# News site specific handling
|
197
|
+
news_sites = {
|
198
|
+
'cnn.com': ['article', '.article__content', '.l-container', '.body-text', '#body-text'],
|
199
|
+
'bbc.com': ['.article__body-content', '.story-body__inner', '[data-component="text-block"]'],
|
200
|
+
'nytimes.com': ['article', '.meteredContent', '.StoryBodyCompanionColumn', '.article-body'],
|
201
|
+
'reuters.com': ['article', '.ArticleBody__content___3MtHP', '.article-body'],
|
202
|
+
'theguardian.com': ['.article-body-commercial-selector', '.content__article-body', '.dcr-1cas96z'],
|
203
|
+
'washingtonpost.com': ['.article-body', '.teaser-content'],
|
204
|
+
'apnews.com': ['.Article', '.RichTextStoryBody'],
|
205
|
+
'indiatimes.com': ['.article-body', '.article_content', '.article-desc', '.Normal'],
|
206
|
+
'cnbc.com': ['.ArticleBody-articleBody', '.group-article-body', '.article-body'],
|
207
|
+
'thehindu.com': ['.article-body', '.article-text', '#content-body-14269002']
|
208
|
+
}
|
209
|
+
|
210
|
+
if not extracted_content:
|
211
|
+
# Check if we're on a known news site
|
212
|
+
for site, selectors in news_sites.items():
|
213
|
+
if site in domain:
|
214
|
+
for selector in selectors:
|
215
|
+
content_element = processed_soup.select_one(selector)
|
216
|
+
if content_element:
|
217
|
+
# Clean the news content
|
218
|
+
for unwanted in content_element.select('aside, figure, .ad, .ads, .advertisement, .social, .share, .related, .newsletter, .more-on, .read-more, .promotions'):
|
219
|
+
unwanted.decompose()
|
220
|
+
extracted_content = content_element.get_text(separator=' ', strip=True)
|
221
|
+
break
|
222
|
+
if extracted_content:
|
223
|
+
break
|
224
|
+
|
225
|
+
# ---------- JSON-LD EXTRACTION ----------
|
226
|
+
if not extracted_content:
|
227
|
+
# Look for structured data in JSON-LD format
|
228
|
+
json_ld = processed_soup.find_all('script', type='application/ld+json')
|
229
|
+
for script in json_ld:
|
230
|
+
try:
|
231
|
+
script_content = script.string
|
232
|
+
if not script_content: # Skip empty scripts
|
233
|
+
continue
|
234
|
+
|
235
|
+
# Clean the JSON string (some sites have invalid JSON)
|
236
|
+
script_content = re.sub(r'[\n\t\r]', '', script_content)
|
237
|
+
script_content = script_content.strip()
|
238
|
+
|
239
|
+
data = json.loads(script_content)
|
240
|
+
# Handle both single objects and arrays of objects
|
241
|
+
if isinstance(data, list):
|
242
|
+
data_list = data
|
243
|
+
else:
|
244
|
+
data_list = [data]
|
245
|
+
|
246
|
+
for item in data_list:
|
247
|
+
article_body = None
|
248
|
+
# Try to find articleBody or various content fields
|
249
|
+
if isinstance(item, dict):
|
250
|
+
# Check for common content fields directly
|
251
|
+
for field in ['articleBody', 'description', 'text', 'mainEntityOfPage']:
|
252
|
+
if field in item and isinstance(item[field], str) and len(item[field]) > 200:
|
253
|
+
article_body = item[field]
|
254
|
+
break
|
255
|
+
|
256
|
+
# Check in nested objects
|
257
|
+
if not article_body and '@graph' in item and isinstance(item['@graph'], list):
|
258
|
+
for graph_item in item['@graph']:
|
259
|
+
if isinstance(graph_item, dict):
|
260
|
+
for field in ['articleBody', 'description', 'text']:
|
261
|
+
if field in graph_item and isinstance(graph_item[field], str) and len(graph_item[field]) > 200:
|
262
|
+
article_body = graph_item[field]
|
263
|
+
break
|
264
|
+
if article_body:
|
265
|
+
break
|
266
|
+
|
267
|
+
if article_body:
|
268
|
+
extracted_content = article_body
|
269
|
+
break
|
270
|
+
|
271
|
+
if extracted_content:
|
272
|
+
break
|
273
|
+
except (json.JSONDecodeError, TypeError, AttributeError, ValueError) as e:
|
274
|
+
logger.debug(f"Error parsing JSON-LD: {str(e)}")
|
275
|
+
continue
|
276
|
+
|
277
|
+
# ---------- META DESCRIPTION EXTRACTION ----------
|
278
|
+
meta_description = None
|
279
|
+
meta_tag = processed_soup.find('meta', attrs={'name': 'description'}) or processed_soup.find('meta', attrs={'property': 'og:description'})
|
280
|
+
if meta_tag and meta_tag.get('content'):
|
281
|
+
meta_description = meta_tag.get('content')
|
282
|
+
|
283
|
+
# ---------- CONTENT ANALYSIS ----------
|
284
|
+
if not extracted_content:
|
285
|
+
# Get all content blocks (divs, sections, articles)
|
286
|
+
content_blocks = []
|
107
287
|
|
108
|
-
#
|
109
|
-
|
110
|
-
|
288
|
+
# Prioritize semantic tags
|
289
|
+
for tag in ['article', 'main', 'section', 'div']:
|
290
|
+
blocks = processed_soup.find_all(tag)
|
111
291
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
292
|
+
for block in blocks:
|
293
|
+
# Skip if too small
|
294
|
+
text = block.get_text(strip=True)
|
295
|
+
if len(text) < 200:
|
296
|
+
continue
|
297
|
+
|
298
|
+
# Calculate content metrics
|
299
|
+
char_count = len(text)
|
300
|
+
link_density = calculate_link_density(block)
|
301
|
+
p_count = len(block.find_all('p'))
|
302
|
+
p_text_length = sum(len(p.get_text(strip=True)) for p in block.find_all('p'))
|
303
|
+
p_density = p_text_length / char_count if char_count > 0 else 0
|
304
|
+
|
305
|
+
# Skip blocks with high link density (likely navigation)
|
306
|
+
if link_density > 0.5:
|
307
|
+
continue
|
308
|
+
|
309
|
+
# Calculate readability scores
|
310
|
+
text_density = char_count / (len(str(block)) + 1) # Text to HTML ratio
|
311
|
+
|
312
|
+
# Score content blocks
|
313
|
+
score = 0
|
314
|
+
|
315
|
+
# Prefer blocks with many paragraphs
|
316
|
+
score += min(p_count * 5, 50) # Max 50 points for paragraphs
|
317
|
+
|
318
|
+
# Prefer blocks with high paragraph text density
|
319
|
+
score += min(int(p_density * 100), 50) # Max 50 points for paragraph density
|
320
|
+
|
321
|
+
# Penalize high link density
|
322
|
+
score -= int(link_density * 100)
|
323
|
+
|
324
|
+
# Boost for high text density
|
325
|
+
score += min(int(text_density * 30), 30) # Max 30 points for text density
|
326
|
+
|
327
|
+
# Boost for certain attributes and classes
|
328
|
+
content_indicators = ['content', 'article', 'story', 'post', 'text', 'body', 'entry']
|
329
|
+
|
330
|
+
# Check class and id attributes
|
331
|
+
for attr in ['class', 'id']:
|
332
|
+
attr_val = block.get(attr, '')
|
333
|
+
if attr_val:
|
334
|
+
if isinstance(attr_val, list):
|
335
|
+
attr_val = ' '.join(attr_val)
|
336
|
+
for indicator in content_indicators:
|
337
|
+
if indicator in attr_val.lower():
|
338
|
+
score += 30
|
339
|
+
break
|
340
|
+
|
341
|
+
# Penalty for boilerplate indicators
|
342
|
+
boilerplate_indicators = ['sidebar', 'menu', 'nav', 'banner', 'ad', 'footer', 'header', 'comment', 'share', 'related']
|
343
|
+
for attr in ['class', 'id']:
|
344
|
+
attr_val = block.get(attr, '')
|
345
|
+
if attr_val:
|
346
|
+
if isinstance(attr_val, list):
|
347
|
+
attr_val = ' '.join(attr_val)
|
348
|
+
for indicator in boilerplate_indicators:
|
349
|
+
if indicator in attr_val.lower():
|
350
|
+
score -= 50
|
351
|
+
break
|
352
|
+
|
353
|
+
# Add to content blocks if score is positive
|
354
|
+
if score > 0:
|
355
|
+
content_blocks.append({
|
356
|
+
'element': block,
|
357
|
+
'score': score,
|
358
|
+
'char_count': char_count,
|
359
|
+
'text': text
|
360
|
+
})
|
361
|
+
|
362
|
+
# Sort content blocks by score
|
363
|
+
if content_blocks:
|
364
|
+
content_blocks.sort(key=lambda x: x['score'], reverse=True)
|
365
|
+
best_block = content_blocks[0]['element']
|
366
|
+
|
367
|
+
# Clean up the best block
|
368
|
+
for unwanted in best_block.find_all(['aside', 'nav', 'footer', 'header']):
|
369
|
+
unwanted.decompose()
|
370
|
+
|
371
|
+
extracted_content = best_block.get_text(separator=' ', strip=True)
|
125
372
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
373
|
+
# ---------- PARAGRAPH EXTRACTION FALLBACK ----------
|
374
|
+
if not extracted_content:
|
375
|
+
# Get all paragraphs with substantial content
|
376
|
+
paragraphs = []
|
377
|
+
for p in processed_soup.find_all('p'):
|
378
|
+
text = p.get_text(strip=True)
|
379
|
+
if len(text) > 40: # Only consider substantial paragraphs
|
380
|
+
# Calculate link density
|
381
|
+
link_density = calculate_link_density(p)
|
382
|
+
if link_density < 0.25: # Skip if too many links
|
383
|
+
paragraphs.append(text)
|
384
|
+
|
385
|
+
if paragraphs:
|
386
|
+
extracted_content = ' '.join(paragraphs)
|
130
387
|
|
388
|
+
# If we have content, clean it up
|
389
|
+
if extracted_content:
|
390
|
+
# Clean whitespace
|
391
|
+
extracted_content = re.sub(r'\s+', ' ', extracted_content).strip()
|
392
|
+
|
393
|
+
# Remove URLs
|
394
|
+
extracted_content = re.sub(r'https?://\S+', '', extracted_content)
|
395
|
+
|
396
|
+
# Remove email addresses
|
397
|
+
extracted_content = re.sub(r'\S+@\S+', '', extracted_content)
|
398
|
+
|
399
|
+
# Remove social media handles
|
400
|
+
extracted_content = re.sub(r'@\w+', '', extracted_content)
|
401
|
+
|
402
|
+
# Replace multiple spaces with single space
|
403
|
+
extracted_content = re.sub(r' +', ' ', extracted_content)
|
404
|
+
|
405
|
+
# Normalize quotes and apostrophes
|
406
|
+
extracted_content = extracted_content.replace('"', '"').replace('"', '"')
|
407
|
+
extracted_content = extracted_content.replace("'", "'").replace("'", "'")
|
408
|
+
|
409
|
+
# Remove any remaining HTML entities
|
410
|
+
extracted_content = re.sub(r'&[a-zA-Z]+;', ' ', extracted_content)
|
411
|
+
|
412
|
+
# Remove short lines that are likely navigation/menu items
|
413
|
+
lines = extracted_content.split('\n')
|
414
|
+
extracted_content = ' '.join([line for line in lines if len(line) > 40 or '.' in line])
|
415
|
+
|
416
|
+
# Combine with meta description if available and content is short
|
417
|
+
if meta_description and len(extracted_content) < 500:
|
418
|
+
extracted_content = meta_description + " " + extracted_content
|
419
|
+
|
420
|
+
# Truncate if needed
|
421
|
+
if len(extracted_content) > max_chars:
|
422
|
+
# Try to break at a sentence boundary
|
423
|
+
cutoff_point = max_chars
|
424
|
+
for i in range(max_chars - 1, max_chars - 300, -1):
|
425
|
+
if i < len(extracted_content) and extracted_content[i] in ['.', '!', '?']:
|
426
|
+
cutoff_point = i + 1
|
427
|
+
break
|
428
|
+
|
429
|
+
extracted_content = extracted_content[:cutoff_point]
|
430
|
+
|
431
|
+
return extracted_content
|
432
|
+
else:
|
433
|
+
# Return meta description if nothing else was found
|
434
|
+
if meta_description:
|
435
|
+
return meta_description
|
436
|
+
|
437
|
+
logger.error(f"No content extracted from {url}")
|
438
|
+
return None
|
131
439
|
else:
|
132
440
|
logger.error(f"Request to {url} returned status code {response.status_code}")
|
133
|
-
|
134
|
-
except ImportError:
|
135
|
-
logger.error("Trafilatura not installed. Install with 'pip install trafilatura'")
|
136
|
-
# Try direct requests only
|
137
|
-
try:
|
138
|
-
response = requests.get(url, headers=headers, timeout=10)
|
139
|
-
if response.status_code == 200:
|
140
|
-
# Very basic HTML cleaning
|
141
|
-
html_content = response.text
|
142
|
-
text = re.sub(r'<[^>]+>', ' ', html_content)
|
143
|
-
text = re.sub(r'\s+', ' ', text).strip()
|
144
|
-
|
145
|
-
if text:
|
146
|
-
if len(text) > max_chars:
|
147
|
-
text = text[:max_chars] + "..."
|
148
|
-
return text
|
149
|
-
except Exception as req_error:
|
150
|
-
logger.error(f"Direct request fallback failed: {str(req_error)}")
|
441
|
+
return None
|
151
442
|
|
152
443
|
except Exception as e:
|
153
|
-
logger.error(f"Error extracting content with
|
154
|
-
# Try
|
444
|
+
logger.error(f"Error extracting content with hybrid approach: {str(e)}")
|
445
|
+
# Try a basic fallback
|
155
446
|
try:
|
156
447
|
response = requests.get(url, headers=headers, timeout=10)
|
157
448
|
if response.status_code == 200:
|
158
|
-
|
159
|
-
text
|
449
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
450
|
+
# Just get the text without images, scripts, styles, etc.
|
451
|
+
for tag in soup(['script', 'style', 'img', 'nav', 'footer', 'header']):
|
452
|
+
tag.decompose()
|
453
|
+
text = soup.get_text(separator=' ', strip=True)
|
160
454
|
text = re.sub(r'\s+', ' ', text).strip()
|
161
455
|
|
162
456
|
if text:
|
@@ -164,14 +458,40 @@ def extract_article_content(url: str, max_chars: int = 2000) -> Optional[str]:
|
|
164
458
|
text = text[:max_chars] + "..."
|
165
459
|
return text
|
166
460
|
except Exception as req_error:
|
167
|
-
logger.error(f"
|
461
|
+
logger.error(f"Basic fallback failed: {str(req_error)}")
|
168
462
|
|
169
463
|
return None
|
170
464
|
except Exception as e:
|
171
465
|
logger.error(f"Error extracting content from {url}: {str(e)}")
|
172
466
|
return None
|
173
467
|
|
174
|
-
def
|
468
|
+
def calculate_link_density(element):
|
469
|
+
"""
|
470
|
+
Calculate the ratio of link text to all text in an element.
|
471
|
+
Used to identify navigation-heavy areas.
|
472
|
+
|
473
|
+
Args:
|
474
|
+
element: BeautifulSoup element
|
475
|
+
|
476
|
+
Returns:
|
477
|
+
Float between 0 and 1 indicating link density
|
478
|
+
"""
|
479
|
+
try:
|
480
|
+
if element is None:
|
481
|
+
return 0
|
482
|
+
|
483
|
+
text_length = len(element.get_text(strip=True))
|
484
|
+
if text_length == 0:
|
485
|
+
return 0
|
486
|
+
|
487
|
+
links = element.find_all('a')
|
488
|
+
link_text_length = sum(len(a.get_text(strip=True)) for a in links)
|
489
|
+
|
490
|
+
return link_text_length / text_length
|
491
|
+
except Exception:
|
492
|
+
return 0
|
493
|
+
|
494
|
+
def get_web_search_results(query: str, max_results: int = 5, max_chars_per_result: int = 5000) -> Dict[str, Any]:
|
175
495
|
"""
|
176
496
|
Get formatted web search results ready to be included in AI prompts.
|
177
497
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ngpt
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.5.1
|
4
4
|
Summary: Swiss army knife for LLMs: powerful CLI and interactive chatbot in one package. Seamlessly work with OpenAI, Ollama, Groq, Claude, Gemini, or any OpenAI-compatible API to generate code, craft git commits, rewrite text, and execute shell commands.
|
5
5
|
Project-URL: Homepage, https://github.com/nazdridoy/ngpt
|
6
6
|
Project-URL: Repository, https://github.com/nazdridoy/ngpt
|
@@ -28,12 +28,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
28
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
29
29
|
Classifier: Topic :: Utilities
|
30
30
|
Requires-Python: >=3.8
|
31
|
-
Requires-Dist:
|
31
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
32
32
|
Requires-Dist: prompt-toolkit>=3.0.0
|
33
33
|
Requires-Dist: pyperclip>=1.8.0
|
34
34
|
Requires-Dist: requests>=2.31.0
|
35
35
|
Requires-Dist: rich>=10.0.0
|
36
|
-
Requires-Dist: trafilatura>=1.6.0
|
37
36
|
Description-Content-Type: text/markdown
|
38
37
|
|
39
38
|
# nGPT
|
@@ -62,10 +61,10 @@ Description-Content-Type: text/markdown
|
|
62
61
|
|
63
62
|
- ✅ **Versatile**: Powerful and easy-to-use CLI tool for various AI tasks
|
64
63
|
- 🪶 **Lightweight**: Minimal dependencies with everything you need included
|
65
|
-
- 🔄 **API Flexibility**: Works with OpenAI, Ollama, Groq, Claude, Gemini, and any compatible endpoint
|
64
|
+
- 🔄 **API Flexibility**: Works with OpenAI, Ollama, Groq, Claude, Gemini, and any OpenAI-compatible endpoint
|
66
65
|
- 💬 **Interactive Chat**: Continuous conversation with memory in modern UI
|
67
66
|
- 📊 **Streaming Responses**: Real-time output for better user experience
|
68
|
-
- 🔍 **Web Search**: Enhance any model with contextual information from the web
|
67
|
+
- 🔍 **Web Search**: Enhance any model with contextual information from the web, using advanced content extraction to identify the most relevant information from web pages
|
69
68
|
- 📥 **Stdin Processing**: Process piped content by using `{}` placeholder in prompts
|
70
69
|
- 🎨 **Markdown Rendering**: Beautiful formatting of markdown and code with syntax highlighting
|
71
70
|
- ⚡ **Real-time Markdown**: Stream responses with live updating syntax highlighting and formatting
|
@@ -85,37 +84,47 @@ See the [Feature Overview](https://nazdridoy.github.io/ngpt/overview/) for more
|
|
85
84
|
|
86
85
|
|
87
86
|
## Table of Contents
|
88
|
-
- [Quick Start](#quick-start)
|
89
87
|
- [Features](#features)
|
90
|
-
- [Documentation](#documentation)
|
91
88
|
- [Installation](#installation)
|
89
|
+
- [Quick Start](#quick-start)
|
92
90
|
- [Usage](#usage)
|
91
|
+
- [Command Line Options](#command-line-options)
|
93
92
|
- [Documentation](https://nazdridoy.github.io/ngpt/)
|
94
|
-
|
93
|
+
- [Documentation](#documentation)
|
95
94
|
- [Configuration](#configuration)
|
96
|
-
- [
|
95
|
+
- [API Key Setup](#api-key-setup)
|
96
|
+
- [OpenAI API Key](#openai-api-key)
|
97
|
+
- [Google Gemini API Key](#google-gemini-api-key)
|
97
98
|
- [CLI Configuration](#cli-configuration)
|
98
99
|
- [Interactive Configuration](#interactive-configuration)
|
99
100
|
- [Configuration File](#configuration-file)
|
100
101
|
- [Configuration Priority](#configuration-priority)
|
101
|
-
- [API Key Setup](#api-key-setup)
|
102
|
-
- [OpenAI API Key](#openai-api-key)
|
103
|
-
- [Google Gemini API Key](#google-gemini-api-key)
|
104
102
|
- [Contributing](#contributing)
|
105
103
|
- [License](#license)
|
106
104
|
|
107
|
-
##
|
105
|
+
## Installation
|
108
106
|
|
109
107
|
```bash
|
110
|
-
#
|
108
|
+
# Installation with pip
|
111
109
|
pip install ngpt
|
112
110
|
|
113
|
-
# Or install with uv (faster)
|
111
|
+
# Or install with uv (faster installation)
|
114
112
|
uv pip install ngpt
|
115
113
|
|
116
|
-
# Or install globally as a CLI tool (recommended)
|
114
|
+
# Or install globally as a CLI tool (recommended for command-line usage)
|
117
115
|
uv tool install ngpt
|
118
116
|
|
117
|
+
# Arch Linux: install from AUR
|
118
|
+
paru -S ngpt
|
119
|
+
```
|
120
|
+
|
121
|
+
Requires Python 3.8 or newer.
|
122
|
+
|
123
|
+
For detailed installation instructions, see the [Installation Guide](https://nazdridoy.github.io/ngpt/installation/).
|
124
|
+
|
125
|
+
## Quick Start
|
126
|
+
|
127
|
+
```bash
|
119
128
|
# Chat with default settings
|
120
129
|
ngpt "Tell me about quantum computing"
|
121
130
|
|
@@ -200,58 +209,6 @@ ngpt --provider Groq "Explain quantum computing"
|
|
200
209
|
# Compare outputs from different providers
|
201
210
|
ngpt --provider OpenAI "Explain quantum physics" > openai_response.txt
|
202
211
|
ngpt --provider Ollama "Explain quantum physics" > ollama_response.txt
|
203
|
-
```
|
204
|
-
|
205
|
-
For more examples and detailed usage, visit the [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/).
|
206
|
-
|
207
|
-
## Documentation
|
208
|
-
|
209
|
-
Comprehensive documentation, including usage guides and examples, is available at:
|
210
|
-
|
211
|
-
**[https://nazdridoy.github.io/ngpt/](https://nazdridoy.github.io/ngpt/)**
|
212
|
-
|
213
|
-
Key documentation sections:
|
214
|
-
- [Installation Guide](https://nazdridoy.github.io/ngpt/installation/)
|
215
|
-
- [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/)
|
216
|
-
- [Configuration Guide](https://nazdridoy.github.io/ngpt/configuration/)
|
217
|
-
- [Examples & Tutorials](https://nazdridoy.github.io/ngpt/examples/basic/)
|
218
|
-
|
219
|
-
## Installation
|
220
|
-
|
221
|
-
```bash
|
222
|
-
# Installation with pip
|
223
|
-
pip install ngpt
|
224
|
-
|
225
|
-
# Or install with uv (faster installation)
|
226
|
-
uv pip install ngpt
|
227
|
-
|
228
|
-
# Or install globally as a CLI tool (recommended for command-line usage)
|
229
|
-
uv tool install ngpt
|
230
|
-
|
231
|
-
# Arch Linux: install from AUR
|
232
|
-
paru -S ngpt
|
233
|
-
```
|
234
|
-
|
235
|
-
Requires Python 3.8 or newer.
|
236
|
-
|
237
|
-
For detailed installation instructions, see the [Installation Guide](https://nazdridoy.github.io/ngpt/installation/).
|
238
|
-
|
239
|
-
## Usage
|
240
|
-
|
241
|
-
### As a CLI Tool
|
242
|
-
|
243
|
-
```bash
|
244
|
-
# Basic chat (default mode)
|
245
|
-
ngpt "Hello, how are you?"
|
246
|
-
|
247
|
-
# Interactive chat session with conversation history
|
248
|
-
ngpt -i
|
249
|
-
|
250
|
-
# Log conversation to a file
|
251
|
-
ngpt --interactive --log conversation.log
|
252
|
-
|
253
|
-
# Use custom system prompt to guide AI behavior
|
254
|
-
ngpt --preprompt "You are a Python programming tutor" "Explain decorators"
|
255
212
|
|
256
213
|
# Show all API configurations
|
257
214
|
ngpt --show-config --all
|
@@ -277,47 +234,16 @@ ngpt -s "list all files in current directory"
|
|
277
234
|
# On Windows generates: dir
|
278
235
|
# On Linux/macOS generates: ls -la
|
279
236
|
|
280
|
-
# Generate
|
281
|
-
# Returns only code without markdown formatting or explanations
|
237
|
+
# Generate code (using -c or --code flag)
|
282
238
|
ngpt -c "create a python function that calculates fibonacci numbers"
|
283
239
|
|
284
240
|
# Use multiline text editor for complex prompts (using -t or --text flag)
|
285
|
-
# Opens an interactive editor with syntax highlighting and intuitive controls
|
286
241
|
ngpt -t
|
287
242
|
```
|
288
243
|
|
289
|
-
For more
|
290
|
-
|
291
|
-
## Configuration
|
292
|
-
|
293
|
-
### API Key Setup
|
294
|
-
|
295
|
-
#### OpenAI API Key
|
296
|
-
1. Create an account at [OpenAI](https://platform.openai.com/)
|
297
|
-
2. Navigate to API keys: https://platform.openai.com/api-keys
|
298
|
-
3. Click "Create new secret key" and copy your API key
|
299
|
-
4. Configure nGPT with your key:
|
300
|
-
```bash
|
301
|
-
ngpt --config
|
302
|
-
# Enter provider: OpenAI
|
303
|
-
# Enter API key: your-openai-api-key
|
304
|
-
# Enter base URL: https://api.openai.com/v1/
|
305
|
-
# Enter model: gpt-3.5-turbo (or other model)
|
306
|
-
```
|
244
|
+
For more examples and detailed usage, visit the [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/).
|
307
245
|
|
308
|
-
|
309
|
-
1. Create or use an existing Google account
|
310
|
-
2. Go to [Google AI Studio](https://aistudio.google.com/)
|
311
|
-
3. Navigate to API keys in the left sidebar (or visit https://aistudio.google.com/app/apikey)
|
312
|
-
4. Create an API key and copy it
|
313
|
-
5. Configure nGPT with your key:
|
314
|
-
```bash
|
315
|
-
ngpt --config
|
316
|
-
# Enter provider: Gemini
|
317
|
-
# Enter API key: your-gemini-api-key
|
318
|
-
# Enter base URL: https://generativelanguage.googleapis.com/v1beta/openai
|
319
|
-
# Enter model: gemini-2.0-flash
|
320
|
-
```
|
246
|
+
## Usage
|
321
247
|
|
322
248
|
### Command Line Options
|
323
249
|
|
@@ -326,8 +252,8 @@ For more CLI examples and detailed usage information, see the [CLI Usage Guide](
|
|
326
252
|
usage: ngpt [-h] [-v] [--language LANGUAGE] [--config [CONFIG]] [--config-index CONFIG_INDEX] [--provider PROVIDER]
|
327
253
|
[--remove] [--show-config] [--all] [--list-models] [--list-renderers] [--cli-config [COMMAND ...]]
|
328
254
|
[--api-key API_KEY] [--base-url BASE_URL] [--model MODEL] [--web-search] [--temperature TEMPERATURE]
|
329
|
-
[--top_p TOP_P] [--max_tokens MAX_TOKENS] [--log [FILE]] [--preprompt PREPROMPT] [--no-stream
|
330
|
-
|
255
|
+
[--top_p TOP_P] [--max_tokens MAX_TOKENS] [--log [FILE]] [--preprompt PREPROMPT] [--no-stream | --prettify |
|
256
|
+
--stream-prettify] [--renderer {auto,rich,glow}] [--rec-chunk] [--diff [FILE]] [--chunk-size CHUNK_SIZE]
|
331
257
|
[--analyses-chunk-size ANALYSES_CHUNK_SIZE] [--max-msg-lines MAX_MSG_LINES]
|
332
258
|
[--max-recursion-depth MAX_RECURSION_DEPTH] [-i | -s | -c | -t | -p | -r | -g]
|
333
259
|
[prompt]
|
@@ -346,12 +272,10 @@ options::
|
|
346
272
|
|
347
273
|
Configuration Options::
|
348
274
|
|
349
|
-
--config [CONFIG] Path to a custom config file or, if no value provided, enter interactive
|
350
|
-
configuration mode to create a new config
|
275
|
+
--config [CONFIG] Path to a custom config file or, if no value provided, enter interactive configuration mode to create a new config
|
351
276
|
--config-index CONFIG_INDEX Index of the configuration to use or edit (default: 0)
|
352
277
|
--provider PROVIDER Provider name to identify the configuration to use
|
353
|
-
--remove Remove the configuration at the specified index (requires --config and
|
354
|
-
--config-index or --provider)
|
278
|
+
--remove Remove the configuration at the specified index (requires --config and --config-index or --provider)
|
355
279
|
--show-config Show the current configuration(s) and exit
|
356
280
|
--all Show details for all configurations (requires --show-config)
|
357
281
|
--list-models List all available models for the current configuration and exit
|
@@ -363,30 +287,28 @@ Global Options::
|
|
363
287
|
--api-key API_KEY API key for the service
|
364
288
|
--base-url BASE_URL Base URL for the API
|
365
289
|
--model MODEL Model to use
|
366
|
-
--web-search Enable web search capability
|
367
|
-
feature)
|
290
|
+
--web-search Enable web search capability using DuckDuckGo to enhance prompts with relevant information
|
368
291
|
--temperature TEMPERATURE Set temperature (controls randomness, default: 0.7)
|
369
292
|
--top_p TOP_P Set top_p (controls diversity, default: 1.0)
|
370
293
|
--max_tokens MAX_TOKENS Set max response length in tokens
|
371
|
-
--log [FILE] Set filepath to log conversation to, or create a temporary log file if no path
|
372
|
-
provided
|
294
|
+
--log [FILE] Set filepath to log conversation to, or create a temporary log file if no path provided
|
373
295
|
--preprompt PREPROMPT Set custom system prompt to control AI behavior
|
374
|
-
--
|
375
|
-
|
376
|
-
|
377
|
-
|
296
|
+
--renderer {auto,rich,glow} Select which markdown renderer to use with --prettify or --stream-prettify (auto, rich, or glow)
|
297
|
+
|
298
|
+
Output Display Options (mutually exclusive)::
|
299
|
+
|
300
|
+
--no-stream Return the whole response without streaming or formatting
|
301
|
+
--prettify Render complete response with markdown and code formatting (non-streaming)
|
302
|
+
--stream-prettify Stream response with real-time markdown rendering (default)
|
378
303
|
|
379
304
|
Git Commit Message Options::
|
380
305
|
|
381
306
|
--rec-chunk Process large diffs in chunks with recursive analysis if needed
|
382
|
-
--diff [FILE] Use diff from specified file instead of staged changes. If used without a path,
|
383
|
-
uses the path from CLI config.
|
307
|
+
--diff [FILE] Use diff from specified file instead of staged changes. If used without a path, uses the path from CLI config.
|
384
308
|
--chunk-size CHUNK_SIZE Number of lines per chunk when chunking is enabled (default: 200)
|
385
|
-
--analyses-chunk-size ANALYSES_CHUNK_SIZE
|
386
|
-
Number of lines per chunk when recursively chunking analyses (default: 200)
|
309
|
+
--analyses-chunk-size ANALYSES_CHUNK_SIZE Number of lines per chunk when recursively chunking analyses (default: 200)
|
387
310
|
--max-msg-lines MAX_MSG_LINES Maximum number of lines in commit message before condensing (default: 20)
|
388
|
-
--max-recursion-depth MAX_RECURSION_DEPTH
|
389
|
-
Maximum recursion depth for commit message condensing (default: 3)
|
311
|
+
--max-recursion-depth MAX_RECURSION_DEPTH Maximum recursion depth for commit message condensing (default: 3)
|
390
312
|
|
391
313
|
Modes (mutually exclusive)::
|
392
314
|
|
@@ -394,15 +316,62 @@ Modes (mutually exclusive)::
|
|
394
316
|
-s, --shell Generate and execute shell commands
|
395
317
|
-c, --code Generate code
|
396
318
|
-t, --text Enter multi-line text input (submit with Ctrl+D)
|
397
|
-
-p, --pipe Read from stdin and use content with prompt. Use {} in prompt as placeholder
|
398
|
-
for stdin content
|
319
|
+
-p, --pipe Read from stdin and use content with prompt. Use {} in prompt as placeholder for stdin content
|
399
320
|
-r, --rewrite Rewrite text from stdin to be more natural while preserving tone and meaning
|
400
321
|
-g, --gitcommsg Generate AI-powered git commit messages from staged changes or diff file
|
401
322
|
```
|
402
323
|
|
403
324
|
> **Note**: For better visualization of conventional commit messages on GitHub, you can use the [GitHub Commit Labels](https://greasyfork.org/en/scripts/526153-github-commit-labels) userscript, which adds colorful labels to your commits.
|
404
325
|
|
405
|
-
For a complete reference of all available options, see the [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/).
|
326
|
+
For a complete reference of all available options, detailed CLI examples and usage information, see the [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/).
|
327
|
+
|
328
|
+
|
329
|
+
## Documentation
|
330
|
+
|
331
|
+
Comprehensive documentation, including usage guides and examples, is available at:
|
332
|
+
|
333
|
+
**[https://nazdridoy.github.io/ngpt/](https://nazdridoy.github.io/ngpt/)**
|
334
|
+
|
335
|
+
Key documentation sections:
|
336
|
+
- [Installation Guide](https://nazdridoy.github.io/ngpt/installation/)
|
337
|
+
- [CLI Usage Guide](https://nazdridoy.github.io/ngpt/usage/cli_usage/)
|
338
|
+
- [Configuration Guide](https://nazdridoy.github.io/ngpt/configuration/)
|
339
|
+
- [Examples & Tutorials](https://nazdridoy.github.io/ngpt/examples/basic/)
|
340
|
+
- [Git Commit Message Guide](https://nazdridoy.github.io/ngpt/usage/gitcommsg/)
|
341
|
+
|
342
|
+
|
343
|
+
## Configuration
|
344
|
+
|
345
|
+
### API Key Setup
|
346
|
+
|
347
|
+
#### OpenAI API Key
|
348
|
+
1. Create an account at [OpenAI](https://platform.openai.com/)
|
349
|
+
2. Navigate to API keys: https://platform.openai.com/api-keys
|
350
|
+
3. Click "Create new secret key" and copy your API key
|
351
|
+
4. Configure nGPT with your key:
|
352
|
+
```bash
|
353
|
+
ngpt --config
|
354
|
+
# Enter provider: OpenAI
|
355
|
+
# Enter API key: your-openai-api-key
|
356
|
+
# Enter base URL: https://api.openai.com/v1/
|
357
|
+
# Enter model: gpt-3.5-turbo (or other model)
|
358
|
+
```
|
359
|
+
|
360
|
+
#### Google Gemini API Key
|
361
|
+
1. Create or use an existing Google account
|
362
|
+
2. Go to [Google AI Studio](https://aistudio.google.com/)
|
363
|
+
3. Navigate to API keys in the left sidebar (or visit https://aistudio.google.com/app/apikey)
|
364
|
+
4. Create an API key and copy it
|
365
|
+
5. Configure nGPT with your key:
|
366
|
+
```bash
|
367
|
+
ngpt --config
|
368
|
+
# Enter provider: Gemini
|
369
|
+
# Enter API key: your-gemini-api-key
|
370
|
+
# Enter base URL: https://generativelanguage.googleapis.com/v1beta/openai
|
371
|
+
# Enter model: gemini-2.0-flash
|
372
|
+
```
|
373
|
+
|
374
|
+
For more detailed information, refer to the [API Key Setup documentation](https://nazdridoy.github.io/ngpt/configuration/#api-key-setup).
|
406
375
|
|
407
376
|
### CLI Configuration
|
408
377
|
|
@@ -20,9 +20,9 @@ ngpt/utils/__init__.py,sha256=qu_66I1Vtav2f1LDiPn5J3DUsbK7o1CSScMcTkYqxoM,1179
|
|
20
20
|
ngpt/utils/cli_config.py,sha256=Ug8cECBTIuzOwkBWidLTfs-OAdOsCMJ2bNa70pOADfw,11195
|
21
21
|
ngpt/utils/config.py,sha256=wsArA4osnh8fKqOvtsPqqBxAz3DpdjtaWUFaRtnUdyc,10452
|
22
22
|
ngpt/utils/log.py,sha256=f1jg2iFo35PAmsarH8FVL_62plq4VXH0Mu2QiP6RJGw,15934
|
23
|
-
ngpt/utils/web_search.py,sha256=
|
24
|
-
ngpt-3.
|
25
|
-
ngpt-3.
|
26
|
-
ngpt-3.
|
27
|
-
ngpt-3.
|
28
|
-
ngpt-3.
|
23
|
+
ngpt/utils/web_search.py,sha256=TK_c2U8MYM86f9J_oEzi0UZ46JohvyxdjfonHZZZqfY,30718
|
24
|
+
ngpt-3.5.1.dist-info/METADATA,sha256=djECxREmLWeO-ugYuco3gi2xEJqaGuFpjJopqa3veLI,23886
|
25
|
+
ngpt-3.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
26
|
+
ngpt-3.5.1.dist-info/entry_points.txt,sha256=SqAAvLhMrsEpkIr4YFRdUeyuXQ9o0IBCeYgE6AVojoI,44
|
27
|
+
ngpt-3.5.1.dist-info/licenses/LICENSE,sha256=mQkpWoADxbHqE0HRefYLJdm7OpdrXBr3vNv5bZ8w72M,1065
|
28
|
+
ngpt-3.5.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|