ai-url-aggregator 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_url_aggregator-0.1/LICENSE +21 -0
- ai_url_aggregator-0.1/PKG-INFO +137 -0
- ai_url_aggregator-0.1/README.md +120 -0
- ai_url_aggregator-0.1/ai_url_aggregator/__init__.py +210 -0
- ai_url_aggregator-0.1/ai_url_aggregator.egg-info/PKG-INFO +137 -0
- ai_url_aggregator-0.1/ai_url_aggregator.egg-info/SOURCES.txt +9 -0
- ai_url_aggregator-0.1/ai_url_aggregator.egg-info/dependency_links.txt +1 -0
- ai_url_aggregator-0.1/ai_url_aggregator.egg-info/requires.txt +2 -0
- ai_url_aggregator-0.1/ai_url_aggregator.egg-info/top_level.txt +1 -0
- ai_url_aggregator-0.1/pyproject.toml +31 -0
- ai_url_aggregator-0.1/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Carlos A. Planchón
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: ai_url_aggregator
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Extracts all text results from an XPath query on a parsel Selector object.
|
|
5
|
+
Author-email: "Carlos A. Planchón" <carlosandresplanchonprestes@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: repository, https://github.com/carlosplanchon/ai_url_aggregator.git
|
|
8
|
+
Keywords: openai,url,research,grabber
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
|
|
18
|
+
```markdown
|
|
19
|
+
# ai_url_aggregator
|
|
20
|
+
|
|
21
|
+
> **Note**: *This is a small experimental library, provided as-is.
|
|
22
|
+
|
|
23
|
+
**ai_url_aggregator** is a Python tool that leverages **Perplexity** and **OpenAI** to search the internet for relevant URLs, filter and deduplicate them, check their availability, and then select the most important ones based on GPT analysis.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
1. **Search Across Models**
|
|
30
|
+
Uses Perplexity’s `sonar-reasoning` model to query the internet for URLs related to your prompt.
|
|
31
|
+
2. **Clean & Filter**
|
|
32
|
+
- Prefers `https://` links when both `http://` and `https://` are found for the same domain.
|
|
33
|
+
- Removes duplicates by collecting results into a `set`.
|
|
34
|
+
3. **Online Check**
|
|
35
|
+
- Verifies each URL’s availability (status codes `200` or `403`) using `requests`.
|
|
36
|
+
4. **Relevance Ranking**
|
|
37
|
+
- Uses an OpenAI model to select the most important websites from the deduplicated list of online URLs.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
### 1. Install via PyPI
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install ai_url_aggregator
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### 2. Set Environment Variables
|
|
50
|
+
|
|
51
|
+
You must provide your **Perplexity** and **OpenAI** API keys:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
export PERPLEXITY_API_KEY="PERPLEXITY_API_KEY"
|
|
55
|
+
export OPENAI_API_KEY="OPENAI_API_KEY"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Replace `"PERPLEXITY_API_KEY"` and `"OPENAI_API_KEY"` with your actual API keys.
|
|
59
|
+
|
|
60
|
+
### 3. (Optional) Install from Source
|
|
61
|
+
|
|
62
|
+
1. **Clone or Download** this repository.
|
|
63
|
+
2. **Install Dependencies**:
|
|
64
|
+
```bash
|
|
65
|
+
pip install -r requirements.txt
|
|
66
|
+
```
|
|
67
|
+
This ensures all required libraries (like `openai`, `requests`, etc.) are installed.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## How It Works
|
|
72
|
+
|
|
73
|
+
1. **`query_models(query: str) -> list[str]`**
|
|
74
|
+
- Sends a query to Perplexity’s `sonar-reasoning` model.
|
|
75
|
+
- Parses the Perplexity output with an OpenAI model into a structured list of URLs.
|
|
76
|
+
|
|
77
|
+
2. **`keep_https(urls: list[str]) -> list[str]`**
|
|
78
|
+
- Selects `https://` versions of URLs when duplicates exist, else keeps `http://`.
|
|
79
|
+
|
|
80
|
+
3. **`execute_query_multiple_times(query: str, num_runs: int) -> list[str]`**
|
|
81
|
+
- Runs the query multiple times to gather more URLs.
|
|
82
|
+
- Deduplicates results using a `set`.
|
|
83
|
+
|
|
84
|
+
4. **`check_urls_online(urls: list[str]) -> list[str]`**
|
|
85
|
+
- Pings each URL to see if it’s reachable (status `200` or `403`).
|
|
86
|
+
|
|
87
|
+
5. **`search_for_web_urls(query: str, num_runs: int) -> list[str]`**
|
|
88
|
+
- Brings all the above together:
|
|
89
|
+
1. Executes a query multiple times.
|
|
90
|
+
2. Prefers HTTPS versions of each domain.
|
|
91
|
+
3. Verifies URL reachability.
|
|
92
|
+
4. Returns a final list of online, deduplicated URLs.
|
|
93
|
+
|
|
94
|
+
6. **`get_top_relevant_websites(website_urls: list[str]) -> list[Website]`**
|
|
95
|
+
- Uses an OpenAI model to select the most relevant (important) websites from the final list of URLs.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Usage Example
|
|
100
|
+
|
|
101
|
+
Once installed and your environment variables are set, you can do:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import prettyprinter
|
|
105
|
+
from ai_url_aggregator import (
|
|
106
|
+
search_for_web_urls,
|
|
107
|
+
get_top_relevant_websites
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Optional: install prettyprinter extras for nicer output
|
|
111
|
+
prettyprinter.install_extras()
|
|
112
|
+
|
|
113
|
+
# Example query:
|
|
114
|
+
query = "Give me a list of all the real state agencies in Uruguay."
|
|
115
|
+
|
|
116
|
+
# Step 1: Get a cleaned, deduplicated, and verified list of URLs
|
|
117
|
+
online_urls = search_for_web_urls(query=query)
|
|
118
|
+
|
|
119
|
+
print("--- Online URLs ---")
|
|
120
|
+
prettyprinter.cpprint(online_urls)
|
|
121
|
+
|
|
122
|
+
# Step 2: Get the most important websites from the final list
|
|
123
|
+
most_important_websites = get_top_relevant_websites(website_urls=online_urls)
|
|
124
|
+
|
|
125
|
+
print("--- Most Important Websites ---")
|
|
126
|
+
prettyprinter.cpprint(most_important_websites)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
This project is distributed under the **MIT License**. See `LICENSE` for more information.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
All suggestions and improvements are welcome!
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
```markdown
|
|
2
|
+
# ai_url_aggregator
|
|
3
|
+
|
|
4
|
+
> **Note**: *This is a small experimental library, provided as-is.
|
|
5
|
+
|
|
6
|
+
**ai_url_aggregator** is a Python tool that leverages **Perplexity** and **OpenAI** to search the internet for relevant URLs, filter and deduplicate them, check their availability, and then select the most important ones based on GPT analysis.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
1. **Search Across Models**
|
|
13
|
+
Uses Perplexity’s `sonar-reasoning` model to query the internet for URLs related to your prompt.
|
|
14
|
+
2. **Clean & Filter**
|
|
15
|
+
- Prefers `https://` links when both `http://` and `https://` are found for the same domain.
|
|
16
|
+
- Removes duplicates by collecting results into a `set`.
|
|
17
|
+
3. **Online Check**
|
|
18
|
+
- Verifies each URL’s availability (status codes `200` or `403`) using `requests`.
|
|
19
|
+
4. **Relevance Ranking**
|
|
20
|
+
- Uses an OpenAI model to select the most important websites from the deduplicated list of online URLs.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
### 1. Install via PyPI
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install ai_url_aggregator
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### 2. Set Environment Variables
|
|
33
|
+
|
|
34
|
+
You must provide your **Perplexity** and **OpenAI** API keys:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
export PERPLEXITY_API_KEY="PERPLEXITY_API_KEY"
|
|
38
|
+
export OPENAI_API_KEY="OPENAI_API_KEY"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Replace `"PERPLEXITY_API_KEY"` and `"OPENAI_API_KEY"` with your actual API keys.
|
|
42
|
+
|
|
43
|
+
### 3. (Optional) Install from Source
|
|
44
|
+
|
|
45
|
+
1. **Clone or Download** this repository.
|
|
46
|
+
2. **Install Dependencies**:
|
|
47
|
+
```bash
|
|
48
|
+
pip install -r requirements.txt
|
|
49
|
+
```
|
|
50
|
+
This ensures all required libraries (like `openai`, `requests`, etc.) are installed.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## How It Works
|
|
55
|
+
|
|
56
|
+
1. **`query_models(query: str) -> list[str]`**
|
|
57
|
+
- Sends a query to Perplexity’s `sonar-reasoning` model.
|
|
58
|
+
- Parses the Perplexity output with an OpenAI model into a structured list of URLs.
|
|
59
|
+
|
|
60
|
+
2. **`keep_https(urls: list[str]) -> list[str]`**
|
|
61
|
+
- Selects `https://` versions of URLs when duplicates exist, else keeps `http://`.
|
|
62
|
+
|
|
63
|
+
3. **`execute_query_multiple_times(query: str, num_runs: int) -> list[str]`**
|
|
64
|
+
- Runs the query multiple times to gather more URLs.
|
|
65
|
+
- Deduplicates results using a `set`.
|
|
66
|
+
|
|
67
|
+
4. **`check_urls_online(urls: list[str]) -> list[str]`**
|
|
68
|
+
- Pings each URL to see if it’s reachable (status `200` or `403`).
|
|
69
|
+
|
|
70
|
+
5. **`search_for_web_urls(query: str, num_runs: int) -> list[str]`**
|
|
71
|
+
- Brings all the above together:
|
|
72
|
+
1. Executes a query multiple times.
|
|
73
|
+
2. Prefers HTTPS versions of each domain.
|
|
74
|
+
3. Verifies URL reachability.
|
|
75
|
+
4. Returns a final list of online, deduplicated URLs.
|
|
76
|
+
|
|
77
|
+
6. **`get_top_relevant_websites(website_urls: list[str]) -> list[Website]`**
|
|
78
|
+
- Uses an OpenAI model to select the most relevant (important) websites from the final list of URLs.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Usage Example
|
|
83
|
+
|
|
84
|
+
Once installed and your environment variables are set, you can do:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import prettyprinter
|
|
88
|
+
from ai_url_aggregator import (
|
|
89
|
+
search_for_web_urls,
|
|
90
|
+
get_top_relevant_websites
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Optional: install prettyprinter extras for nicer output
|
|
94
|
+
prettyprinter.install_extras()
|
|
95
|
+
|
|
96
|
+
# Example query:
|
|
97
|
+
query = "Give me a list of all the real state agencies in Uruguay."
|
|
98
|
+
|
|
99
|
+
# Step 1: Get a cleaned, deduplicated, and verified list of URLs
|
|
100
|
+
online_urls = search_for_web_urls(query=query)
|
|
101
|
+
|
|
102
|
+
print("--- Online URLs ---")
|
|
103
|
+
prettyprinter.cpprint(online_urls)
|
|
104
|
+
|
|
105
|
+
# Step 2: Get the most important websites from the final list
|
|
106
|
+
most_important_websites = get_top_relevant_websites(website_urls=online_urls)
|
|
107
|
+
|
|
108
|
+
print("--- Most Important Websites ---")
|
|
109
|
+
prettyprinter.cpprint(most_important_websites)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
This project is distributed under the **MIT License**. See `LICENSE` for more information.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
All suggestions and improvements are welcome!
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
PERPLEXITY_API_KEY = os.environ["PERPLEXITY_API_KEY"]
|
|
17
|
+
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
perplexity_client = OpenAI(
|
|
21
|
+
api_key=PERPLEXITY_API_KEY,
|
|
22
|
+
base_url="https://api.perplexity.ai"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
openai_client = OpenAI(
|
|
26
|
+
api_key=OPENAI_API_KEY
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class UrlList(BaseModel):
|
|
31
|
+
urls: List[str]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def query_models(query: str):
|
|
35
|
+
perplexity_messages = [
|
|
36
|
+
{
|
|
37
|
+
"role": "user",
|
|
38
|
+
"content": (
|
|
39
|
+
query
|
|
40
|
+
),
|
|
41
|
+
},
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# MODEL:
|
|
45
|
+
# https://sonar.perplexity.ai/
|
|
46
|
+
# demo chat completion without streaming.
|
|
47
|
+
response = perplexity_client.chat.completions.create(
|
|
48
|
+
# model="sonar-pro",
|
|
49
|
+
# sonar-reasoning is the Deepseek without censorship hosted in the US.
|
|
50
|
+
model="sonar-reasoning",
|
|
51
|
+
messages=perplexity_messages,
|
|
52
|
+
)
|
|
53
|
+
# print(response)
|
|
54
|
+
|
|
55
|
+
response_content: str = response.choices[0].message.content
|
|
56
|
+
print(response_content)
|
|
57
|
+
|
|
58
|
+
openai_messages = [
|
|
59
|
+
{
|
|
60
|
+
"role": "user",
|
|
61
|
+
"content": (
|
|
62
|
+
response_content
|
|
63
|
+
),
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
url_list_response = openai_client.beta.chat.completions.parse(
|
|
68
|
+
model="gpt-4o",
|
|
69
|
+
messages=openai_messages,
|
|
70
|
+
response_format=UrlList,
|
|
71
|
+
temperature=0
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
url_list: list[str] = url_list_response.choices[0].message.parsed.urls
|
|
75
|
+
|
|
76
|
+
return url_list
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def keep_https(urls: list[str]) -> list[str]:
|
|
80
|
+
# Create a dictionary to store the HTTPS version of each domain
|
|
81
|
+
https_urls = {}
|
|
82
|
+
|
|
83
|
+
for url in urls:
|
|
84
|
+
try:
|
|
85
|
+
# Extract the domain (without protocol)
|
|
86
|
+
domain = url.split('//')[1]
|
|
87
|
+
|
|
88
|
+
# If the URL is HTTPS, store it in the dictionary
|
|
89
|
+
if url.startswith('https://'):
|
|
90
|
+
https_urls[domain] = url
|
|
91
|
+
|
|
92
|
+
# If the URL is HTTP and no HTTPS version has been stored yet,
|
|
93
|
+
# store the HTTP version.
|
|
94
|
+
elif url.startswith('http://') and domain not in https_urls:
|
|
95
|
+
https_urls[domain] = url
|
|
96
|
+
|
|
97
|
+
except Exception:
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
# Return the values (URLs) from the dictionary
|
|
101
|
+
return list(https_urls.values())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def execute_query_multiple_times(
|
|
105
|
+
query: str,
|
|
106
|
+
num_runs: int = 5
|
|
107
|
+
) -> list[str]:
|
|
108
|
+
print("--- GET UNIQUE URLS ---")
|
|
109
|
+
# Initialize an empty set to store unique URLs
|
|
110
|
+
unique_urls = set()
|
|
111
|
+
|
|
112
|
+
# Execute the function `num_runs` times
|
|
113
|
+
for i in range(num_runs):
|
|
114
|
+
print(f"i: {i}")
|
|
115
|
+
# Get the URLs from the function
|
|
116
|
+
urls: list[str] = query_models(query=query)
|
|
117
|
+
|
|
118
|
+
urls = [url.rstrip("/") for url in urls]
|
|
119
|
+
|
|
120
|
+
# Add the URLs to the set (duplicates will be automatically ignored)
|
|
121
|
+
unique_urls.update(urls)
|
|
122
|
+
|
|
123
|
+
# Convert the set back to a list if needed
|
|
124
|
+
unique_urls_list = list(unique_urls)
|
|
125
|
+
|
|
126
|
+
# Return the list of unique URLs
|
|
127
|
+
return unique_urls_list
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def check_urls_online(urls: list[str]) -> list[str]:
|
|
131
|
+
online_urls = []
|
|
132
|
+
|
|
133
|
+
for url in urls:
|
|
134
|
+
try:
|
|
135
|
+
# Set a timeout to avoid waiting too long.
|
|
136
|
+
response = requests.head(url, timeout=5)
|
|
137
|
+
if response.status_code in [200, 403]:
|
|
138
|
+
online_urls.append(url)
|
|
139
|
+
else:
|
|
140
|
+
...
|
|
141
|
+
except requests.exceptions.RequestException:
|
|
142
|
+
# Fallback to GET if HEAD fails
|
|
143
|
+
try:
|
|
144
|
+
response = requests.get(url, timeout=5)
|
|
145
|
+
if response.status_code in [200, 403]:
|
|
146
|
+
online_urls.append(url)
|
|
147
|
+
except Exception:
|
|
148
|
+
...
|
|
149
|
+
return online_urls
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def search_for_web_urls(
|
|
153
|
+
query: str,
|
|
154
|
+
num_runs=5
|
|
155
|
+
) -> list[str]:
|
|
156
|
+
query_response: list[str] = execute_query_multiple_times(
|
|
157
|
+
query=query,
|
|
158
|
+
num_runs=num_runs
|
|
159
|
+
)
|
|
160
|
+
prefer_https_urls = keep_https(
|
|
161
|
+
urls=query_response
|
|
162
|
+
)
|
|
163
|
+
online_urls: list[str] = check_urls_online(
|
|
164
|
+
urls=prefer_https_urls
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return online_urls
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class Website(BaseModel):
|
|
171
|
+
name: str = Field(description="The name of the website.")
|
|
172
|
+
url: str = Field(description="The url of the website.")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class WebsiteList(BaseModel):
|
|
176
|
+
websites_list: list[Website] = Field(description="A short list.")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_top_relevant_websites(website_urls: list[str]) -> list[Website]:
|
|
180
|
+
websites_list_dump = json.dumps(website_urls)
|
|
181
|
+
|
|
182
|
+
# Get important websites with GPT.
|
|
183
|
+
openai_messages = [
|
|
184
|
+
{
|
|
185
|
+
"role": "user",
|
|
186
|
+
"content": (
|
|
187
|
+
websites_list_dump
|
|
188
|
+
),
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
"role": "system",
|
|
192
|
+
"content": (
|
|
193
|
+
"Select the most important websites from this JSON. Return a short list."
|
|
194
|
+
),
|
|
195
|
+
}
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
important_websites_fitered_by_gpt =\
|
|
199
|
+
openai_client.beta.chat.completions.parse(
|
|
200
|
+
model="gpt-4o",
|
|
201
|
+
messages=openai_messages,
|
|
202
|
+
response_format=WebsiteList,
|
|
203
|
+
temperature=0
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
parsed_websites_list: list[Website] =\
|
|
207
|
+
important_websites_fitered_by_gpt.choices[
|
|
208
|
+
0].message.parsed.websites_list
|
|
209
|
+
|
|
210
|
+
return parsed_websites_list
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: ai_url_aggregator
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Extracts all text results from an XPath query on a parsel Selector object.
|
|
5
|
+
Author-email: "Carlos A. Planchón" <carlosandresplanchonprestes@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: repository, https://github.com/carlosplanchon/ai_url_aggregator.git
|
|
8
|
+
Keywords: openai,url,research,grabber
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
|
|
18
|
+
```markdown
|
|
19
|
+
# ai_url_aggregator
|
|
20
|
+
|
|
21
|
+
> **Note**: *This is a small experimental library, provided as-is.
|
|
22
|
+
|
|
23
|
+
**ai_url_aggregator** is a Python tool that leverages **Perplexity** and **OpenAI** to search the internet for relevant URLs, filter and deduplicate them, check their availability, and then select the most important ones based on GPT analysis.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
1. **Search Across Models**
|
|
30
|
+
Uses Perplexity’s `sonar-reasoning` model to query the internet for URLs related to your prompt.
|
|
31
|
+
2. **Clean & Filter**
|
|
32
|
+
- Prefers `https://` links when both `http://` and `https://` are found for the same domain.
|
|
33
|
+
- Removes duplicates by collecting results into a `set`.
|
|
34
|
+
3. **Online Check**
|
|
35
|
+
- Verifies each URL’s availability (status codes `200` or `403`) using `requests`.
|
|
36
|
+
4. **Relevance Ranking**
|
|
37
|
+
- Uses an OpenAI model to select the most important websites from the deduplicated list of online URLs.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
### 1. Install via PyPI
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install ai_url_aggregator
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### 2. Set Environment Variables
|
|
50
|
+
|
|
51
|
+
You must provide your **Perplexity** and **OpenAI** API keys:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
export PERPLEXITY_API_KEY="PERPLEXITY_API_KEY"
|
|
55
|
+
export OPENAI_API_KEY="OPENAI_API_KEY"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Replace `"PERPLEXITY_API_KEY"` and `"OPENAI_API_KEY"` with your actual API keys.
|
|
59
|
+
|
|
60
|
+
### 3. (Optional) Install from Source
|
|
61
|
+
|
|
62
|
+
1. **Clone or Download** this repository.
|
|
63
|
+
2. **Install Dependencies**:
|
|
64
|
+
```bash
|
|
65
|
+
pip install -r requirements.txt
|
|
66
|
+
```
|
|
67
|
+
This ensures all required libraries (like `openai`, `requests`, etc.) are installed.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## How It Works
|
|
72
|
+
|
|
73
|
+
1. **`query_models(query: str) -> list[str]`**
|
|
74
|
+
- Sends a query to Perplexity’s `sonar-reasoning` model.
|
|
75
|
+
- Parses the Perplexity output with an OpenAI model into a structured list of URLs.
|
|
76
|
+
|
|
77
|
+
2. **`keep_https(urls: list[str]) -> list[str]`**
|
|
78
|
+
- Selects `https://` versions of URLs when duplicates exist, else keeps `http://`.
|
|
79
|
+
|
|
80
|
+
3. **`execute_query_multiple_times(query: str, num_runs: int) -> list[str]`**
|
|
81
|
+
- Runs the query multiple times to gather more URLs.
|
|
82
|
+
- Deduplicates results using a `set`.
|
|
83
|
+
|
|
84
|
+
4. **`check_urls_online(urls: list[str]) -> list[str]`**
|
|
85
|
+
- Pings each URL to see if it’s reachable (status `200` or `403`).
|
|
86
|
+
|
|
87
|
+
5. **`search_for_web_urls(query: str, num_runs: int) -> list[str]`**
|
|
88
|
+
- Brings all the above together:
|
|
89
|
+
1. Executes a query multiple times.
|
|
90
|
+
2. Prefers HTTPS versions of each domain.
|
|
91
|
+
3. Verifies URL reachability.
|
|
92
|
+
4. Returns a final list of online, deduplicated URLs.
|
|
93
|
+
|
|
94
|
+
6. **`get_top_relevant_websites(website_urls: list[str]) -> list[Website]`**
|
|
95
|
+
- Uses an OpenAI model to select the most relevant (important) websites from the final list of URLs.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Usage Example
|
|
100
|
+
|
|
101
|
+
Once installed and your environment variables are set, you can do:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import prettyprinter
|
|
105
|
+
from ai_url_aggregator import (
|
|
106
|
+
search_for_web_urls,
|
|
107
|
+
get_top_relevant_websites
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Optional: install prettyprinter extras for nicer output
|
|
111
|
+
prettyprinter.install_extras()
|
|
112
|
+
|
|
113
|
+
# Example query:
|
|
114
|
+
query = "Give me a list of all the real state agencies in Uruguay."
|
|
115
|
+
|
|
116
|
+
# Step 1: Get a cleaned, deduplicated, and verified list of URLs
|
|
117
|
+
online_urls = search_for_web_urls(query=query)
|
|
118
|
+
|
|
119
|
+
print("--- Online URLs ---")
|
|
120
|
+
prettyprinter.cpprint(online_urls)
|
|
121
|
+
|
|
122
|
+
# Step 2: Get the most important websites from the final list
|
|
123
|
+
most_important_websites = get_top_relevant_websites(website_urls=online_urls)
|
|
124
|
+
|
|
125
|
+
print("--- Most Important Websites ---")
|
|
126
|
+
prettyprinter.cpprint(most_important_websites)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
This project is distributed under the **MIT License**. See `LICENSE` for more information.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
All suggestions and improvements are welcome!
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
ai_url_aggregator/__init__.py
|
|
5
|
+
ai_url_aggregator.egg-info/PKG-INFO
|
|
6
|
+
ai_url_aggregator.egg-info/SOURCES.txt
|
|
7
|
+
ai_url_aggregator.egg-info/dependency_links.txt
|
|
8
|
+
ai_url_aggregator.egg-info/requires.txt
|
|
9
|
+
ai_url_aggregator.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ai_url_aggregator
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[options]
|
|
6
|
+
package_dir = "ai_url_aggregator"
|
|
7
|
+
packages = ["ai_url_aggregator"]
|
|
8
|
+
|
|
9
|
+
[project]
|
|
10
|
+
name = "ai_url_aggregator"
|
|
11
|
+
version = "0.1"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Carlos A. Planchón", email = "carlosandresplanchonprestes@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
description = "Extracts all text results from an XPath query on a parsel Selector object."
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Topic :: Software Development :: Build Tools",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
]
|
|
22
|
+
license = {text = "MIT License"}
|
|
23
|
+
keywords = ["openai", "url", "research", "grabber"]
|
|
24
|
+
readme = "README.md"
|
|
25
|
+
dependencies=[
|
|
26
|
+
"openai",
|
|
27
|
+
"requests"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
repository = "https://github.com/carlosplanchon/ai_url_aggregator.git"
|