SmartWebSearch 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 LIN WAI CHON
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: SmartWebSearch
3
+ Version: 1.2.1
4
+ Summary: SmartWebSearch is a Python package that combines the Tavily search API with Retrieval-Augmented Generation (RAG), LLM-powered query expansion, and web content extraction to perform intelligent, deep web searches with automated summarization.
5
+ Author: LIN WAI CHON
6
+ Author-email: jacksonlam.temp@gmail.com
7
+ License-File: LICENSE
8
+ Requires-Dist: requests
9
+ Requires-Dist: bs4
10
+ Requires-Dist: selenium
11
+ Requires-Dist: markdownify
12
+ Requires-Dist: tavily
13
+ Requires-Dist: numpy
14
+ Requires-Dist: sentence_transformers
15
+ Requires-Dist: langchain_text_splitters
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: license-file
19
+ Dynamic: requires-dist
20
+ Dynamic: summary
@@ -0,0 +1,90 @@
1
+ # Smart Web Search Package
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
5
+
6
+ SmartWebSearch is a Python package that combines the Tavily search API with Retrieval-Augmented Generation (RAG), LLM-powered query expansion, and web content extraction to perform intelligent, deep web searches with automated summarization.
7
+
8
+ ## Package Version
9
+ - 1.2.1
10
+
11
+ ## Features
12
+ - 🌐 **Web Search** – Uses Tavily API to fetch relevant search results.
13
+ - 🧠 **Query Expansion** – Leverages LLMs (e.g., DeepSeek) to decompose complex queries and generate auxiliary searches.
14
+ - 📄 **Content Extraction** – Fetches full page content using headless Chrome and filters noise.
15
+ - 🔍 **RAG Pipeline** – Embeds documents with multilingual models (e.g., multilingual-e5-base) and retrieves context-aware chunks.
16
+ - 📝 **Summarization** – Summarizes retrieved content using LLMs.
17
+
18
+ ## Environment
19
+ - **Python 3.12 or above**
20
+ - **Windows 11 Pro 64-bit** (macOS haven't tested)
21
+ - **Python Packages** (requests, bs4, selenium, markdownify, tavily, numpy, sentence_transformers, langchain_text_splitters)
22
+
23
+ ## Installation
24
+ - **The SmartWebSearch Package**: Install the SmartWebSearch package [here](https://github.com/LittleWai07/smart-web-search-package/archive/refs/heads/main.zip) or with git command `git clone https://github.com/LittleWai07/smart-web-search-package.git` (Git is required to run this command)
25
+ - **Required Python Packages**: Install the required Python packages by command `pip install -r requirements.txt`
26
+
27
+ ## API Keys
28
+ You need two API keys
29
+ - **Tavily API key**: Sign up and get the API key [here](https://www.tavily.com) (1,000 free quotas per month)
30
+ - **OpenAI Compatible API key**: eg., from [OpenAI](https://platform.openai.com/), [DeepSeek](https://platform.deepseek.com/), etc.
31
+
32
+ ## 🔒 Security Note
33
+
34
+ For security reasons, **never hard-code your API keys directly in your source code**.
35
+ Instead, store them in environment variables, a `.env` file or a `*.json` file and load them into your program.
36
+
37
+ ## Quick Start
38
+ Fill in the API keys and following required parameters manually.
39
+ - **Tavily API Key**: The Tavily search API key (The key starts with `tvly-dev-`).
40
+ - **OpenAI Compatible API Key**: The API key for the OpenAI Compatible API platform (The key usually starts with `sk-`).
41
+ - **AI Model**: The id of the AI model used for summarization. (Default: `deepseek-chat`)
42
+ - **OpenAI Compatible API Base URL**: The base url of the OpenAI Compatible API platform (The URL usually end with `/chat/completions`) (Default: `https://api.deepseek.com/chat/completions`)
43
+
44
+ ```python
45
+ """
46
+ SmartWebSearch
47
+ ~~~~~~~~~~~~
48
+ An example of how to use the SmartWebSearch package.
49
+ """
50
+
51
+ # Import the SmartWebSearch package
52
+ import SmartWebSearch as sws
53
+
54
+ # --------------------------------------------------------------------
55
+ # You can configure for different API providers by changing the
56
+ # model and base_url. Below are some examples:
57
+ # --------------------------------------------------------------------
58
+
59
+ # Example 1: Using DeepSeek (default)
60
+ search = sws.SmartWebSearch(
61
+ "<Tavily API Key>",
62
+ "<OpenAI Compatible API Key>",
63
+ model="deepseek-chat",
64
+ openai_comp_api_base_url="https://api.deepseek.com/chat/completions"
65
+ )
66
+
67
+ # Example 2: Using OpenAI
68
+ # search = sws.SmartWebSearch(
69
+ # "<Tavily API Key>",
70
+ # "<OpenAI Compatible API Key>",
71
+ # model="gpt-4-turbo-preview",
72
+ # openai_comp_api_base_url="https://api.openai.com/v1/chat/completions"
73
+ # )
74
+
75
+ # --------------------------------------------------------------------
76
+ # Run a search
77
+ # --------------------------------------------------------------------
78
+ prompt = input("Enter a prompt: ")
79
+
80
+ print("=== Normal Search (Tavily summaries) ===")
81
+ print(search.search(prompt))
82
+
83
+ print("\n=== Deep Search (full page content + RAG) ===")
84
+ print(search.deepsearch(prompt))
85
+ ```
86
+
87
+ **Note**: The documentation of this package will be completed in the future.
88
+
89
+ ## License
90
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
@@ -0,0 +1,43 @@
1
+ """
2
+ SmartWebSearch.ChromeDriver
3
+ ~~~~~~~~~~~~
4
+
5
+ This module implements the ChromeDriver.
6
+ """
7
+
8
+ # Import the required modules
9
+ from selenium.webdriver import Chrome
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.chrome.options import Options
12
+
13
+ # The ChromeDriver class
14
+ class ChromeDriver:
15
+ """
16
+ A class for interacting with a ChromeDriver.
17
+ """
18
+
19
+ def __init__(self) -> None:
20
+ """
21
+ Initialize the ChromeDriver object.
22
+
23
+ Returns:
24
+ None
25
+ """
26
+
27
+ # Create a headless Chrome browser
28
+ self.chrome_options: Options = Options()
29
+ self.chrome_options.add_argument("--headless")
30
+ self.chrome_options.add_argument("--no-sandbox")
31
+
32
+ self.driver: Chrome = Chrome(options = self.chrome_options)
33
+ self.driver.set_page_load_timeout(20)
34
+
35
+ def quit(self) -> None:
36
+ """
37
+ Quit the ChromeDriver object.
38
+
39
+ Returns:
40
+ None
41
+ """
42
+
43
+ self.driver.quit()
@@ -0,0 +1,111 @@
1
+ """
2
+ SmartWebSearch.RAGTool
3
+ ~~~~~~~~~~~~
4
+
5
+ This module implements the Debugger Tool for the package.
6
+ """
7
+
8
+ # Import the required modules
9
+ import os
10
+ import datetime
11
+ from typing import Any, TypeAlias, Literal
12
+
13
+ # Type Alias
14
+ _DebugType: TypeAlias = Literal['INFO', 'WARNING', 'ERROR', 'FILE']
15
+ _DebugImportance: TypeAlias = Literal['LOW', 'MEDIUM', 'HIGH']
16
+
17
+ # Configuration Class
18
+ class DebuggerConfiguration:
19
+ """
20
+ DebuggerConfiguration class for the Debugger Tool.
21
+ """
22
+
23
+ # Whether to enable debugging
24
+ DEBUGGING: bool = False
25
+
26
+ # Whether to enable creating debug files
27
+ CREATE_DEBUG_FILES: bool = True
28
+
29
+ # Whether to skip low importance debug messages
30
+ SKIP_LOW_IMPORTANCE: bool = False
31
+
32
+ # Functions
33
+ def clear_debug_files() -> None:
34
+ """
35
+ Clear all debug files in the current directory.
36
+
37
+ Returns:
38
+ None
39
+ """
40
+
41
+ # Get all files in the current directory
42
+ files: list[str] = os.listdir()
43
+
44
+ # Loop through the files
45
+ for file in files:
46
+ # Check if the file is a debug file
47
+ if file.startswith("debug-"):
48
+ # Delete the file
49
+ os.remove(file)
50
+
51
+ # Run the clear_debug_files function
52
+ clear_debug_files()
53
+
54
+ # Functions
55
+ def show_debug(*values: tuple[Any], type: _DebugType = 'INFO', importance: _DebugImportance = 'MEDIUM') -> None:
56
+ """
57
+ Print the values to the console if DEBUGGING is True.
58
+
59
+ Args:
60
+ *values (tuple[Any]): The values to print.
61
+ type (_DebugType) = 'INFO': The type of debug message.
62
+
63
+ Returns:
64
+ None
65
+ """
66
+
67
+ # If type is error, set importance to high
68
+ if type == 'ERROR': importance = 'HIGH'
69
+
70
+ # If importance is low and SKIP_LOW_IMPORTANCE is True, return
71
+ if importance == 'LOW' and DebuggerConfiguration.SKIP_LOW_IMPORTANCE: return
72
+
73
+ # Print the values if DEBUGGING is True
74
+ if DebuggerConfiguration.DEBUGGING:
75
+ print(f'[DEBUGGER] <{type} - {importance[0]}>', *values)
76
+
77
+ def create_debug_file(filename: str, ext: str, content: str) -> None:
78
+ """
79
+ Create a debug file with the given filename and content.
80
+
81
+ Args:
82
+ filename (str): The name of the file to create.
83
+ ext (str): The extension of the file to create.
84
+ content (str): The content to write to the file.
85
+
86
+ Returns:
87
+ None
88
+ """
89
+
90
+ # If not debugging, return
91
+ if not DebuggerConfiguration.DEBUGGING: return
92
+
93
+ # If not creating debug files, return
94
+ if not DebuggerConfiguration.CREATE_DEBUG_FILES: return
95
+
96
+ # Replace all spaces in the filename to dash
97
+ filename: str = filename.replace(" ", "-")
98
+
99
+ # Replace all underscores in the filename to dash
100
+ filename: str = filename.replace("_", "-")
101
+
102
+ # Create the directory if it doesn't exist
103
+ if os.path.dirname(filename):
104
+ os.makedirs(os.path.dirname(filename), exist_ok = True)
105
+
106
+ # Write the content to the file
107
+ with open(f"debug-{filename}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.{ext}", "w", encoding = "utf-8") as f:
108
+ f.write(content)
109
+
110
+ # Show debug message
111
+ show_debug(f"Created debug file: 'debug-{filename}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.{ext}', content length: {len(content)}", type = 'FILE')
@@ -0,0 +1,97 @@
1
+ """
2
+ SmartWebSearch.KeyCheck
3
+ ~~~~~~~~~~~~
4
+
5
+ This module implements the KeyCheck Tool for the package.
6
+ """
7
+
8
+ # Import the required modules
9
+ import requests
10
+
11
+ # Exception Class
12
+ class InvalidKeyError(Exception):
13
+ """
14
+ An exception for invalid API keys.
15
+ """
16
+
17
+ def __init__(self, message: str) -> None:
18
+ """
19
+ Initialize the InvalidKeyError object.
20
+
21
+ Args:
22
+ message (str): The error message.
23
+ """
24
+
25
+ self.message: str = message
26
+ super().__init__(self.message)
27
+
28
+ # KeyCheck Class
29
+ class KeyCheck:
30
+ """
31
+ A class for checking the validity of API keys.
32
+ """
33
+
34
+ RAISE_ERROR: bool = True
35
+
36
+ # Check if the OpenAI Compatible API key is valid
37
+ @staticmethod
38
+ def check_openai_comp_api_key(openai_comp_api_key: str, model: str = "deepseek-chat", openai_comp_api_base_url: str = "https://api.deepseek.com/chat/completions") -> bool:
39
+ """
40
+ Check if the OpenAI Compatible API key is valid.
41
+
42
+ Args:
43
+ openai_comp_api_key (str): The OpenAI Compatible API key.
44
+ model (str) = "deepseek-chat": The model to use.
45
+ openai_comp_api_base_url (str) = "https://api.deepseek.com/chat/completions": The OpenAI Compatible API base URL.
46
+
47
+ Returns:
48
+ bool: True if the key is valid, False otherwise.
49
+ """
50
+
51
+ # Send a request to the OpenAI Compatible API to check if the key is valid
52
+ res: requests.Response = requests.post(
53
+ openai_comp_api_base_url,
54
+ headers = {
55
+ "Content-Type": "application/json",
56
+ "Authorization": f"Bearer {openai_comp_api_key}"
57
+ },
58
+ json = {
59
+ "model": model,
60
+ "messages": [{"role": "user", "content": "Hello!"}]
61
+ }
62
+ )
63
+
64
+ # If the key is invalid, raise an exception
65
+ if res.status_code != 200 and KeyCheck.RAISE_ERROR:
66
+ raise InvalidKeyError(f"Invalid OpenAI Compatible API key: {openai_comp_api_key}")
67
+
68
+ # Return True if the key is valid, False otherwise
69
+ return res.status_code == 200
70
+
71
+ # Check if the Tavily API key is valid
72
+ @staticmethod
73
+ def check_tavily_api_key(tavily_api_key: str) -> bool:
74
+ """
75
+ Check if the Tavily API key is valid.
76
+
77
+ Args:
78
+ tavily_api_key (str): The Tavily API key.
79
+
80
+ Returns:
81
+ bool: True if the key is valid, False otherwise.
82
+ """
83
+
84
+ # Send a request to the Tavily API to check if the key is valid
85
+ res: requests.Response = requests.get(
86
+ "https://api.tavily.com/usage",
87
+ headers = {
88
+ "Authorization": f"Bearer {tavily_api_key}"
89
+ }
90
+ )
91
+
92
+ # If the key is invalid, raise an exception
93
+ if res.status_code != 200 and KeyCheck.RAISE_ERROR:
94
+ raise InvalidKeyError(f"Invalid Tavily API key: {tavily_api_key}")
95
+
96
+ # Return True if the key is valid, False otherwise
97
+ return res.status_code == 200
@@ -0,0 +1,190 @@
1
+ """
2
+ SmartWebSearch.Progress
3
+ ~~~~~~~~~~~~
4
+
5
+ This module implements the progress for the web searching module.
6
+ """
7
+
8
+ # Import the required modules
9
+ from typing import Any, TypeAlias, Literal, Callable
10
+ from datetime import datetime
11
+
12
+ # Type Alias
13
+ _ProgressStatus: TypeAlias = Literal['IDLE', 'STORMING', 'STORMED', 'SEARCHING', 'SEARCHED', 'PARSING', 'PARSED', 'KL_BASE_CREATING', 'KL_BASE_CREATED', 'KL_BASE_MATCHING', 'KL_BASE_MATCHED', 'CONCLUDING', 'CONCLUDED', 'PART_COMPLETED', 'COMPLETED', 'REQUEST_TIMEOUT']
14
+
15
+ # Progress Classes
16
+ class ProgressStatusSelector:
17
+ """
18
+ A class representing the status of a web searching operation.
19
+ """
20
+
21
+ # Constants
22
+ IDLE: _ProgressStatus = 'IDLE'
23
+ STORMING: _ProgressStatus = 'STORMING'
24
+ STORMED: _ProgressStatus = 'STORMED'
25
+ SEARCHING: _ProgressStatus = 'SEARCHING'
26
+ SEARCHED: _ProgressStatus = 'SEARCHED'
27
+ PARSING: _ProgressStatus = 'PARSING'
28
+ PARSED: _ProgressStatus = 'PARSED'
29
+ KL_BASE_CREATING: _ProgressStatus = 'KL_BASE_CREATING'
30
+ KL_BASE_CREATED: _ProgressStatus = 'KL_BASE_CREATED'
31
+ KL_BASE_MATCHING: _ProgressStatus = 'KL_BASE_MATCHING'
32
+ KL_BASE_MATCHED: _ProgressStatus = 'KL_BASE_MATCHED'
33
+ CONCLUDING: _ProgressStatus = 'CONCLUDING'
34
+ CONCLUDED: _ProgressStatus = 'CONCLUDED'
35
+ PART_COMPLETED: _ProgressStatus = 'PART_COMPLETED'
36
+ COMPLETED: _ProgressStatus = 'COMPLETED'
37
+ REQUEST_TIMEOUT: _ProgressStatus = 'REQUEST_TIMEOUT'
38
+
39
+ class _ProgressData:
40
+ """
41
+ A class representing the data of a web searching operation.
42
+ """
43
+
44
+ def __init__(self, status: _ProgressStatus = 'IDLE', message: str = None, data: Any = None, progress: float = None, timestamp: datetime = None) -> None:
45
+ """
46
+ Initializes a new instance of the _ProgressData class.
47
+
48
+ Args:
49
+ status (_ProgressStatus) = 'IDLE': The status of the progress.
50
+ message (str) = None: The message to display. Defaults to None.
51
+ data (Any) = None: The data to display. Defaults to None.
52
+ progress (float) = None: The progress of the operation. Defaults to None. Range: [0.0, 1.0].
53
+ timestamp (datetime) = datetime.now(): The timestamp of the progress. Defaults to current datetime.
54
+
55
+ Returns:
56
+ None
57
+ """
58
+
59
+ self.__status: _ProgressStatus = status
60
+ self.__message: str = message
61
+ self.__data: Any = data
62
+ self.__progress: float = progress
63
+ self.__timestamp: datetime = timestamp if timestamp else datetime.now()
64
+
65
+ def __str__(self) -> str:
66
+ """
67
+ Returns the string representation of the _ProgressData class.
68
+
69
+ Returns:
70
+ str: The string representation of the _ProgressData class.
71
+ """
72
+
73
+ return f"_ProgressData(status='{self.__status}', message='{self.__message}', data='{self.__data}', progress='{self.__progress}', timestamp='{self.__timestamp}')"
74
+
75
+ @property
76
+ def status(self) -> _ProgressStatus:
77
+ """
78
+ Returns the status of the progress.
79
+
80
+ Returns:
81
+ _ProgressStatus: The status of the progress.
82
+ """
83
+ return self.__status
84
+
85
+ @property
86
+ def message(self) -> str:
87
+ """
88
+ Returns the message of the progress.
89
+
90
+ Returns:
91
+ str: The message of the progress.
92
+ """
93
+ return self.__message
94
+
95
+ @property
96
+ def data(self) -> Any:
97
+ """
98
+ Returns the data of the progress.
99
+
100
+ Returns:
101
+ Any: The data of the progress.
102
+ """
103
+ return self.__data
104
+
105
+ @property
106
+ def progress(self) -> float:
107
+ """
108
+ Returns the progress of the progress.
109
+
110
+ Returns:
111
+ float: The progress of the progress.
112
+ """
113
+ return self.__progress
114
+
115
+ @property
116
+ def timestamp(self) -> datetime:
117
+ """
118
+ Returns the timestamp of the progress.
119
+
120
+ Returns:
121
+ datetime: The timestamp of the progress.
122
+ """
123
+ return self.__timestamp
124
+
125
+ class Progress:
126
+ """
127
+ A class representing the progress of a web searching operation.
128
+ """
129
+
130
+ def __init__(self) -> None:
131
+ """
132
+ Initializes a new instance of the Progress class.
133
+
134
+ Returns:
135
+ None
136
+ """
137
+
138
+ # Initialize the class attributes
139
+ self.__current_progress: _ProgressData = _ProgressData()
140
+ self.__progress_listeners = []
141
+
142
+ def add_progress_listener(self, listener: Callable[[_ProgressStatus], None]) -> None:
143
+ """
144
+ Adds a listener to the progress of a web searching operation.
145
+
146
+ Args:
147
+ listener (Callable[[_ProgressStatus], None]): The callback function to add.
148
+
149
+ Returns:
150
+ None
151
+ """
152
+
153
+ # Add the listener to the list of listeners
154
+ self.__progress_listeners.append(listener)
155
+
156
+ def remove_progress_listener(self, listener: Callable) -> None:
157
+ """
158
+ Removes a listener from the progress of a web searching operation.
159
+
160
+ Args:
161
+ listener (Callable[]): The callback function to remove.
162
+
163
+ Returns:
164
+ None
165
+ """
166
+
167
+ # Remove the listener from the list of listeners
168
+ self.__progress_listeners.remove(listener)
169
+
170
+ def _update_progress(self, status: _ProgressStatus, message: str = None, data: Any = None, progress: float = None, timestamp: datetime = None) -> None:
171
+ """
172
+ Updates the progress of a web searching operation.
173
+
174
+ Args:
175
+ status (_ProgressStatus): The status of the progress.
176
+ message (str, optional): The message to display. Defaults to None.
177
+ data (Any, optional): The data to display. Defaults to None.
178
+ progress (float, optional): The progress of the operation. Defaults to None. Range: [0.0, 1.0].
179
+ timestamp (datetime, optional): The timestamp of the progress. Defaults to current datetime.
180
+
181
+ Returns:
182
+ None
183
+ """
184
+
185
+ # Update the progress
186
+ self.__current_progress: _ProgressData = _ProgressData(status, message, data, progress, timestamp)
187
+
188
+ # Call the listeners
189
+ for listener in self.__progress_listeners:
190
+ listener(self.__current_progress)