mini-search-engine 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mini_search_engine-1.0.0/LICENSE +21 -0
- mini_search_engine-1.0.0/MANIFEST.in +3 -0
- mini_search_engine-1.0.0/PKG-INFO +116 -0
- mini_search_engine-1.0.0/README.md +75 -0
- mini_search_engine-1.0.0/mini_search_engine/__init__.py +3 -0
- mini_search_engine-1.0.0/mini_search_engine/engine.py +267 -0
- mini_search_engine-1.0.0/mini_search_engine.egg-info/PKG-INFO +116 -0
- mini_search_engine-1.0.0/mini_search_engine.egg-info/SOURCES.txt +13 -0
- mini_search_engine-1.0.0/mini_search_engine.egg-info/dependency_links.txt +1 -0
- mini_search_engine-1.0.0/mini_search_engine.egg-info/requires.txt +4 -0
- mini_search_engine-1.0.0/mini_search_engine.egg-info/top_level.txt +1 -0
- mini_search_engine-1.0.0/requirements.txt +4 -0
- mini_search_engine-1.0.0/setup.cfg +4 -0
- mini_search_engine-1.0.0/setup.py +43 -0
- mini_search_engine-1.0.0/tests/test_engine.py +125 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Jules
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mini_search_engine
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A simple search engine library scraping Google and DuckDuckGo.
|
|
5
|
+
Home-page: https://github.com/yourusername/mini-search-engine
|
|
6
|
+
Author: Jules
|
|
7
|
+
Author-email: jules@example.com
|
|
8
|
+
Project-URL: Bug Reports, https://github.com/yourusername/mini-search-engine/issues
|
|
9
|
+
Project-URL: Source, https://github.com/yourusername/mini-search-engine
|
|
10
|
+
Keywords: search,google,duckduckgo,scraping,crawler
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.6
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: requests
|
|
26
|
+
Requires-Dist: beautifulsoup4
|
|
27
|
+
Requires-Dist: lxml
|
|
28
|
+
Requires-Dist: certifi
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: home-page
|
|
35
|
+
Dynamic: keywords
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
Dynamic: project-url
|
|
38
|
+
Dynamic: requires-dist
|
|
39
|
+
Dynamic: requires-python
|
|
40
|
+
Dynamic: summary
|
|
41
|
+
|
|
42
|
+
# Mini Search Engine
|
|
43
|
+
|
|
44
|
+
A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- **Multi-Engine Support**: Search via Google or DuckDuckGo.
|
|
49
|
+
- **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
|
|
50
|
+
- **Pagination**: Retrieve as many results as you need.
|
|
51
|
+
- **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
|
|
52
|
+
- **Time Filtering**: Filter results by day, week, month, or year.
|
|
53
|
+
- **No API Key Needed**: Uses direct web scraping.
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
You can install the package directly from PyPI:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install mini-search-engine
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Or install locally from source:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/yourusername/mini-search-engine.git
|
|
67
|
+
cd mini-search-engine
|
|
68
|
+
pip install .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from mini_search_engine import SearchEngine
|
|
75
|
+
import logging
|
|
76
|
+
|
|
77
|
+
# Optional: Enable logging to see what's happening under the hood
|
|
78
|
+
logging.basicConfig(level=logging.INFO)
|
|
79
|
+
|
|
80
|
+
engine = SearchEngine()
|
|
81
|
+
|
|
82
|
+
# Simple Search
|
|
83
|
+
results = engine.search("python programming")
|
|
84
|
+
for res in results:
|
|
85
|
+
print(res['title'], res['link'])
|
|
86
|
+
|
|
87
|
+
# Advanced Search with Filters
|
|
88
|
+
results = engine.search(
|
|
89
|
+
"latest python news",
|
|
90
|
+
engine="auto", # Try Google, then DDG
|
|
91
|
+
limit=20, # Get 20 results (handles pagination automatically)
|
|
92
|
+
safe="strict", # Strict safe search
|
|
93
|
+
time_range="w" # Past week
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
print(f"\nFound {len(results)} results:")
|
|
97
|
+
for i, res in enumerate(results):
|
|
98
|
+
print(f"#{i+1} [{res['source']}] {res['title']}")
|
|
99
|
+
print(f"Link: {res['link']}")
|
|
100
|
+
print(f"Snippet: {res['snippet']}")
|
|
101
|
+
print("-" * 30)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## API Reference
|
|
105
|
+
|
|
106
|
+
### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
|
|
107
|
+
|
|
108
|
+
- `query` (str): The search query.
|
|
109
|
+
- `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
|
|
110
|
+
- `limit` (int): Number of results to return. Defaults to `10`.
|
|
111
|
+
- `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
|
|
112
|
+
- `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT License
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Mini Search Engine
|
|
2
|
+
|
|
3
|
+
A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multi-Engine Support**: Search via Google or DuckDuckGo.
|
|
8
|
+
- **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
|
|
9
|
+
- **Pagination**: Retrieve as many results as you need.
|
|
10
|
+
- **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
|
|
11
|
+
- **Time Filtering**: Filter results by day, week, month, or year.
|
|
12
|
+
- **No API Key Needed**: Uses direct web scraping.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
You can install the package directly from PyPI:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install mini-search-engine
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or install locally from source:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/yourusername/mini-search-engine.git
|
|
26
|
+
cd mini-search-engine
|
|
27
|
+
pip install .
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from mini_search_engine import SearchEngine
|
|
34
|
+
import logging
|
|
35
|
+
|
|
36
|
+
# Optional: Enable logging to see what's happening under the hood
|
|
37
|
+
logging.basicConfig(level=logging.INFO)
|
|
38
|
+
|
|
39
|
+
engine = SearchEngine()
|
|
40
|
+
|
|
41
|
+
# Simple Search
|
|
42
|
+
results = engine.search("python programming")
|
|
43
|
+
for res in results:
|
|
44
|
+
print(res['title'], res['link'])
|
|
45
|
+
|
|
46
|
+
# Advanced Search with Filters
|
|
47
|
+
results = engine.search(
|
|
48
|
+
"latest python news",
|
|
49
|
+
engine="auto", # Try Google, then DDG
|
|
50
|
+
limit=20, # Get 20 results (handles pagination automatically)
|
|
51
|
+
safe="strict", # Strict safe search
|
|
52
|
+
time_range="w" # Past week
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
print(f"\nFound {len(results)} results:")
|
|
56
|
+
for i, res in enumerate(results):
|
|
57
|
+
print(f"#{i+1} [{res['source']}] {res['title']}")
|
|
58
|
+
print(f"Link: {res['link']}")
|
|
59
|
+
print(f"Snippet: {res['snippet']}")
|
|
60
|
+
print("-" * 30)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## API Reference
|
|
64
|
+
|
|
65
|
+
### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
|
|
66
|
+
|
|
67
|
+
- `query` (str): The search query.
|
|
68
|
+
- `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
|
|
69
|
+
- `limit` (int): Number of results to return. Defaults to `10`.
|
|
70
|
+
- `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
|
|
71
|
+
- `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
|
|
72
|
+
|
|
73
|
+
## License
|
|
74
|
+
|
|
75
|
+
MIT License
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import random
|
|
6
|
+
import urllib.parse
|
|
7
|
+
|
|
8
|
+
# Configure logger
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
class SearchEngine:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.session = requests.Session()
|
|
14
|
+
self.session.headers.update({
|
|
15
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
def build_db(self, start_urls=None, max_pages=50, max_workers=10, timeout=None):
|
|
19
|
+
"""
|
|
20
|
+
Deprecated: This method is no longer needed as the search engine
|
|
21
|
+
now queries the web directly. Kept for compatibility.
|
|
22
|
+
"""
|
|
23
|
+
print("Note: build_db() is deprecated and does nothing. The engine searches the live web now.")
|
|
24
|
+
logger.warning("build_db() called but is deprecated.")
|
|
25
|
+
|
|
26
|
+
def search(self, query, engine="auto", limit=10, safe="moderate", time_range=None):
|
|
27
|
+
"""
|
|
28
|
+
Searches the web for the query with extensive control.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
query (str): The search query.
|
|
32
|
+
engine (str): 'google', 'ddg', or 'auto'. Defaults to 'auto'.
|
|
33
|
+
If the specified engine fails, it will attempt the other one.
|
|
34
|
+
limit (int): Approximate number of results to return. Defaults to 10.
|
|
35
|
+
safe (str): Safe search level. 'strict', 'moderate', or 'off'.
|
|
36
|
+
Defaults to 'moderate'.
|
|
37
|
+
time_range (str): 'd' (day), 'w' (week), 'm' (month), 'y' (year).
|
|
38
|
+
Defaults to None (any time).
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
list: A list of result dictionaries.
|
|
42
|
+
"""
|
|
43
|
+
print(f"Searching for: '{query}' (Engine: {engine}, Limit: {limit}, Safe: {safe}, Time: {time_range})")
|
|
44
|
+
|
|
45
|
+
# Determine execution order
|
|
46
|
+
engines_to_try = []
|
|
47
|
+
if engine.lower() == 'google':
|
|
48
|
+
engines_to_try = ['google', 'ddg']
|
|
49
|
+
elif engine.lower() == 'ddg' or engine.lower() == 'duckduckgo':
|
|
50
|
+
engines_to_try = ['ddg', 'google']
|
|
51
|
+
else:
|
|
52
|
+
engines_to_try = ['google', 'ddg'] # Default preference
|
|
53
|
+
|
|
54
|
+
for eng in engines_to_try:
|
|
55
|
+
results = []
|
|
56
|
+
if eng == 'google':
|
|
57
|
+
results = self._search_google(query, limit, safe, time_range)
|
|
58
|
+
elif eng == 'ddg':
|
|
59
|
+
results = self._search_duckduckgo(query, limit, safe, time_range)
|
|
60
|
+
|
|
61
|
+
if results:
|
|
62
|
+
return results[:limit]
|
|
63
|
+
|
|
64
|
+
print(f"{eng.capitalize()} returned no results or failed. Trying next available engine...")
|
|
65
|
+
|
|
66
|
+
print("All engines failed to return results.")
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
def _search_google(self, query, limit, safe, time_range):
|
|
70
|
+
base_url = "https://www.google.com/search"
|
|
71
|
+
results = []
|
|
72
|
+
start = 0
|
|
73
|
+
|
|
74
|
+
# Map parameters
|
|
75
|
+
params = {
|
|
76
|
+
"q": query,
|
|
77
|
+
"hl": "en"
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# Safe Search
|
|
81
|
+
if safe == 'strict':
|
|
82
|
+
params['safe'] = 'active'
|
|
83
|
+
elif safe == 'off':
|
|
84
|
+
params['safe'] = 'images' # 'off' isn't explicitly 'off' in url, but omitting often defaults to moderate. 'images' is a trick or just omit.
|
|
85
|
+
# Actually, omitting 'safe' is usually moderate. explicit 'safe=active' is strict.
|
|
86
|
+
# To turn OFF, sometimes 'safe=off' works or 'safe=undefined'.
|
|
87
|
+
# We'll just omit it for 'off' and 'moderate', and use 'active' for strict.
|
|
88
|
+
if 'safe' in params: del params['safe']
|
|
89
|
+
|
|
90
|
+
# Time Range
|
|
91
|
+
if time_range:
|
|
92
|
+
# Map d, w, m, y to qdr:d, qdr:w, etc.
|
|
93
|
+
tr_map = {'d': 'd', 'w': 'w', 'm': 'm', 'y': 'y'}
|
|
94
|
+
if time_range in tr_map:
|
|
95
|
+
params['tbs'] = f"qdr:{tr_map[time_range]}"
|
|
96
|
+
|
|
97
|
+
while len(results) < limit:
|
|
98
|
+
# Use a copy of params to avoid mutation issues in mocks/retries
|
|
99
|
+
current_params = params.copy()
|
|
100
|
+
current_params['start'] = start
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# Add random delay if paging
|
|
104
|
+
if start > 0:
|
|
105
|
+
time.sleep(random.uniform(1.0, 2.5))
|
|
106
|
+
|
|
107
|
+
response = self.session.get(base_url, params=current_params, timeout=10)
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
|
|
110
|
+
if "systems have detected unusual traffic" in response.text or "recaptcha" in response.text.lower():
|
|
111
|
+
logger.warning("Google blocked the request.")
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
soup = BeautifulSoup(response.text, "lxml")
|
|
115
|
+
current_page_results = []
|
|
116
|
+
|
|
117
|
+
for g in soup.select("div.g"):
|
|
118
|
+
title_elem = g.select_one("h3")
|
|
119
|
+
link_elem = g.select_one("a")
|
|
120
|
+
|
|
121
|
+
if title_elem and link_elem:
|
|
122
|
+
title = title_elem.get_text()
|
|
123
|
+
link = link_elem["href"]
|
|
124
|
+
|
|
125
|
+
if "/url?q=" in link:
|
|
126
|
+
raw_link = link.split("/url?q=")[1].split("&")[0]
|
|
127
|
+
link = urllib.parse.unquote(raw_link)
|
|
128
|
+
|
|
129
|
+
snippet = "No snippet"
|
|
130
|
+
snippet_div = g.select_one("div.VwiC3b, div.IsZvec, span.aCOpRe")
|
|
131
|
+
if snippet_div:
|
|
132
|
+
snippet = snippet_div.get_text()
|
|
133
|
+
|
|
134
|
+
res = {
|
|
135
|
+
"title": title,
|
|
136
|
+
"link": link,
|
|
137
|
+
"snippet": snippet,
|
|
138
|
+
"score": 1.0,
|
|
139
|
+
"source": "google"
|
|
140
|
+
}
|
|
141
|
+
current_page_results.append(res)
|
|
142
|
+
results.append(res)
|
|
143
|
+
|
|
144
|
+
if len(results) >= limit:
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
if not current_page_results:
|
|
148
|
+
# No more results on this page
|
|
149
|
+
break
|
|
150
|
+
|
|
151
|
+
start += 10
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Google search failed: {e}")
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
return results
|
|
158
|
+
|
|
159
|
+
def _search_duckduckgo(self, query, limit, safe, time_range):
|
|
160
|
+
url = "https://html.duckduckgo.com/html/"
|
|
161
|
+
results = []
|
|
162
|
+
|
|
163
|
+
# Initial Params
|
|
164
|
+
data = {
|
|
165
|
+
"q": query,
|
|
166
|
+
"kl": "us-en" # Default region
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Safe Search
|
|
170
|
+
# kp: -2 (strict), -1 (off), 1 (moderate)?
|
|
171
|
+
# Checking online: kp=-2 is strict, kp=-1 is off. kp=1 is moderate.
|
|
172
|
+
if safe == 'strict':
|
|
173
|
+
data['kp'] = '-2'
|
|
174
|
+
elif safe == 'off':
|
|
175
|
+
data['kp'] = '-1'
|
|
176
|
+
else:
|
|
177
|
+
data['kp'] = '1' # moderate
|
|
178
|
+
|
|
179
|
+
# Time Range
|
|
180
|
+
if time_range:
|
|
181
|
+
tr_map = {'d': 'd', 'w': 'w', 'm': 'm', 'y': 'y'}
|
|
182
|
+
if time_range in tr_map:
|
|
183
|
+
data['df'] = tr_map[time_range]
|
|
184
|
+
|
|
185
|
+
while len(results) < limit:
|
|
186
|
+
try:
|
|
187
|
+
# Add random delay if paging
|
|
188
|
+
if len(results) > 0:
|
|
189
|
+
time.sleep(random.uniform(0.5, 1.5))
|
|
190
|
+
|
|
191
|
+
response = self.session.post(url, data=data, timeout=10)
|
|
192
|
+
response.raise_for_status()
|
|
193
|
+
|
|
194
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
195
|
+
current_page_results = []
|
|
196
|
+
|
|
197
|
+
# Parse Results
|
|
198
|
+
for result in soup.select(".result"):
|
|
199
|
+
if "result--ad" in result.get("class", []):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
title_elem = result.select_one(".result__a")
|
|
203
|
+
snippet_elem = result.select_one(".result__snippet")
|
|
204
|
+
|
|
205
|
+
if title_elem:
|
|
206
|
+
title = title_elem.get_text(strip=True)
|
|
207
|
+
raw_link = title_elem["href"]
|
|
208
|
+
link = raw_link
|
|
209
|
+
|
|
210
|
+
# Decode DDG redirection
|
|
211
|
+
if "uddg=" in raw_link:
|
|
212
|
+
try:
|
|
213
|
+
parsed = urllib.parse.urlparse(raw_link)
|
|
214
|
+
qs = urllib.parse.parse_qs(parsed.query)
|
|
215
|
+
if 'uddg' in qs:
|
|
216
|
+
link = qs['uddg'][0]
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
snippet = snippet_elem.get_text(strip=True) if snippet_elem else "No snippet"
|
|
221
|
+
|
|
222
|
+
res = {
|
|
223
|
+
"title": title,
|
|
224
|
+
"link": link,
|
|
225
|
+
"snippet": snippet,
|
|
226
|
+
"score": 1.0,
|
|
227
|
+
"source": "duckduckgo"
|
|
228
|
+
}
|
|
229
|
+
current_page_results.append(res)
|
|
230
|
+
results.append(res)
|
|
231
|
+
|
|
232
|
+
if len(results) >= limit:
|
|
233
|
+
break
|
|
234
|
+
|
|
235
|
+
if not current_page_results:
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
# Check for limit
|
|
239
|
+
if len(results) >= limit:
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
# Pagination: Find the "Next" form
|
|
243
|
+
# Usually a form with action="/html/" and input value="Next"
|
|
244
|
+
next_form = None
|
|
245
|
+
for form in soup.select("form[action='/html/']"):
|
|
246
|
+
if form.select_one("input[value='Next']"):
|
|
247
|
+
next_form = form
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
if next_form:
|
|
251
|
+
# Extract inputs for the next request
|
|
252
|
+
new_data = {}
|
|
253
|
+
for inp in next_form.select("input"):
|
|
254
|
+
name = inp.get("name")
|
|
255
|
+
value = inp.get("value")
|
|
256
|
+
if name:
|
|
257
|
+
new_data[name] = value
|
|
258
|
+
data = new_data
|
|
259
|
+
else:
|
|
260
|
+
# No next page
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
except Exception as e:
|
|
264
|
+
logger.error(f"DuckDuckGo search failed: {e}")
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
return results
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mini_search_engine
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A simple search engine library scraping Google and DuckDuckGo.
|
|
5
|
+
Home-page: https://github.com/yourusername/mini-search-engine
|
|
6
|
+
Author: Jules
|
|
7
|
+
Author-email: jules@example.com
|
|
8
|
+
Project-URL: Bug Reports, https://github.com/yourusername/mini-search-engine/issues
|
|
9
|
+
Project-URL: Source, https://github.com/yourusername/mini-search-engine
|
|
10
|
+
Keywords: search,google,duckduckgo,scraping,crawler
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.6
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: requests
|
|
26
|
+
Requires-Dist: beautifulsoup4
|
|
27
|
+
Requires-Dist: lxml
|
|
28
|
+
Requires-Dist: certifi
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: home-page
|
|
35
|
+
Dynamic: keywords
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
Dynamic: project-url
|
|
38
|
+
Dynamic: requires-dist
|
|
39
|
+
Dynamic: requires-python
|
|
40
|
+
Dynamic: summary
|
|
41
|
+
|
|
42
|
+
# Mini Search Engine
|
|
43
|
+
|
|
44
|
+
A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- **Multi-Engine Support**: Search via Google or DuckDuckGo.
|
|
49
|
+
- **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
|
|
50
|
+
- **Pagination**: Retrieve as many results as you need.
|
|
51
|
+
- **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
|
|
52
|
+
- **Time Filtering**: Filter results by day, week, month, or year.
|
|
53
|
+
- **No API Key Needed**: Uses direct web scraping.
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
You can install the package directly from PyPI:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install mini-search-engine
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Or install locally from source:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/yourusername/mini-search-engine.git
|
|
67
|
+
cd mini-search-engine
|
|
68
|
+
pip install .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from mini_search_engine import SearchEngine
|
|
75
|
+
import logging
|
|
76
|
+
|
|
77
|
+
# Optional: Enable logging to see what's happening under the hood
|
|
78
|
+
logging.basicConfig(level=logging.INFO)
|
|
79
|
+
|
|
80
|
+
engine = SearchEngine()
|
|
81
|
+
|
|
82
|
+
# Simple Search
|
|
83
|
+
results = engine.search("python programming")
|
|
84
|
+
for res in results:
|
|
85
|
+
print(res['title'], res['link'])
|
|
86
|
+
|
|
87
|
+
# Advanced Search with Filters
|
|
88
|
+
results = engine.search(
|
|
89
|
+
"latest python news",
|
|
90
|
+
engine="auto", # Try Google, then DDG
|
|
91
|
+
limit=20, # Get 20 results (handles pagination automatically)
|
|
92
|
+
safe="strict", # Strict safe search
|
|
93
|
+
time_range="w" # Past week
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
print(f"\nFound {len(results)} results:")
|
|
97
|
+
for i, res in enumerate(results):
|
|
98
|
+
print(f"#{i+1} [{res['source']}] {res['title']}")
|
|
99
|
+
print(f"Link: {res['link']}")
|
|
100
|
+
print(f"Snippet: {res['snippet']}")
|
|
101
|
+
print("-" * 30)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## API Reference
|
|
105
|
+
|
|
106
|
+
### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
|
|
107
|
+
|
|
108
|
+
- `query` (str): The search query.
|
|
109
|
+
- `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
|
|
110
|
+
- `limit` (int): Number of results to return. Defaults to `10`.
|
|
111
|
+
- `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
|
|
112
|
+
- `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT License
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
requirements.txt
|
|
5
|
+
setup.py
|
|
6
|
+
mini_search_engine/__init__.py
|
|
7
|
+
mini_search_engine/engine.py
|
|
8
|
+
mini_search_engine.egg-info/PKG-INFO
|
|
9
|
+
mini_search_engine.egg-info/SOURCES.txt
|
|
10
|
+
mini_search_engine.egg-info/dependency_links.txt
|
|
11
|
+
mini_search_engine.egg-info/requires.txt
|
|
12
|
+
mini_search_engine.egg-info/top_level.txt
|
|
13
|
+
tests/test_engine.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mini_search_engine
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
import pathlib
|
|
3
|
+
|
|
4
|
+
# Read the contents of your README file
|
|
5
|
+
here = pathlib.Path(__file__).parent.resolve()
|
|
6
|
+
long_description = (here / "README.md").read_text(encoding="utf-8")
|
|
7
|
+
|
|
8
|
+
setup(
|
|
9
|
+
name="mini_search_engine",
|
|
10
|
+
version="1.0.0",
|
|
11
|
+
description="A simple search engine library scraping Google and DuckDuckGo.",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/yourusername/mini-search-engine",
|
|
15
|
+
author="Jules",
|
|
16
|
+
author_email="jules@example.com",
|
|
17
|
+
classifiers=[
|
|
18
|
+
"Development Status :: 5 - Production/Stable",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.6",
|
|
24
|
+
"Programming Language :: Python :: 3.7",
|
|
25
|
+
"Programming Language :: Python :: 3.8",
|
|
26
|
+
"Programming Language :: Python :: 3.9",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
],
|
|
30
|
+
keywords="search, google, duckduckgo, scraping, crawler",
|
|
31
|
+
packages=find_packages(),
|
|
32
|
+
python_requires=">=3.6",
|
|
33
|
+
install_requires=[
|
|
34
|
+
"requests",
|
|
35
|
+
"beautifulsoup4",
|
|
36
|
+
"lxml",
|
|
37
|
+
"certifi"
|
|
38
|
+
],
|
|
39
|
+
project_urls={
|
|
40
|
+
"Bug Reports": "https://github.com/yourusername/mini-search-engine/issues",
|
|
41
|
+
"Source": "https://github.com/yourusername/mini-search-engine",
|
|
42
|
+
},
|
|
43
|
+
)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch, Mock
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
# Add the parent directory to the path
|
|
7
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
8
|
+
|
|
9
|
+
from mini_search_engine.engine import SearchEngine
|
|
10
|
+
|
|
11
|
+
class TestMiniSearchEngine(unittest.TestCase):
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
self.engine = SearchEngine()
|
|
15
|
+
|
|
16
|
+
@patch('mini_search_engine.engine.requests.Session.get')
|
|
17
|
+
def test_search_google_basic(self, mock_get):
|
|
18
|
+
# Mock Google response
|
|
19
|
+
mock_response = Mock()
|
|
20
|
+
mock_response.status_code = 200
|
|
21
|
+
mock_response.text = """
|
|
22
|
+
<html><body>
|
|
23
|
+
<div class="g">
|
|
24
|
+
<h3>Google Result</h3>
|
|
25
|
+
<a href="/url?q=http://google.com/res">Link</a>
|
|
26
|
+
<div class="VwiC3b">Snippet</div>
|
|
27
|
+
</div>
|
|
28
|
+
</body></html>
|
|
29
|
+
"""
|
|
30
|
+
mock_get.return_value = mock_response
|
|
31
|
+
|
|
32
|
+
# Set limit=1 to avoid looping endlessly with the same mock response
|
|
33
|
+
results = self.engine.search("test", engine="google", limit=1)
|
|
34
|
+
|
|
35
|
+
self.assertEqual(len(results), 1)
|
|
36
|
+
self.assertEqual(results[0]['source'], "google")
|
|
37
|
+
|
|
38
|
+
# Verify params
|
|
39
|
+
args, kwargs = mock_get.call_args
|
|
40
|
+
self.assertEqual(kwargs['params']['q'], "test")
|
|
41
|
+
|
|
42
|
+
@patch('mini_search_engine.engine.requests.Session.post')
|
|
43
|
+
def test_search_ddg_basic(self, mock_post):
|
|
44
|
+
mock_response = Mock()
|
|
45
|
+
mock_response.status_code = 200
|
|
46
|
+
mock_response.text = """
|
|
47
|
+
<html><body>
|
|
48
|
+
<div class="result">
|
|
49
|
+
<a class="result__a" href="//duckduckgo.com/l/?uddg=http://ddg.com/res">DDG Result</a>
|
|
50
|
+
<div class="result__snippet">Snippet</div>
|
|
51
|
+
</div>
|
|
52
|
+
</body></html>
|
|
53
|
+
"""
|
|
54
|
+
mock_post.return_value = mock_response
|
|
55
|
+
|
|
56
|
+
results = self.engine.search("test", engine="ddg")
|
|
57
|
+
|
|
58
|
+
self.assertEqual(len(results), 1)
|
|
59
|
+
self.assertEqual(results[0]['source'], "duckduckgo")
|
|
60
|
+
self.assertEqual(results[0]['link'], "http://ddg.com/res")
|
|
61
|
+
|
|
62
|
+
@patch('mini_search_engine.engine.requests.Session.get')
|
|
63
|
+
@patch('mini_search_engine.engine.requests.Session.post')
|
|
64
|
+
def test_fallback_logic(self, mock_post, mock_get):
|
|
65
|
+
# Google fails (blocked)
|
|
66
|
+
mock_google = Mock()
|
|
67
|
+
mock_google.status_code = 200
|
|
68
|
+
mock_google.text = "systems have detected unusual traffic"
|
|
69
|
+
mock_get.return_value = mock_google
|
|
70
|
+
|
|
71
|
+
# DDG succeeds
|
|
72
|
+
mock_ddg = Mock()
|
|
73
|
+
mock_ddg.status_code = 200
|
|
74
|
+
mock_ddg.text = """
|
|
75
|
+
<html><body>
|
|
76
|
+
<div class="result">
|
|
77
|
+
<a class="result__a" href="http://res.com">Res</a>
|
|
78
|
+
</div>
|
|
79
|
+
</body></html>
|
|
80
|
+
"""
|
|
81
|
+
mock_post.return_value = mock_ddg
|
|
82
|
+
|
|
83
|
+
# Request 'auto' (defaults to Google -> DDG)
|
|
84
|
+
results = self.engine.search("test", engine="auto")
|
|
85
|
+
|
|
86
|
+
self.assertTrue(mock_get.called)
|
|
87
|
+
self.assertTrue(mock_post.called)
|
|
88
|
+
self.assertEqual(len(results), 1)
|
|
89
|
+
self.assertEqual(results[0]['source'], "duckduckgo")
|
|
90
|
+
|
|
91
|
+
@patch('mini_search_engine.engine.requests.Session.get')
|
|
92
|
+
def test_google_pagination_and_limit(self, mock_get):
|
|
93
|
+
# Mock 2 pages
|
|
94
|
+
page1 = Mock()
|
|
95
|
+
page1.status_code = 200
|
|
96
|
+
page1.text = """
|
|
97
|
+
<div class="g"><h3>R1</h3><a href="l1">L</a></div>
|
|
98
|
+
<div class="g"><h3>R2</h3><a href="l2">L</a></div>
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
page2 = Mock()
|
|
102
|
+
page2.status_code = 200
|
|
103
|
+
page2.text = """
|
|
104
|
+
<div class="g"><h3>R3</h3><a href="l3">L</a></div>
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
mock_get.side_effect = [page1, page2]
|
|
108
|
+
|
|
109
|
+
# Limit 3 results
|
|
110
|
+
results = self.engine.search("test", engine="google", limit=3)
|
|
111
|
+
|
|
112
|
+
self.assertEqual(len(results), 3)
|
|
113
|
+
self.assertEqual(results[0]['title'], "R1")
|
|
114
|
+
self.assertEqual(results[2]['title'], "R3")
|
|
115
|
+
|
|
116
|
+
# Verify calls
|
|
117
|
+
self.assertEqual(mock_get.call_count, 2)
|
|
118
|
+
# Check start params
|
|
119
|
+
args1, kwargs1 = mock_get.call_args_list[0]
|
|
120
|
+
self.assertEqual(kwargs1['params']['start'], 0)
|
|
121
|
+
args2, kwargs2 = mock_get.call_args_list[1]
|
|
122
|
+
self.assertEqual(kwargs2['params']['start'], 10)
|
|
123
|
+
|
|
124
|
+
if __name__ == '__main__':
|
|
125
|
+
unittest.main()
|