mini-search-engine 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Jules
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,3 @@
1
+ include README.md
2
+ include requirements.txt
3
+ include LICENSE
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: mini_search_engine
3
+ Version: 1.0.0
4
+ Summary: A simple search engine library scraping Google and DuckDuckGo.
5
+ Home-page: https://github.com/yourusername/mini-search-engine
6
+ Author: Jules
7
+ Author-email: jules@example.com
8
+ Project-URL: Bug Reports, https://github.com/yourusername/mini-search-engine/issues
9
+ Project-URL: Source, https://github.com/yourusername/mini-search-engine
10
+ Keywords: search,google,duckduckgo,scraping,crawler
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.6
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.6
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: requests
26
+ Requires-Dist: beautifulsoup4
27
+ Requires-Dist: lxml
28
+ Requires-Dist: certifi
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: keywords
36
+ Dynamic: license-file
37
+ Dynamic: project-url
38
+ Dynamic: requires-dist
39
+ Dynamic: requires-python
40
+ Dynamic: summary
41
+
42
+ # Mini Search Engine
43
+
44
+ A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
45
+
46
+ ## Features
47
+
48
+ - **Multi-Engine Support**: Search via Google or DuckDuckGo.
49
+ - **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
50
+ - **Pagination**: Retrieve as many results as you need.
51
+ - **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
52
+ - **Time Filtering**: Filter results by day, week, month, or year.
53
+ - **No API Key Needed**: Uses direct web scraping.
54
+
55
+ ## Installation
56
+
57
+ You can install the package directly from PyPI:
58
+
59
+ ```bash
60
+ pip install mini-search-engine
61
+ ```
62
+
63
+ Or install locally from source:
64
+
65
+ ```bash
66
+ git clone https://github.com/yourusername/mini-search-engine.git
67
+ cd mini-search-engine
68
+ pip install .
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ```python
74
+ from mini_search_engine import SearchEngine
75
+ import logging
76
+
77
+ # Optional: Enable logging to see what's happening under the hood
78
+ logging.basicConfig(level=logging.INFO)
79
+
80
+ engine = SearchEngine()
81
+
82
+ # Simple Search
83
+ results = engine.search("python programming")
84
+ for res in results:
85
+ print(res['title'], res['link'])
86
+
87
+ # Advanced Search with Filters
88
+ results = engine.search(
89
+ "latest python news",
90
+ engine="auto", # Try Google, then DDG
91
+ limit=20, # Get 20 results (handles pagination automatically)
92
+ safe="strict", # Strict safe search
93
+ time_range="w" # Past week
94
+ )
95
+
96
+ print(f"\nFound {len(results)} results:")
97
+ for i, res in enumerate(results):
98
+ print(f"#{i+1} [{res['source']}] {res['title']}")
99
+ print(f"Link: {res['link']}")
100
+ print(f"Snippet: {res['snippet']}")
101
+ print("-" * 30)
102
+ ```
103
+
104
+ ## API Reference
105
+
106
+ ### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
107
+
108
+ - `query` (str): The search query.
109
+ - `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
110
+ - `limit` (int): Number of results to return. Defaults to `10`.
111
+ - `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
112
+ - `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
113
+
114
+ ## License
115
+
116
+ MIT License
@@ -0,0 +1,75 @@
1
+ # Mini Search Engine
2
+
3
+ A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
4
+
5
+ ## Features
6
+
7
+ - **Multi-Engine Support**: Search via Google or DuckDuckGo.
8
+ - **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
9
+ - **Pagination**: Retrieve as many results as you need.
10
+ - **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
11
+ - **Time Filtering**: Filter results by day, week, month, or year.
12
+ - **No API Key Needed**: Uses direct web scraping.
13
+
14
+ ## Installation
15
+
16
+ You can install the package directly from PyPI:
17
+
18
+ ```bash
19
+ pip install mini-search-engine
20
+ ```
21
+
22
+ Or install locally from source:
23
+
24
+ ```bash
25
+ git clone https://github.com/yourusername/mini-search-engine.git
26
+ cd mini-search-engine
27
+ pip install .
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from mini_search_engine import SearchEngine
34
+ import logging
35
+
36
+ # Optional: Enable logging to see what's happening under the hood
37
+ logging.basicConfig(level=logging.INFO)
38
+
39
+ engine = SearchEngine()
40
+
41
+ # Simple Search
42
+ results = engine.search("python programming")
43
+ for res in results:
44
+ print(res['title'], res['link'])
45
+
46
+ # Advanced Search with Filters
47
+ results = engine.search(
48
+ "latest python news",
49
+ engine="auto", # Try Google, then DDG
50
+ limit=20, # Get 20 results (handles pagination automatically)
51
+ safe="strict", # Strict safe search
52
+ time_range="w" # Past week
53
+ )
54
+
55
+ print(f"\nFound {len(results)} results:")
56
+ for i, res in enumerate(results):
57
+ print(f"#{i+1} [{res['source']}] {res['title']}")
58
+ print(f"Link: {res['link']}")
59
+ print(f"Snippet: {res['snippet']}")
60
+ print("-" * 30)
61
+ ```
62
+
63
+ ## API Reference
64
+
65
+ ### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
66
+
67
+ - `query` (str): The search query.
68
+ - `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
69
+ - `limit` (int): Number of results to return. Defaults to `10`.
70
+ - `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
71
+ - `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
72
+
73
+ ## License
74
+
75
+ MIT License
@@ -0,0 +1,3 @@
1
+ from .engine import SearchEngine
2
+
3
+ __all__ = ['SearchEngine']
@@ -0,0 +1,267 @@
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ import time
5
+ import random
6
+ import urllib.parse
7
+
8
+ # Configure logger
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class SearchEngine:
12
+ def __init__(self):
13
+ self.session = requests.Session()
14
+ self.session.headers.update({
15
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
16
+ })
17
+
18
+ def build_db(self, start_urls=None, max_pages=50, max_workers=10, timeout=None):
19
+ """
20
+ Deprecated: This method is no longer needed as the search engine
21
+ now queries the web directly. Kept for compatibility.
22
+ """
23
+ print("Note: build_db() is deprecated and does nothing. The engine searches the live web now.")
24
+ logger.warning("build_db() called but is deprecated.")
25
+
26
+ def search(self, query, engine="auto", limit=10, safe="moderate", time_range=None):
27
+ """
28
+ Searches the web for the query with extensive control.
29
+
30
+ Args:
31
+ query (str): The search query.
32
+ engine (str): 'google', 'ddg', or 'auto'. Defaults to 'auto'.
33
+ If the specified engine fails, it will attempt the other one.
34
+ limit (int): Approximate number of results to return. Defaults to 10.
35
+ safe (str): Safe search level. 'strict', 'moderate', or 'off'.
36
+ Defaults to 'moderate'.
37
+ time_range (str): 'd' (day), 'w' (week), 'm' (month), 'y' (year).
38
+ Defaults to None (any time).
39
+
40
+ Returns:
41
+ list: A list of result dictionaries.
42
+ """
43
+ print(f"Searching for: '{query}' (Engine: {engine}, Limit: {limit}, Safe: {safe}, Time: {time_range})")
44
+
45
+ # Determine execution order
46
+ engines_to_try = []
47
+ if engine.lower() == 'google':
48
+ engines_to_try = ['google', 'ddg']
49
+ elif engine.lower() == 'ddg' or engine.lower() == 'duckduckgo':
50
+ engines_to_try = ['ddg', 'google']
51
+ else:
52
+ engines_to_try = ['google', 'ddg'] # Default preference
53
+
54
+ for eng in engines_to_try:
55
+ results = []
56
+ if eng == 'google':
57
+ results = self._search_google(query, limit, safe, time_range)
58
+ elif eng == 'ddg':
59
+ results = self._search_duckduckgo(query, limit, safe, time_range)
60
+
61
+ if results:
62
+ return results[:limit]
63
+
64
+ print(f"{eng.capitalize()} returned no results or failed. Trying next available engine...")
65
+
66
+ print("All engines failed to return results.")
67
+ return []
68
+
69
+ def _search_google(self, query, limit, safe, time_range):
70
+ base_url = "https://www.google.com/search"
71
+ results = []
72
+ start = 0
73
+
74
+ # Map parameters
75
+ params = {
76
+ "q": query,
77
+ "hl": "en"
78
+ }
79
+
80
+ # Safe Search
81
+ if safe == 'strict':
82
+ params['safe'] = 'active'
83
+ elif safe == 'off':
84
+ params['safe'] = 'images' # 'off' isn't explicitly 'off' in url, but omitting often defaults to moderate. 'images' is a trick or just omit.
85
+ # Actually, omitting 'safe' is usually moderate. explicit 'safe=active' is strict.
86
+ # To turn OFF, sometimes 'safe=off' works or 'safe=undefined'.
87
+ # We'll just omit it for 'off' and 'moderate', and use 'active' for strict.
88
+ if 'safe' in params: del params['safe']
89
+
90
+ # Time Range
91
+ if time_range:
92
+ # Map d, w, m, y to qdr:d, qdr:w, etc.
93
+ tr_map = {'d': 'd', 'w': 'w', 'm': 'm', 'y': 'y'}
94
+ if time_range in tr_map:
95
+ params['tbs'] = f"qdr:{tr_map[time_range]}"
96
+
97
+ while len(results) < limit:
98
+ # Use a copy of params to avoid mutation issues in mocks/retries
99
+ current_params = params.copy()
100
+ current_params['start'] = start
101
+
102
+ try:
103
+ # Add random delay if paging
104
+ if start > 0:
105
+ time.sleep(random.uniform(1.0, 2.5))
106
+
107
+ response = self.session.get(base_url, params=current_params, timeout=10)
108
+ response.raise_for_status()
109
+
110
+ if "systems have detected unusual traffic" in response.text or "recaptcha" in response.text.lower():
111
+ logger.warning("Google blocked the request.")
112
+ break
113
+
114
+ soup = BeautifulSoup(response.text, "lxml")
115
+ current_page_results = []
116
+
117
+ for g in soup.select("div.g"):
118
+ title_elem = g.select_one("h3")
119
+ link_elem = g.select_one("a")
120
+
121
+ if title_elem and link_elem:
122
+ title = title_elem.get_text()
123
+ link = link_elem["href"]
124
+
125
+ if "/url?q=" in link:
126
+ raw_link = link.split("/url?q=")[1].split("&")[0]
127
+ link = urllib.parse.unquote(raw_link)
128
+
129
+ snippet = "No snippet"
130
+ snippet_div = g.select_one("div.VwiC3b, div.IsZvec, span.aCOpRe")
131
+ if snippet_div:
132
+ snippet = snippet_div.get_text()
133
+
134
+ res = {
135
+ "title": title,
136
+ "link": link,
137
+ "snippet": snippet,
138
+ "score": 1.0,
139
+ "source": "google"
140
+ }
141
+ current_page_results.append(res)
142
+ results.append(res)
143
+
144
+ if len(results) >= limit:
145
+ break
146
+
147
+ if not current_page_results:
148
+ # No more results on this page
149
+ break
150
+
151
+ start += 10
152
+
153
+ except Exception as e:
154
+ logger.error(f"Google search failed: {e}")
155
+ break
156
+
157
+ return results
158
+
159
+ def _search_duckduckgo(self, query, limit, safe, time_range):
160
+ url = "https://html.duckduckgo.com/html/"
161
+ results = []
162
+
163
+ # Initial Params
164
+ data = {
165
+ "q": query,
166
+ "kl": "us-en" # Default region
167
+ }
168
+
169
+ # Safe Search
170
+ # kp: -2 (strict), -1 (off), 1 (moderate)?
171
+ # Checking online: kp=-2 is strict, kp=-1 is off. kp=1 is moderate.
172
+ if safe == 'strict':
173
+ data['kp'] = '-2'
174
+ elif safe == 'off':
175
+ data['kp'] = '-1'
176
+ else:
177
+ data['kp'] = '1' # moderate
178
+
179
+ # Time Range
180
+ if time_range:
181
+ tr_map = {'d': 'd', 'w': 'w', 'm': 'm', 'y': 'y'}
182
+ if time_range in tr_map:
183
+ data['df'] = tr_map[time_range]
184
+
185
+ while len(results) < limit:
186
+ try:
187
+ # Add random delay if paging
188
+ if len(results) > 0:
189
+ time.sleep(random.uniform(0.5, 1.5))
190
+
191
+ response = self.session.post(url, data=data, timeout=10)
192
+ response.raise_for_status()
193
+
194
+ soup = BeautifulSoup(response.text, "html.parser")
195
+ current_page_results = []
196
+
197
+ # Parse Results
198
+ for result in soup.select(".result"):
199
+ if "result--ad" in result.get("class", []):
200
+ continue
201
+
202
+ title_elem = result.select_one(".result__a")
203
+ snippet_elem = result.select_one(".result__snippet")
204
+
205
+ if title_elem:
206
+ title = title_elem.get_text(strip=True)
207
+ raw_link = title_elem["href"]
208
+ link = raw_link
209
+
210
+ # Decode DDG redirection
211
+ if "uddg=" in raw_link:
212
+ try:
213
+ parsed = urllib.parse.urlparse(raw_link)
214
+ qs = urllib.parse.parse_qs(parsed.query)
215
+ if 'uddg' in qs:
216
+ link = qs['uddg'][0]
217
+ except Exception:
218
+ pass
219
+
220
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else "No snippet"
221
+
222
+ res = {
223
+ "title": title,
224
+ "link": link,
225
+ "snippet": snippet,
226
+ "score": 1.0,
227
+ "source": "duckduckgo"
228
+ }
229
+ current_page_results.append(res)
230
+ results.append(res)
231
+
232
+ if len(results) >= limit:
233
+ break
234
+
235
+ if not current_page_results:
236
+ break
237
+
238
+ # Check for limit
239
+ if len(results) >= limit:
240
+ break
241
+
242
+ # Pagination: Find the "Next" form
243
+ # Usually a form with action="/html/" and input value="Next"
244
+ next_form = None
245
+ for form in soup.select("form[action='/html/']"):
246
+ if form.select_one("input[value='Next']"):
247
+ next_form = form
248
+ break
249
+
250
+ if next_form:
251
+ # Extract inputs for the next request
252
+ new_data = {}
253
+ for inp in next_form.select("input"):
254
+ name = inp.get("name")
255
+ value = inp.get("value")
256
+ if name:
257
+ new_data[name] = value
258
+ data = new_data
259
+ else:
260
+ # No next page
261
+ break
262
+
263
+ except Exception as e:
264
+ logger.error(f"DuckDuckGo search failed: {e}")
265
+ break
266
+
267
+ return results
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: mini_search_engine
3
+ Version: 1.0.0
4
+ Summary: A simple search engine library scraping Google and DuckDuckGo.
5
+ Home-page: https://github.com/yourusername/mini-search-engine
6
+ Author: Jules
7
+ Author-email: jules@example.com
8
+ Project-URL: Bug Reports, https://github.com/yourusername/mini-search-engine/issues
9
+ Project-URL: Source, https://github.com/yourusername/mini-search-engine
10
+ Keywords: search,google,duckduckgo,scraping,crawler
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.6
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.6
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: requests
26
+ Requires-Dist: beautifulsoup4
27
+ Requires-Dist: lxml
28
+ Requires-Dist: certifi
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: keywords
36
+ Dynamic: license-file
37
+ Dynamic: project-url
38
+ Dynamic: requires-dist
39
+ Dynamic: requires-python
40
+ Dynamic: summary
41
+
42
+ # Mini Search Engine
43
+
44
+ A flexible Python library that scrapes Google and DuckDuckGo for search results. It supports extensive search features including pagination, safe search, and time filtering.
45
+
46
+ ## Features
47
+
48
+ - **Multi-Engine Support**: Search via Google or DuckDuckGo.
49
+ - **Smart Fallback**: Automatically tries the next available engine if the primary one is blocked (e.g., falls back to DuckDuckGo if Google blocks).
50
+ - **Pagination**: Retrieve as many results as you need.
51
+ - **Safe Search**: Control safe search strictness (`strict`, `moderate`, `off`).
52
+ - **Time Filtering**: Filter results by day, week, month, or year.
53
+ - **No API Key Needed**: Uses direct web scraping.
54
+
55
+ ## Installation
56
+
57
+ You can install the package directly from PyPI:
58
+
59
+ ```bash
60
+ pip install mini-search-engine
61
+ ```
62
+
63
+ Or install locally from source:
64
+
65
+ ```bash
66
+ git clone https://github.com/yourusername/mini-search-engine.git
67
+ cd mini-search-engine
68
+ pip install .
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ```python
74
+ from mini_search_engine import SearchEngine
75
+ import logging
76
+
77
+ # Optional: Enable logging to see what's happening under the hood
78
+ logging.basicConfig(level=logging.INFO)
79
+
80
+ engine = SearchEngine()
81
+
82
+ # Simple Search
83
+ results = engine.search("python programming")
84
+ for res in results:
85
+ print(res['title'], res['link'])
86
+
87
+ # Advanced Search with Filters
88
+ results = engine.search(
89
+ "latest python news",
90
+ engine="auto", # Try Google, then DDG
91
+ limit=20, # Get 20 results (handles pagination automatically)
92
+ safe="strict", # Strict safe search
93
+ time_range="w" # Past week
94
+ )
95
+
96
+ print(f"\nFound {len(results)} results:")
97
+ for i, res in enumerate(results):
98
+ print(f"#{i+1} [{res['source']}] {res['title']}")
99
+ print(f"Link: {res['link']}")
100
+ print(f"Snippet: {res['snippet']}")
101
+ print("-" * 30)
102
+ ```
103
+
104
+ ## API Reference
105
+
106
+ ### `search(query, engine="auto", limit=10, safe="moderate", time_range=None)`
107
+
108
+ - `query` (str): The search query.
109
+ - `engine` (str): `'google'`, `'ddg'`, or `'auto'`. Defaults to `'auto'`.
110
+ - `limit` (int): Number of results to return. Defaults to `10`.
111
+ - `safe` (str): Safe search level: `'strict'`, `'moderate'`, `'off'`. Defaults to `'moderate'`.
112
+ - `time_range` (str): `'d'` (day), `'w'` (week), `'m'` (month), `'y'` (year). Defaults to `None` (any time).
113
+
114
+ ## License
115
+
116
+ MIT License
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ requirements.txt
5
+ setup.py
6
+ mini_search_engine/__init__.py
7
+ mini_search_engine/engine.py
8
+ mini_search_engine.egg-info/PKG-INFO
9
+ mini_search_engine.egg-info/SOURCES.txt
10
+ mini_search_engine.egg-info/dependency_links.txt
11
+ mini_search_engine.egg-info/requires.txt
12
+ mini_search_engine.egg-info/top_level.txt
13
+ tests/test_engine.py
@@ -0,0 +1,4 @@
1
+ requests
2
+ beautifulsoup4
3
+ lxml
4
+ certifi
@@ -0,0 +1 @@
1
+ mini_search_engine
@@ -0,0 +1,4 @@
1
+ beautifulsoup4>=4.0.0
2
+ requests>=2.0.0
3
+ lxml>=4.6.0
4
+ certifi>=2020.0.0
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,43 @@
1
+ from setuptools import setup, find_packages
2
+ import pathlib
3
+
4
+ # Read the contents of your README file
5
+ here = pathlib.Path(__file__).parent.resolve()
6
+ long_description = (here / "README.md").read_text(encoding="utf-8")
7
+
8
+ setup(
9
+ name="mini_search_engine",
10
+ version="1.0.0",
11
+ description="A simple search engine library scraping Google and DuckDuckGo.",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/yourusername/mini-search-engine",
15
+ author="Jules",
16
+ author_email="jules@example.com",
17
+ classifiers=[
18
+ "Development Status :: 5 - Production/Stable",
19
+ "Intended Audience :: Developers",
20
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.6",
24
+ "Programming Language :: Python :: 3.7",
25
+ "Programming Language :: Python :: 3.8",
26
+ "Programming Language :: Python :: 3.9",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Operating System :: OS Independent",
29
+ ],
30
+ keywords="search, google, duckduckgo, scraping, crawler",
31
+ packages=find_packages(),
32
+ python_requires=">=3.6",
33
+ install_requires=[
34
+ "requests",
35
+ "beautifulsoup4",
36
+ "lxml",
37
+ "certifi"
38
+ ],
39
+ project_urls={
40
+ "Bug Reports": "https://github.com/yourusername/mini-search-engine/issues",
41
+ "Source": "https://github.com/yourusername/mini-search-engine",
42
+ },
43
+ )
@@ -0,0 +1,125 @@
1
+ import unittest
2
+ from unittest.mock import patch, Mock
3
+ import os
4
+ import sys
5
+
6
+ # Add the parent directory to the path
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
8
+
9
+ from mini_search_engine.engine import SearchEngine
10
+
11
+ class TestMiniSearchEngine(unittest.TestCase):
12
+
13
+ def setUp(self):
14
+ self.engine = SearchEngine()
15
+
16
+ @patch('mini_search_engine.engine.requests.Session.get')
17
+ def test_search_google_basic(self, mock_get):
18
+ # Mock Google response
19
+ mock_response = Mock()
20
+ mock_response.status_code = 200
21
+ mock_response.text = """
22
+ <html><body>
23
+ <div class="g">
24
+ <h3>Google Result</h3>
25
+ <a href="/url?q=http://google.com/res">Link</a>
26
+ <div class="VwiC3b">Snippet</div>
27
+ </div>
28
+ </body></html>
29
+ """
30
+ mock_get.return_value = mock_response
31
+
32
+ # Set limit=1 to avoid looping endlessly with the same mock response
33
+ results = self.engine.search("test", engine="google", limit=1)
34
+
35
+ self.assertEqual(len(results), 1)
36
+ self.assertEqual(results[0]['source'], "google")
37
+
38
+ # Verify params
39
+ args, kwargs = mock_get.call_args
40
+ self.assertEqual(kwargs['params']['q'], "test")
41
+
42
+ @patch('mini_search_engine.engine.requests.Session.post')
43
+ def test_search_ddg_basic(self, mock_post):
44
+ mock_response = Mock()
45
+ mock_response.status_code = 200
46
+ mock_response.text = """
47
+ <html><body>
48
+ <div class="result">
49
+ <a class="result__a" href="//duckduckgo.com/l/?uddg=http://ddg.com/res">DDG Result</a>
50
+ <div class="result__snippet">Snippet</div>
51
+ </div>
52
+ </body></html>
53
+ """
54
+ mock_post.return_value = mock_response
55
+
56
+ results = self.engine.search("test", engine="ddg")
57
+
58
+ self.assertEqual(len(results), 1)
59
+ self.assertEqual(results[0]['source'], "duckduckgo")
60
+ self.assertEqual(results[0]['link'], "http://ddg.com/res")
61
+
62
+ @patch('mini_search_engine.engine.requests.Session.get')
63
+ @patch('mini_search_engine.engine.requests.Session.post')
64
+ def test_fallback_logic(self, mock_post, mock_get):
65
+ # Google fails (blocked)
66
+ mock_google = Mock()
67
+ mock_google.status_code = 200
68
+ mock_google.text = "systems have detected unusual traffic"
69
+ mock_get.return_value = mock_google
70
+
71
+ # DDG succeeds
72
+ mock_ddg = Mock()
73
+ mock_ddg.status_code = 200
74
+ mock_ddg.text = """
75
+ <html><body>
76
+ <div class="result">
77
+ <a class="result__a" href="http://res.com">Res</a>
78
+ </div>
79
+ </body></html>
80
+ """
81
+ mock_post.return_value = mock_ddg
82
+
83
+ # Request 'auto' (defaults to Google -> DDG)
84
+ results = self.engine.search("test", engine="auto")
85
+
86
+ self.assertTrue(mock_get.called)
87
+ self.assertTrue(mock_post.called)
88
+ self.assertEqual(len(results), 1)
89
+ self.assertEqual(results[0]['source'], "duckduckgo")
90
+
91
+ @patch('mini_search_engine.engine.requests.Session.get')
92
+ def test_google_pagination_and_limit(self, mock_get):
93
+ # Mock 2 pages
94
+ page1 = Mock()
95
+ page1.status_code = 200
96
+ page1.text = """
97
+ <div class="g"><h3>R1</h3><a href="l1">L</a></div>
98
+ <div class="g"><h3>R2</h3><a href="l2">L</a></div>
99
+ """
100
+
101
+ page2 = Mock()
102
+ page2.status_code = 200
103
+ page2.text = """
104
+ <div class="g"><h3>R3</h3><a href="l3">L</a></div>
105
+ """
106
+
107
+ mock_get.side_effect = [page1, page2]
108
+
109
+ # Limit 3 results
110
+ results = self.engine.search("test", engine="google", limit=3)
111
+
112
+ self.assertEqual(len(results), 3)
113
+ self.assertEqual(results[0]['title'], "R1")
114
+ self.assertEqual(results[2]['title'], "R3")
115
+
116
+ # Verify calls
117
+ self.assertEqual(mock_get.call_count, 2)
118
+ # Check start params
119
+ args1, kwargs1 = mock_get.call_args_list[0]
120
+ self.assertEqual(kwargs1['params']['start'], 0)
121
+ args2, kwargs2 = mock_get.call_args_list[1]
122
+ self.assertEqual(kwargs2['params']['start'], 10)
123
+
124
+ if __name__ == '__main__':
125
+ unittest.main()