softhauzpy 0.0.81__tar.gz → 0.0.91__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/PKG-INFO +1 -1
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/setup.py +1 -1
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy/main.py +16 -9
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy.egg-info/PKG-INFO +1 -1
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/README.md +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/setup.cfg +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy/__init__.py +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy.egg-info/SOURCES.txt +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy.egg-info/dependency_links.txt +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy.egg-info/requires.txt +0 -0
- {softhauzpy-0.0.81 → softhauzpy-0.0.91}/softhauzpy.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.91
|
|
4
4
|
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
5
|
Home-page: https://softhauz.ca
|
|
6
6
|
Author: Karen Urate
|
|
@@ -94,12 +94,13 @@ def detect_input_type(value: str) -> str:
|
|
|
94
94
|
|
|
95
95
|
Parameters
|
|
96
96
|
----------
|
|
97
|
-
|
|
97
|
+
page_url : String - Accepts either the URL to fetch, a local path to an HTML file, or the text/HTML String content of the page.
|
|
98
98
|
title : String - Optional document title (included in the returned text header when provided).
|
|
99
99
|
author : String - Optional document author (included in the returned text header when provided).
|
|
100
100
|
description : String - Optional description (included in the returned text header when provided).
|
|
101
101
|
creation_date : String - Optional creation date string (included in the returned text header when provided).
|
|
102
102
|
modified_date : String - Optional last-modified date string (included in the returned text header when provided).
|
|
103
|
+
assigned_location : String - Optional the URL to assign if the value passed in page_url is the text/HTML String content of the page.
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
Returns
|
|
@@ -127,7 +128,8 @@ def extract_pure_text(
|
|
|
127
128
|
author: str | None = None,
|
|
128
129
|
description: str | None = None,
|
|
129
130
|
creation_date: str | None = None,
|
|
130
|
-
modified_date: str | None = None
|
|
131
|
+
modified_date: str | None = None,
|
|
132
|
+
assigned_location: str | None = None) -> dict:
|
|
131
133
|
|
|
132
134
|
input_type = detect_input_type(page_url)
|
|
133
135
|
|
|
@@ -163,11 +165,12 @@ def extract_pure_text(
|
|
|
163
165
|
if modified_date:
|
|
164
166
|
header_parts.append(f"Last Modified: {modified_date}")
|
|
165
167
|
if page_url:
|
|
166
|
-
|
|
168
|
+
if (input_type == "url") or (input_type == "html"):
|
|
169
|
+
header_parts.append(f"URL: {page_url}")
|
|
167
170
|
|
|
168
171
|
header = " ".join(header_parts)
|
|
169
172
|
result = {
|
|
170
|
-
"url": page_url,
|
|
173
|
+
"url": page_url if ((input_type == "url") or (input_type == "html")) else assigned_location,
|
|
171
174
|
"title": title,
|
|
172
175
|
"author": author,
|
|
173
176
|
"description": description,
|
|
@@ -220,16 +223,20 @@ def get_search_results_list(page_list=[], keywords='') -> list:
|
|
|
220
223
|
|
|
221
224
|
if len(url) == 0 or len(url) < 1:
|
|
222
225
|
continue
|
|
223
|
-
|
|
226
|
+
|
|
224
227
|
title = page[1] or ''
|
|
225
228
|
author = page[2] or ''
|
|
226
229
|
description = page[3] or ''
|
|
227
230
|
creation_date = page[4] or ''
|
|
228
231
|
modified_date = page[5] or ''
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
|
|
232
|
-
|
|
232
|
+
assigned_location = page[6] if len(page[6])>3 else ''
|
|
233
|
+
|
|
234
|
+
if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date, assigned_location=assigned_location)["content"]).lower():
|
|
235
|
+
|
|
236
|
+
if detect_input_type(url) != "url":
|
|
237
|
+
results.append((assigned_location, title, author, description, creation_date, modified_date))
|
|
238
|
+
else:
|
|
239
|
+
results.append((url, title, author, description, creation_date, modified_date))
|
|
233
240
|
|
|
234
241
|
return results
|
|
235
242
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.91
|
|
4
4
|
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
5
|
Home-page: https://softhauz.ca
|
|
6
6
|
Author: Karen Urate
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|