softhauzpy 0.0.81__tar.gz → 0.0.91__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.81
3
+ Version: 0.0.91
4
4
  Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
5
5
  Home-page: https://softhauz.ca
6
6
  Author: Karen Urate
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as f:
5
5
 
6
6
  setup(
7
7
  name='softhauzpy',
8
- version='0.0.81',
8
+ version='0.0.91',
9
9
  author='Karen Urate',
10
10
  author_email='karen.urate@softhauz.ca',
11
11
  packages=find_packages(),
@@ -94,12 +94,13 @@ def detect_input_type(value: str) -> str:
94
94
 
95
95
  Parameters
96
96
  ----------
97
- url : String - The URL to fetch.
97
+ page_url : String - Accepts either the URL to fetch, a local path to an HTML file, or the text/HTML String content of the page.
98
98
  title : String - Optional document title (included in the returned text header when provided).
99
99
  author : String - Optional document author (included in the returned text header when provided).
100
100
  description : String - Optional description (included in the returned text header when provided).
101
101
  creation_date : String - Optional creation date string (included in the returned text header when provided).
102
102
  modified_date : String - Optional last-modified date string (included in the returned text header when provided).
103
+ assigned_location : String - Optional the URL to assign if the value passed in page_url is the text/HTML String content of the page.
103
104
 
104
105
 
105
106
  Returns
@@ -127,7 +128,8 @@ def extract_pure_text(
127
128
  author: str | None = None,
128
129
  description: str | None = None,
129
130
  creation_date: str | None = None,
130
- modified_date: str | None = None) -> dict:
131
+ modified_date: str | None = None,
132
+ assigned_location: str | None = None) -> dict:
131
133
 
132
134
  input_type = detect_input_type(page_url)
133
135
 
@@ -163,11 +165,12 @@ def extract_pure_text(
163
165
  if modified_date:
164
166
  header_parts.append(f"Last Modified: {modified_date}")
165
167
  if page_url:
166
- header_parts.append(f"URL: {page_url}")
168
+ if (input_type == "url") or (input_type == "html"):
169
+ header_parts.append(f"URL: {page_url}")
167
170
 
168
171
  header = " ".join(header_parts)
169
172
  result = {
170
- "url": page_url,
173
+ "url": page_url if ((input_type == "url") or (input_type == "html")) else assigned_location,
171
174
  "title": title,
172
175
  "author": author,
173
176
  "description": description,
@@ -220,16 +223,20 @@ def get_search_results_list(page_list=[], keywords='') -> list:
220
223
 
221
224
  if len(url) == 0 or len(url) < 1:
222
225
  continue
223
-
226
+
224
227
  title = page[1] or ''
225
228
  author = page[2] or ''
226
229
  description = page[3] or ''
227
230
  creation_date = page[4] or ''
228
231
  modified_date = page[5] or ''
229
-
230
-
231
- if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
232
- results.append((url, title, author, description, creation_date, modified_date))
232
+ assigned_location = page[6] if len(page[6])>3 else ''
233
+
234
+ if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date, assigned_location=assigned_location)["content"]).lower():
235
+
236
+ if detect_input_type(url) != "url":
237
+ results.append((assigned_location, title, author, description, creation_date, modified_date))
238
+ else:
239
+ results.append((url, title, author, description, creation_date, modified_date))
233
240
 
234
241
  return results
235
242
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.81
3
+ Version: 0.0.91
4
4
  Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
5
5
  Home-page: https://softhauz.ca
6
6
  Author: Karen Urate
File without changes
File without changes