softhauzpy 0.0.7__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/PKG-INFO +4 -1
- softhauzpy-0.0.9/setup.py +22 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy/__init__.py +1 -1
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy/main.py +58 -13
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy.egg-info/PKG-INFO +4 -1
- softhauzpy-0.0.7/setup.py +0 -19
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/README.md +0 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/setup.cfg +0 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy.egg-info/SOURCES.txt +0 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy.egg-info/dependency_links.txt +0 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy.egg-info/requires.txt +0 -0
- {softhauzpy-0.0.7 → softhauzpy-0.0.9}/softhauzpy.egg-info/top_level.txt +0 -0
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
|
+
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
|
+
Home-page: https://softhauz.ca
|
|
4
6
|
Author: Karen Urate
|
|
5
7
|
Author-email: karen.urate@softhauz.ca
|
|
8
|
+
License: MIT
|
|
6
9
|
Description-Content-Type: text/markdown
|
|
7
10
|
|
|
8
11
|
# SofthauzPy
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
|
4
|
+
long_description = f.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name='softhauzpy',
|
|
8
|
+
version='0.0.9',
|
|
9
|
+
author='Karen Urate',
|
|
10
|
+
author_email='karen.urate@softhauz.ca',
|
|
11
|
+
packages=find_packages(),
|
|
12
|
+
install_requires=[
|
|
13
|
+
'requests>=2.32.3',
|
|
14
|
+
'beautifulsoup4>=4.12.3',
|
|
15
|
+
'nltk>=3.9.4'
|
|
16
|
+
],
|
|
17
|
+
url="https://softhauz.ca",
|
|
18
|
+
license="MIT",
|
|
19
|
+
description="is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.",
|
|
20
|
+
long_description=long_description,
|
|
21
|
+
long_description_content_type="text/markdown",
|
|
22
|
+
)
|
|
@@ -3,7 +3,7 @@ from .main import incremental_update, highlight_query_terms, build_sitemap_urls
|
|
|
3
3
|
from .main import fingerprint_page, generate_snippet
|
|
4
4
|
|
|
5
5
|
# extractions
|
|
6
|
-
from .main import extract_structured_data, extract_headings
|
|
6
|
+
from .main import detect_input_type, extract_structured_data, extract_headings
|
|
7
7
|
from .main import extract_metadata, extract_links, extract_pure_text
|
|
8
8
|
|
|
9
9
|
# indexing
|
|
@@ -15,7 +15,7 @@ External Package List:
|
|
|
15
15
|
- nltk >= (v. 3.9.4)
|
|
16
16
|
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
import os
|
|
19
19
|
import re
|
|
20
20
|
import json
|
|
21
21
|
import math
|
|
@@ -56,18 +56,51 @@ except Exception:
|
|
|
56
56
|
}
|
|
57
57
|
_NLTK_AVAILABLE = False
|
|
58
58
|
|
|
59
|
+
"""
|
|
60
|
+
Detect whether the input is a URL, an HTML file path, or a raw string.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
value : String - The input to evaluate.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
url : String - 'url', 'html_file', or 'raw_string'
|
|
69
|
+
"""
|
|
70
|
+
def detect_input_type(value: str) -> str:
|
|
71
|
+
# Check if it's a URL (http/https/ftp scheme)
|
|
72
|
+
try:
|
|
73
|
+
parsed = urlparse(value)
|
|
74
|
+
if parsed.scheme in ("http", "https", "ftp", "ftps"):
|
|
75
|
+
return "url"
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
# Check if it's a path to an existing HTML file
|
|
80
|
+
if os.path.isfile(value) and value.lower().endswith((".html", ".htm")):
|
|
81
|
+
return "html_file"
|
|
82
|
+
|
|
83
|
+
# Check if it looks like an HTML string even if not a file
|
|
84
|
+
stripped = value.strip()
|
|
85
|
+
if stripped.startswith("<") and stripped.endswith(">"):
|
|
86
|
+
return "raw_string"
|
|
87
|
+
|
|
88
|
+
# Default: treat as plain raw string
|
|
89
|
+
return "raw_string"
|
|
90
|
+
|
|
59
91
|
"""
|
|
60
92
|
|
|
61
93
|
Fetch a webpage and return only the pure text content found within its HTML tags.
|
|
62
94
|
|
|
63
95
|
Parameters
|
|
64
96
|
----------
|
|
65
|
-
|
|
97
|
+
page_url : String - Accepts either the URL to fetch, a local path to an HTML file, or the text/HTML String content of the page.
|
|
66
98
|
title : String - Optional document title (included in the returned text header when provided).
|
|
67
99
|
author : String - Optional document author (included in the returned text header when provided).
|
|
68
100
|
description : String - Optional description (included in the returned text header when provided).
|
|
69
101
|
creation_date : String - Optional creation date string (included in the returned text header when provided).
|
|
70
102
|
modified_date : String - Optional last-modified date string (included in the returned text header when provided).
|
|
103
|
+
assigned_location : String - Optional the URL to assign if the value passed in page_url is the text/HTML String content of the page.
|
|
71
104
|
|
|
72
105
|
|
|
73
106
|
Returns
|
|
@@ -88,8 +121,6 @@ except Exception:
|
|
|
88
121
|
If the server returns a non-2xx status code.
|
|
89
122
|
|
|
90
123
|
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
124
|
def extract_pure_text(
|
|
94
125
|
page_url: str,
|
|
95
126
|
*,
|
|
@@ -97,11 +128,24 @@ def extract_pure_text(
|
|
|
97
128
|
author: str | None = None,
|
|
98
129
|
description: str | None = None,
|
|
99
130
|
creation_date: str | None = None,
|
|
100
|
-
modified_date: str | None = None
|
|
101
|
-
|
|
102
|
-
|
|
131
|
+
modified_date: str | None = None,
|
|
132
|
+
assigned_location: str | None = None) -> dict:
|
|
133
|
+
|
|
134
|
+
input_type = detect_input_type(page_url)
|
|
103
135
|
|
|
104
|
-
|
|
136
|
+
if input_type == "url":
|
|
137
|
+
response = fetch_page(page_url, timeout=15)
|
|
138
|
+
response.raise_for_status()
|
|
139
|
+
html_content = response.text
|
|
140
|
+
|
|
141
|
+
elif input_type == "html_file":
|
|
142
|
+
with open(page_url, "r", encoding="utf-8") as f:
|
|
143
|
+
html_content = f.read()
|
|
144
|
+
|
|
145
|
+
else:
|
|
146
|
+
html_content = page_url
|
|
147
|
+
|
|
148
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
105
149
|
|
|
106
150
|
for tag in soup.find_all(_SKIP_TAGS):
|
|
107
151
|
tag.decompose()
|
|
@@ -121,11 +165,12 @@ def extract_pure_text(
|
|
|
121
165
|
if modified_date:
|
|
122
166
|
header_parts.append(f"Last Modified: {modified_date}")
|
|
123
167
|
if page_url:
|
|
124
|
-
|
|
168
|
+
if (input_type == "url") or (input_type == "html"):
|
|
169
|
+
header_parts.append(f"URL: {page_url}")
|
|
125
170
|
|
|
126
171
|
header = " ".join(header_parts)
|
|
127
172
|
result = {
|
|
128
|
-
"url": page_url,
|
|
173
|
+
"url": page_url if ((input_type == "url") or (input_type == "html")) else assigned_location,
|
|
129
174
|
"title": title,
|
|
130
175
|
"author": author,
|
|
131
176
|
"description": description,
|
|
@@ -184,9 +229,9 @@ def get_search_results_list(page_list=[], keywords='') -> list:
|
|
|
184
229
|
description = page[3] or ''
|
|
185
230
|
creation_date = page[4] or ''
|
|
186
231
|
modified_date = page[5] or ''
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
|
|
232
|
+
assigned_location = page[6] or ''
|
|
233
|
+
|
|
234
|
+
if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date, assigned_location=assigned_location)["content"]).lower():
|
|
190
235
|
results.append((url, title, author, description, creation_date, modified_date))
|
|
191
236
|
|
|
192
237
|
return results
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
|
+
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
|
+
Home-page: https://softhauz.ca
|
|
4
6
|
Author: Karen Urate
|
|
5
7
|
Author-email: karen.urate@softhauz.ca
|
|
8
|
+
License: MIT
|
|
6
9
|
Description-Content-Type: text/markdown
|
|
7
10
|
|
|
8
11
|
# SofthauzPy
|
softhauzpy-0.0.7/setup.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from setuptools import setup, find_packages
|
|
2
|
-
|
|
3
|
-
with open("README.md", "r", encoding="utf-8") as f:
|
|
4
|
-
description = f.read()
|
|
5
|
-
|
|
6
|
-
setup(
|
|
7
|
-
name='softhauzpy',
|
|
8
|
-
version='0.0.7',
|
|
9
|
-
author='Karen Urate',
|
|
10
|
-
author_email='karen.urate@softhauz.ca',
|
|
11
|
-
packages=find_packages(),
|
|
12
|
-
install_requires=[
|
|
13
|
-
'requests>=2.32.3',
|
|
14
|
-
'beautifulsoup4>=4.12.3',
|
|
15
|
-
'nltk>=3.9.4'
|
|
16
|
-
],
|
|
17
|
-
long_description=description,
|
|
18
|
-
long_description_content_type="text/markdown",
|
|
19
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|