softhauzpy 0.0.91__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/PKG-INFO +1 -1
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/setup.py +1 -1
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy/main.py +25 -15
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy.egg-info/PKG-INFO +1 -1
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/README.md +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/setup.cfg +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy/__init__.py +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy.egg-info/SOURCES.txt +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy.egg-info/dependency_links.txt +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy.egg-info/requires.txt +0 -0
- {softhauzpy-0.0.91 → softhauzpy-0.1.0}/softhauzpy.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
5
|
Home-page: https://softhauz.ca
|
|
6
6
|
Author: Karen Urate
|
|
@@ -68,21 +68,31 @@ except Exception:
|
|
|
68
68
|
url : String - 'url', 'html_file', or 'raw_string'
|
|
69
69
|
"""
|
|
70
70
|
def detect_input_type(value: str) -> str:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
if parsed.scheme in ("http", "https", "ftp", "ftps"):
|
|
75
|
-
return "url"
|
|
76
|
-
except Exception:
|
|
77
|
-
pass
|
|
78
|
-
|
|
79
|
-
# Check if it's a path to an existing HTML file
|
|
80
|
-
if os.path.isfile(value) and value.lower().endswith((".html", ".htm")):
|
|
81
|
-
return "html_file"
|
|
71
|
+
|
|
72
|
+
if not value:
|
|
73
|
+
return "raw_string"
|
|
82
74
|
|
|
83
|
-
# Check if it looks like an HTML string even if not a file
|
|
84
75
|
stripped = value.strip()
|
|
85
|
-
|
|
76
|
+
|
|
77
|
+
# Safeguard: Real network URLs are never massive HTML texts,
|
|
78
|
+
# and they don't start with standard layout angle brackets.
|
|
79
|
+
if stripped.startswith("<") or len(stripped) > 2048:
|
|
80
|
+
return "raw_string"
|
|
81
|
+
|
|
82
|
+
# Enforce standard prefix matching before relying on urlparse split tokens
|
|
83
|
+
if stripped.lower().startswith(("http://", "https://", "ftp://", "ftps://")):
|
|
84
|
+
try:
|
|
85
|
+
parsed = urlparse(stripped)
|
|
86
|
+
if parsed.scheme in ("http", "https", "ftp", "ftps") and parsed.netloc:
|
|
87
|
+
return "url"
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# Check file path references safely
|
|
92
|
+
try:
|
|
93
|
+
if os.path.isfile(value) and value.lower().endswith((".html", ".htm")):
|
|
94
|
+
return "html_file"
|
|
95
|
+
except Exception:
|
|
86
96
|
return "raw_string"
|
|
87
97
|
|
|
88
98
|
# Default: treat as plain raw string
|
|
@@ -165,12 +175,12 @@ def extract_pure_text(
|
|
|
165
175
|
if modified_date:
|
|
166
176
|
header_parts.append(f"Last Modified: {modified_date}")
|
|
167
177
|
if page_url:
|
|
168
|
-
if (input_type == "url") or (input_type == "
|
|
178
|
+
if (input_type == "url") or (input_type == "html_file"):
|
|
169
179
|
header_parts.append(f"URL: {page_url}")
|
|
170
180
|
|
|
171
181
|
header = " ".join(header_parts)
|
|
172
182
|
result = {
|
|
173
|
-
"url": page_url if ((input_type == "url") or (input_type == "
|
|
183
|
+
"url": page_url if ((input_type == "url") or (input_type == "html_file")) else assigned_location,
|
|
174
184
|
"title": title,
|
|
175
185
|
"author": author,
|
|
176
186
|
"description": description,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: softhauzpy
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
|
|
5
5
|
Home-page: https://softhauz.ca
|
|
6
6
|
Author: Karen Urate
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|