softhauzpy 0.0.7__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
+ Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
5
+ Home-page: https://softhauz.ca
4
6
  Author: Karen Urate
5
7
  Author-email: karen.urate@softhauz.ca
8
+ License: MIT
6
9
  Description-Content-Type: text/markdown
7
10
 
8
11
  # SofthauzPy
@@ -0,0 +1,22 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ setup(
7
+ name='softhauzpy',
8
+ version='0.0.9',
9
+ author='Karen Urate',
10
+ author_email='karen.urate@softhauz.ca',
11
+ packages=find_packages(),
12
+ install_requires=[
13
+ 'requests>=2.32.3',
14
+ 'beautifulsoup4>=4.12.3',
15
+ 'nltk>=3.9.4'
16
+ ],
17
+ url="https://softhauz.ca",
18
+ license="MIT",
19
+ description="is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.",
20
+ long_description=long_description,
21
+ long_description_content_type="text/markdown",
22
+ )
@@ -3,7 +3,7 @@ from .main import incremental_update, highlight_query_terms, build_sitemap_urls
3
3
  from .main import fingerprint_page, generate_snippet
4
4
 
5
5
  # extractions
6
- from .main import extract_structured_data, extract_headings
6
+ from .main import detect_input_type, extract_structured_data, extract_headings
7
7
  from .main import extract_metadata, extract_links, extract_pure_text
8
8
 
9
9
  # indexing
@@ -15,7 +15,7 @@ External Package List:
15
15
  - nltk >= (v. 3.9.4)
16
16
 
17
17
  """
18
-
18
+ import os
19
19
  import re
20
20
  import json
21
21
  import math
@@ -56,18 +56,51 @@ except Exception:
56
56
  }
57
57
  _NLTK_AVAILABLE = False
58
58
 
59
+ """
60
+ Detect whether the input is a URL, an HTML file path, or a raw string.
61
+
62
+ Parameters
63
+ ----------
64
+ value : String - The input to evaluate.
65
+
66
+ Returns
67
+ -------
68
+ url : String - 'url', 'html_file', or 'raw_string'
69
+ """
70
+ def detect_input_type(value: str) -> str:
71
+ # Check if it's a URL (http/https/ftp scheme)
72
+ try:
73
+ parsed = urlparse(value)
74
+ if parsed.scheme in ("http", "https", "ftp", "ftps"):
75
+ return "url"
76
+ except Exception:
77
+ pass
78
+
79
+ # Check if it's a path to an existing HTML file
80
+ if os.path.isfile(value) and value.lower().endswith((".html", ".htm")):
81
+ return "html_file"
82
+
83
+ # Check if it looks like an HTML string even if not a file
84
+ stripped = value.strip()
85
+ if stripped.startswith("<") and stripped.endswith(">"):
86
+ return "raw_string"
87
+
88
+ # Default: treat as plain raw string
89
+ return "raw_string"
90
+
59
91
  """
60
92
 
61
93
  Fetch a webpage and return only the pure text content found within its HTML tags.
62
94
 
63
95
  Parameters
64
96
  ----------
65
- url : String - The URL to fetch.
97
+ page_url : String - Accepts either the URL to fetch, a local path to an HTML file, or the text/HTML String content of the page.
66
98
  title : String - Optional document title (included in the returned text header when provided).
67
99
  author : String - Optional document author (included in the returned text header when provided).
68
100
  description : String - Optional description (included in the returned text header when provided).
69
101
  creation_date : String - Optional creation date string (included in the returned text header when provided).
70
102
  modified_date : String - Optional last-modified date string (included in the returned text header when provided).
103
+ assigned_location : String - Optional the URL to assign if the value passed in page_url is the text/HTML String content of the page.
71
104
 
72
105
 
73
106
  Returns
@@ -88,8 +121,6 @@ except Exception:
88
121
  If the server returns a non-2xx status code.
89
122
 
90
123
  """
91
-
92
-
93
124
  def extract_pure_text(
94
125
  page_url: str,
95
126
  *,
@@ -97,11 +128,24 @@ def extract_pure_text(
97
128
  author: str | None = None,
98
129
  description: str | None = None,
99
130
  creation_date: str | None = None,
100
- modified_date: str | None = None) -> dict:
101
- response = fetch_page(page_url, timeout=15)
102
- response.raise_for_status()
131
+ modified_date: str | None = None,
132
+ assigned_location: str | None = None) -> dict:
133
+
134
+ input_type = detect_input_type(page_url)
103
135
 
104
- soup = BeautifulSoup(response.text, "html.parser")
136
+ if input_type == "url":
137
+ response = fetch_page(page_url, timeout=15)
138
+ response.raise_for_status()
139
+ html_content = response.text
140
+
141
+ elif input_type == "html_file":
142
+ with open(page_url, "r", encoding="utf-8") as f:
143
+ html_content = f.read()
144
+
145
+ else:
146
+ html_content = page_url
147
+
148
+ soup = BeautifulSoup(html_content, "html.parser")
105
149
 
106
150
  for tag in soup.find_all(_SKIP_TAGS):
107
151
  tag.decompose()
@@ -121,11 +165,12 @@ def extract_pure_text(
121
165
  if modified_date:
122
166
  header_parts.append(f"Last Modified: {modified_date}")
123
167
  if page_url:
124
- header_parts.append(f"URL: {page_url}")
168
+ if (input_type == "url") or (input_type == "html"):
169
+ header_parts.append(f"URL: {page_url}")
125
170
 
126
171
  header = " ".join(header_parts)
127
172
  result = {
128
- "url": page_url,
173
+ "url": page_url if ((input_type == "url") or (input_type == "html")) else assigned_location,
129
174
  "title": title,
130
175
  "author": author,
131
176
  "description": description,
@@ -184,9 +229,9 @@ def get_search_results_list(page_list=[], keywords='') -> list:
184
229
  description = page[3] or ''
185
230
  creation_date = page[4] or ''
186
231
  modified_date = page[5] or ''
187
-
188
-
189
- if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
232
+ assigned_location = page[6] or ''
233
+
234
+ if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date, assigned_location=assigned_location)["content"]).lower():
190
235
  results.append((url, title, author, description, creation_date, modified_date))
191
236
 
192
237
  return results
@@ -1,8 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
+ Summary: is a comprehensive Python toolkit built for developers creating intelligent, data-driven web applications. It provides a powerful suite of web utilities including web scraping tools, crawling systems, content extraction pipelines, and search engine components that help developers build fully customizable in-house website search solutions.
5
+ Home-page: https://softhauz.ca
4
6
  Author: Karen Urate
5
7
  Author-email: karen.urate@softhauz.ca
8
+ License: MIT
6
9
  Description-Content-Type: text/markdown
7
10
 
8
11
  # SofthauzPy
softhauzpy-0.0.7/setup.py DELETED
@@ -1,19 +0,0 @@
1
- from setuptools import setup, find_packages
2
-
3
- with open("README.md", "r", encoding="utf-8") as f:
4
- description = f.read()
5
-
6
- setup(
7
- name='softhauzpy',
8
- version='0.0.7',
9
- author='Karen Urate',
10
- author_email='karen.urate@softhauz.ca',
11
- packages=find_packages(),
12
- install_requires=[
13
- 'requests>=2.32.3',
14
- 'beautifulsoup4>=4.12.3',
15
- 'nltk>=3.9.4'
16
- ],
17
- long_description=description,
18
- long_description_content_type="text/markdown",
19
- )
File without changes
File without changes