PyPI - softhauzpy - Versions diffs - 0.0.5__tar.gz → 0.0.7__tar.gz - Mend

softhauzpy 0.0.5tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: softhauzpy
-Version: 0.0.5
+Version: 0.0.7
 Author: Karen Urate
 Author-email: karen.urate@softhauz.ca
 Description-Content-Type: text/markdown

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as f:
 setup(
     name='softhauzpy',
-    version='0.0.5',
+    version='0.0.7',
     author='Karen Urate',
     author_email='karen.urate@softhauz.ca',
     packages=find_packages(),

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy/__init__.py RENAMED Viewed

@@ -7,7 +7,7 @@ from .main import extract_structured_data, extract_headings
 from .main import extract_metadata, extract_links, extract_pure_text
 # indexing
-from .main import load_index, save_index, search_index, compute_tfidf, build_inverted_index
+from .main import load_index, save_index, search_index, compute_tfidf, build_inverted_index, get_document_score
 # crawls and scrapes
 from .main import tokenize, chunk_text, crawl_site, parse_html, fetch_page, get_search_results_list

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy/main.py RENAMED Viewed

@@ -62,17 +62,17 @@ except Exception:
     Parameters
     ----------
-    url           : The URL to fetch.
-    title         : Optional document title (included in the returned text header when provided).
-    author        : Optional document author (included in the returned text header when provided).
-    description   : Optional description (included in the returned text header when provided).
-    creation_date : Optional creation date string (included in the returned text header when provided).
-    modified_date : Optional last-modified date string (included in the returned text header when provided).
+    url           : String - The URL to fetch.
+    title         : String - Optional document title (included in the returned text header when provided).
+    author        : String - Optional document author (included in the returned text header when provided).
+    description   : String - Optional description (included in the returned text header when provided).
+    creation_date : String - Optional creation date string (included in the returned text header when provided).
+    modified_date : String - Optional last-modified date string (included in the returned text header when provided).
     Returns
     -------
-    dict with keys:
+    Dictionary with Keys:
         "url"           : str
         "title"         : str | None
         "author"        : str | None
@@ -86,7 +86,7 @@ except Exception:
     ------
     requests.HTTPError
         If the server returns a non-2xx status code.
 """
@@ -146,33 +146,23 @@ def extract_pure_text(
    returned list contains detailed information about a page.
    Parameters:
-       page_list (list of tuples): A list where each tuple represents a page with the following elements:
-           - url (str): The URL of the page.
-           - title (str): The title of the page.
-           - author (str): The author of the page.
-           - description (str): A brief description of the page.
-           - creation_date (str): The date the page was created.
-           - modified_date (str): The date the page was last modified.
-       keywords (str): A string containing keywords to search for within the page entries.
+        page_list   : List - A list where each tuple represents a page with the following elements:
+                                (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
+        keywords    : String - A string containing keywords to search for within the page entries.
    Returns:
-       list of tuples: A list of tuples matching the search criteria. Each tuple contains:
-           - url (str)
-           - title (str)
-           - author (str)
-           - description (str)
-           - creation_date (str)
-           - modified_date (str)
+        results     : List - A list of tuples matching the search criteria. Each tuple contains:
+                                (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
    Example:
        >>> pages = [
-       ...     ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
-       ...     ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
-       ... ]
+            ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
+            ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
+       ]
        >>> get_search_results_list(pages, "sample")
        [
-           ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
-           ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
+            ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
+            ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
        ]
 """
@@ -205,12 +195,12 @@ def get_search_results_list(page_list=[], keywords='') -> list:
     Fetch a single URL with retry logic and polite delay.
     Args:
-        url:      Target URL.
-        timeout:  Per-request timeout in seconds.
-        retries:  Maximum number of attempts before giving up.
-        delay:    Seconds to wait between retries (doubles on each failure).
-        headers:  Optional extra HTTP headers (merged with a default UA).
-        session:  An existing requests.Session (useful for cookie sharing).
+        url         : String - Target URL.
+        timeout     : Integer - Per-request timeout in seconds.
+        retries     : Integer - Maximum number of attempts before giving up.
+        delay       : Float - Seconds to wait between retries (doubles on each failure).
+        headers     : Dictionary - Optional extra HTTP headers (merged with a default UA).
+        session     : requests.Session | None - An existing requests.Session (useful for cookie sharing).
     Returns:
         A requests.Response on success, or None after all retries fail.
@@ -256,8 +246,8 @@ def fetch_page(
     Parse raw HTML into a BeautifulSoup tree.
     Args:
-        html:   Raw HTML string or bytes.
-        parser: BS4 parser backend ('lxml', 'html.parser', 'html5lib').
+        html    : String - Raw HTML string or bytes.
+        parser  : String - BS4 parser backend ('lxml', 'html.parser', 'html5lib').
     Returns:
         A BeautifulSoup object ready for querying.
@@ -283,12 +273,12 @@ def parse_html(
     language, and author where available.
     Args:
-        soup: Parsed BeautifulSoup object.
-        url:  Original URL (used as fallback for canonical).
+        soup    : BeautifulSoup - Parsed BeautifulSoup object.
+        url     : String - Original URL (used as fallback for canonical).
     Returns:
-        Dict with keys: title, description, keywords, og_title,
-        og_description, og_image, canonical, lang, author.
+        Dictionary with Keys:
+            title, description, keywords, og_title, og_description, og_image, canonical, lang, author
     Example:
         meta = extract_metadata(soup, url="https://example.com/page")
@@ -325,10 +315,10 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
     Collect all hyperlinks from a page, normalised to absolute URLs.
     Args:
-        soup:               Parsed page.
-        base_url:           Absolute URL of the page being parsed.
-        same_domain_only:   When True, filters out external domains.
-        exclude_extensions: File extensions to skip (e.g. ['.pdf', '.jpg']).
+        soup                : BeautifulSoup - Parsed page.
+        base_url            : String - Absolute URL of the page being parsed.
+        same_domain_only    : Boolean - When True, filters out external domains.
+        exclude_extensions  : List - File extensions to skip (e.g. ['.pdf', '.jpg']).
     Returns:
         Deduplicated list of absolute URL strings.
@@ -337,8 +327,6 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
         links = extract_links(soup, "https://docs.example.com/intro")
         # => ['https://docs.example.com/api', 'https://docs.example.com/faq']
 """
 def extract_links(
         soup: BeautifulSoup,
         base_url: str,
@@ -378,14 +366,15 @@ def extract_links(
     Returns a list of page records for further processing.
     Args:
-        start_url:        Root URL to begin crawling.
-        max_pages:        Hard cap on pages visited.
-        same_domain_only: Stay within the same hostname.
-        delay:            Polite pause (seconds) between requests.
-        session:          Reusable requests.Session.
+        start_url           : String - Root URL to begin crawling.
+        max_pages           : Integer - Hard cap on pages visited.
+        same_domain_only    : Boolean - Stay within the same hostname.
+        delay               : Float - Polite pause (seconds) between requests.
+        session             : requests.Session - Reusable requests.Session.
     Returns:
-        List of dicts, each with keys: url, html, soup, status_code.
+        List of dicts, each with keys:
+            url, html, soup, status_code
     Example:
         pages = crawl_site("https://docs.example.com", max_pages=50)
@@ -444,9 +433,9 @@ def crawl_site(
     boundary are still findable.
     Args:
-        text:       Full document text.
-        chunk_size: Maximum words per chunk.
-        overlap:    Words shared between consecutive chunks.
+        text        : String - Full document text.
+        chunk_size  : Integer - Maximum words per chunk.
+        overlap     : Integer - Words shared between consecutive chunks.
     Returns:
         List of text chunks.
@@ -478,10 +467,10 @@ def chunk_text(
     applies Porter stemming (if nltk is installed).
     Args:
-        text:             Input string.
-        remove_stopwords: Filter common English stopwords.
-        stem:             Apply stemming for root-form matching.
-        min_token_len:    Discard tokens shorter than this length.
+        text                : String - Input string.
+        remove_stopwords    : Boolean - Filter common English stopwords.
+        stem                : Boolean - Apply stemming for root-form matching.
+        min_token_len       : Integer - Discard tokens shorter than this length.
     Returns:
         List of processed token strings.
@@ -510,7 +499,39 @@ def tokenize(
         tokens = [_stemmer.stem(t) for t in tokens]
     return tokens
+"""
+    This is a function to customize the scoring of documents using the tokens of a query as indices.
+    Obtain a document score based on query index and unit preference.
+    Args:
+        url                 : String - The URL of the page
+        query               : String - The query to search.
+        remove_stopwords    : Boolean - Filter common English stopwords.
+        stem                : Boolean - Apply stemming for root-form matching.
+        unit                : Integer - The desired score base metric for the document. Formula: count of occurrence x unit. Default is 1.0.
+    Returns:
+        List of processed token strings.
+    Example:
+        score = get_document_score("https://softhauz.ca/eng", query = "SofthauzPy", remove_stopwords = False, stem = False)
+        # 8.0
+"""
+def get_document_score(url:str, *, query:str = '', remove_stopwords: bool = False, stem: bool = False, unit: float = 1.0) -> float:
+    score = 0.0
+    text = extract_pure_text(url)["content"].lower()
+    tokens = tokenize(query, remove_stopwords = remove_stopwords, stem = stem)
+    for token in tokens:
+        token = token.lower()
+        occurrence = text.count(token)
+        if (occurrence > 0):
+            score += (occurrence * unit)
+    return score
 """
     Build an inverted index mapping tokens to list of (doc_id, frequency).
@@ -519,10 +540,9 @@ def tokenize(
     without scanning every document on every query.
     Args:
-        documents:  List of dicts, each containing at least text_field and
-                    id_field.
-        text_field: Key whose value is the text to index.
-        id_field:   Key used as the document identifier.
+        documents   : List - List of dicts, each containing at least text_field and id_field.
+        text_field  : String - Key whose value is the text to index.
+        id_field    : String - Key used as the document identifier.
     Returns:
         Dict: { token: [(doc_id, freq), ...] }
@@ -561,7 +581,7 @@ def build_inverted_index(
         id_field: str = "url",
 ) -> dict:
     index: dict[str, list[tuple[str, int]]] = defaultdict(list)
+    #  {'strictli': [('http://127.0.0.1:8000/eng', 2)], 'necessari': [('http://127.0.0.1:8000/eng', 3)]...}
     for doc in documents:
         doc_id = doc[id_field]
         text = doc.get(text_field, "")
@@ -580,16 +600,21 @@ def build_inverted_index(
     the whole corpus — the backbone of classical relevance ranking.
     Args:
-        documents:  Corpus as a list of dicts.
-        text_field: Field containing raw text.
-        id_field:   Field used as document identifier.
+        documents   : List - Corpus as a list of dicts.
+        text_field  : String - Field containing raw text.
+        id_field    : String - Field used as document identifier.
     Returns:
         Nested dict: { doc_id: { token: tfidf_score } }
     Example:
-        scores = compute_tfidf(docs)
-        top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5]
+        documents = [
+            {"url": "https://example.com/ai",      "text": "Artificial intelligence is transforming the world"},
+            {"url": "https://example.com/ml",       "text": "Machine learning is a subset of artificial intelligence"},
+            {"url": "https://example.com/nlp",      "text": "Natural language processing enables machines to understand text"},
+        ]
+        scores = compute_tfidf(documents)
+        top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5] # these are the 1st five items
 """
@@ -599,6 +624,7 @@ def compute_tfidf(
         text_field: str = "text",
         id_field: str = "url",
 ) -> dict[str, dict[str, float]]:
     N = len(documents)
     tf_store: dict[str, dict[str, float]] = {}
     doc_freq: Counter = Counter()
@@ -629,10 +655,10 @@ def compute_tfidf(
     then returns the top-k results by total relevance score.
     Args:
-        query:  Raw user query string.
-        index:  Inverted index from build_inverted_index().
-        tfidf:  TF-IDF matrix from compute_tfidf().
-        top_k:  Maximum results to return.
+        query   : String - Raw user query string.
+        index   : Dictionary - Inverted index from build_inverted_index().
+        tfidf   : Dictionary - TF-IDF matrix from compute_tfidf().
+        top_k   : Integer - Maximum results to return.
     Returns:
         List of (doc_id, score) tuples, highest score first.
@@ -669,10 +695,10 @@ def search_index(
     h1 matches a query is a simple way to improve ranking quality.
     Args:
-        soup: Parsed BeautifulSoup object.
+        soup    : BeautifulSoup - Parsed BeautifulSoup object.
     Returns:
-        List of dicts: [{ "level": 1, "text": "Getting Started" }, …]
+        List of dicts - [{ "level": 1, "text": "Getting Started" }, …]
     Example:
         headings = extract_headings(soup)
@@ -694,10 +720,10 @@ def extract_headings(soup: BeautifulSoup) -> list[dict]:
     returns the surrounding word window, mimicking Google's snippet style.
     Args:
-        text:       Full document text.
-        query:      User's search query.
-        window:     Words to show on each side of the match.
-        max_length: Hard character cap on the returned snippet.
+        text        : String - Full document text.
+        query       : String - User's search query.
+        window      : Integer - Words to show on each side of the match.
+        max_length  : Integer - Hard character cap on the returned snippet.
     Returns:
         A short excerpt string, potentially with leading/trailing ellipsis.
@@ -745,7 +771,7 @@ def generate_snippet(
     unchanged content.
     Args:
-        text: Extracted page text (from extract_text).
+        text    : String - Extracted page text (from extract_text).
     Returns:
         64-character hex string (SHA-256 digest).
@@ -769,10 +795,10 @@ def fingerprint_page(text: str) -> str:
     re-crawl every time the search service starts.
     Args:
-        index:    Inverted index from build_inverted_index().
-        tfidf:    TF-IDF scores from compute_tfidf().
-        metadata: Per-page metadata records (list of dicts).
-        path:     Output file path.
+        index       : Dictionary - Inverted index from build_inverted_index().
+        tfidf       : Dictionary - TF-IDF scores from compute_tfidf().
+        metadata    : List - Per-page metadata records (list of dicts).
+        path        : String - Output file path.
     Example:
         save_index(index, tfidf, page_metadata, "data/index.json")
@@ -800,13 +826,13 @@ def save_index(
     Deserialise a previously saved search index from JSON.
     Args:
-        path: File path written by save_index().
+        path    : String - File path written by save_index().
     Returns:
         Tuple (inverted_index, tfidf, metadata_list).
     Raises:
-        FileNotFoundError: If the file does not exist.
+        FileNotFoundError - if the file does not exist.
     Example:
         index, tfidf, metadata = load_index("data/index.json")
@@ -830,7 +856,7 @@ def load_index(path: str = "search_index.json") -> tuple[dict, dict, list]:
     ideal for enriching search results.
     Args:
-        soup: Parsed BeautifulSoup object.
+        soup    : BeautifulSoup - Parsed BeautifulSoup object.
     Returns:
         List of parsed JSON-LD objects found on the page.
@@ -863,8 +889,8 @@ def extract_structured_data(soup: BeautifulSoup) -> list[dict]:
     (sitemapindex elements) one level deep.
     Args:
-        base_url: Root URL of the site, e.g. "https://docs.example.com".
-        session:  Optional reusable requests.Session.
+        base_url    : String - Root URL of the site, e.g. "https://docs.example.com".
+        session     : requests.Session - Optional reusable requests.Session.
     Returns:
         Sorted, deduplicated list of page URLs listed in the sitemap(s).
@@ -914,10 +940,10 @@ def build_sitemap_urls(
     The matching is case-insensitive and handles whole words only.
     Args:
-        snippet:   Text excerpt (from generate_snippet).
-        query:     Original user query.
-        open_tag:  Opening HTML tag (default <mark>).
-        close_tag: Closing HTML tag (default </mark>).
+        snippet     : String - Text excerpt (from generate_snippet).
+        query       : String - Original user query.
+        open_tag    : String - Opening HTML tag (default <mark>).
+        close_tag   : String - Closing HTML tag (default </mark>).
     Returns:
         Snippet string with matching keywords wrapped in tags.
@@ -956,12 +982,12 @@ def highlight_query_terms(
     unchanged the function exits early, making scheduled re-crawls cheap.
     Args:
-        url:          Page to check and potentially re-index.
-        index:        Mutable inverted index (modified in place).
-        tfidf:        Mutable TF-IDF store (modified in place).
-        metadata:     Mutable metadata list (modified in place).
-        fingerprints: Dict mapping url to last known fingerprint (mutable).
-        session:      Optional requests.Session.
+        url             : String - Page to check and potentially re-index.
+        index           : Dictionary - Mutable inverted index (modified in place).
+        tfidf           : Dictionary - Mutable TF-IDF store (modified in place).
+        metadata        : List - Mutable metadata list (modified in place).
+        fingerprints    : Dictionary - Dict mapping url to last known fingerprint (mutable).
+        session         : requests.Session - Optional requests.Session.
     Returns:
         True if the page was re-indexed, False if it was unchanged.

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: softhauzpy
-Version: 0.0.5
+Version: 0.0.7
 Author: Karen Urate
 Author-email: karen.urate@softhauz.ca
 Description-Content-Type: text/markdown

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/README.md RENAMED Viewed

File without changes

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/setup.cfg RENAMED Viewed

File without changes

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/requires.txt RENAMED Viewed

File without changes

{softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/top_level.txt RENAMED Viewed

File without changes

softhauzpy 0.0.5__tar.gz → 0.0.7__tar.gz

softhauzpy 0.0.5tar.gz → 0.0.7tar.gz