softhauzpy 0.0.4__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Author: Karen Urate
5
5
  Author-email: karen.urate@softhauz.ca
6
6
  Description-Content-Type: text/markdown
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as f:
5
5
 
6
6
  setup(
7
7
  name='softhauzpy',
8
- version='0.0.4',
8
+ version='0.0.6',
9
9
  author='Karen Urate',
10
10
  author_email='karen.urate@softhauz.ca',
11
11
  packages=find_packages(),
@@ -62,17 +62,17 @@ except Exception:
62
62
 
63
63
  Parameters
64
64
  ----------
65
- url : The URL to fetch.
66
- title : Optional document title (included in the returned text header when provided).
67
- author : Optional document author (included in the returned text header when provided).
68
- description : Optional description (included in the returned text header when provided).
69
- creation_date : Optional creation date string (included in the returned text header when provided).
70
- modified_date : Optional last-modified date string (included in the returned text header when provided).
65
+ url : String - The URL to fetch.
66
+ title : String - Optional document title (included in the returned text header when provided).
67
+ author : String - Optional document author (included in the returned text header when provided).
68
+ description : String - Optional description (included in the returned text header when provided).
69
+ creation_date : String - Optional creation date string (included in the returned text header when provided).
70
+ modified_date : String - Optional last-modified date string (included in the returned text header when provided).
71
71
 
72
72
 
73
73
  Returns
74
74
  -------
75
- dict with keys:
75
+ Dictionary with Keys:
76
76
  "url" : str
77
77
  "title" : str | None
78
78
  "author" : str | None
@@ -86,7 +86,7 @@ except Exception:
86
86
  ------
87
87
  requests.HTTPError
88
88
  If the server returns a non-2xx status code.
89
-
89
+
90
90
  """
91
91
 
92
92
 
@@ -146,40 +146,32 @@ def extract_pure_text(
146
146
  returned list contains detailed information about a page.
147
147
 
148
148
  Parameters:
149
- page_list (list of tuples): A list where each tuple represents a page with the following elements:
150
- - url (str): The URL of the page.
151
- - title (str): The title of the page.
152
- - author (str): The author of the page.
153
- - description (str): A brief description of the page.
154
- - creation_date (str): The date the page was created.
155
- - modified_date (str): The date the page was last modified.
156
- keywords (str): A string containing keywords to search for within the page entries.
149
+ page_list : List - A list where each tuple represents a page with the following elements:
150
+ (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
151
+ keywords : String - A string containing keywords to search for within the page entries.
157
152
 
158
153
  Returns:
159
- list of tuples: A list of tuples matching the search criteria. Each tuple contains:
160
- - url (str)
161
- - title (str)
162
- - author (str)
163
- - description (str)
164
- - creation_date (str)
165
- - modified_date (str)
154
+ results : List - A list of tuples matching the search criteria. Each tuple contains:
155
+ (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
166
156
 
167
157
  Example:
168
158
  >>> pages = [
169
- ... ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
170
- ... ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
171
- ... ]
172
- >>> search_pages(pages, "sample")
159
+ ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
160
+ ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
161
+ ]
162
+ >>> get_search_results_list(pages, "sample")
173
163
  [
174
- ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
175
- ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
164
+ ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
165
+ ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
176
166
  ]
177
167
  """
178
168
 
179
169
 
180
170
  def get_search_results_list(page_list=[], keywords='') -> list:
181
- results = []
182
171
 
172
+ results = []
173
+ keywords = keywords.lower()
174
+
183
175
  for page in page_list:
184
176
 
185
177
  url = page[0]
@@ -192,24 +184,23 @@ def get_search_results_list(page_list=[], keywords='') -> list:
192
184
  description = page[3] or ''
193
185
  creation_date = page[4] or ''
194
186
  modified_date = page[5] or ''
195
-
196
- if keywords in extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]:
187
+
188
+
189
+ if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
197
190
  results.append((url, title, author, description, creation_date, modified_date))
198
191
 
199
192
  return results
200
193
 
201
-
202
-
203
194
  """
204
195
  Fetch a single URL with retry logic and polite delay.
205
196
 
206
197
  Args:
207
- url: Target URL.
208
- timeout: Per-request timeout in seconds.
209
- retries: Maximum number of attempts before giving up.
210
- delay: Seconds to wait between retries (doubles on each failure).
211
- headers: Optional extra HTTP headers (merged with a default UA).
212
- session: An existing requests.Session (useful for cookie sharing).
198
+ url : String - Target URL.
199
+ timeout : Integer - Per-request timeout in seconds.
200
+ retries : Integer - Maximum number of attempts before giving up.
201
+ delay : Float - Seconds to wait between retries (doubles on each failure).
202
+ headers : Dictionary - Optional extra HTTP headers (merged with a default UA).
203
+ session : requests.Session | None - An existing requests.Session (useful for cookie sharing).
213
204
 
214
205
  Returns:
215
206
  A requests.Response on success, or None after all retries fail.
@@ -255,8 +246,8 @@ def fetch_page(
255
246
  Parse raw HTML into a BeautifulSoup tree.
256
247
 
257
248
  Args:
258
- html: Raw HTML string or bytes.
259
- parser: BS4 parser backend ('lxml', 'html.parser', 'html5lib').
249
+ html : String - Raw HTML string or bytes.
250
+ parser : String - BS4 parser backend ('lxml', 'html.parser', 'html5lib').
260
251
 
261
252
  Returns:
262
253
  A BeautifulSoup object ready for querying.
@@ -282,12 +273,12 @@ def parse_html(
282
273
  language, and author where available.
283
274
 
284
275
  Args:
285
- soup: Parsed BeautifulSoup object.
286
- url: Original URL (used as fallback for canonical).
276
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
277
+ url : String - Original URL (used as fallback for canonical).
287
278
 
288
279
  Returns:
289
- Dict with keys: title, description, keywords, og_title,
290
- og_description, og_image, canonical, lang, author.
280
+ Dictionary with Keys:
281
+ title, description, keywords, og_title, og_description, og_image, canonical, lang, author
291
282
 
292
283
  Example:
293
284
  meta = extract_metadata(soup, url="https://example.com/page")
@@ -324,10 +315,10 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
324
315
  Collect all hyperlinks from a page, normalised to absolute URLs.
325
316
 
326
317
  Args:
327
- soup: Parsed page.
328
- base_url: Absolute URL of the page being parsed.
329
- same_domain_only: When True, filters out external domains.
330
- exclude_extensions: File extensions to skip (e.g. ['.pdf', '.jpg']).
318
+ soup : BeautifulSoup - Parsed page.
319
+ base_url : String - Absolute URL of the page being parsed.
320
+ same_domain_only : Boolean - When True, filters out external domains.
321
+ exclude_extensions : List - File extensions to skip (e.g. ['.pdf', '.jpg']).
331
322
 
332
323
  Returns:
333
324
  Deduplicated list of absolute URL strings.
@@ -336,8 +327,6 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
336
327
  links = extract_links(soup, "https://docs.example.com/intro")
337
328
  # => ['https://docs.example.com/api', 'https://docs.example.com/faq']
338
329
  """
339
-
340
-
341
330
  def extract_links(
342
331
  soup: BeautifulSoup,
343
332
  base_url: str,
@@ -377,14 +366,15 @@ def extract_links(
377
366
  Returns a list of page records for further processing.
378
367
 
379
368
  Args:
380
- start_url: Root URL to begin crawling.
381
- max_pages: Hard cap on pages visited.
382
- same_domain_only: Stay within the same hostname.
383
- delay: Polite pause (seconds) between requests.
384
- session: Reusable requests.Session.
369
+ start_url : String - Root URL to begin crawling.
370
+ max_pages : Integer - Hard cap on pages visited.
371
+ same_domain_only : Boolean - Stay within the same hostname.
372
+ delay : Float - Polite pause (seconds) between requests.
373
+ session : requests.Session - Reusable requests.Session.
385
374
 
386
375
  Returns:
387
- List of dicts, each with keys: url, html, soup, status_code.
376
+ List of dicts, each with keys:
377
+ url, html, soup, status_code
388
378
 
389
379
  Example:
390
380
  pages = crawl_site("https://docs.example.com", max_pages=50)
@@ -443,9 +433,9 @@ def crawl_site(
443
433
  boundary are still findable.
444
434
 
445
435
  Args:
446
- text: Full document text.
447
- chunk_size: Maximum words per chunk.
448
- overlap: Words shared between consecutive chunks.
436
+ text : String - Full document text.
437
+ chunk_size : Integer - Maximum words per chunk.
438
+ overlap : Integer - Words shared between consecutive chunks.
449
439
 
450
440
  Returns:
451
441
  List of text chunks.
@@ -477,10 +467,10 @@ def chunk_text(
477
467
  applies Porter stemming (if nltk is installed).
478
468
 
479
469
  Args:
480
- text: Input string.
481
- remove_stopwords: Filter common English stopwords.
482
- stem: Apply stemming for root-form matching.
483
- min_token_len: Discard tokens shorter than this length.
470
+ text : String - Input string.
471
+ remove_stopwords : Boolean - Filter common English stopwords.
472
+ stem : Boolean - Apply stemming for root-form matching.
473
+ min_token_len : Integer - Discard tokens shorter than this length.
484
474
 
485
475
  Returns:
486
476
  List of processed token strings.
@@ -509,7 +499,39 @@ def tokenize(
509
499
  tokens = [_stemmer.stem(t) for t in tokens]
510
500
 
511
501
  return tokens
502
+
512
503
 
504
+ """
505
+ This is a function to customize the scoring of documents using the tokens of a query as indices.
506
+ Obtain a document score based on query index and unit preference.
507
+
508
+ Args:
509
+ url : String - The URL of the page
510
+ query : String - The query to search.
511
+ remove_stopwords : Boolean - Filter common English stopwords.
512
+ stem : Boolean - Apply stemming for root-form matching.
513
+ unit : Integer - The desired score base metric for the document. Formula: count of occurrence x unit. Default is 1.0.
514
+
515
+ Returns:
516
+ List of processed token strings.
517
+
518
+ Example:
519
+ score = get_document_score("https://softhauz.ca/eng", query = "SofthauzPy", remove_stopwords = False, stem = False)
520
+ # 8.0
521
+ """
522
+ def get_document_score(url:str, *, query:str = '', remove_stopwords: bool = False, stem: bool = False, unit: float = 1.0) -> float:
523
+
524
+ score = 0.0
525
+ text = extract_pure_text(url)["content"].lower()
526
+ tokens = tokenize(query, remove_stopwords = remove_stopwords, stem = stem)
527
+
528
+ for token in tokens:
529
+ token = token.lower()
530
+ occurrence = text.count(token)
531
+ if (occurrence > 0):
532
+ score += (occurrence * unit)
533
+
534
+ return score
513
535
 
514
536
  """
515
537
  Build an inverted index mapping tokens to list of (doc_id, frequency).
@@ -518,10 +540,9 @@ def tokenize(
518
540
  without scanning every document on every query.
519
541
 
520
542
  Args:
521
- documents: List of dicts, each containing at least text_field and
522
- id_field.
523
- text_field: Key whose value is the text to index.
524
- id_field: Key used as the document identifier.
543
+ documents : List - List of dicts, each containing at least text_field and id_field.
544
+ text_field : String - Key whose value is the text to index.
545
+ id_field : String - Key used as the document identifier.
525
546
 
526
547
  Returns:
527
548
  Dict: { token: [(doc_id, freq), ...] }
@@ -560,7 +581,7 @@ def build_inverted_index(
560
581
  id_field: str = "url",
561
582
  ) -> dict:
562
583
  index: dict[str, list[tuple[str, int]]] = defaultdict(list)
563
-
584
+ # {'strictli': [('http://127.0.0.1:8000/eng', 2)], 'necessari': [('http://127.0.0.1:8000/eng', 3)]...}
564
585
  for doc in documents:
565
586
  doc_id = doc[id_field]
566
587
  text = doc.get(text_field, "")
@@ -579,16 +600,21 @@ def build_inverted_index(
579
600
  the whole corpus — the backbone of classical relevance ranking.
580
601
 
581
602
  Args:
582
- documents: Corpus as a list of dicts.
583
- text_field: Field containing raw text.
584
- id_field: Field used as document identifier.
603
+ documents : List - Corpus as a list of dicts.
604
+ text_field : String - Field containing raw text.
605
+ id_field : String - Field used as document identifier.
585
606
 
586
607
  Returns:
587
608
  Nested dict: { doc_id: { token: tfidf_score } }
588
609
 
589
610
  Example:
590
- scores = compute_tfidf(docs)
591
- top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5]
611
+ documents = [
612
+ {"url": "https://example.com/ai", "text": "Artificial intelligence is transforming the world"},
613
+ {"url": "https://example.com/ml", "text": "Machine learning is a subset of artificial intelligence"},
614
+ {"url": "https://example.com/nlp", "text": "Natural language processing enables machines to understand text"},
615
+ ]
616
+ scores = compute_tfidf(documents)
617
+ top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5] # these are the 1st five items
592
618
  """
593
619
 
594
620
 
@@ -598,6 +624,7 @@ def compute_tfidf(
598
624
  text_field: str = "text",
599
625
  id_field: str = "url",
600
626
  ) -> dict[str, dict[str, float]]:
627
+
601
628
  N = len(documents)
602
629
  tf_store: dict[str, dict[str, float]] = {}
603
630
  doc_freq: Counter = Counter()
@@ -628,10 +655,10 @@ def compute_tfidf(
628
655
  then returns the top-k results by total relevance score.
629
656
 
630
657
  Args:
631
- query: Raw user query string.
632
- index: Inverted index from build_inverted_index().
633
- tfidf: TF-IDF matrix from compute_tfidf().
634
- top_k: Maximum results to return.
658
+ query : String - Raw user query string.
659
+ index : Dictionary - Inverted index from build_inverted_index().
660
+ tfidf : Dictionary - TF-IDF matrix from compute_tfidf().
661
+ top_k : Integer - Maximum results to return.
635
662
 
636
663
  Returns:
637
664
  List of (doc_id, score) tuples, highest score first.
@@ -668,10 +695,10 @@ def search_index(
668
695
  h1 matches a query is a simple way to improve ranking quality.
669
696
 
670
697
  Args:
671
- soup: Parsed BeautifulSoup object.
698
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
672
699
 
673
700
  Returns:
674
- List of dicts: [{ "level": 1, "text": "Getting Started" }, …]
701
+ List of dicts - [{ "level": 1, "text": "Getting Started" }, …]
675
702
 
676
703
  Example:
677
704
  headings = extract_headings(soup)
@@ -693,10 +720,10 @@ def extract_headings(soup: BeautifulSoup) -> list[dict]:
693
720
  returns the surrounding word window, mimicking Google's snippet style.
694
721
 
695
722
  Args:
696
- text: Full document text.
697
- query: User's search query.
698
- window: Words to show on each side of the match.
699
- max_length: Hard character cap on the returned snippet.
723
+ text : String - Full document text.
724
+ query : String - User's search query.
725
+ window : Integer - Words to show on each side of the match.
726
+ max_length : Integer - Hard character cap on the returned snippet.
700
727
 
701
728
  Returns:
702
729
  A short excerpt string, potentially with leading/trailing ellipsis.
@@ -744,7 +771,7 @@ def generate_snippet(
744
771
  unchanged content.
745
772
 
746
773
  Args:
747
- text: Extracted page text (from extract_text).
774
+ text : String - Extracted page text (from extract_text).
748
775
 
749
776
  Returns:
750
777
  64-character hex string (SHA-256 digest).
@@ -768,10 +795,10 @@ def fingerprint_page(text: str) -> str:
768
795
  re-crawl every time the search service starts.
769
796
 
770
797
  Args:
771
- index: Inverted index from build_inverted_index().
772
- tfidf: TF-IDF scores from compute_tfidf().
773
- metadata: Per-page metadata records (list of dicts).
774
- path: Output file path.
798
+ index : Dictionary - Inverted index from build_inverted_index().
799
+ tfidf : Dictionary - TF-IDF scores from compute_tfidf().
800
+ metadata : List - Per-page metadata records (list of dicts).
801
+ path : String - Output file path.
775
802
 
776
803
  Example:
777
804
  save_index(index, tfidf, page_metadata, "data/index.json")
@@ -799,13 +826,13 @@ def save_index(
799
826
  Deserialise a previously saved search index from JSON.
800
827
 
801
828
  Args:
802
- path: File path written by save_index().
829
+ path : String - File path written by save_index().
803
830
 
804
831
  Returns:
805
832
  Tuple (inverted_index, tfidf, metadata_list).
806
833
 
807
834
  Raises:
808
- FileNotFoundError: If the file does not exist.
835
+ FileNotFoundError - if the file does not exist.
809
836
 
810
837
  Example:
811
838
  index, tfidf, metadata = load_index("data/index.json")
@@ -829,7 +856,7 @@ def load_index(path: str = "search_index.json") -> tuple[dict, dict, list]:
829
856
  ideal for enriching search results.
830
857
 
831
858
  Args:
832
- soup: Parsed BeautifulSoup object.
859
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
833
860
 
834
861
  Returns:
835
862
  List of parsed JSON-LD objects found on the page.
@@ -862,8 +889,8 @@ def extract_structured_data(soup: BeautifulSoup) -> list[dict]:
862
889
  (sitemapindex elements) one level deep.
863
890
 
864
891
  Args:
865
- base_url: Root URL of the site, e.g. "https://docs.example.com".
866
- session: Optional reusable requests.Session.
892
+ base_url : String - Root URL of the site, e.g. "https://docs.example.com".
893
+ session : requests.Session - Optional reusable requests.Session.
867
894
 
868
895
  Returns:
869
896
  Sorted, deduplicated list of page URLs listed in the sitemap(s).
@@ -913,10 +940,10 @@ def build_sitemap_urls(
913
940
  The matching is case-insensitive and handles whole words only.
914
941
 
915
942
  Args:
916
- snippet: Text excerpt (from generate_snippet).
917
- query: Original user query.
918
- open_tag: Opening HTML tag (default <mark>).
919
- close_tag: Closing HTML tag (default </mark>).
943
+ snippet : String - Text excerpt (from generate_snippet).
944
+ query : String - Original user query.
945
+ open_tag : String - Opening HTML tag (default <mark>).
946
+ close_tag : String - Closing HTML tag (default </mark>).
920
947
 
921
948
  Returns:
922
949
  Snippet string with matching keywords wrapped in tags.
@@ -955,12 +982,12 @@ def highlight_query_terms(
955
982
  unchanged the function exits early, making scheduled re-crawls cheap.
956
983
 
957
984
  Args:
958
- url: Page to check and potentially re-index.
959
- index: Mutable inverted index (modified in place).
960
- tfidf: Mutable TF-IDF store (modified in place).
961
- metadata: Mutable metadata list (modified in place).
962
- fingerprints: Dict mapping url to last known fingerprint (mutable).
963
- session: Optional requests.Session.
985
+ url : String - Page to check and potentially re-index.
986
+ index : Dictionary - Mutable inverted index (modified in place).
987
+ tfidf : Dictionary - Mutable TF-IDF store (modified in place).
988
+ metadata : List - Mutable metadata list (modified in place).
989
+ fingerprints : Dictionary - Dict mapping url to last known fingerprint (mutable).
990
+ session : requests.Session - Optional requests.Session.
964
991
 
965
992
  Returns:
966
993
  True if the page was re-indexed, False if it was unchanged.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Author: Karen Urate
5
5
  Author-email: karen.urate@softhauz.ca
6
6
  Description-Content-Type: text/markdown
File without changes
File without changes