softhauzpy 0.0.5__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Author: Karen Urate
5
5
  Author-email: karen.urate@softhauz.ca
6
6
  Description-Content-Type: text/markdown
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as f:
5
5
 
6
6
  setup(
7
7
  name='softhauzpy',
8
- version='0.0.5',
8
+ version='0.0.6',
9
9
  author='Karen Urate',
10
10
  author_email='karen.urate@softhauz.ca',
11
11
  packages=find_packages(),
@@ -62,17 +62,17 @@ except Exception:
62
62
 
63
63
  Parameters
64
64
  ----------
65
- url : The URL to fetch.
66
- title : Optional document title (included in the returned text header when provided).
67
- author : Optional document author (included in the returned text header when provided).
68
- description : Optional description (included in the returned text header when provided).
69
- creation_date : Optional creation date string (included in the returned text header when provided).
70
- modified_date : Optional last-modified date string (included in the returned text header when provided).
65
+ url : String - The URL to fetch.
66
+ title : String - Optional document title (included in the returned text header when provided).
67
+ author : String - Optional document author (included in the returned text header when provided).
68
+ description : String - Optional description (included in the returned text header when provided).
69
+ creation_date : String - Optional creation date string (included in the returned text header when provided).
70
+ modified_date : String - Optional last-modified date string (included in the returned text header when provided).
71
71
 
72
72
 
73
73
  Returns
74
74
  -------
75
- dict with keys:
75
+ Dictionary with Keys:
76
76
  "url" : str
77
77
  "title" : str | None
78
78
  "author" : str | None
@@ -86,7 +86,7 @@ except Exception:
86
86
  ------
87
87
  requests.HTTPError
88
88
  If the server returns a non-2xx status code.
89
-
89
+
90
90
  """
91
91
 
92
92
 
@@ -146,33 +146,23 @@ def extract_pure_text(
146
146
  returned list contains detailed information about a page.
147
147
 
148
148
  Parameters:
149
- page_list (list of tuples): A list where each tuple represents a page with the following elements:
150
- - url (str): The URL of the page.
151
- - title (str): The title of the page.
152
- - author (str): The author of the page.
153
- - description (str): A brief description of the page.
154
- - creation_date (str): The date the page was created.
155
- - modified_date (str): The date the page was last modified.
156
- keywords (str): A string containing keywords to search for within the page entries.
149
+ page_list : List - A list where each tuple represents a page with the following elements:
150
+ (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
151
+ keywords : String - A string containing keywords to search for within the page entries.
157
152
 
158
153
  Returns:
159
- list of tuples: A list of tuples matching the search criteria. Each tuple contains:
160
- - url (str)
161
- - title (str)
162
- - author (str)
163
- - description (str)
164
- - creation_date (str)
165
- - modified_date (str)
154
+ results : List - A list of tuples matching the search criteria. Each tuple contains:
155
+ (url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
166
156
 
167
157
  Example:
168
158
  >>> pages = [
169
- ... ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
170
- ... ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
171
- ... ]
159
+ ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
160
+ ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
161
+ ]
172
162
  >>> get_search_results_list(pages, "sample")
173
163
  [
174
- ("https://example.com", "Example Page", "Alice", "A sample page", "2023-01-01", "2023-01-05"),
175
- ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
164
+ ("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
165
+ ("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
176
166
  ]
177
167
  """
178
168
 
@@ -205,12 +195,12 @@ def get_search_results_list(page_list=[], keywords='') -> list:
205
195
  Fetch a single URL with retry logic and polite delay.
206
196
 
207
197
  Args:
208
- url: Target URL.
209
- timeout: Per-request timeout in seconds.
210
- retries: Maximum number of attempts before giving up.
211
- delay: Seconds to wait between retries (doubles on each failure).
212
- headers: Optional extra HTTP headers (merged with a default UA).
213
- session: An existing requests.Session (useful for cookie sharing).
198
+ url : String - Target URL.
199
+ timeout : Integer - Per-request timeout in seconds.
200
+ retries : Integer - Maximum number of attempts before giving up.
201
+ delay : Float - Seconds to wait between retries (doubles on each failure).
202
+ headers : Dictionary - Optional extra HTTP headers (merged with a default UA).
203
+ session : requests.Session | None - An existing requests.Session (useful for cookie sharing).
214
204
 
215
205
  Returns:
216
206
  A requests.Response on success, or None after all retries fail.
@@ -256,8 +246,8 @@ def fetch_page(
256
246
  Parse raw HTML into a BeautifulSoup tree.
257
247
 
258
248
  Args:
259
- html: Raw HTML string or bytes.
260
- parser: BS4 parser backend ('lxml', 'html.parser', 'html5lib').
249
+ html : String - Raw HTML string or bytes.
250
+ parser : String - BS4 parser backend ('lxml', 'html.parser', 'html5lib').
261
251
 
262
252
  Returns:
263
253
  A BeautifulSoup object ready for querying.
@@ -283,12 +273,12 @@ def parse_html(
283
273
  language, and author where available.
284
274
 
285
275
  Args:
286
- soup: Parsed BeautifulSoup object.
287
- url: Original URL (used as fallback for canonical).
276
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
277
+ url : String - Original URL (used as fallback for canonical).
288
278
 
289
279
  Returns:
290
- Dict with keys: title, description, keywords, og_title,
291
- og_description, og_image, canonical, lang, author.
280
+ Dictionary with Keys:
281
+ title, description, keywords, og_title, og_description, og_image, canonical, lang, author
292
282
 
293
283
  Example:
294
284
  meta = extract_metadata(soup, url="https://example.com/page")
@@ -325,10 +315,10 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
325
315
  Collect all hyperlinks from a page, normalised to absolute URLs.
326
316
 
327
317
  Args:
328
- soup: Parsed page.
329
- base_url: Absolute URL of the page being parsed.
330
- same_domain_only: When True, filters out external domains.
331
- exclude_extensions: File extensions to skip (e.g. ['.pdf', '.jpg']).
318
+ soup : BeautifulSoup - Parsed page.
319
+ base_url : String - Absolute URL of the page being parsed.
320
+ same_domain_only : Boolean - When True, filters out external domains.
321
+ exclude_extensions : List - File extensions to skip (e.g. ['.pdf', '.jpg']).
332
322
 
333
323
  Returns:
334
324
  Deduplicated list of absolute URL strings.
@@ -337,8 +327,6 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
337
327
  links = extract_links(soup, "https://docs.example.com/intro")
338
328
  # => ['https://docs.example.com/api', 'https://docs.example.com/faq']
339
329
  """
340
-
341
-
342
330
  def extract_links(
343
331
  soup: BeautifulSoup,
344
332
  base_url: str,
@@ -378,14 +366,15 @@ def extract_links(
378
366
  Returns a list of page records for further processing.
379
367
 
380
368
  Args:
381
- start_url: Root URL to begin crawling.
382
- max_pages: Hard cap on pages visited.
383
- same_domain_only: Stay within the same hostname.
384
- delay: Polite pause (seconds) between requests.
385
- session: Reusable requests.Session.
369
+ start_url : String - Root URL to begin crawling.
370
+ max_pages : Integer - Hard cap on pages visited.
371
+ same_domain_only : Boolean - Stay within the same hostname.
372
+ delay : Float - Polite pause (seconds) between requests.
373
+ session : requests.Session - Reusable requests.Session.
386
374
 
387
375
  Returns:
388
- List of dicts, each with keys: url, html, soup, status_code.
376
+ List of dicts, each with keys:
377
+ url, html, soup, status_code
389
378
 
390
379
  Example:
391
380
  pages = crawl_site("https://docs.example.com", max_pages=50)
@@ -444,9 +433,9 @@ def crawl_site(
444
433
  boundary are still findable.
445
434
 
446
435
  Args:
447
- text: Full document text.
448
- chunk_size: Maximum words per chunk.
449
- overlap: Words shared between consecutive chunks.
436
+ text : String - Full document text.
437
+ chunk_size : Integer - Maximum words per chunk.
438
+ overlap : Integer - Words shared between consecutive chunks.
450
439
 
451
440
  Returns:
452
441
  List of text chunks.
@@ -478,10 +467,10 @@ def chunk_text(
478
467
  applies Porter stemming (if nltk is installed).
479
468
 
480
469
  Args:
481
- text: Input string.
482
- remove_stopwords: Filter common English stopwords.
483
- stem: Apply stemming for root-form matching.
484
- min_token_len: Discard tokens shorter than this length.
470
+ text : String - Input string.
471
+ remove_stopwords : Boolean - Filter common English stopwords.
472
+ stem : Boolean - Apply stemming for root-form matching.
473
+ min_token_len : Integer - Discard tokens shorter than this length.
485
474
 
486
475
  Returns:
487
476
  List of processed token strings.
@@ -510,7 +499,39 @@ def tokenize(
510
499
  tokens = [_stemmer.stem(t) for t in tokens]
511
500
 
512
501
  return tokens
502
+
503
+
504
+ """
505
+ This is a function to customize the scoring of documents using the tokens of a query as indices.
506
+ Obtain a document score based on query index and unit preference.
507
+
508
+ Args:
509
+ url : String - The URL of the page
510
+ query : String - The query to search.
511
+ remove_stopwords : Boolean - Filter common English stopwords.
512
+ stem : Boolean - Apply stemming for root-form matching.
513
+ unit : Integer - The desired score base metric for the document. Formula: count of occurrence x unit. Default is 1.0.
513
514
 
515
+ Returns:
516
+ List of processed token strings.
517
+
518
+ Example:
519
+ score = get_document_score("https://softhauz.ca/eng", query = "SofthauzPy", remove_stopwords = False, stem = False)
520
+ # 8.0
521
+ """
522
+ def get_document_score(url:str, *, query:str = '', remove_stopwords: bool = False, stem: bool = False, unit: float = 1.0) -> float:
523
+
524
+ score = 0.0
525
+ text = extract_pure_text(url)["content"].lower()
526
+ tokens = tokenize(query, remove_stopwords = remove_stopwords, stem = stem)
527
+
528
+ for token in tokens:
529
+ token = token.lower()
530
+ occurrence = text.count(token)
531
+ if (occurrence > 0):
532
+ score += (occurrence * unit)
533
+
534
+ return score
514
535
 
515
536
  """
516
537
  Build an inverted index mapping tokens to list of (doc_id, frequency).
@@ -519,10 +540,9 @@ def tokenize(
519
540
  without scanning every document on every query.
520
541
 
521
542
  Args:
522
- documents: List of dicts, each containing at least text_field and
523
- id_field.
524
- text_field: Key whose value is the text to index.
525
- id_field: Key used as the document identifier.
543
+ documents : List - List of dicts, each containing at least text_field and id_field.
544
+ text_field : String - Key whose value is the text to index.
545
+ id_field : String - Key used as the document identifier.
526
546
 
527
547
  Returns:
528
548
  Dict: { token: [(doc_id, freq), ...] }
@@ -561,7 +581,7 @@ def build_inverted_index(
561
581
  id_field: str = "url",
562
582
  ) -> dict:
563
583
  index: dict[str, list[tuple[str, int]]] = defaultdict(list)
564
-
584
+ # {'strictli': [('http://127.0.0.1:8000/eng', 2)], 'necessari': [('http://127.0.0.1:8000/eng', 3)]...}
565
585
  for doc in documents:
566
586
  doc_id = doc[id_field]
567
587
  text = doc.get(text_field, "")
@@ -580,16 +600,21 @@ def build_inverted_index(
580
600
  the whole corpus — the backbone of classical relevance ranking.
581
601
 
582
602
  Args:
583
- documents: Corpus as a list of dicts.
584
- text_field: Field containing raw text.
585
- id_field: Field used as document identifier.
603
+ documents : List - Corpus as a list of dicts.
604
+ text_field : String - Field containing raw text.
605
+ id_field : String - Field used as document identifier.
586
606
 
587
607
  Returns:
588
608
  Nested dict: { doc_id: { token: tfidf_score } }
589
609
 
590
610
  Example:
591
- scores = compute_tfidf(docs)
592
- top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5]
611
+ documents = [
612
+ {"url": "https://example.com/ai", "text": "Artificial intelligence is transforming the world"},
613
+ {"url": "https://example.com/ml", "text": "Machine learning is a subset of artificial intelligence"},
614
+ {"url": "https://example.com/nlp", "text": "Natural language processing enables machines to understand text"},
615
+ ]
616
+ scores = compute_tfidf(documents)
617
+ top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5] # these are the 1st five items
593
618
  """
594
619
 
595
620
 
@@ -599,6 +624,7 @@ def compute_tfidf(
599
624
  text_field: str = "text",
600
625
  id_field: str = "url",
601
626
  ) -> dict[str, dict[str, float]]:
627
+
602
628
  N = len(documents)
603
629
  tf_store: dict[str, dict[str, float]] = {}
604
630
  doc_freq: Counter = Counter()
@@ -629,10 +655,10 @@ def compute_tfidf(
629
655
  then returns the top-k results by total relevance score.
630
656
 
631
657
  Args:
632
- query: Raw user query string.
633
- index: Inverted index from build_inverted_index().
634
- tfidf: TF-IDF matrix from compute_tfidf().
635
- top_k: Maximum results to return.
658
+ query : String - Raw user query string.
659
+ index : Dictionary - Inverted index from build_inverted_index().
660
+ tfidf : Dictionary - TF-IDF matrix from compute_tfidf().
661
+ top_k : Integer - Maximum results to return.
636
662
 
637
663
  Returns:
638
664
  List of (doc_id, score) tuples, highest score first.
@@ -669,10 +695,10 @@ def search_index(
669
695
  h1 matches a query is a simple way to improve ranking quality.
670
696
 
671
697
  Args:
672
- soup: Parsed BeautifulSoup object.
698
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
673
699
 
674
700
  Returns:
675
- List of dicts: [{ "level": 1, "text": "Getting Started" }, …]
701
+ List of dicts - [{ "level": 1, "text": "Getting Started" }, …]
676
702
 
677
703
  Example:
678
704
  headings = extract_headings(soup)
@@ -694,10 +720,10 @@ def extract_headings(soup: BeautifulSoup) -> list[dict]:
694
720
  returns the surrounding word window, mimicking Google's snippet style.
695
721
 
696
722
  Args:
697
- text: Full document text.
698
- query: User's search query.
699
- window: Words to show on each side of the match.
700
- max_length: Hard character cap on the returned snippet.
723
+ text : String - Full document text.
724
+ query : String - User's search query.
725
+ window : Integer - Words to show on each side of the match.
726
+ max_length : Integer - Hard character cap on the returned snippet.
701
727
 
702
728
  Returns:
703
729
  A short excerpt string, potentially with leading/trailing ellipsis.
@@ -745,7 +771,7 @@ def generate_snippet(
745
771
  unchanged content.
746
772
 
747
773
  Args:
748
- text: Extracted page text (from extract_text).
774
+ text : String - Extracted page text (from extract_text).
749
775
 
750
776
  Returns:
751
777
  64-character hex string (SHA-256 digest).
@@ -769,10 +795,10 @@ def fingerprint_page(text: str) -> str:
769
795
  re-crawl every time the search service starts.
770
796
 
771
797
  Args:
772
- index: Inverted index from build_inverted_index().
773
- tfidf: TF-IDF scores from compute_tfidf().
774
- metadata: Per-page metadata records (list of dicts).
775
- path: Output file path.
798
+ index : Dictionary - Inverted index from build_inverted_index().
799
+ tfidf : Dictionary - TF-IDF scores from compute_tfidf().
800
+ metadata : List - Per-page metadata records (list of dicts).
801
+ path : String - Output file path.
776
802
 
777
803
  Example:
778
804
  save_index(index, tfidf, page_metadata, "data/index.json")
@@ -800,13 +826,13 @@ def save_index(
800
826
  Deserialise a previously saved search index from JSON.
801
827
 
802
828
  Args:
803
- path: File path written by save_index().
829
+ path : String - File path written by save_index().
804
830
 
805
831
  Returns:
806
832
  Tuple (inverted_index, tfidf, metadata_list).
807
833
 
808
834
  Raises:
809
- FileNotFoundError: If the file does not exist.
835
+ FileNotFoundError - if the file does not exist.
810
836
 
811
837
  Example:
812
838
  index, tfidf, metadata = load_index("data/index.json")
@@ -830,7 +856,7 @@ def load_index(path: str = "search_index.json") -> tuple[dict, dict, list]:
830
856
  ideal for enriching search results.
831
857
 
832
858
  Args:
833
- soup: Parsed BeautifulSoup object.
859
+ soup : BeautifulSoup - Parsed BeautifulSoup object.
834
860
 
835
861
  Returns:
836
862
  List of parsed JSON-LD objects found on the page.
@@ -863,8 +889,8 @@ def extract_structured_data(soup: BeautifulSoup) -> list[dict]:
863
889
  (sitemapindex elements) one level deep.
864
890
 
865
891
  Args:
866
- base_url: Root URL of the site, e.g. "https://docs.example.com".
867
- session: Optional reusable requests.Session.
892
+ base_url : String - Root URL of the site, e.g. "https://docs.example.com".
893
+ session : requests.Session - Optional reusable requests.Session.
868
894
 
869
895
  Returns:
870
896
  Sorted, deduplicated list of page URLs listed in the sitemap(s).
@@ -914,10 +940,10 @@ def build_sitemap_urls(
914
940
  The matching is case-insensitive and handles whole words only.
915
941
 
916
942
  Args:
917
- snippet: Text excerpt (from generate_snippet).
918
- query: Original user query.
919
- open_tag: Opening HTML tag (default <mark>).
920
- close_tag: Closing HTML tag (default </mark>).
943
+ snippet : String - Text excerpt (from generate_snippet).
944
+ query : String - Original user query.
945
+ open_tag : String - Opening HTML tag (default <mark>).
946
+ close_tag : String - Closing HTML tag (default </mark>).
921
947
 
922
948
  Returns:
923
949
  Snippet string with matching keywords wrapped in tags.
@@ -956,12 +982,12 @@ def highlight_query_terms(
956
982
  unchanged the function exits early, making scheduled re-crawls cheap.
957
983
 
958
984
  Args:
959
- url: Page to check and potentially re-index.
960
- index: Mutable inverted index (modified in place).
961
- tfidf: Mutable TF-IDF store (modified in place).
962
- metadata: Mutable metadata list (modified in place).
963
- fingerprints: Dict mapping url to last known fingerprint (mutable).
964
- session: Optional requests.Session.
985
+ url : String - Page to check and potentially re-index.
986
+ index : Dictionary - Mutable inverted index (modified in place).
987
+ tfidf : Dictionary - Mutable TF-IDF store (modified in place).
988
+ metadata : List - Mutable metadata list (modified in place).
989
+ fingerprints : Dictionary - Dict mapping url to last known fingerprint (mutable).
990
+ session : requests.Session - Optional requests.Session.
965
991
 
966
992
  Returns:
967
993
  True if the page was re-indexed, False if it was unchanged.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: softhauzpy
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Author: Karen Urate
5
5
  Author-email: karen.urate@softhauz.ca
6
6
  Description-Content-Type: text/markdown
File without changes
File without changes