softhauzpy 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/PKG-INFO +1 -1
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/setup.py +1 -1
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy/main.py +132 -105
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy.egg-info/PKG-INFO +1 -1
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/README.md +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/setup.cfg +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy/__init__.py +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy.egg-info/SOURCES.txt +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy.egg-info/dependency_links.txt +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy.egg-info/requires.txt +0 -0
- {softhauzpy-0.0.4 → softhauzpy-0.0.6}/softhauzpy.egg-info/top_level.txt +0 -0
|
@@ -62,17 +62,17 @@ except Exception:
|
|
|
62
62
|
|
|
63
63
|
Parameters
|
|
64
64
|
----------
|
|
65
|
-
url : The URL to fetch.
|
|
66
|
-
title : Optional document title (included in the returned text header when provided).
|
|
67
|
-
author : Optional document author (included in the returned text header when provided).
|
|
68
|
-
description : Optional description (included in the returned text header when provided).
|
|
69
|
-
creation_date : Optional creation date string (included in the returned text header when provided).
|
|
70
|
-
modified_date : Optional last-modified date string (included in the returned text header when provided).
|
|
65
|
+
url : String - The URL to fetch.
|
|
66
|
+
title : String - Optional document title (included in the returned text header when provided).
|
|
67
|
+
author : String - Optional document author (included in the returned text header when provided).
|
|
68
|
+
description : String - Optional description (included in the returned text header when provided).
|
|
69
|
+
creation_date : String - Optional creation date string (included in the returned text header when provided).
|
|
70
|
+
modified_date : String - Optional last-modified date string (included in the returned text header when provided).
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
Returns
|
|
74
74
|
-------
|
|
75
|
-
|
|
75
|
+
Dictionary with Keys:
|
|
76
76
|
"url" : str
|
|
77
77
|
"title" : str | None
|
|
78
78
|
"author" : str | None
|
|
@@ -86,7 +86,7 @@ except Exception:
|
|
|
86
86
|
------
|
|
87
87
|
requests.HTTPError
|
|
88
88
|
If the server returns a non-2xx status code.
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
"""
|
|
91
91
|
|
|
92
92
|
|
|
@@ -146,40 +146,32 @@ def extract_pure_text(
|
|
|
146
146
|
returned list contains detailed information about a page.
|
|
147
147
|
|
|
148
148
|
Parameters:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
- author (str): The author of the page.
|
|
153
|
-
- description (str): A brief description of the page.
|
|
154
|
-
- creation_date (str): The date the page was created.
|
|
155
|
-
- modified_date (str): The date the page was last modified.
|
|
156
|
-
keywords (str): A string containing keywords to search for within the page entries.
|
|
149
|
+
page_list : List - A list where each tuple represents a page with the following elements:
|
|
150
|
+
(url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
|
|
151
|
+
keywords : String - A string containing keywords to search for within the page entries.
|
|
157
152
|
|
|
158
153
|
Returns:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
- title (str)
|
|
162
|
-
- author (str)
|
|
163
|
-
- description (str)
|
|
164
|
-
- creation_date (str)
|
|
165
|
-
- modified_date (str)
|
|
154
|
+
results : List - A list of tuples matching the search criteria. Each tuple contains:
|
|
155
|
+
(url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
|
|
166
156
|
|
|
167
157
|
Example:
|
|
168
158
|
>>> pages = [
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
>>>
|
|
159
|
+
("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
|
|
160
|
+
("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
|
|
161
|
+
]
|
|
162
|
+
>>> get_search_results_list(pages, "sample")
|
|
173
163
|
[
|
|
174
|
-
|
|
175
|
-
|
|
164
|
+
("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
|
|
165
|
+
("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
|
|
176
166
|
]
|
|
177
167
|
"""
|
|
178
168
|
|
|
179
169
|
|
|
180
170
|
def get_search_results_list(page_list=[], keywords='') -> list:
|
|
181
|
-
results = []
|
|
182
171
|
|
|
172
|
+
results = []
|
|
173
|
+
keywords = keywords.lower()
|
|
174
|
+
|
|
183
175
|
for page in page_list:
|
|
184
176
|
|
|
185
177
|
url = page[0]
|
|
@@ -192,24 +184,23 @@ def get_search_results_list(page_list=[], keywords='') -> list:
|
|
|
192
184
|
description = page[3] or ''
|
|
193
185
|
creation_date = page[4] or ''
|
|
194
186
|
modified_date = page[5] or ''
|
|
195
|
-
|
|
196
|
-
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if keywords in (extract_pure_text(url, title=title, author=author, description=description, creation_date=creation_date, modified_date=modified_date)["content"]).lower():
|
|
197
190
|
results.append((url, title, author, description, creation_date, modified_date))
|
|
198
191
|
|
|
199
192
|
return results
|
|
200
193
|
|
|
201
|
-
|
|
202
|
-
|
|
203
194
|
"""
|
|
204
195
|
Fetch a single URL with retry logic and polite delay.
|
|
205
196
|
|
|
206
197
|
Args:
|
|
207
|
-
url:
|
|
208
|
-
timeout:
|
|
209
|
-
retries:
|
|
210
|
-
delay:
|
|
211
|
-
headers:
|
|
212
|
-
session:
|
|
198
|
+
url : String - Target URL.
|
|
199
|
+
timeout : Integer - Per-request timeout in seconds.
|
|
200
|
+
retries : Integer - Maximum number of attempts before giving up.
|
|
201
|
+
delay : Float - Seconds to wait between retries (doubles on each failure).
|
|
202
|
+
headers : Dictionary - Optional extra HTTP headers (merged with a default UA).
|
|
203
|
+
session : requests.Session | None - An existing requests.Session (useful for cookie sharing).
|
|
213
204
|
|
|
214
205
|
Returns:
|
|
215
206
|
A requests.Response on success, or None after all retries fail.
|
|
@@ -255,8 +246,8 @@ def fetch_page(
|
|
|
255
246
|
Parse raw HTML into a BeautifulSoup tree.
|
|
256
247
|
|
|
257
248
|
Args:
|
|
258
|
-
html:
|
|
259
|
-
parser: BS4 parser backend ('lxml', 'html.parser', 'html5lib').
|
|
249
|
+
html : String - Raw HTML string or bytes.
|
|
250
|
+
parser : String - BS4 parser backend ('lxml', 'html.parser', 'html5lib').
|
|
260
251
|
|
|
261
252
|
Returns:
|
|
262
253
|
A BeautifulSoup object ready for querying.
|
|
@@ -282,12 +273,12 @@ def parse_html(
|
|
|
282
273
|
language, and author where available.
|
|
283
274
|
|
|
284
275
|
Args:
|
|
285
|
-
soup: Parsed BeautifulSoup object.
|
|
286
|
-
url:
|
|
276
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
277
|
+
url : String - Original URL (used as fallback for canonical).
|
|
287
278
|
|
|
288
279
|
Returns:
|
|
289
|
-
|
|
290
|
-
|
|
280
|
+
Dictionary with Keys:
|
|
281
|
+
title, description, keywords, og_title, og_description, og_image, canonical, lang, author
|
|
291
282
|
|
|
292
283
|
Example:
|
|
293
284
|
meta = extract_metadata(soup, url="https://example.com/page")
|
|
@@ -324,10 +315,10 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
|
|
|
324
315
|
Collect all hyperlinks from a page, normalised to absolute URLs.
|
|
325
316
|
|
|
326
317
|
Args:
|
|
327
|
-
soup:
|
|
328
|
-
base_url:
|
|
329
|
-
same_domain_only:
|
|
330
|
-
exclude_extensions: File extensions to skip (e.g. ['.pdf', '.jpg']).
|
|
318
|
+
soup : BeautifulSoup - Parsed page.
|
|
319
|
+
base_url : String - Absolute URL of the page being parsed.
|
|
320
|
+
same_domain_only : Boolean - When True, filters out external domains.
|
|
321
|
+
exclude_extensions : List - File extensions to skip (e.g. ['.pdf', '.jpg']).
|
|
331
322
|
|
|
332
323
|
Returns:
|
|
333
324
|
Deduplicated list of absolute URL strings.
|
|
@@ -336,8 +327,6 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
|
|
|
336
327
|
links = extract_links(soup, "https://docs.example.com/intro")
|
|
337
328
|
# => ['https://docs.example.com/api', 'https://docs.example.com/faq']
|
|
338
329
|
"""
|
|
339
|
-
|
|
340
|
-
|
|
341
330
|
def extract_links(
|
|
342
331
|
soup: BeautifulSoup,
|
|
343
332
|
base_url: str,
|
|
@@ -377,14 +366,15 @@ def extract_links(
|
|
|
377
366
|
Returns a list of page records for further processing.
|
|
378
367
|
|
|
379
368
|
Args:
|
|
380
|
-
start_url:
|
|
381
|
-
max_pages:
|
|
382
|
-
same_domain_only: Stay within the same hostname.
|
|
383
|
-
delay:
|
|
384
|
-
session:
|
|
369
|
+
start_url : String - Root URL to begin crawling.
|
|
370
|
+
max_pages : Integer - Hard cap on pages visited.
|
|
371
|
+
same_domain_only : Boolean - Stay within the same hostname.
|
|
372
|
+
delay : Float - Polite pause (seconds) between requests.
|
|
373
|
+
session : requests.Session - Reusable requests.Session.
|
|
385
374
|
|
|
386
375
|
Returns:
|
|
387
|
-
List of dicts, each with keys:
|
|
376
|
+
List of dicts, each with keys:
|
|
377
|
+
url, html, soup, status_code
|
|
388
378
|
|
|
389
379
|
Example:
|
|
390
380
|
pages = crawl_site("https://docs.example.com", max_pages=50)
|
|
@@ -443,9 +433,9 @@ def crawl_site(
|
|
|
443
433
|
boundary are still findable.
|
|
444
434
|
|
|
445
435
|
Args:
|
|
446
|
-
text:
|
|
447
|
-
chunk_size: Maximum words per chunk.
|
|
448
|
-
overlap:
|
|
436
|
+
text : String - Full document text.
|
|
437
|
+
chunk_size : Integer - Maximum words per chunk.
|
|
438
|
+
overlap : Integer - Words shared between consecutive chunks.
|
|
449
439
|
|
|
450
440
|
Returns:
|
|
451
441
|
List of text chunks.
|
|
@@ -477,10 +467,10 @@ def chunk_text(
|
|
|
477
467
|
applies Porter stemming (if nltk is installed).
|
|
478
468
|
|
|
479
469
|
Args:
|
|
480
|
-
text:
|
|
481
|
-
remove_stopwords: Filter common English stopwords.
|
|
482
|
-
stem:
|
|
483
|
-
min_token_len:
|
|
470
|
+
text : String - Input string.
|
|
471
|
+
remove_stopwords : Boolean - Filter common English stopwords.
|
|
472
|
+
stem : Boolean - Apply stemming for root-form matching.
|
|
473
|
+
min_token_len : Integer - Discard tokens shorter than this length.
|
|
484
474
|
|
|
485
475
|
Returns:
|
|
486
476
|
List of processed token strings.
|
|
@@ -509,7 +499,39 @@ def tokenize(
|
|
|
509
499
|
tokens = [_stemmer.stem(t) for t in tokens]
|
|
510
500
|
|
|
511
501
|
return tokens
|
|
502
|
+
|
|
512
503
|
|
|
504
|
+
"""
|
|
505
|
+
This is a function to customize the scoring of documents using the tokens of a query as indices.
|
|
506
|
+
Obtain a document score based on query index and unit preference.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
url : String - The URL of the page
|
|
510
|
+
query : String - The query to search.
|
|
511
|
+
remove_stopwords : Boolean - Filter common English stopwords.
|
|
512
|
+
stem : Boolean - Apply stemming for root-form matching.
|
|
513
|
+
unit : Integer - The desired score base metric for the document. Formula: count of occurrence x unit. Default is 1.0.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
List of processed token strings.
|
|
517
|
+
|
|
518
|
+
Example:
|
|
519
|
+
score = get_document_score("https://softhauz.ca/eng", query = "SofthauzPy", remove_stopwords = False, stem = False)
|
|
520
|
+
# 8.0
|
|
521
|
+
"""
|
|
522
|
+
def get_document_score(url:str, *, query:str = '', remove_stopwords: bool = False, stem: bool = False, unit: float = 1.0) -> float:
|
|
523
|
+
|
|
524
|
+
score = 0.0
|
|
525
|
+
text = extract_pure_text(url)["content"].lower()
|
|
526
|
+
tokens = tokenize(query, remove_stopwords = remove_stopwords, stem = stem)
|
|
527
|
+
|
|
528
|
+
for token in tokens:
|
|
529
|
+
token = token.lower()
|
|
530
|
+
occurrence = text.count(token)
|
|
531
|
+
if (occurrence > 0):
|
|
532
|
+
score += (occurrence * unit)
|
|
533
|
+
|
|
534
|
+
return score
|
|
513
535
|
|
|
514
536
|
"""
|
|
515
537
|
Build an inverted index mapping tokens to list of (doc_id, frequency).
|
|
@@ -518,10 +540,9 @@ def tokenize(
|
|
|
518
540
|
without scanning every document on every query.
|
|
519
541
|
|
|
520
542
|
Args:
|
|
521
|
-
documents:
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
id_field: Key used as the document identifier.
|
|
543
|
+
documents : List - List of dicts, each containing at least text_field and id_field.
|
|
544
|
+
text_field : String - Key whose value is the text to index.
|
|
545
|
+
id_field : String - Key used as the document identifier.
|
|
525
546
|
|
|
526
547
|
Returns:
|
|
527
548
|
Dict: { token: [(doc_id, freq), ...] }
|
|
@@ -560,7 +581,7 @@ def build_inverted_index(
|
|
|
560
581
|
id_field: str = "url",
|
|
561
582
|
) -> dict:
|
|
562
583
|
index: dict[str, list[tuple[str, int]]] = defaultdict(list)
|
|
563
|
-
|
|
584
|
+
# {'strictli': [('http://127.0.0.1:8000/eng', 2)], 'necessari': [('http://127.0.0.1:8000/eng', 3)]...}
|
|
564
585
|
for doc in documents:
|
|
565
586
|
doc_id = doc[id_field]
|
|
566
587
|
text = doc.get(text_field, "")
|
|
@@ -579,16 +600,21 @@ def build_inverted_index(
|
|
|
579
600
|
the whole corpus — the backbone of classical relevance ranking.
|
|
580
601
|
|
|
581
602
|
Args:
|
|
582
|
-
documents:
|
|
583
|
-
text_field: Field containing raw text.
|
|
584
|
-
id_field:
|
|
603
|
+
documents : List - Corpus as a list of dicts.
|
|
604
|
+
text_field : String - Field containing raw text.
|
|
605
|
+
id_field : String - Field used as document identifier.
|
|
585
606
|
|
|
586
607
|
Returns:
|
|
587
608
|
Nested dict: { doc_id: { token: tfidf_score } }
|
|
588
609
|
|
|
589
610
|
Example:
|
|
590
|
-
|
|
591
|
-
|
|
611
|
+
documents = [
|
|
612
|
+
{"url": "https://example.com/ai", "text": "Artificial intelligence is transforming the world"},
|
|
613
|
+
{"url": "https://example.com/ml", "text": "Machine learning is a subset of artificial intelligence"},
|
|
614
|
+
{"url": "https://example.com/nlp", "text": "Natural language processing enables machines to understand text"},
|
|
615
|
+
]
|
|
616
|
+
scores = compute_tfidf(documents)
|
|
617
|
+
top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5] # these are the 1st five items
|
|
592
618
|
"""
|
|
593
619
|
|
|
594
620
|
|
|
@@ -598,6 +624,7 @@ def compute_tfidf(
|
|
|
598
624
|
text_field: str = "text",
|
|
599
625
|
id_field: str = "url",
|
|
600
626
|
) -> dict[str, dict[str, float]]:
|
|
627
|
+
|
|
601
628
|
N = len(documents)
|
|
602
629
|
tf_store: dict[str, dict[str, float]] = {}
|
|
603
630
|
doc_freq: Counter = Counter()
|
|
@@ -628,10 +655,10 @@ def compute_tfidf(
|
|
|
628
655
|
then returns the top-k results by total relevance score.
|
|
629
656
|
|
|
630
657
|
Args:
|
|
631
|
-
query:
|
|
632
|
-
index:
|
|
633
|
-
tfidf:
|
|
634
|
-
top_k:
|
|
658
|
+
query : String - Raw user query string.
|
|
659
|
+
index : Dictionary - Inverted index from build_inverted_index().
|
|
660
|
+
tfidf : Dictionary - TF-IDF matrix from compute_tfidf().
|
|
661
|
+
top_k : Integer - Maximum results to return.
|
|
635
662
|
|
|
636
663
|
Returns:
|
|
637
664
|
List of (doc_id, score) tuples, highest score first.
|
|
@@ -668,10 +695,10 @@ def search_index(
|
|
|
668
695
|
h1 matches a query is a simple way to improve ranking quality.
|
|
669
696
|
|
|
670
697
|
Args:
|
|
671
|
-
soup: Parsed BeautifulSoup object.
|
|
698
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
672
699
|
|
|
673
700
|
Returns:
|
|
674
|
-
List of dicts
|
|
701
|
+
List of dicts - [{ "level": 1, "text": "Getting Started" }, …]
|
|
675
702
|
|
|
676
703
|
Example:
|
|
677
704
|
headings = extract_headings(soup)
|
|
@@ -693,10 +720,10 @@ def extract_headings(soup: BeautifulSoup) -> list[dict]:
|
|
|
693
720
|
returns the surrounding word window, mimicking Google's snippet style.
|
|
694
721
|
|
|
695
722
|
Args:
|
|
696
|
-
text:
|
|
697
|
-
query:
|
|
698
|
-
window:
|
|
699
|
-
max_length: Hard character cap on the returned snippet.
|
|
723
|
+
text : String - Full document text.
|
|
724
|
+
query : String - User's search query.
|
|
725
|
+
window : Integer - Words to show on each side of the match.
|
|
726
|
+
max_length : Integer - Hard character cap on the returned snippet.
|
|
700
727
|
|
|
701
728
|
Returns:
|
|
702
729
|
A short excerpt string, potentially with leading/trailing ellipsis.
|
|
@@ -744,7 +771,7 @@ def generate_snippet(
|
|
|
744
771
|
unchanged content.
|
|
745
772
|
|
|
746
773
|
Args:
|
|
747
|
-
text: Extracted page text (from extract_text).
|
|
774
|
+
text : String - Extracted page text (from extract_text).
|
|
748
775
|
|
|
749
776
|
Returns:
|
|
750
777
|
64-character hex string (SHA-256 digest).
|
|
@@ -768,10 +795,10 @@ def fingerprint_page(text: str) -> str:
|
|
|
768
795
|
re-crawl every time the search service starts.
|
|
769
796
|
|
|
770
797
|
Args:
|
|
771
|
-
index:
|
|
772
|
-
tfidf:
|
|
773
|
-
metadata: Per-page metadata records (list of dicts).
|
|
774
|
-
path:
|
|
798
|
+
index : Dictionary - Inverted index from build_inverted_index().
|
|
799
|
+
tfidf : Dictionary - TF-IDF scores from compute_tfidf().
|
|
800
|
+
metadata : List - Per-page metadata records (list of dicts).
|
|
801
|
+
path : String - Output file path.
|
|
775
802
|
|
|
776
803
|
Example:
|
|
777
804
|
save_index(index, tfidf, page_metadata, "data/index.json")
|
|
@@ -799,13 +826,13 @@ def save_index(
|
|
|
799
826
|
Deserialise a previously saved search index from JSON.
|
|
800
827
|
|
|
801
828
|
Args:
|
|
802
|
-
path: File path written by save_index().
|
|
829
|
+
path : String - File path written by save_index().
|
|
803
830
|
|
|
804
831
|
Returns:
|
|
805
832
|
Tuple (inverted_index, tfidf, metadata_list).
|
|
806
833
|
|
|
807
834
|
Raises:
|
|
808
|
-
FileNotFoundError
|
|
835
|
+
FileNotFoundError - if the file does not exist.
|
|
809
836
|
|
|
810
837
|
Example:
|
|
811
838
|
index, tfidf, metadata = load_index("data/index.json")
|
|
@@ -829,7 +856,7 @@ def load_index(path: str = "search_index.json") -> tuple[dict, dict, list]:
|
|
|
829
856
|
ideal for enriching search results.
|
|
830
857
|
|
|
831
858
|
Args:
|
|
832
|
-
soup: Parsed BeautifulSoup object.
|
|
859
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
833
860
|
|
|
834
861
|
Returns:
|
|
835
862
|
List of parsed JSON-LD objects found on the page.
|
|
@@ -862,8 +889,8 @@ def extract_structured_data(soup: BeautifulSoup) -> list[dict]:
|
|
|
862
889
|
(sitemapindex elements) one level deep.
|
|
863
890
|
|
|
864
891
|
Args:
|
|
865
|
-
base_url: Root URL of the site, e.g. "https://docs.example.com".
|
|
866
|
-
session:
|
|
892
|
+
base_url : String - Root URL of the site, e.g. "https://docs.example.com".
|
|
893
|
+
session : requests.Session - Optional reusable requests.Session.
|
|
867
894
|
|
|
868
895
|
Returns:
|
|
869
896
|
Sorted, deduplicated list of page URLs listed in the sitemap(s).
|
|
@@ -913,10 +940,10 @@ def build_sitemap_urls(
|
|
|
913
940
|
The matching is case-insensitive and handles whole words only.
|
|
914
941
|
|
|
915
942
|
Args:
|
|
916
|
-
snippet:
|
|
917
|
-
query:
|
|
918
|
-
open_tag:
|
|
919
|
-
close_tag: Closing HTML tag (default </mark>).
|
|
943
|
+
snippet : String - Text excerpt (from generate_snippet).
|
|
944
|
+
query : String - Original user query.
|
|
945
|
+
open_tag : String - Opening HTML tag (default <mark>).
|
|
946
|
+
close_tag : String - Closing HTML tag (default </mark>).
|
|
920
947
|
|
|
921
948
|
Returns:
|
|
922
949
|
Snippet string with matching keywords wrapped in tags.
|
|
@@ -955,12 +982,12 @@ def highlight_query_terms(
|
|
|
955
982
|
unchanged the function exits early, making scheduled re-crawls cheap.
|
|
956
983
|
|
|
957
984
|
Args:
|
|
958
|
-
url:
|
|
959
|
-
index:
|
|
960
|
-
tfidf:
|
|
961
|
-
metadata:
|
|
962
|
-
fingerprints: Dict mapping url to last known fingerprint (mutable).
|
|
963
|
-
session:
|
|
985
|
+
url : String - Page to check and potentially re-index.
|
|
986
|
+
index : Dictionary - Mutable inverted index (modified in place).
|
|
987
|
+
tfidf : Dictionary - Mutable TF-IDF store (modified in place).
|
|
988
|
+
metadata : List - Mutable metadata list (modified in place).
|
|
989
|
+
fingerprints : Dictionary - Dict mapping url to last known fingerprint (mutable).
|
|
990
|
+
session : requests.Session - Optional requests.Session.
|
|
964
991
|
|
|
965
992
|
Returns:
|
|
966
993
|
True if the page was re-indexed, False if it was unchanged.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|