softhauzpy 0.0.5__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/PKG-INFO +1 -1
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/setup.py +1 -1
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy/__init__.py +1 -1
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy/main.py +125 -99
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/PKG-INFO +1 -1
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/README.md +0 -0
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/setup.cfg +0 -0
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/SOURCES.txt +0 -0
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/dependency_links.txt +0 -0
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/requires.txt +0 -0
- {softhauzpy-0.0.5 → softhauzpy-0.0.7}/softhauzpy.egg-info/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from .main import extract_structured_data, extract_headings
|
|
|
7
7
|
from .main import extract_metadata, extract_links, extract_pure_text
|
|
8
8
|
|
|
9
9
|
# indexing
|
|
10
|
-
from .main import load_index, save_index, search_index, compute_tfidf, build_inverted_index
|
|
10
|
+
from .main import load_index, save_index, search_index, compute_tfidf, build_inverted_index, get_document_score
|
|
11
11
|
|
|
12
12
|
# crawls and scrapes
|
|
13
13
|
from .main import tokenize, chunk_text, crawl_site, parse_html, fetch_page, get_search_results_list
|
|
@@ -62,17 +62,17 @@ except Exception:
|
|
|
62
62
|
|
|
63
63
|
Parameters
|
|
64
64
|
----------
|
|
65
|
-
url : The URL to fetch.
|
|
66
|
-
title : Optional document title (included in the returned text header when provided).
|
|
67
|
-
author : Optional document author (included in the returned text header when provided).
|
|
68
|
-
description : Optional description (included in the returned text header when provided).
|
|
69
|
-
creation_date : Optional creation date string (included in the returned text header when provided).
|
|
70
|
-
modified_date : Optional last-modified date string (included in the returned text header when provided).
|
|
65
|
+
url : String - The URL to fetch.
|
|
66
|
+
title : String - Optional document title (included in the returned text header when provided).
|
|
67
|
+
author : String - Optional document author (included in the returned text header when provided).
|
|
68
|
+
description : String - Optional description (included in the returned text header when provided).
|
|
69
|
+
creation_date : String - Optional creation date string (included in the returned text header when provided).
|
|
70
|
+
modified_date : String - Optional last-modified date string (included in the returned text header when provided).
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
Returns
|
|
74
74
|
-------
|
|
75
|
-
|
|
75
|
+
Dictionary with Keys:
|
|
76
76
|
"url" : str
|
|
77
77
|
"title" : str | None
|
|
78
78
|
"author" : str | None
|
|
@@ -86,7 +86,7 @@ except Exception:
|
|
|
86
86
|
------
|
|
87
87
|
requests.HTTPError
|
|
88
88
|
If the server returns a non-2xx status code.
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
"""
|
|
91
91
|
|
|
92
92
|
|
|
@@ -146,33 +146,23 @@ def extract_pure_text(
|
|
|
146
146
|
returned list contains detailed information about a page.
|
|
147
147
|
|
|
148
148
|
Parameters:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
- author (str): The author of the page.
|
|
153
|
-
- description (str): A brief description of the page.
|
|
154
|
-
- creation_date (str): The date the page was created.
|
|
155
|
-
- modified_date (str): The date the page was last modified.
|
|
156
|
-
keywords (str): A string containing keywords to search for within the page entries.
|
|
149
|
+
page_list : List - A list where each tuple represents a page with the following elements:
|
|
150
|
+
(url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
|
|
151
|
+
keywords : String - A string containing keywords to search for within the page entries.
|
|
157
152
|
|
|
158
153
|
Returns:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
- title (str)
|
|
162
|
-
- author (str)
|
|
163
|
-
- description (str)
|
|
164
|
-
- creation_date (str)
|
|
165
|
-
- modified_date (str)
|
|
154
|
+
results : List - A list of tuples matching the search criteria. Each tuple contains:
|
|
155
|
+
(url: str, title: str, author: str, description: str, creation_date: str, modified_date: str)
|
|
166
156
|
|
|
167
157
|
Example:
|
|
168
158
|
>>> pages = [
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
159
|
+
("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
|
|
160
|
+
("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
|
|
161
|
+
]
|
|
172
162
|
>>> get_search_results_list(pages, "sample")
|
|
173
163
|
[
|
|
174
|
-
|
|
175
|
-
|
|
164
|
+
("https://softhauz.ca/eng", "Softhauz", "Urate, Karen", "This is the homepage.","2025-02-01", "2026-05-31"),
|
|
165
|
+
("https://another.com", "Another Page", "Bob", "Another sample page", "2023-02-01", "2023-02-05")
|
|
176
166
|
]
|
|
177
167
|
"""
|
|
178
168
|
|
|
@@ -205,12 +195,12 @@ def get_search_results_list(page_list=[], keywords='') -> list:
|
|
|
205
195
|
Fetch a single URL with retry logic and polite delay.
|
|
206
196
|
|
|
207
197
|
Args:
|
|
208
|
-
url:
|
|
209
|
-
timeout:
|
|
210
|
-
retries:
|
|
211
|
-
delay:
|
|
212
|
-
headers:
|
|
213
|
-
session:
|
|
198
|
+
url : String - Target URL.
|
|
199
|
+
timeout : Integer - Per-request timeout in seconds.
|
|
200
|
+
retries : Integer - Maximum number of attempts before giving up.
|
|
201
|
+
delay : Float - Seconds to wait between retries (doubles on each failure).
|
|
202
|
+
headers : Dictionary - Optional extra HTTP headers (merged with a default UA).
|
|
203
|
+
session : requests.Session | None - An existing requests.Session (useful for cookie sharing).
|
|
214
204
|
|
|
215
205
|
Returns:
|
|
216
206
|
A requests.Response on success, or None after all retries fail.
|
|
@@ -256,8 +246,8 @@ def fetch_page(
|
|
|
256
246
|
Parse raw HTML into a BeautifulSoup tree.
|
|
257
247
|
|
|
258
248
|
Args:
|
|
259
|
-
html:
|
|
260
|
-
parser: BS4 parser backend ('lxml', 'html.parser', 'html5lib').
|
|
249
|
+
html : String - Raw HTML string or bytes.
|
|
250
|
+
parser : String - BS4 parser backend ('lxml', 'html.parser', 'html5lib').
|
|
261
251
|
|
|
262
252
|
Returns:
|
|
263
253
|
A BeautifulSoup object ready for querying.
|
|
@@ -283,12 +273,12 @@ def parse_html(
|
|
|
283
273
|
language, and author where available.
|
|
284
274
|
|
|
285
275
|
Args:
|
|
286
|
-
soup: Parsed BeautifulSoup object.
|
|
287
|
-
url:
|
|
276
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
277
|
+
url : String - Original URL (used as fallback for canonical).
|
|
288
278
|
|
|
289
279
|
Returns:
|
|
290
|
-
|
|
291
|
-
|
|
280
|
+
Dictionary with Keys:
|
|
281
|
+
title, description, keywords, og_title, og_description, og_image, canonical, lang, author
|
|
292
282
|
|
|
293
283
|
Example:
|
|
294
284
|
meta = extract_metadata(soup, url="https://example.com/page")
|
|
@@ -325,10 +315,10 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
|
|
|
325
315
|
Collect all hyperlinks from a page, normalised to absolute URLs.
|
|
326
316
|
|
|
327
317
|
Args:
|
|
328
|
-
soup:
|
|
329
|
-
base_url:
|
|
330
|
-
same_domain_only:
|
|
331
|
-
exclude_extensions: File extensions to skip (e.g. ['.pdf', '.jpg']).
|
|
318
|
+
soup : BeautifulSoup - Parsed page.
|
|
319
|
+
base_url : String - Absolute URL of the page being parsed.
|
|
320
|
+
same_domain_only : Boolean - When True, filters out external domains.
|
|
321
|
+
exclude_extensions : List - File extensions to skip (e.g. ['.pdf', '.jpg']).
|
|
332
322
|
|
|
333
323
|
Returns:
|
|
334
324
|
Deduplicated list of absolute URL strings.
|
|
@@ -337,8 +327,6 @@ def extract_metadata(soup: BeautifulSoup, url: str = "") -> dict:
|
|
|
337
327
|
links = extract_links(soup, "https://docs.example.com/intro")
|
|
338
328
|
# => ['https://docs.example.com/api', 'https://docs.example.com/faq']
|
|
339
329
|
"""
|
|
340
|
-
|
|
341
|
-
|
|
342
330
|
def extract_links(
|
|
343
331
|
soup: BeautifulSoup,
|
|
344
332
|
base_url: str,
|
|
@@ -378,14 +366,15 @@ def extract_links(
|
|
|
378
366
|
Returns a list of page records for further processing.
|
|
379
367
|
|
|
380
368
|
Args:
|
|
381
|
-
start_url:
|
|
382
|
-
max_pages:
|
|
383
|
-
same_domain_only: Stay within the same hostname.
|
|
384
|
-
delay:
|
|
385
|
-
session:
|
|
369
|
+
start_url : String - Root URL to begin crawling.
|
|
370
|
+
max_pages : Integer - Hard cap on pages visited.
|
|
371
|
+
same_domain_only : Boolean - Stay within the same hostname.
|
|
372
|
+
delay : Float - Polite pause (seconds) between requests.
|
|
373
|
+
session : requests.Session - Reusable requests.Session.
|
|
386
374
|
|
|
387
375
|
Returns:
|
|
388
|
-
List of dicts, each with keys:
|
|
376
|
+
List of dicts, each with keys:
|
|
377
|
+
url, html, soup, status_code
|
|
389
378
|
|
|
390
379
|
Example:
|
|
391
380
|
pages = crawl_site("https://docs.example.com", max_pages=50)
|
|
@@ -444,9 +433,9 @@ def crawl_site(
|
|
|
444
433
|
boundary are still findable.
|
|
445
434
|
|
|
446
435
|
Args:
|
|
447
|
-
text:
|
|
448
|
-
chunk_size: Maximum words per chunk.
|
|
449
|
-
overlap:
|
|
436
|
+
text : String - Full document text.
|
|
437
|
+
chunk_size : Integer - Maximum words per chunk.
|
|
438
|
+
overlap : Integer - Words shared between consecutive chunks.
|
|
450
439
|
|
|
451
440
|
Returns:
|
|
452
441
|
List of text chunks.
|
|
@@ -478,10 +467,10 @@ def chunk_text(
|
|
|
478
467
|
applies Porter stemming (if nltk is installed).
|
|
479
468
|
|
|
480
469
|
Args:
|
|
481
|
-
text:
|
|
482
|
-
remove_stopwords: Filter common English stopwords.
|
|
483
|
-
stem:
|
|
484
|
-
min_token_len:
|
|
470
|
+
text : String - Input string.
|
|
471
|
+
remove_stopwords : Boolean - Filter common English stopwords.
|
|
472
|
+
stem : Boolean - Apply stemming for root-form matching.
|
|
473
|
+
min_token_len : Integer - Discard tokens shorter than this length.
|
|
485
474
|
|
|
486
475
|
Returns:
|
|
487
476
|
List of processed token strings.
|
|
@@ -510,7 +499,39 @@ def tokenize(
|
|
|
510
499
|
tokens = [_stemmer.stem(t) for t in tokens]
|
|
511
500
|
|
|
512
501
|
return tokens
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
"""
|
|
505
|
+
This is a function to customize the scoring of documents using the tokens of a query as indices.
|
|
506
|
+
Obtain a document score based on query index and unit preference.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
url : String - The URL of the page
|
|
510
|
+
query : String - The query to search.
|
|
511
|
+
remove_stopwords : Boolean - Filter common English stopwords.
|
|
512
|
+
stem : Boolean - Apply stemming for root-form matching.
|
|
513
|
+
unit : Integer - The desired score base metric for the document. Formula: count of occurrence x unit. Default is 1.0.
|
|
513
514
|
|
|
515
|
+
Returns:
|
|
516
|
+
List of processed token strings.
|
|
517
|
+
|
|
518
|
+
Example:
|
|
519
|
+
score = get_document_score("https://softhauz.ca/eng", query = "SofthauzPy", remove_stopwords = False, stem = False)
|
|
520
|
+
# 8.0
|
|
521
|
+
"""
|
|
522
|
+
def get_document_score(url:str, *, query:str = '', remove_stopwords: bool = False, stem: bool = False, unit: float = 1.0) -> float:
|
|
523
|
+
|
|
524
|
+
score = 0.0
|
|
525
|
+
text = extract_pure_text(url)["content"].lower()
|
|
526
|
+
tokens = tokenize(query, remove_stopwords = remove_stopwords, stem = stem)
|
|
527
|
+
|
|
528
|
+
for token in tokens:
|
|
529
|
+
token = token.lower()
|
|
530
|
+
occurrence = text.count(token)
|
|
531
|
+
if (occurrence > 0):
|
|
532
|
+
score += (occurrence * unit)
|
|
533
|
+
|
|
534
|
+
return score
|
|
514
535
|
|
|
515
536
|
"""
|
|
516
537
|
Build an inverted index mapping tokens to list of (doc_id, frequency).
|
|
@@ -519,10 +540,9 @@ def tokenize(
|
|
|
519
540
|
without scanning every document on every query.
|
|
520
541
|
|
|
521
542
|
Args:
|
|
522
|
-
documents:
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
id_field: Key used as the document identifier.
|
|
543
|
+
documents : List - List of dicts, each containing at least text_field and id_field.
|
|
544
|
+
text_field : String - Key whose value is the text to index.
|
|
545
|
+
id_field : String - Key used as the document identifier.
|
|
526
546
|
|
|
527
547
|
Returns:
|
|
528
548
|
Dict: { token: [(doc_id, freq), ...] }
|
|
@@ -561,7 +581,7 @@ def build_inverted_index(
|
|
|
561
581
|
id_field: str = "url",
|
|
562
582
|
) -> dict:
|
|
563
583
|
index: dict[str, list[tuple[str, int]]] = defaultdict(list)
|
|
564
|
-
|
|
584
|
+
# {'strictli': [('http://127.0.0.1:8000/eng', 2)], 'necessari': [('http://127.0.0.1:8000/eng', 3)]...}
|
|
565
585
|
for doc in documents:
|
|
566
586
|
doc_id = doc[id_field]
|
|
567
587
|
text = doc.get(text_field, "")
|
|
@@ -580,16 +600,21 @@ def build_inverted_index(
|
|
|
580
600
|
the whole corpus — the backbone of classical relevance ranking.
|
|
581
601
|
|
|
582
602
|
Args:
|
|
583
|
-
documents:
|
|
584
|
-
text_field: Field containing raw text.
|
|
585
|
-
id_field:
|
|
603
|
+
documents : List - Corpus as a list of dicts.
|
|
604
|
+
text_field : String - Field containing raw text.
|
|
605
|
+
id_field : String - Field used as document identifier.
|
|
586
606
|
|
|
587
607
|
Returns:
|
|
588
608
|
Nested dict: { doc_id: { token: tfidf_score } }
|
|
589
609
|
|
|
590
610
|
Example:
|
|
591
|
-
|
|
592
|
-
|
|
611
|
+
documents = [
|
|
612
|
+
{"url": "https://example.com/ai", "text": "Artificial intelligence is transforming the world"},
|
|
613
|
+
{"url": "https://example.com/ml", "text": "Machine learning is a subset of artificial intelligence"},
|
|
614
|
+
{"url": "https://example.com/nlp", "text": "Natural language processing enables machines to understand text"},
|
|
615
|
+
]
|
|
616
|
+
scores = compute_tfidf(documents)
|
|
617
|
+
top = sorted(scores["https://…"].items(), key=lambda x: -x[1])[:5] # these are the 1st five items
|
|
593
618
|
"""
|
|
594
619
|
|
|
595
620
|
|
|
@@ -599,6 +624,7 @@ def compute_tfidf(
|
|
|
599
624
|
text_field: str = "text",
|
|
600
625
|
id_field: str = "url",
|
|
601
626
|
) -> dict[str, dict[str, float]]:
|
|
627
|
+
|
|
602
628
|
N = len(documents)
|
|
603
629
|
tf_store: dict[str, dict[str, float]] = {}
|
|
604
630
|
doc_freq: Counter = Counter()
|
|
@@ -629,10 +655,10 @@ def compute_tfidf(
|
|
|
629
655
|
then returns the top-k results by total relevance score.
|
|
630
656
|
|
|
631
657
|
Args:
|
|
632
|
-
query:
|
|
633
|
-
index:
|
|
634
|
-
tfidf:
|
|
635
|
-
top_k:
|
|
658
|
+
query : String - Raw user query string.
|
|
659
|
+
index : Dictionary - Inverted index from build_inverted_index().
|
|
660
|
+
tfidf : Dictionary - TF-IDF matrix from compute_tfidf().
|
|
661
|
+
top_k : Integer - Maximum results to return.
|
|
636
662
|
|
|
637
663
|
Returns:
|
|
638
664
|
List of (doc_id, score) tuples, highest score first.
|
|
@@ -669,10 +695,10 @@ def search_index(
|
|
|
669
695
|
h1 matches a query is a simple way to improve ranking quality.
|
|
670
696
|
|
|
671
697
|
Args:
|
|
672
|
-
soup: Parsed BeautifulSoup object.
|
|
698
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
673
699
|
|
|
674
700
|
Returns:
|
|
675
|
-
List of dicts
|
|
701
|
+
List of dicts - [{ "level": 1, "text": "Getting Started" }, …]
|
|
676
702
|
|
|
677
703
|
Example:
|
|
678
704
|
headings = extract_headings(soup)
|
|
@@ -694,10 +720,10 @@ def extract_headings(soup: BeautifulSoup) -> list[dict]:
|
|
|
694
720
|
returns the surrounding word window, mimicking Google's snippet style.
|
|
695
721
|
|
|
696
722
|
Args:
|
|
697
|
-
text:
|
|
698
|
-
query:
|
|
699
|
-
window:
|
|
700
|
-
max_length: Hard character cap on the returned snippet.
|
|
723
|
+
text : String - Full document text.
|
|
724
|
+
query : String - User's search query.
|
|
725
|
+
window : Integer - Words to show on each side of the match.
|
|
726
|
+
max_length : Integer - Hard character cap on the returned snippet.
|
|
701
727
|
|
|
702
728
|
Returns:
|
|
703
729
|
A short excerpt string, potentially with leading/trailing ellipsis.
|
|
@@ -745,7 +771,7 @@ def generate_snippet(
|
|
|
745
771
|
unchanged content.
|
|
746
772
|
|
|
747
773
|
Args:
|
|
748
|
-
text: Extracted page text (from extract_text).
|
|
774
|
+
text : String - Extracted page text (from extract_text).
|
|
749
775
|
|
|
750
776
|
Returns:
|
|
751
777
|
64-character hex string (SHA-256 digest).
|
|
@@ -769,10 +795,10 @@ def fingerprint_page(text: str) -> str:
|
|
|
769
795
|
re-crawl every time the search service starts.
|
|
770
796
|
|
|
771
797
|
Args:
|
|
772
|
-
index:
|
|
773
|
-
tfidf:
|
|
774
|
-
metadata: Per-page metadata records (list of dicts).
|
|
775
|
-
path:
|
|
798
|
+
index : Dictionary - Inverted index from build_inverted_index().
|
|
799
|
+
tfidf : Dictionary - TF-IDF scores from compute_tfidf().
|
|
800
|
+
metadata : List - Per-page metadata records (list of dicts).
|
|
801
|
+
path : String - Output file path.
|
|
776
802
|
|
|
777
803
|
Example:
|
|
778
804
|
save_index(index, tfidf, page_metadata, "data/index.json")
|
|
@@ -800,13 +826,13 @@ def save_index(
|
|
|
800
826
|
Deserialise a previously saved search index from JSON.
|
|
801
827
|
|
|
802
828
|
Args:
|
|
803
|
-
path: File path written by save_index().
|
|
829
|
+
path : String - File path written by save_index().
|
|
804
830
|
|
|
805
831
|
Returns:
|
|
806
832
|
Tuple (inverted_index, tfidf, metadata_list).
|
|
807
833
|
|
|
808
834
|
Raises:
|
|
809
|
-
FileNotFoundError
|
|
835
|
+
FileNotFoundError - if the file does not exist.
|
|
810
836
|
|
|
811
837
|
Example:
|
|
812
838
|
index, tfidf, metadata = load_index("data/index.json")
|
|
@@ -830,7 +856,7 @@ def load_index(path: str = "search_index.json") -> tuple[dict, dict, list]:
|
|
|
830
856
|
ideal for enriching search results.
|
|
831
857
|
|
|
832
858
|
Args:
|
|
833
|
-
soup: Parsed BeautifulSoup object.
|
|
859
|
+
soup : BeautifulSoup - Parsed BeautifulSoup object.
|
|
834
860
|
|
|
835
861
|
Returns:
|
|
836
862
|
List of parsed JSON-LD objects found on the page.
|
|
@@ -863,8 +889,8 @@ def extract_structured_data(soup: BeautifulSoup) -> list[dict]:
|
|
|
863
889
|
(sitemapindex elements) one level deep.
|
|
864
890
|
|
|
865
891
|
Args:
|
|
866
|
-
base_url: Root URL of the site, e.g. "https://docs.example.com".
|
|
867
|
-
session:
|
|
892
|
+
base_url : String - Root URL of the site, e.g. "https://docs.example.com".
|
|
893
|
+
session : requests.Session - Optional reusable requests.Session.
|
|
868
894
|
|
|
869
895
|
Returns:
|
|
870
896
|
Sorted, deduplicated list of page URLs listed in the sitemap(s).
|
|
@@ -914,10 +940,10 @@ def build_sitemap_urls(
|
|
|
914
940
|
The matching is case-insensitive and handles whole words only.
|
|
915
941
|
|
|
916
942
|
Args:
|
|
917
|
-
snippet:
|
|
918
|
-
query:
|
|
919
|
-
open_tag:
|
|
920
|
-
close_tag: Closing HTML tag (default </mark>).
|
|
943
|
+
snippet : String - Text excerpt (from generate_snippet).
|
|
944
|
+
query : String - Original user query.
|
|
945
|
+
open_tag : String - Opening HTML tag (default <mark>).
|
|
946
|
+
close_tag : String - Closing HTML tag (default </mark>).
|
|
921
947
|
|
|
922
948
|
Returns:
|
|
923
949
|
Snippet string with matching keywords wrapped in tags.
|
|
@@ -956,12 +982,12 @@ def highlight_query_terms(
|
|
|
956
982
|
unchanged the function exits early, making scheduled re-crawls cheap.
|
|
957
983
|
|
|
958
984
|
Args:
|
|
959
|
-
url:
|
|
960
|
-
index:
|
|
961
|
-
tfidf:
|
|
962
|
-
metadata:
|
|
963
|
-
fingerprints: Dict mapping url to last known fingerprint (mutable).
|
|
964
|
-
session:
|
|
985
|
+
url : String - Page to check and potentially re-index.
|
|
986
|
+
index : Dictionary - Mutable inverted index (modified in place).
|
|
987
|
+
tfidf : Dictionary - Mutable TF-IDF store (modified in place).
|
|
988
|
+
metadata : List - Mutable metadata list (modified in place).
|
|
989
|
+
fingerprints : Dictionary - Dict mapping url to last known fingerprint (mutable).
|
|
990
|
+
session : requests.Session - Optional requests.Session.
|
|
965
991
|
|
|
966
992
|
Returns:
|
|
967
993
|
True if the page was re-indexed, False if it was unchanged.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|