webscout 3.4__py3-none-any.whl → 3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,9 @@
1
- DeepWEBS/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- DeepWEBS/documents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- DeepWEBS/documents/query_results_extractor.py,sha256=whd0NKLpcxW_6q3SkBOhMukr1K_c1PPYN92rf5EHRPM,4049
4
- DeepWEBS/documents/webpage_content_extractor.py,sha256=P4yHCkPTiBvMbORd8SKVt64rQFPJuj3iixcQoRU34Lw,5272
5
- DeepWEBS/networks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- DeepWEBS/networks/filepath_converter.py,sha256=JKMBew1TYe4TVoGTqgTWerq2Pam49_9u9TVUFCTDQyk,3183
7
- DeepWEBS/networks/google_searcher.py,sha256=-AdIpVkRgemsARnOt8WPkF2Id1baVlqDHyqX2qz8Aew,1966
8
- DeepWEBS/networks/network_configs.py,sha256=-Hb78_7SBx32h219FnU14qcHTvBdDUf_QAU6-RTL_e0,726
9
- DeepWEBS/networks/webpage_fetcher.py,sha256=vRB9T3o-nMgrMkG2NPHTDctNeXaPSKCmBXqu189h2ZI,3590
10
- DeepWEBS/utilsdw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- DeepWEBS/utilsdw/enver.py,sha256=vpI7s4_o_VL9govSryOv-z1zYK3pTEW3-H9QNN8JYtc,2472
12
- DeepWEBS/utilsdw/logger.py,sha256=Z0nFUcEGyU8r28yKiIyvEtO26xxpmJgbvNToTfwZecc,8174
13
1
  webscout/AIauto.py,sha256=xPGr_Z0h27XXNh4Wiufjn9TksDOqxqlaGcLUYKNP55w,18246
14
2
  webscout/AIbase.py,sha256=GoHbN8r0gq2saYRZv6LA-Fr9Jlcjv80STKFXUq2ZeGU,4710
15
- webscout/AIutel.py,sha256=Ghe9w1gqnCZTwjc3NzXymSamY3nP0zEep4NiATW32Qk,33454
16
- webscout/DWEBS.py,sha256=QT-7-dUgWhQ_H7EVZD53AVyXxyskoPMKCkFIpzkN56Q,7332
3
+ webscout/AIutel.py,sha256=MMfUvTQXYDtaFXsXtwKgv9V_qMK6WgOxdx7Wagdm2Lw,33542
4
+ webscout/DWEBS.py,sha256=QLuT1IKu0lnwdl7W6c-ctBAO7Jj0Zk3PYm6-13BC7rU,25740
17
5
  webscout/LLM.py,sha256=LbGCZdJf8A5dwfoGS4tyy39tAh5BDdhMZP0ScKaaQfU,4184
18
- webscout/__init__.py,sha256=pOqM5UGmljQN4jTrL3kyrjMv66VuTPyzfNlXZra9KLQ,1872
6
+ webscout/__init__.py,sha256=ugx2Z3KX710527ri6AWPRwme9HjuiQezAnwyltqzr4c,2038
19
7
  webscout/__main__.py,sha256=ZtTRgsRjUi2JOvYFLF1ZCh55Sdoz94I-BS-TlJC7WDU,126
20
8
  webscout/async_providers.py,sha256=holBv5SxanxVXc_92CBBaXHlB2IakB_fHnhyZaFjYF8,684
21
9
  webscout/cli.py,sha256=enw_dPTCG3sNC1TXt96XccnpRmF4Etr99nh-RbGYags,18784
@@ -27,14 +15,15 @@ webscout/transcriber.py,sha256=EddvTSq7dPJ42V3pQVnGuEiYQ7WjJ9uyeR9kMSxN7uY,20622
27
15
  webscout/utils.py,sha256=CxeXvp0rWIulUrEaPZMaNfg_tSuQLRSV8uuHA2chyKE,2603
28
16
  webscout/version.py,sha256=pTj22SSXb7rieyMXdGyEFmljJmZMa6FL_DaETjfeLwA,23
29
17
  webscout/voice.py,sha256=0QjXTHAQmCK07IDZXRc7JXem47cnPJH7u3X0sVP1-UQ,967
30
- webscout/webai.py,sha256=hnRfUI9AT3MgltP68bAmW5Tq4_aWcYytYeTFEsgS7u0,85991
18
+ webscout/webai.py,sha256=qkvhYdyF5wNdmW4rNdH3RbfQxabEWlGvCyAk2SbH04k,86602
31
19
  webscout/webscout_search.py,sha256=lFAot1-Qil_YfXieeLakDVDEX8Ckcima4ueXdOYwiMc,42804
32
20
  webscout/webscout_search_async.py,sha256=dooKGwLm0cwTml55Vy6NHPPY-nymEqX2h8laX94Zg5A,14537
21
+ webscout/websx_search.py,sha256=n-qVwiHozJEF-GFRPcAfh4k1d_tscTmDe1dNL-1ngcU,12094
33
22
  webscout/Local/__init__.py,sha256=RN6klpbabPGNX2YzPm_hdeUcQvieUwvJt22uAO2RKSM,238
34
23
  webscout/Local/_version.py,sha256=hC_EHWR519ZOsyRw9i6gXEfU5IAIR_B9d3THLVmkWXw,83
35
24
  webscout/Local/formats.py,sha256=BiZZSoN3e8S6-S-ykBL9ogSUs0vK11GaZ3ghc9U8GRk,18994
36
25
  webscout/Local/model.py,sha256=T_bzNNrxEyOyLyhp6fKwiuVBBkXC2a37LzJVCxFIxOU,30710
37
- webscout/Local/rawdog.py,sha256=LtA7bck2HyvWmovuaG86Iiquiz7XiMcxBlebo9IuGBY,35744
26
+ webscout/Local/rawdog.py,sha256=ojY_O8Vb1KvR34OwWdfLgllgaAK_7HMf64ElMATvCXs,36689
38
27
  webscout/Local/samplers.py,sha256=qXwU4eLXER-2aCYzcJcTgA6BeFmi5GMpTDUX1C9pTN4,4372
39
28
  webscout/Local/thread.py,sha256=Lyf_N2CaGAn2usSWSiUXLPAgpWub8vUu_tgFgtnvZVA,27408
40
29
  webscout/Local/utils.py,sha256=CSt9IqHhVGk_nJEnKvSFbLhC5nNf01e0MtwpgMmF9pA,6197
@@ -43,26 +32,28 @@ webscout/Provider/Berlin4h.py,sha256=zMpmWmdFCbcE3UWB-F9xbbTWZTfx4GnjnRf6sDoaiC0
43
32
  webscout/Provider/Blackboxai.py,sha256=HUk0moEGsgGvidD1LF9tbfaKdx7bPnGU_SrYPdcfHU8,17182
44
33
  webscout/Provider/ChatGPTUK.py,sha256=qmuCb_a71GNE5LelOb5AKJUBndvj7soebiNey4VdDvE,8570
45
34
  webscout/Provider/Cohere.py,sha256=IXnRosYOaMAA65nvsKmN6ZkJGSdZFYQYBidzuNaCqX8,8711
46
- webscout/Provider/Deepseek.py,sha256=HKsC-ePLSPqcrQbafy-IzR0BNqId3LfiIEhk9j9oTs4,10285
35
+ webscout/Provider/Deepinfra.py,sha256=kVnWARJdEtIeIsZwGw3POq8B2dO87bDcJso3uOeCeOA,18750
36
+ webscout/Provider/Deepseek.py,sha256=pnOB44ObuOfAsoi_bUGUvha3tfwd0rTJ9rnX-14QkL4,10550
47
37
  webscout/Provider/Gemini.py,sha256=_4DHWvlWuNAmVHPwHB1RjmryjTZZCthLa6lvPEHLvkQ,8451
48
38
  webscout/Provider/Groq.py,sha256=QfgP3hKUcqq5vUA4Pzuu3HAgpJkKwLWNjjsnxtkCYd8,21094
49
39
  webscout/Provider/Koboldai.py,sha256=KwWx2yPlvT9BGx37iNvSbgzWkJ9I8kSOmeg7sL1hb0M,15806
50
40
  webscout/Provider/Leo.py,sha256=wbuDR-vFjLptfRC6yDlk74tINqNvCOzpISsK92lIgGg,19987
51
41
  webscout/Provider/Llama2.py,sha256=gVMotyiBaDSqliwuDtFefHoOBn9V5m5Ze_YVtV0trt8,17525
52
- webscout/Provider/OpenGPT.py,sha256=SJskNkUGNNb3zdZY50xokzW-rwcSlHw8EN6WVv70dg8,18890
42
+ webscout/Provider/OpenGPT.py,sha256=ZymwLgNJSPlGZHW3msMlnRR7NxmALqJw9yuToqrRrhw,35515
53
43
  webscout/Provider/Openai.py,sha256=SjfVOwY94unVnXhvN0Fkome-q2-wi4mPJk_vCGq5Fjc,20617
54
44
  webscout/Provider/Perplexity.py,sha256=CPdKqkdlVejXDcf1uycNO4LPCVNUADSCetvyJEGepSw,8826
55
- webscout/Provider/Phind.py,sha256=NXiYNRs8h_6c3AGOUqFrvN01odBIQ_psSUBPaHiAUoE,19907
45
+ webscout/Provider/Phind.py,sha256=bkgKVtggRJSbJAG1tXviW9BqDvcgqPBlSr88Q6rlFHw,39226
56
46
  webscout/Provider/Poe.py,sha256=ObUxa-Fa2Dq7sJcV0hc65m09StS9uWsB2-bR2rSjXDY,7510
57
47
  webscout/Provider/Reka.py,sha256=F0ZXENkhARprj5biK3mRxwiuPH0BW3ga7EWsi8agbtE,8917
58
48
  webscout/Provider/ThinkAnyAI.py,sha256=_qFjj0djxxrranyEY33w14oizyRjzlVwMv_hzvVtwNc,11616
49
+ webscout/Provider/VTLchat.py,sha256=_sErGr-wOi16ZAfiGOo0bPsAEMkjzzwreEsIqjIZMIU,10041
59
50
  webscout/Provider/Xjai.py,sha256=BIlk2ouz9Kh_0Gg9hPvTqhI7XtcmWdg5vHSX_4uGrIs,9039
60
51
  webscout/Provider/Yepchat.py,sha256=2Eit-A7w1ph1GQKNQuur_yaDzI64r0yBGxCIjDefJxQ,19875
61
52
  webscout/Provider/Youchat.py,sha256=UVGBuGSjv4uRibn1xflmCjYcfrRTKnDvX3adhag6T98,7976
62
- webscout/Provider/__init__.py,sha256=nmZYPpXyp8s0xn4UO9IMhkV7-RfGqMdOa3CRmG0uuTg,1510
63
- webscout-3.4.dist-info/LICENSE.md,sha256=9P0imsudI7MEvZe2pOcg8rKBn6E5FGHQ-riYozZI-Bk,2942
64
- webscout-3.4.dist-info/METADATA,sha256=t8Hfgd5KshA4OOid1ovzYd83p890DBmZWGFWFn1pTFE,67227
65
- webscout-3.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
66
- webscout-3.4.dist-info/entry_points.txt,sha256=Hh4YIIjvkqB9SVxZ2ri4DZUkgEu_WF_5_r_nZDIvfG8,73
67
- webscout-3.4.dist-info/top_level.txt,sha256=OD5YKy6Y3hldL7SmuxsiEDxAG4LgdSSWwzYk22MF9fk,18
68
- webscout-3.4.dist-info/RECORD,,
53
+ webscout/Provider/__init__.py,sha256=RaMdtYv7eQJ2vB8jXUHrkfNbx2DgRjbwc6DI40cOH1A,1809
54
+ webscout-3.6.dist-info/LICENSE.md,sha256=9P0imsudI7MEvZe2pOcg8rKBn6E5FGHQ-riYozZI-Bk,2942
55
+ webscout-3.6.dist-info/METADATA,sha256=YhKU0lcCGrDmUA-L1wy7ETzzmkWmbY5MvEfZg_6GlcU,67341
56
+ webscout-3.6.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
57
+ webscout-3.6.dist-info/entry_points.txt,sha256=Hh4YIIjvkqB9SVxZ2ri4DZUkgEu_WF_5_r_nZDIvfG8,73
58
+ webscout-3.6.dist-info/top_level.txt,sha256=nYIw7OKBQDr_Z33IzZUKidRD3zQEo8jOJYkMVMeN334,9
59
+ webscout-3.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (70.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,2 +1 @@
1
- DeepWEBS
2
1
  webscout
DeepWEBS/__init__.py DELETED
File without changes
File without changes
@@ -1,99 +0,0 @@
1
- from bs4 import BeautifulSoup
2
- from pathlib import Path
3
- from DeepWEBS.utilsdw.logger import logger
4
-
5
- class QueryResultsExtractor:
6
- def __init__(self) -> None:
7
- self.query_results = []
8
- self.related_questions = []
9
-
10
- def load_html(self, html_path):
11
- try:
12
- with open(html_path, "r", encoding="utf-8") as f:
13
- html = f.read()
14
- self.soup = BeautifulSoup(html, "html.parser")
15
- except FileNotFoundError:
16
- logger.error(f"File not found: {html_path}")
17
- except Exception as e:
18
- logger.error(f"Error loading HTML: {e}")
19
-
20
- def extract_query_results(self):
21
- try:
22
- self.query = self.soup.find("textarea").text.strip()
23
- query_result_elements = self.soup.find_all("div", class_="g")
24
- for idx, result in enumerate(query_result_elements):
25
- try:
26
- site = result.find("cite").find_previous("span").text.strip()
27
- url = result.find("a")["href"]
28
- title = result.find("h3").text.strip()
29
- abstract_element_conditions = [
30
- {"data-sncf": "1"},
31
- {"class_": "ITZIwc"},
32
- ]
33
- for condition in abstract_element_conditions:
34
- abstract_element = result.find("div", condition)
35
- if abstract_element is not None:
36
- abstract = abstract_element.text.strip()
37
- break
38
- else:
39
- abstract = ""
40
- logger.mesg(
41
- f"{title}\n"
42
- f" - {site}\n"
43
- f" - {url}\n"
44
- f" - {abstract}\n"
45
- f"\n"
46
- )
47
- self.query_results.append(
48
- {
49
- "title": title,
50
- "site": site,
51
- "url": url,
52
- "abstract": abstract,
53
- "index": idx,
54
- "type": "web",
55
- }
56
- )
57
- except Exception as e:
58
- logger.error(f"Error extracting query result: {e}")
59
- logger.success(f"- {len(query_result_elements)} query results")
60
- except Exception as e:
61
- logger.error(f"Error extracting query results: {e}")
62
-
63
- def extract_related_questions(self):
64
- try:
65
- related_question_elements = self.soup.find_all(
66
- "div", class_="related-question-pair"
67
- )
68
- for question_element in related_question_elements:
69
- try:
70
- question = question_element.find("span").text.strip()
71
- print(question)
72
- self.related_questions.append(question)
73
- except Exception as e:
74
- logger.error(f"Error extracting related question: {e}")
75
- logger.success(f"- {len(self.related_questions)} related questions")
76
- except Exception as e:
77
- logger.error(f"Error extracting related questions: {e}")
78
-
79
- def extract(self, html_path):
80
- self.load_html(html_path)
81
- self.extract_query_results()
82
- self.extract_related_questions()
83
- self.search_results = {
84
- "query": self.query,
85
- "query_results": self.query_results,
86
- "related_questions": self.related_questions,
87
- }
88
- return self.search_results
89
-
90
-
91
- if __name__ == "__main__":
92
- html_path_root = Path(__file__).parents[1] / "files"
93
- html_filename = "python_tutorials"
94
- html_path = html_path_root / f"{html_filename}.html"
95
- extractor = QueryResultsExtractor()
96
- try:
97
- extractor.extract(html_path)
98
- except Exception as e:
99
- logger.error(f"Error in main function: {e}")
@@ -1,145 +0,0 @@
1
- import concurrent.futures
2
- import re
3
- from pathlib import Path
4
- from pprint import pprint
5
- from bs4 import BeautifulSoup
6
- from tiktoken import get_encoding as tiktoken_get_encoding
7
- from DeepWEBS.utilsdw.logger import logger
8
- from markdownify import markdownify
9
- from DeepWEBS.networks.network_configs import IGNORE_TAGS, IGNORE_CLASSES
10
- from termcolor import colored
11
-
12
-
13
- class WebpageContentExtractor:
14
- def __init__(self):
15
- self.tokenizer = tiktoken_get_encoding("cl100k_base")
16
-
17
- def count_tokens(self, text):
18
- tokens = self.tokenizer.encode(text)
19
- token_count = len(tokens)
20
- return token_count
21
-
22
- def html_to_markdown(self, html_str, ignore_links=True):
23
- if ignore_links:
24
- markdown_str = markdownify(html_str, strip="a")
25
- else:
26
- markdown_str = markdownify(html_str)
27
- markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
28
-
29
- self.markdown_token_count = self.count_tokens(markdown_str)
30
- logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
31
-
32
- self.markdown_str = markdown_str
33
-
34
- return self.markdown_str
35
-
36
- def remove_elements_from_html(self, html_str):
37
- soup = BeautifulSoup(html_str, "html.parser")
38
- ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
39
- ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
40
- removed_element_counts = 0
41
- for element in soup.find_all():
42
- class_str = ""
43
- id_str = ""
44
- try:
45
- class_attr = element.get("class", [])
46
- if class_attr:
47
- class_str = " ".join(list(class_attr))
48
- if id_str:
49
- class_str = f"{class_str} {id_str}"
50
- except:
51
- pass
52
-
53
- try:
54
- id_str = element.get("id", "")
55
- except:
56
- pass
57
-
58
- if (
59
- (not element.text.strip())
60
- or (element.name in IGNORE_TAGS)
61
- or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
62
- or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
63
- ):
64
- element.decompose()
65
- removed_element_counts += 1
66
-
67
- logger.mesg(
68
- f"- Elements: "
69
- f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
70
- )
71
-
72
- html_str = str(soup)
73
- self.html_str = html_str
74
-
75
- return self.html_str
76
-
77
- def extract(self, html_path):
78
- logger.note(f"Extracting content from: {html_path}")
79
-
80
- if not Path(html_path).exists():
81
- logger.warn(f"File not found: {html_path}")
82
- return ""
83
-
84
- encodings = ["utf-8", "latin-1"]
85
- for encoding in encodings:
86
- try:
87
- with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
88
- html_str = rf.read()
89
- break
90
- except UnicodeDecodeError:
91
- pass
92
- else:
93
- logger.warn(f"No matching encodings: {html_path}")
94
- return ""
95
-
96
- html_str = self.remove_elements_from_html(html_str)
97
- markdown_str = self.html_to_markdown(html_str)
98
- return markdown_str
99
-
100
-
101
- class BatchWebpageContentExtractor:
102
- def __init__(self) -> None:
103
- self.html_path_and_extracted_content_list = []
104
- self.done_count = 0
105
-
106
- def extract_single_html(self, html_path):
107
- webpage_content_extractor = WebpageContentExtractor()
108
- extracted_content = webpage_content_extractor.extract(html_path)
109
- self.html_path_and_extracted_content_list.append(
110
- {"html_path": html_path, "extracted_content": extracted_content}
111
- )
112
- self.done_count += 1
113
- logger.success(
114
- f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
115
- )
116
-
117
- def extract(self, html_paths):
118
- self.html_path = html_paths
119
- self.total_count = len(self.html_path)
120
- with concurrent.futures.ThreadPoolExecutor() as executor:
121
- futures = [
122
- executor.submit(self.extract_single_html, html_path)
123
- for html_path in self.html_path
124
- ]
125
- for idx, future in enumerate(concurrent.futures.as_completed(futures)):
126
- result = future.result()
127
-
128
- return self.html_path_and_extracted_content_list
129
-
130
-
131
- if __name__ == "__main__":
132
- html_root = Path(__file__).parents[1] / "files" / "urls" / "python tutorials"
133
- html_paths = [
134
- html_root / html_filename
135
- for html_filename in [
136
- "docs.python.org_zh-cn_3_tutorial_interpreter.html",
137
- "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html",
138
- "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html",
139
- ]
140
- ]
141
- batch_webpage_content_extractor = BatchWebpageContentExtractor()
142
- html_path_and_extracted_content_list = batch_webpage_content_extractor.extract(
143
- html_paths
144
- )
145
- # pprint(html_path_and_extracted_content_list)
File without changes
@@ -1,109 +0,0 @@
1
- import platform
2
- import re
3
- from pathlib import Path
4
- from urllib.parse import quote, unquote
5
-
6
-
7
- # What characters are forbidden in Windows and Linux directory names?
8
- # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
9
-
10
- INVALID_FILE_PATH_CHARS = [
11
- "\\",
12
- "/",
13
- ":",
14
- "*",
15
- "?",
16
- '"',
17
- "<",
18
- ">",
19
- "|",
20
- "\n",
21
- "\t",
22
- "\r",
23
- *[chr(i) for i in range(32)],
24
- ]
25
-
26
- WINDOWS_INVALID_FILE_PATH_NAMES = [
27
- "con",
28
- "prn",
29
- "aux",
30
- "nul",
31
- *[f"com{i+1}" for i in range(10)],
32
- *[f"lpt{i+1}" for i in range(10)],
33
- ]
34
-
35
-
36
- class FilepathConverter:
37
- def __init__(self, parent: str = None):
38
- self.output_root = Path(__file__).parents[1] / "files"
39
- self.parent = parent
40
-
41
- def preprocess(self, input_string):
42
- return input_string
43
-
44
- def validate(self, input_string):
45
- if not input_string:
46
- return input_string
47
- filename = input_string
48
- for char in INVALID_FILE_PATH_CHARS:
49
- filename = filename.replace(char, "_")
50
- if platform.system() == "Windows":
51
- filename_base = filename.split(".")[0]
52
- if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
53
- filename_base = filename_base + "_"
54
- filename = ".".join([filename_base, *filename.split(".")[1:]])
55
- return filename
56
-
57
- def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
58
- if ext:
59
- filename_ext = "." + filename.split(".")[-1]
60
- if filename_ext.lower() not in accept_exts:
61
- filename += ext
62
- return filename
63
-
64
- def convert(self, input_string, parent=None):
65
- filename = self.preprocess(input_string)
66
- filename = self.validate(filename)
67
- filename = self.append_extension(filename)
68
-
69
- parent = parent or self.parent
70
- parent = self.validate(parent)
71
- if parent:
72
- filepath = self.output_root / parent / filename
73
- else:
74
- filepath = self.output_root / filename
75
-
76
- self.filename = filename
77
- self.filepath = filepath
78
-
79
- return self.filepath
80
-
81
-
82
- class UrlToFilepathConverter(FilepathConverter):
83
- def __init__(self, parent: str = None):
84
- super().__init__(parent)
85
- self.output_root = self.output_root / "urls"
86
-
87
- def preprocess(self, url):
88
- filename = unquote(url.split("//")[1])
89
- return filename
90
-
91
-
92
- class QueryToFilepathConverter(FilepathConverter):
93
- def __init__(self, parent: str = None):
94
- super().__init__(parent)
95
- self.output_root = self.output_root / "queries"
96
-
97
-
98
- if __name__ == "__main__":
99
- query = "python"
100
- query_converter = QueryToFilepathConverter()
101
- print(query_converter.convert(query))
102
-
103
- # url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
104
- url = (
105
- "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
106
- )
107
-
108
- url_converter = UrlToFilepathConverter(parent=query)
109
- print(url_converter.convert(url))
@@ -1,52 +0,0 @@
1
- import requests
2
- from pathlib import Path
3
- from typing import Optional
4
- import random
5
- from DeepWEBS.utilsdw.enver import enver
6
- from DeepWEBS.utilsdw.logger import logger
7
- from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
8
- from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
9
-
10
- class GoogleSearcher:
11
- def __init__(self):
12
- self.url = "https://www.google.com/search"
13
- self.enver = enver
14
- self.enver.set_envs(proxies=True)
15
- self.filepath_converter = QueryToFilepathConverter()
16
-
17
- def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
18
- params = {
19
- "q": query,
20
- "num": result_num,
21
- }
22
- response = requests.get(
23
- self.url,
24
- headers=REQUESTS_HEADERS,
25
- params=params,
26
- proxies=self.enver.requests_proxies,
27
- )
28
- response.raise_for_status() # Raise an exception for non-2xx status codes
29
- return response
30
-
31
- def save_response(self, response: requests.Response, html_path: Path) -> None:
32
- html_path.parent.mkdir(parents=True, exist_ok=True)
33
- logger.note(f"Saving to: [{html_path}]")
34
- with html_path.open("wb") as wf:
35
- wf.write(response.content)
36
-
37
- def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
38
- html_path = self.filepath_converter.convert(query)
39
- logger.note(f"Searching: [{query}]")
40
-
41
- if html_path.exists() and not overwrite:
42
- logger.success(f"HTML existed: {html_path}")
43
- else:
44
- response = self.send_request(query, result_num, safe)
45
- self.save_response(response, html_path)
46
-
47
- return html_path
48
-
49
- if __name__ == "__main__":
50
- searcher = GoogleSearcher()
51
- html_path = searcher.search("python tutorials")
52
- print(f"HTML file saved at: {html_path}")
@@ -1,30 +0,0 @@
1
- IGNORE_TAGS = ["script", "style", "button"]
2
- IGNORE_CLASSES = [
3
- # common
4
- "sidebar",
5
- "footer",
6
- "related",
7
- "comment",
8
- "topbar",
9
- "offcanvas",
10
- "navbar",
11
- # 163.com
12
- "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
13
- "ntes\-.*nav",
14
- "nav\-bottom",
15
- # wikipedia.org
16
- "language\-list",
17
- "vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
18
- "navbox",
19
- "catlinks",
20
- ]
21
-
22
- IGNORE_HOSTS = [
23
- "weibo.com",
24
- "hymson.com",
25
- "yahoo.com",
26
- ]
27
-
28
- REQUESTS_HEADERS = {
29
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
30
- }
@@ -1,95 +0,0 @@
1
- import concurrent.futures
2
- import random
3
- import requests
4
- import tldextract
5
- from pathlib import Path
6
- from typing import List, Tuple, Dict
7
-
8
- from DeepWEBS.utilsdw.enver import enver
9
- from DeepWEBS.utilsdw.logger import logger
10
- from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
11
- from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
12
-
13
- class WebpageFetcher:
14
- def __init__(self):
15
- self.enver = enver
16
- self.enver.set_envs(proxies=True)
17
- self.filepath_converter = UrlToFilepathConverter()
18
-
19
- def is_ignored_host(self, url: str) -> bool:
20
- host = tldextract.extract(url).registered_domain
21
- return host in IGNORE_HOSTS
22
-
23
- def send_request(self, url: str) -> requests.Response:
24
- try:
25
- user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
26
- response = requests.get(
27
- url=url,
28
- headers={"User-Agent": user_agent},
29
- proxies=self.enver.requests_proxies,
30
- timeout=15,
31
- )
32
- response.raise_for_status()
33
- return response
34
- except requests.exceptions.RequestException as e:
35
- logger.warn(f"Failed to fetch: [{url}] | {e}")
36
- return None
37
-
38
- def save_response(self, response: requests.Response, html_path: Path) -> None:
39
- if response is None:
40
- return
41
-
42
- html_path.parent.mkdir(parents=True, exist_ok=True)
43
- logger.success(f"Saving to: [{html_path}]")
44
- with html_path.open("wb") as wf:
45
- wf.write(response.content)
46
-
47
- def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
48
- logger.note(f"Fetching: [{url}]")
49
- html_path = self.filepath_converter.convert(url, parent=output_parent)
50
-
51
- if self.is_ignored_host(url):
52
- logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
53
- return html_path
54
-
55
- if html_path.exists() and not overwrite:
56
- logger.success(f"HTML existed: [{html_path}]")
57
- else:
58
- response = self.send_request(url)
59
- self.save_response(response, html_path)
60
-
61
- return html_path
62
-
63
- class BatchWebpageFetcher:
64
- def __init__(self):
65
- self.done_count = 0
66
- self.total_count = 0
67
- self.url_and_html_path_list: List[Dict[str, str]] = []
68
-
69
- def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
70
- webpage_fetcher = WebpageFetcher()
71
- html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
72
- self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
73
- self.done_count += 1
74
- logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
75
- return url, html_path
76
-
77
- def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
78
- self.urls = urls
79
- self.total_count = len(self.urls)
80
-
81
- with concurrent.futures.ProcessPoolExecutor() as executor:
82
- futures = [
83
- executor.submit(WebpageFetcher().fetch, url, overwrite, output_parent)
84
- for url in urls
85
- ]
86
- concurrent.futures.wait(futures)
87
-
88
- self.url_and_html_path_list = [
89
- {"url": future.result().url, "html_path": str(future.result().html_path)}
90
- for future in futures
91
- ]
92
-
93
- return self.url_and_html_path_list
94
-
95
-
File without changes