py2ls 0.1.10.0__py3-none-any.whl → 0.1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/COMMIT_EDITMSG +1 -1
- py2ls/.git/FETCH_HEAD +1 -1
- py2ls/.git/index +0 -0
- py2ls/.git/logs/HEAD +1 -0
- py2ls/.git/logs/refs/heads/main +1 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/logs/refs/remotes/origin/main +1 -0
- py2ls/.git/objects/27/aa6074f652bc6f7078f8647489d9ee8e24f0e2 +0 -0
- py2ls/.git/objects/28/c2969d785c1b892c2a96b3f00eba63a59811b3 +0 -0
- py2ls/.git/objects/2a/fdf45791a26d42ccead35ace76a8f0b2a56561 +0 -0
- py2ls/.git/objects/34/b6f3a2ee84f39bed4eee57f2c0e0afb994feb1 +0 -0
- py2ls/.git/objects/35/1a5f491ab97eee9d1ee699478d75a8bb5d3dc2 +0 -0
- py2ls/.git/objects/39/b13be65125556784e44c7a1d9821703c7ab67e +0 -0
- py2ls/.git/objects/3b/507acc7f23391644cc0b824b1e79fd2677a362 +0 -0
- py2ls/.git/objects/3d/9d10d27724657a436c65a6254bfd213d4b3562 +0 -0
- py2ls/.git/objects/47/6cbd5a7c5e35cddef2f8a38bdc4896d403b095 +0 -0
- py2ls/.git/objects/78/063f4c863fc371ec0313303c0a81283b35d9b6 +0 -0
- py2ls/.git/objects/82/70b319ce4046854fbe7dc41054b6c2d112dab2 +0 -0
- py2ls/.git/objects/85/aee46f478e9afdb84d50a05242c53b04ed2e21 +0 -0
- py2ls/.git/objects/86/e288b46f8fe179907e4413f665aeb5053fddb1 +0 -0
- py2ls/.git/objects/94/f7dbe88e80c4205a901b71eb8f181974376bba +0 -0
- py2ls/.git/objects/9b/ec5ee2236ee2d5532c36bfd132e23c58fdb69c +0 -0
- py2ls/.git/objects/b3/4f7f271c6d6105e35a6556ffda71d03afe8c96 +0 -0
- py2ls/.git/objects/b3/69579064bde9de9a19d114fc33e4e48cc8c0e4 +0 -0
- py2ls/.git/objects/bf/b54d65922ce1dfda1aaa014913a54e7172d0bc +0 -0
- py2ls/.git/objects/c1/397c6ed72c4e20ef6b9ab83163e9a6baba5b45 +0 -0
- py2ls/.git/objects/cc/45df1d317a2eb63ff1ff3a5f3b4a9f98fd92b5 +0 -0
- py2ls/.git/objects/d6/39e8af592cd75a318d8affddd1bcc70c2095f2 +0 -0
- py2ls/.git/objects/db/3f2cd643292057936230b95cf7ec3046affe11 +0 -0
- py2ls/.git/objects/de/214c626ac2dd2685bfaa0bc0fc20f528d014d7 +0 -0
- py2ls/.git/objects/e4/6c715352db9fe3c887a635f1916df4ca1f4ff9 +0 -0
- py2ls/.git/objects/e5/0580a0bd1e1b3d29f834382b80fceb61d5cf0c +0 -0
- py2ls/.git/objects/ec/d980279432b13f0374b90ca439a6329cdece0f +0 -0
- py2ls/.git/objects/ee/cee64eacaff022dcdc509c0c2b1da492f21060 +0 -0
- py2ls/.git/objects/f5/61c3c1bf1c9ea9c9d1f556a7be2869f71f3bdf +0 -0
- py2ls/.git/refs/heads/main +1 -1
- py2ls/.git/refs/remotes/origin/main +1 -1
- py2ls/batman.py +62 -47
- py2ls/ips.py +771 -3
- py2ls/netfinder.py +125 -1
- py2ls/ocr.py +721 -0
- py2ls/plot.py +24 -0
- py2ls/translator.py +470 -119
- {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/METADATA +1 -1
- {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/RECORD +46 -17
- {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/WHEEL +1 -1
py2ls/netfinder.py
CHANGED
@@ -63,6 +63,59 @@ def user_agent(
|
|
63
63
|
return output_ua
|
64
64
|
|
65
65
|
|
66
|
+
def get_tags(content, ascending=True):
|
67
|
+
tag_names = set()
|
68
|
+
|
69
|
+
# Iterate through all tags in the parsed HTML
|
70
|
+
for tag in content.find_all(True): # `True` finds all tags
|
71
|
+
tag_names.add(tag.name) # Add the tag name to the set
|
72
|
+
|
73
|
+
# Convert set to a sorted list for easier reading (optional)
|
74
|
+
if ascending is None:
|
75
|
+
return tag_names
|
76
|
+
else:
|
77
|
+
if ascending:
|
78
|
+
return sorted(tag_names)
|
79
|
+
else:
|
80
|
+
return tag_names
|
81
|
+
|
82
|
+
|
83
|
+
def get_attr(content, where=None, attr=None, **kwargs):
|
84
|
+
"""
|
85
|
+
usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
|
86
|
+
|
87
|
+
Extracts the specified attribute from tags in the content.
|
88
|
+
|
89
|
+
Parameters:
|
90
|
+
- content: BeautifulSoup object of the HTML content.
|
91
|
+
- where: The tag name to search for (e.g., 'time').
|
92
|
+
- attr: The attribute to extract (e.g., 'datetime').
|
93
|
+
- kwargs: Additional filtering conditions for find_all.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
- A list of attribute values if found; otherwise, prints debug info.
|
97
|
+
"""
|
98
|
+
# Extract all tags from the content
|
99
|
+
all_tags = get_tags(content)
|
100
|
+
if all([where, attr]):
|
101
|
+
if where in all_tags:
|
102
|
+
if kwargs:
|
103
|
+
element_ = content.find_all(where, **kwargs)
|
104
|
+
else:
|
105
|
+
element_ = content.find_all(where)
|
106
|
+
attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
|
107
|
+
if attr_values:
|
108
|
+
return attr_values
|
109
|
+
else:
|
110
|
+
print(f"The attribute '{attr}' is not found in the elements.")
|
111
|
+
else:
|
112
|
+
print(f"Cannot find tag '{where}' in the content.")
|
113
|
+
print("Available tags:")
|
114
|
+
pp(all_tags)
|
115
|
+
else:
|
116
|
+
print("Please provide both 'where' (tag name) and 'attr' (attribute).")
|
117
|
+
|
118
|
+
|
66
119
|
def extract_text_from_content(
|
67
120
|
content, content_type="text/html", where=None, what=None, extend=True, **kwargs
|
68
121
|
):
|
@@ -128,7 +181,11 @@ def extract_text_from_content(
|
|
128
181
|
result_set = content.find_all(where, **search_kwargs)
|
129
182
|
else:
|
130
183
|
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
131
|
-
|
184
|
+
if "get" in kwargs:
|
185
|
+
del search_kwargs["get"] # rm 'get' key
|
186
|
+
return get_attr(
|
187
|
+
content, where=where, attr=kwargs["get"], **search_kwargs
|
188
|
+
)
|
132
189
|
if not result_set:
|
133
190
|
print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
|
134
191
|
if extend:
|
@@ -216,6 +273,60 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
|
|
216
273
|
return cookies_dict
|
217
274
|
|
218
275
|
|
276
|
+
### 更加平滑地移动鼠标, 这样更容易反爬
|
277
|
+
def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
|
278
|
+
"""Smoothly scrolls down the page to trigger lazy loading."""
|
279
|
+
current_scroll_position = 0
|
280
|
+
end_of_page = driver.execute_script("return document.body.scrollHeight")
|
281
|
+
|
282
|
+
while current_scroll_position < end_of_page:
|
283
|
+
step = random.randint(min_step, max_step)
|
284
|
+
driver.execute_script(f"window.scrollBy(0, {step});")
|
285
|
+
time.sleep(scroll_pause)
|
286
|
+
|
287
|
+
# Update the current scroll position
|
288
|
+
current_scroll_position += step
|
289
|
+
end_of_page = driver.execute_script("return document.body.scrollHeight")
|
290
|
+
|
291
|
+
|
292
|
+
def scroll_inf2end(driver, scroll_pause=1):
|
293
|
+
"""Continuously scrolls until the end of the page is reached."""
|
294
|
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
295
|
+
|
296
|
+
while True:
|
297
|
+
# Scroll to the bottom
|
298
|
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
299
|
+
time.sleep(scroll_pause)
|
300
|
+
|
301
|
+
# Get the new height after scrolling
|
302
|
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
303
|
+
if new_height == last_height:
|
304
|
+
break # Exit if no new content is loaded
|
305
|
+
last_height = new_height
|
306
|
+
|
307
|
+
|
308
|
+
def corr_by_kind(wait_until_kind):
|
309
|
+
"""
|
310
|
+
Map the 'wait_until_kind' string to the appropriate Selenium By strategy.
|
311
|
+
"""
|
312
|
+
if "tag" in wait_until_kind:
|
313
|
+
return By.TAG_NAME
|
314
|
+
elif "css" in wait_until_kind:
|
315
|
+
return By.CSS_SELECTOR
|
316
|
+
elif "id" in wait_until_kind:
|
317
|
+
return By.ID
|
318
|
+
elif "name" in wait_until_kind:
|
319
|
+
return By.NAME
|
320
|
+
elif "class" in wait_until_kind:
|
321
|
+
return By.CLASS_NAME
|
322
|
+
elif "path" in wait_until_kind:
|
323
|
+
return By.XPATH
|
324
|
+
elif "link" in wait_until_kind or "text" in wait_until_kind:
|
325
|
+
return By.LINK_TEXT
|
326
|
+
else:
|
327
|
+
raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
|
328
|
+
|
329
|
+
|
219
330
|
def fetch_all(
|
220
331
|
url,
|
221
332
|
parser="lxml",
|
@@ -224,6 +335,8 @@ def fetch_all(
|
|
224
335
|
timeout=10,
|
225
336
|
retry=2,
|
226
337
|
wait=0,
|
338
|
+
wait_until=None,
|
339
|
+
wait_until_kind=None,
|
227
340
|
scroll_try=3,
|
228
341
|
login_url=None,
|
229
342
|
username=None,
|
@@ -308,7 +421,10 @@ def fetch_all(
|
|
308
421
|
prefs = {"profile.managed_default_content_settings.images": 2}
|
309
422
|
chrome_options.add_experimental_option("prefs", prefs)
|
310
423
|
# chrome_options.page_load_strategy = capability
|
424
|
+
|
311
425
|
service = Service(ChromeDriverManager().install())
|
426
|
+
# driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
|
427
|
+
# service=Service(executable_path=driver_path)
|
312
428
|
|
313
429
|
driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
314
430
|
|
@@ -323,6 +439,11 @@ def fetch_all(
|
|
323
439
|
wait_ = 0
|
324
440
|
driver_.implicitly_wait(wait_)
|
325
441
|
|
442
|
+
if wait_until is not None and wait_until_kind is not None:
|
443
|
+
strategy = corr_by_kind(wait_until_kind)
|
444
|
+
WebDriverWait(driver_, timeout).until(
|
445
|
+
EC.presence_of_element_located((strategy, wait_until))
|
446
|
+
)
|
326
447
|
if login_url and login_dict:
|
327
448
|
cookies = get_cookies(url=login_url, login=login_dict)
|
328
449
|
driver_.get(url)
|
@@ -358,6 +479,9 @@ def fetch_all(
|
|
358
479
|
# EC.presence_of_element_located((by, where))
|
359
480
|
# )
|
360
481
|
|
482
|
+
# # scroll down the page by a certain number of pixels
|
483
|
+
scroll_smth_steps(driver_)
|
484
|
+
|
361
485
|
# 设置轮询
|
362
486
|
for attempt in range(scroll_try):
|
363
487
|
page_source = driver_.page_source
|