py2ls 0.1.9.9__py3-none-any.whl → 0.1.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/COMMIT_EDITMSG +1 -1
- py2ls/.git/FETCH_HEAD +1 -1
- py2ls/.git/index +0 -0
- py2ls/.git/logs/HEAD +1 -0
- py2ls/.git/logs/refs/heads/main +1 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/logs/refs/remotes/origin/main +1 -0
- py2ls/.git/objects/27/aa6074f652bc6f7078f8647489d9ee8e24f0e2 +0 -0
- py2ls/.git/objects/28/c2969d785c1b892c2a96b3f00eba63a59811b3 +0 -0
- py2ls/.git/objects/2a/fdf45791a26d42ccead35ace76a8f0b2a56561 +0 -0
- py2ls/.git/objects/34/b6f3a2ee84f39bed4eee57f2c0e0afb994feb1 +0 -0
- py2ls/.git/objects/35/1a5f491ab97eee9d1ee699478d75a8bb5d3dc2 +0 -0
- py2ls/.git/objects/39/b13be65125556784e44c7a1d9821703c7ab67e +0 -0
- py2ls/.git/objects/3b/507acc7f23391644cc0b824b1e79fd2677a362 +0 -0
- py2ls/.git/objects/3d/9d10d27724657a436c65a6254bfd213d4b3562 +0 -0
- py2ls/.git/objects/47/6cbd5a7c5e35cddef2f8a38bdc4896d403b095 +0 -0
- py2ls/.git/objects/78/063f4c863fc371ec0313303c0a81283b35d9b6 +0 -0
- py2ls/.git/objects/82/70b319ce4046854fbe7dc41054b6c2d112dab2 +0 -0
- py2ls/.git/objects/85/aee46f478e9afdb84d50a05242c53b04ed2e21 +0 -0
- py2ls/.git/objects/86/e288b46f8fe179907e4413f665aeb5053fddb1 +0 -0
- py2ls/.git/objects/94/f7dbe88e80c4205a901b71eb8f181974376bba +0 -0
- py2ls/.git/objects/9b/ec5ee2236ee2d5532c36bfd132e23c58fdb69c +0 -0
- py2ls/.git/objects/b3/4f7f271c6d6105e35a6556ffda71d03afe8c96 +0 -0
- py2ls/.git/objects/b3/69579064bde9de9a19d114fc33e4e48cc8c0e4 +0 -0
- py2ls/.git/objects/bf/b54d65922ce1dfda1aaa014913a54e7172d0bc +0 -0
- py2ls/.git/objects/c1/397c6ed72c4e20ef6b9ab83163e9a6baba5b45 +0 -0
- py2ls/.git/objects/cc/45df1d317a2eb63ff1ff3a5f3b4a9f98fd92b5 +0 -0
- py2ls/.git/objects/d6/39e8af592cd75a318d8affddd1bcc70c2095f2 +0 -0
- py2ls/.git/objects/db/3f2cd643292057936230b95cf7ec3046affe11 +0 -0
- py2ls/.git/objects/de/214c626ac2dd2685bfaa0bc0fc20f528d014d7 +0 -0
- py2ls/.git/objects/e4/6c715352db9fe3c887a635f1916df4ca1f4ff9 +0 -0
- py2ls/.git/objects/e5/0580a0bd1e1b3d29f834382b80fceb61d5cf0c +0 -0
- py2ls/.git/objects/ec/d980279432b13f0374b90ca439a6329cdece0f +0 -0
- py2ls/.git/objects/ee/cee64eacaff022dcdc509c0c2b1da492f21060 +0 -0
- py2ls/.git/objects/f5/61c3c1bf1c9ea9c9d1f556a7be2869f71f3bdf +0 -0
- py2ls/.git/refs/heads/main +1 -1
- py2ls/.git/refs/remotes/origin/main +1 -1
- py2ls/batman.py +198 -0
- py2ls/ich2ls.py +539 -85
- py2ls/ips.py +1 -1
- py2ls/netfinder.py +105 -3
- py2ls/ocr.py +557 -0
- py2ls/plot.py +68 -11
- {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/METADATA +1 -1
- {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/RECORD +46 -16
- {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
@@ -63,6 +63,36 @@ def user_agent(
|
|
63
63
|
return output_ua
|
64
64
|
|
65
65
|
|
66
|
+
def get_tags(content, ascending=True):
|
67
|
+
tag_names = set()
|
68
|
+
|
69
|
+
# Iterate through all tags in the parsed HTML
|
70
|
+
for tag in content.find_all(True): # `True` finds all tags
|
71
|
+
tag_names.add(tag.name) # Add the tag name to the set
|
72
|
+
|
73
|
+
# Convert set to a sorted list for easier reading (optional)
|
74
|
+
if ascending is None:
|
75
|
+
return tag_names
|
76
|
+
else:
|
77
|
+
if ascending:
|
78
|
+
return sorted(tag_names)
|
79
|
+
else:
|
80
|
+
return tag_names
|
81
|
+
|
82
|
+
|
83
|
+
def get_attr(content, where, attr):
|
84
|
+
all_tags = get_tags(content)
|
85
|
+
if all([where, attr]):
|
86
|
+
if where in all_tags:
|
87
|
+
element_ = content.find_all(where)
|
88
|
+
return [i[attr] for i in element_]
|
89
|
+
else:
|
90
|
+
print(
|
91
|
+
f"cannot find attr {attr} in tag_name{where}\n or possibly cannot find the tag_names:"
|
92
|
+
)
|
93
|
+
pp(all_tags)
|
94
|
+
|
95
|
+
|
66
96
|
def extract_text_from_content(
|
67
97
|
content, content_type="text/html", where=None, what=None, extend=True, **kwargs
|
68
98
|
):
|
@@ -128,7 +158,9 @@ def extract_text_from_content(
|
|
128
158
|
result_set = content.find_all(where, **search_kwargs)
|
129
159
|
else:
|
130
160
|
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
131
|
-
|
161
|
+
if "get" in kwargs:
|
162
|
+
attr = kwargs["get"]
|
163
|
+
return get_attr(content, where, attr)
|
132
164
|
if not result_set:
|
133
165
|
print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
|
134
166
|
if extend:
|
@@ -216,6 +248,60 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
|
|
216
248
|
return cookies_dict
|
217
249
|
|
218
250
|
|
251
|
+
### 更加平滑地移动鼠标, 这样更容易反爬
|
252
|
+
def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
|
253
|
+
"""Smoothly scrolls down the page to trigger lazy loading."""
|
254
|
+
current_scroll_position = 0
|
255
|
+
end_of_page = driver.execute_script("return document.body.scrollHeight")
|
256
|
+
|
257
|
+
while current_scroll_position < end_of_page:
|
258
|
+
step = random.randint(min_step, max_step)
|
259
|
+
driver.execute_script(f"window.scrollBy(0, {step});")
|
260
|
+
time.sleep(scroll_pause)
|
261
|
+
|
262
|
+
# Update the current scroll position
|
263
|
+
current_scroll_position += step
|
264
|
+
end_of_page = driver.execute_script("return document.body.scrollHeight")
|
265
|
+
|
266
|
+
|
267
|
+
def scroll_inf2end(driver, scroll_pause=1):
|
268
|
+
"""Continuously scrolls until the end of the page is reached."""
|
269
|
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
270
|
+
|
271
|
+
while True:
|
272
|
+
# Scroll to the bottom
|
273
|
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
274
|
+
time.sleep(scroll_pause)
|
275
|
+
|
276
|
+
# Get the new height after scrolling
|
277
|
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
278
|
+
if new_height == last_height:
|
279
|
+
break # Exit if no new content is loaded
|
280
|
+
last_height = new_height
|
281
|
+
|
282
|
+
|
283
|
+
def corr_by_kind(wait_until_kind):
|
284
|
+
"""
|
285
|
+
Map the 'wait_until_kind' string to the appropriate Selenium By strategy.
|
286
|
+
"""
|
287
|
+
if "tag" in wait_until_kind:
|
288
|
+
return By.TAG_NAME
|
289
|
+
elif "css" in wait_until_kind:
|
290
|
+
return By.CSS_SELECTOR
|
291
|
+
elif "id" in wait_until_kind:
|
292
|
+
return By.ID
|
293
|
+
elif "name" in wait_until_kind:
|
294
|
+
return By.NAME
|
295
|
+
elif "class" in wait_until_kind:
|
296
|
+
return By.CLASS_NAME
|
297
|
+
elif "path" in wait_until_kind:
|
298
|
+
return By.XPATH
|
299
|
+
elif "link" in wait_until_kind or "text" in wait_until_kind:
|
300
|
+
return By.LINK_TEXT
|
301
|
+
else:
|
302
|
+
raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
|
303
|
+
|
304
|
+
|
219
305
|
def fetch_all(
|
220
306
|
url,
|
221
307
|
parser="lxml",
|
@@ -224,6 +310,8 @@ def fetch_all(
|
|
224
310
|
timeout=10,
|
225
311
|
retry=2,
|
226
312
|
wait=0,
|
313
|
+
wait_until=None,
|
314
|
+
wait_until_kind=None,
|
227
315
|
scroll_try=3,
|
228
316
|
login_url=None,
|
229
317
|
username=None,
|
@@ -308,7 +396,10 @@ def fetch_all(
|
|
308
396
|
prefs = {"profile.managed_default_content_settings.images": 2}
|
309
397
|
chrome_options.add_experimental_option("prefs", prefs)
|
310
398
|
# chrome_options.page_load_strategy = capability
|
399
|
+
|
311
400
|
service = Service(ChromeDriverManager().install())
|
401
|
+
# driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
|
402
|
+
# service=Service(executable_path=driver_path)
|
312
403
|
|
313
404
|
driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
314
405
|
|
@@ -323,6 +414,11 @@ def fetch_all(
|
|
323
414
|
wait_ = 0
|
324
415
|
driver_.implicitly_wait(wait_)
|
325
416
|
|
417
|
+
if wait_until is not None and wait_until_kind is not None:
|
418
|
+
strategy = corr_by_kind(wait_until_kind)
|
419
|
+
WebDriverWait(driver_, timeout).until(
|
420
|
+
EC.presence_of_element_located((strategy, wait_until))
|
421
|
+
)
|
326
422
|
if login_url and login_dict:
|
327
423
|
cookies = get_cookies(url=login_url, login=login_dict)
|
328
424
|
driver_.get(url)
|
@@ -358,6 +454,9 @@ def fetch_all(
|
|
358
454
|
# EC.presence_of_element_located((by, where))
|
359
455
|
# )
|
360
456
|
|
457
|
+
# # scroll down the page by a certain number of pixels
|
458
|
+
scroll_smth_steps(driver_)
|
459
|
+
|
361
460
|
# 设置轮询
|
362
461
|
for attempt in range(scroll_try):
|
363
462
|
page_source = driver_.page_source
|
@@ -671,8 +770,8 @@ def downloader(
|
|
671
770
|
if dir_save:
|
672
771
|
if rm_folder:
|
673
772
|
ips.rm_folder(dir_save)
|
674
|
-
if verbose:
|
675
|
-
|
773
|
+
# if verbose:
|
774
|
+
# print(f"\n... attempting to download to local\n")
|
676
775
|
fnames = [file_link.split("/")[-1] for file_link in file_links_all]
|
677
776
|
|
678
777
|
for idx, file_link in enumerate(file_links_all):
|
@@ -688,6 +787,9 @@ def downloader(
|
|
688
787
|
ext = next(
|
689
788
|
(ftype for ftype in kind if ftype in file_link), None
|
690
789
|
)
|
790
|
+
if ext is None:
|
791
|
+
ext = kind_
|
792
|
+
print("ehereerere", ext)
|
691
793
|
if ext:
|
692
794
|
corrected_fname = fname_corrector(fnames[idx], ext)
|
693
795
|
corrected_fname = check_and_modify_filename(
|