py2ls 0.1.9.9__py3-none-any.whl → 0.1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. py2ls/.git/COMMIT_EDITMSG +1 -1
  2. py2ls/.git/FETCH_HEAD +1 -1
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/HEAD +1 -0
  5. py2ls/.git/logs/refs/heads/main +1 -0
  6. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  7. py2ls/.git/logs/refs/remotes/origin/main +1 -0
  8. py2ls/.git/objects/27/aa6074f652bc6f7078f8647489d9ee8e24f0e2 +0 -0
  9. py2ls/.git/objects/28/c2969d785c1b892c2a96b3f00eba63a59811b3 +0 -0
  10. py2ls/.git/objects/2a/fdf45791a26d42ccead35ace76a8f0b2a56561 +0 -0
  11. py2ls/.git/objects/34/b6f3a2ee84f39bed4eee57f2c0e0afb994feb1 +0 -0
  12. py2ls/.git/objects/35/1a5f491ab97eee9d1ee699478d75a8bb5d3dc2 +0 -0
  13. py2ls/.git/objects/39/b13be65125556784e44c7a1d9821703c7ab67e +0 -0
  14. py2ls/.git/objects/3b/507acc7f23391644cc0b824b1e79fd2677a362 +0 -0
  15. py2ls/.git/objects/3d/9d10d27724657a436c65a6254bfd213d4b3562 +0 -0
  16. py2ls/.git/objects/47/6cbd5a7c5e35cddef2f8a38bdc4896d403b095 +0 -0
  17. py2ls/.git/objects/78/063f4c863fc371ec0313303c0a81283b35d9b6 +0 -0
  18. py2ls/.git/objects/82/70b319ce4046854fbe7dc41054b6c2d112dab2 +0 -0
  19. py2ls/.git/objects/85/aee46f478e9afdb84d50a05242c53b04ed2e21 +0 -0
  20. py2ls/.git/objects/86/e288b46f8fe179907e4413f665aeb5053fddb1 +0 -0
  21. py2ls/.git/objects/94/f7dbe88e80c4205a901b71eb8f181974376bba +0 -0
  22. py2ls/.git/objects/9b/ec5ee2236ee2d5532c36bfd132e23c58fdb69c +0 -0
  23. py2ls/.git/objects/b3/4f7f271c6d6105e35a6556ffda71d03afe8c96 +0 -0
  24. py2ls/.git/objects/b3/69579064bde9de9a19d114fc33e4e48cc8c0e4 +0 -0
  25. py2ls/.git/objects/bf/b54d65922ce1dfda1aaa014913a54e7172d0bc +0 -0
  26. py2ls/.git/objects/c1/397c6ed72c4e20ef6b9ab83163e9a6baba5b45 +0 -0
  27. py2ls/.git/objects/cc/45df1d317a2eb63ff1ff3a5f3b4a9f98fd92b5 +0 -0
  28. py2ls/.git/objects/d6/39e8af592cd75a318d8affddd1bcc70c2095f2 +0 -0
  29. py2ls/.git/objects/db/3f2cd643292057936230b95cf7ec3046affe11 +0 -0
  30. py2ls/.git/objects/de/214c626ac2dd2685bfaa0bc0fc20f528d014d7 +0 -0
  31. py2ls/.git/objects/e4/6c715352db9fe3c887a635f1916df4ca1f4ff9 +0 -0
  32. py2ls/.git/objects/e5/0580a0bd1e1b3d29f834382b80fceb61d5cf0c +0 -0
  33. py2ls/.git/objects/ec/d980279432b13f0374b90ca439a6329cdece0f +0 -0
  34. py2ls/.git/objects/ee/cee64eacaff022dcdc509c0c2b1da492f21060 +0 -0
  35. py2ls/.git/objects/f5/61c3c1bf1c9ea9c9d1f556a7be2869f71f3bdf +0 -0
  36. py2ls/.git/refs/heads/main +1 -1
  37. py2ls/.git/refs/remotes/origin/main +1 -1
  38. py2ls/batman.py +198 -0
  39. py2ls/ich2ls.py +539 -85
  40. py2ls/ips.py +1 -1
  41. py2ls/netfinder.py +105 -3
  42. py2ls/ocr.py +557 -0
  43. py2ls/plot.py +68 -11
  44. {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/METADATA +1 -1
  45. {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/RECORD +46 -16
  46. {py2ls-0.1.9.9.dist-info → py2ls-0.1.10.1.dist-info}/WHEEL +0 -0
py2ls/netfinder.py CHANGED
@@ -63,6 +63,36 @@ def user_agent(
63
63
  return output_ua
64
64
 
65
65
 
66
+ def get_tags(content, ascending=True):
67
+ tag_names = set()
68
+
69
+ # Iterate through all tags in the parsed HTML
70
+ for tag in content.find_all(True): # `True` finds all tags
71
+ tag_names.add(tag.name) # Add the tag name to the set
72
+
73
+ # Convert set to a sorted list for easier reading (optional)
74
+ if ascending is None:
75
+ return tag_names
76
+ else:
77
+ if ascending:
78
+ return sorted(tag_names)
79
+ else:
80
+ return tag_names
81
+
82
+
83
+ def get_attr(content, where, attr):
84
+ all_tags = get_tags(content)
85
+ if all([where, attr]):
86
+ if where in all_tags:
87
+ element_ = content.find_all(where)
88
+ return [i[attr] for i in element_]
89
+ else:
90
+ print(
91
+ f"cannot find attr {attr} in tag_name{where}\n or possibly cannot find the tag_names:"
92
+ )
93
+ pp(all_tags)
94
+
95
+
66
96
  def extract_text_from_content(
67
97
  content, content_type="text/html", where=None, what=None, extend=True, **kwargs
68
98
  ):
@@ -128,7 +158,9 @@ def extract_text_from_content(
128
158
  result_set = content.find_all(where, **search_kwargs)
129
159
  else:
130
160
  result_set = content.find_all(where, attrs=dict(**search_kwargs))
131
-
161
+ if "get" in kwargs:
162
+ attr = kwargs["get"]
163
+ return get_attr(content, where, attr)
132
164
  if not result_set:
133
165
  print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
134
166
  if extend:
@@ -216,6 +248,60 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
216
248
  return cookies_dict
217
249
 
218
250
 
251
+ ### 更加平滑地移动鼠标, 这样更容易反爬
252
+ def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
253
+ """Smoothly scrolls down the page to trigger lazy loading."""
254
+ current_scroll_position = 0
255
+ end_of_page = driver.execute_script("return document.body.scrollHeight")
256
+
257
+ while current_scroll_position < end_of_page:
258
+ step = random.randint(min_step, max_step)
259
+ driver.execute_script(f"window.scrollBy(0, {step});")
260
+ time.sleep(scroll_pause)
261
+
262
+ # Update the current scroll position
263
+ current_scroll_position += step
264
+ end_of_page = driver.execute_script("return document.body.scrollHeight")
265
+
266
+
267
+ def scroll_inf2end(driver, scroll_pause=1):
268
+ """Continuously scrolls until the end of the page is reached."""
269
+ last_height = driver.execute_script("return document.body.scrollHeight")
270
+
271
+ while True:
272
+ # Scroll to the bottom
273
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
274
+ time.sleep(scroll_pause)
275
+
276
+ # Get the new height after scrolling
277
+ new_height = driver.execute_script("return document.body.scrollHeight")
278
+ if new_height == last_height:
279
+ break # Exit if no new content is loaded
280
+ last_height = new_height
281
+
282
+
283
+ def corr_by_kind(wait_until_kind):
284
+ """
285
+ Map the 'wait_until_kind' string to the appropriate Selenium By strategy.
286
+ """
287
+ if "tag" in wait_until_kind:
288
+ return By.TAG_NAME
289
+ elif "css" in wait_until_kind:
290
+ return By.CSS_SELECTOR
291
+ elif "id" in wait_until_kind:
292
+ return By.ID
293
+ elif "name" in wait_until_kind:
294
+ return By.NAME
295
+ elif "class" in wait_until_kind:
296
+ return By.CLASS_NAME
297
+ elif "path" in wait_until_kind:
298
+ return By.XPATH
299
+ elif "link" in wait_until_kind or "text" in wait_until_kind:
300
+ return By.LINK_TEXT
301
+ else:
302
+ raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
303
+
304
+
219
305
  def fetch_all(
220
306
  url,
221
307
  parser="lxml",
@@ -224,6 +310,8 @@ def fetch_all(
224
310
  timeout=10,
225
311
  retry=2,
226
312
  wait=0,
313
+ wait_until=None,
314
+ wait_until_kind=None,
227
315
  scroll_try=3,
228
316
  login_url=None,
229
317
  username=None,
@@ -308,7 +396,10 @@ def fetch_all(
308
396
  prefs = {"profile.managed_default_content_settings.images": 2}
309
397
  chrome_options.add_experimental_option("prefs", prefs)
310
398
  # chrome_options.page_load_strategy = capability
399
+
311
400
  service = Service(ChromeDriverManager().install())
401
+ # driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
402
+ # service=Service(executable_path=driver_path)
312
403
 
313
404
  driver_ = webdriver.Chrome(service=service, options=chrome_options)
314
405
 
@@ -323,6 +414,11 @@ def fetch_all(
323
414
  wait_ = 0
324
415
  driver_.implicitly_wait(wait_)
325
416
 
417
+ if wait_until is not None and wait_until_kind is not None:
418
+ strategy = corr_by_kind(wait_until_kind)
419
+ WebDriverWait(driver_, timeout).until(
420
+ EC.presence_of_element_located((strategy, wait_until))
421
+ )
326
422
  if login_url and login_dict:
327
423
  cookies = get_cookies(url=login_url, login=login_dict)
328
424
  driver_.get(url)
@@ -358,6 +454,9 @@ def fetch_all(
358
454
  # EC.presence_of_element_located((by, where))
359
455
  # )
360
456
 
457
+ # # scroll down the page by a certain number of pixels
458
+ scroll_smth_steps(driver_)
459
+
361
460
  # 设置轮询
362
461
  for attempt in range(scroll_try):
363
462
  page_source = driver_.page_source
@@ -671,8 +770,8 @@ def downloader(
671
770
  if dir_save:
672
771
  if rm_folder:
673
772
  ips.rm_folder(dir_save)
674
- if verbose:
675
- print(f"\n... attempting to download to local\n")
773
+ # if verbose:
774
+ # print(f"\n... attempting to download to local\n")
676
775
  fnames = [file_link.split("/")[-1] for file_link in file_links_all]
677
776
 
678
777
  for idx, file_link in enumerate(file_links_all):
@@ -688,6 +787,9 @@ def downloader(
688
787
  ext = next(
689
788
  (ftype for ftype in kind if ftype in file_link), None
690
789
  )
790
+ if ext is None:
791
+ ext = kind_
792
+ print("ehereerere", ext)
691
793
  if ext:
692
794
  corrected_fname = fname_corrector(fnames[idx], ext)
693
795
  corrected_fname = check_and_modify_filename(