py2ls 0.1.10.0__py3-none-any.whl → 0.1.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. py2ls/.git/COMMIT_EDITMSG +1 -1
  2. py2ls/.git/FETCH_HEAD +1 -1
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/HEAD +1 -0
  5. py2ls/.git/logs/refs/heads/main +1 -0
  6. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  7. py2ls/.git/logs/refs/remotes/origin/main +1 -0
  8. py2ls/.git/objects/27/aa6074f652bc6f7078f8647489d9ee8e24f0e2 +0 -0
  9. py2ls/.git/objects/28/c2969d785c1b892c2a96b3f00eba63a59811b3 +0 -0
  10. py2ls/.git/objects/2a/fdf45791a26d42ccead35ace76a8f0b2a56561 +0 -0
  11. py2ls/.git/objects/34/b6f3a2ee84f39bed4eee57f2c0e0afb994feb1 +0 -0
  12. py2ls/.git/objects/35/1a5f491ab97eee9d1ee699478d75a8bb5d3dc2 +0 -0
  13. py2ls/.git/objects/39/b13be65125556784e44c7a1d9821703c7ab67e +0 -0
  14. py2ls/.git/objects/3b/507acc7f23391644cc0b824b1e79fd2677a362 +0 -0
  15. py2ls/.git/objects/3d/9d10d27724657a436c65a6254bfd213d4b3562 +0 -0
  16. py2ls/.git/objects/47/6cbd5a7c5e35cddef2f8a38bdc4896d403b095 +0 -0
  17. py2ls/.git/objects/78/063f4c863fc371ec0313303c0a81283b35d9b6 +0 -0
  18. py2ls/.git/objects/82/70b319ce4046854fbe7dc41054b6c2d112dab2 +0 -0
  19. py2ls/.git/objects/85/aee46f478e9afdb84d50a05242c53b04ed2e21 +0 -0
  20. py2ls/.git/objects/86/e288b46f8fe179907e4413f665aeb5053fddb1 +0 -0
  21. py2ls/.git/objects/94/f7dbe88e80c4205a901b71eb8f181974376bba +0 -0
  22. py2ls/.git/objects/9b/ec5ee2236ee2d5532c36bfd132e23c58fdb69c +0 -0
  23. py2ls/.git/objects/b3/4f7f271c6d6105e35a6556ffda71d03afe8c96 +0 -0
  24. py2ls/.git/objects/b3/69579064bde9de9a19d114fc33e4e48cc8c0e4 +0 -0
  25. py2ls/.git/objects/bf/b54d65922ce1dfda1aaa014913a54e7172d0bc +0 -0
  26. py2ls/.git/objects/c1/397c6ed72c4e20ef6b9ab83163e9a6baba5b45 +0 -0
  27. py2ls/.git/objects/cc/45df1d317a2eb63ff1ff3a5f3b4a9f98fd92b5 +0 -0
  28. py2ls/.git/objects/d6/39e8af592cd75a318d8affddd1bcc70c2095f2 +0 -0
  29. py2ls/.git/objects/db/3f2cd643292057936230b95cf7ec3046affe11 +0 -0
  30. py2ls/.git/objects/de/214c626ac2dd2685bfaa0bc0fc20f528d014d7 +0 -0
  31. py2ls/.git/objects/e4/6c715352db9fe3c887a635f1916df4ca1f4ff9 +0 -0
  32. py2ls/.git/objects/e5/0580a0bd1e1b3d29f834382b80fceb61d5cf0c +0 -0
  33. py2ls/.git/objects/ec/d980279432b13f0374b90ca439a6329cdece0f +0 -0
  34. py2ls/.git/objects/ee/cee64eacaff022dcdc509c0c2b1da492f21060 +0 -0
  35. py2ls/.git/objects/f5/61c3c1bf1c9ea9c9d1f556a7be2869f71f3bdf +0 -0
  36. py2ls/.git/refs/heads/main +1 -1
  37. py2ls/.git/refs/remotes/origin/main +1 -1
  38. py2ls/batman.py +62 -47
  39. py2ls/ips.py +771 -3
  40. py2ls/netfinder.py +125 -1
  41. py2ls/ocr.py +721 -0
  42. py2ls/plot.py +24 -0
  43. py2ls/translator.py +470 -119
  44. {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/METADATA +1 -1
  45. {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/RECORD +46 -17
  46. {py2ls-0.1.10.0.dist-info → py2ls-0.1.10.2.dist-info}/WHEEL +1 -1
py2ls/netfinder.py CHANGED
@@ -63,6 +63,59 @@ def user_agent(
63
63
  return output_ua
64
64
 
65
65
 
66
+ def get_tags(content, ascending=True):
67
+ tag_names = set()
68
+
69
+ # Iterate through all tags in the parsed HTML
70
+ for tag in content.find_all(True): # `True` finds all tags
71
+ tag_names.add(tag.name) # Add the tag name to the set
72
+
73
+ # Convert set to a sorted list for easier reading (optional)
74
+ if ascending is None:
75
+ return tag_names
76
+ else:
77
+ if ascending:
78
+ return sorted(tag_names)
79
+ else:
80
+ return tag_names
81
+
82
+
83
+ def get_attr(content, where=None, attr=None, **kwargs):
84
+ """
85
+ usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
86
+
87
+ Extracts the specified attribute from tags in the content.
88
+
89
+ Parameters:
90
+ - content: BeautifulSoup object of the HTML content.
91
+ - where: The tag name to search for (e.g., 'time').
92
+ - attr: The attribute to extract (e.g., 'datetime').
93
+ - kwargs: Additional filtering conditions for find_all.
94
+
95
+ Returns:
96
+ - A list of attribute values if found; otherwise, prints debug info.
97
+ """
98
+ # Extract all tags from the content
99
+ all_tags = get_tags(content)
100
+ if all([where, attr]):
101
+ if where in all_tags:
102
+ if kwargs:
103
+ element_ = content.find_all(where, **kwargs)
104
+ else:
105
+ element_ = content.find_all(where)
106
+ attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
107
+ if attr_values:
108
+ return attr_values
109
+ else:
110
+ print(f"The attribute '{attr}' is not found in the elements.")
111
+ else:
112
+ print(f"Cannot find tag '{where}' in the content.")
113
+ print("Available tags:")
114
+ pp(all_tags)
115
+ else:
116
+ print("Please provide both 'where' (tag name) and 'attr' (attribute).")
117
+
118
+
66
119
  def extract_text_from_content(
67
120
  content, content_type="text/html", where=None, what=None, extend=True, **kwargs
68
121
  ):
@@ -128,7 +181,11 @@ def extract_text_from_content(
128
181
  result_set = content.find_all(where, **search_kwargs)
129
182
  else:
130
183
  result_set = content.find_all(where, attrs=dict(**search_kwargs))
131
-
184
+ if "get" in kwargs:
185
+ del search_kwargs["get"] # rm 'get' key
186
+ return get_attr(
187
+ content, where=where, attr=kwargs["get"], **search_kwargs
188
+ )
132
189
  if not result_set:
133
190
  print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
134
191
  if extend:
@@ -216,6 +273,60 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
216
273
  return cookies_dict
217
274
 
218
275
 
276
+ ### 更加平滑地移动鼠标, 这样更容易反爬
277
+ def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
278
+ """Smoothly scrolls down the page to trigger lazy loading."""
279
+ current_scroll_position = 0
280
+ end_of_page = driver.execute_script("return document.body.scrollHeight")
281
+
282
+ while current_scroll_position < end_of_page:
283
+ step = random.randint(min_step, max_step)
284
+ driver.execute_script(f"window.scrollBy(0, {step});")
285
+ time.sleep(scroll_pause)
286
+
287
+ # Update the current scroll position
288
+ current_scroll_position += step
289
+ end_of_page = driver.execute_script("return document.body.scrollHeight")
290
+
291
+
292
+ def scroll_inf2end(driver, scroll_pause=1):
293
+ """Continuously scrolls until the end of the page is reached."""
294
+ last_height = driver.execute_script("return document.body.scrollHeight")
295
+
296
+ while True:
297
+ # Scroll to the bottom
298
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
299
+ time.sleep(scroll_pause)
300
+
301
+ # Get the new height after scrolling
302
+ new_height = driver.execute_script("return document.body.scrollHeight")
303
+ if new_height == last_height:
304
+ break # Exit if no new content is loaded
305
+ last_height = new_height
306
+
307
+
308
+ def corr_by_kind(wait_until_kind):
309
+ """
310
+ Map the 'wait_until_kind' string to the appropriate Selenium By strategy.
311
+ """
312
+ if "tag" in wait_until_kind:
313
+ return By.TAG_NAME
314
+ elif "css" in wait_until_kind:
315
+ return By.CSS_SELECTOR
316
+ elif "id" in wait_until_kind:
317
+ return By.ID
318
+ elif "name" in wait_until_kind:
319
+ return By.NAME
320
+ elif "class" in wait_until_kind:
321
+ return By.CLASS_NAME
322
+ elif "path" in wait_until_kind:
323
+ return By.XPATH
324
+ elif "link" in wait_until_kind or "text" in wait_until_kind:
325
+ return By.LINK_TEXT
326
+ else:
327
+ raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
328
+
329
+
219
330
  def fetch_all(
220
331
  url,
221
332
  parser="lxml",
@@ -224,6 +335,8 @@ def fetch_all(
224
335
  timeout=10,
225
336
  retry=2,
226
337
  wait=0,
338
+ wait_until=None,
339
+ wait_until_kind=None,
227
340
  scroll_try=3,
228
341
  login_url=None,
229
342
  username=None,
@@ -308,7 +421,10 @@ def fetch_all(
308
421
  prefs = {"profile.managed_default_content_settings.images": 2}
309
422
  chrome_options.add_experimental_option("prefs", prefs)
310
423
  # chrome_options.page_load_strategy = capability
424
+
311
425
  service = Service(ChromeDriverManager().install())
426
+ # driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
427
+ # service=Service(executable_path=driver_path)
312
428
 
313
429
  driver_ = webdriver.Chrome(service=service, options=chrome_options)
314
430
 
@@ -323,6 +439,11 @@ def fetch_all(
323
439
  wait_ = 0
324
440
  driver_.implicitly_wait(wait_)
325
441
 
442
+ if wait_until is not None and wait_until_kind is not None:
443
+ strategy = corr_by_kind(wait_until_kind)
444
+ WebDriverWait(driver_, timeout).until(
445
+ EC.presence_of_element_located((strategy, wait_until))
446
+ )
326
447
  if login_url and login_dict:
327
448
  cookies = get_cookies(url=login_url, login=login_dict)
328
449
  driver_.get(url)
@@ -358,6 +479,9 @@ def fetch_all(
358
479
  # EC.presence_of_element_located((by, where))
359
480
  # )
360
481
 
482
+ # # scroll down the page by a certain number of pixels
483
+ scroll_smth_steps(driver_)
484
+
361
485
  # 设置轮询
362
486
  for attempt in range(scroll_try):
363
487
  page_source = driver_.page_source