PyPI - py2ls - Versions diffs - 0.1.10.0__py3-none-any.whl → 0.1.10.2__py3-none-any.whl - Mend

py2ls 0.1.10.0py3-none-any.whl → 0.1.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

py2ls/netfinder.py CHANGED Viewed

@@ -63,6 +63,59 @@ def user_agent(
     return output_ua
+def get_tags(content, ascending=True):
+    tag_names = set()
+    # Iterate through all tags in the parsed HTML
+    for tag in content.find_all(True):  # `True` finds all tags
+        tag_names.add(tag.name)  # Add the tag name to the set
+    # Convert set to a sorted list for easier reading (optional)
+    if ascending is None:
+        return tag_names
+    else:
+        if ascending:
+            return sorted(tag_names)
+        else:
+            return tag_names
+def get_attr(content, where=None, attr=None, **kwargs):
+    """
+    usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
+    Extracts the specified attribute from tags in the content.
+    Parameters:
+    - content: BeautifulSoup object of the HTML content.
+    - where: The tag name to search for (e.g., 'time').
+    - attr: The attribute to extract (e.g., 'datetime').
+    - kwargs: Additional filtering conditions for find_all.
+    Returns:
+    - A list of attribute values if found; otherwise, prints debug info.
+    """
+    # Extract all tags from the content
+    all_tags = get_tags(content)
+    if all([where, attr]):
+        if where in all_tags:
+            if kwargs:
+                element_ = content.find_all(where, **kwargs)
+            else:
+                element_ = content.find_all(where)
+            attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
+            if attr_values:
+                return attr_values
+            else:
+                print(f"The attribute '{attr}' is not found in the elements.")
+        else:
+            print(f"Cannot find tag '{where}' in the content.")
+            print("Available tags:")
+            pp(all_tags)
+    else:
+        print("Please provide both 'where' (tag name) and 'attr' (attribute).")
 def extract_text_from_content(
     content, content_type="text/html", where=None, what=None, extend=True, **kwargs
 ):
@@ -128,7 +181,11 @@ def extract_text_from_content(
                 result_set = content.find_all(where, **search_kwargs)
             else:
                 result_set = content.find_all(where, attrs=dict(**search_kwargs))
+            if "get" in kwargs:
+                del search_kwargs["get"]  # rm 'get' key
+                return get_attr(
+                    content, where=where, attr=kwargs["get"], **search_kwargs
+                )
             if not result_set:
                 print("Failed: check the 'attrs' setting:  attrs={'id':'xample'}")
             if extend:
@@ -216,6 +273,60 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
     return cookies_dict
+### 更加平滑地移动鼠标, 这样更容易反爬
+def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
+    """Smoothly scrolls down the page to trigger lazy loading."""
+    current_scroll_position = 0
+    end_of_page = driver.execute_script("return document.body.scrollHeight")
+    while current_scroll_position < end_of_page:
+        step = random.randint(min_step, max_step)
+        driver.execute_script(f"window.scrollBy(0, {step});")
+        time.sleep(scroll_pause)
+        # Update the current scroll position
+        current_scroll_position += step
+        end_of_page = driver.execute_script("return document.body.scrollHeight")
+def scroll_inf2end(driver, scroll_pause=1):
+    """Continuously scrolls until the end of the page is reached."""
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    while True:
+        # Scroll to the bottom
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(scroll_pause)
+        # Get the new height after scrolling
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break  # Exit if no new content is loaded
+        last_height = new_height
+def corr_by_kind(wait_until_kind):
+    """
+    Map the 'wait_until_kind' string to the appropriate Selenium By strategy.
+    """
+    if "tag" in wait_until_kind:
+        return By.TAG_NAME
+    elif "css" in wait_until_kind:
+        return By.CSS_SELECTOR
+    elif "id" in wait_until_kind:
+        return By.ID
+    elif "name" in wait_until_kind:
+        return By.NAME
+    elif "class" in wait_until_kind:
+        return By.CLASS_NAME
+    elif "path" in wait_until_kind:
+        return By.XPATH
+    elif "link" in wait_until_kind or "text" in wait_until_kind:
+        return By.LINK_TEXT
+    else:
+        raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
 def fetch_all(
     url,
     parser="lxml",
@@ -224,6 +335,8 @@ def fetch_all(
     timeout=10,
     retry=2,
     wait=0,
+    wait_until=None,
+    wait_until_kind=None,
     scroll_try=3,
     login_url=None,
     username=None,
@@ -308,7 +421,10 @@ def fetch_all(
                 prefs = {"profile.managed_default_content_settings.images": 2}
                 chrome_options.add_experimental_option("prefs", prefs)
             # chrome_options.page_load_strategy = capability
             service = Service(ChromeDriverManager().install())
+            # driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
+            # service=Service(executable_path=driver_path)
             driver_ = webdriver.Chrome(service=service, options=chrome_options)
@@ -323,6 +439,11 @@ def fetch_all(
                 wait_ = 0
             driver_.implicitly_wait(wait_)
+            if wait_until is not None and wait_until_kind is not None:
+                strategy = corr_by_kind(wait_until_kind)
+                WebDriverWait(driver_, timeout).until(
+                    EC.presence_of_element_located((strategy, wait_until))
+                )
             if login_url and login_dict:
                 cookies = get_cookies(url=login_url, login=login_dict)
                 driver_.get(url)
@@ -358,6 +479,9 @@ def fetch_all(
             #     EC.presence_of_element_located((by, where))
             # )
+            # # scroll down the page by a certain number of pixels
+            scroll_smth_steps(driver_)
             # 设置轮询
             for attempt in range(scroll_try):
                 page_source = driver_.page_source

py2ls 0.1.10.0__py3-none-any.whl → 0.1.10.2__py3-none-any.whl

py2ls 0.1.10.0py3-none-any.whl → 0.1.10.2py3-none-any.whl