PyPI - parsagon - Versions diffs - 0.8.1__tar.gz → 0.9.0__tar.gz - Mend

parsagon 0.8.1tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{parsagon-0.8.1 → parsagon-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parsagon
-Version: 0.8.1
+Version: 0.9.0
 Summary: Allows you to create browser automations with natural language
 Author-email: Sandy Suh <sandy@parsagon.io>
 Project-URL: Homepage, https://parsagon.io

{parsagon-0.8.1 → parsagon-0.9.0}/pyproject.toml RENAMED Viewed

@@ -10,7 +10,7 @@ line-length = 120
 [project]
 name = "parsagon"
-version = "0.8.1"
+version = "0.9.0"
 description = "Allows you to create browser automations with natural language"
 readme = "README.md"
 requires-python = ">=3.8"

{parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/api.py RENAMED Viewed

@@ -80,7 +80,7 @@ def get_interaction_element_id(marked_html, elem_type, description):
     return result
-def scrape_page(url, html, schema):
+def scrape_page(html, schema):
     """
     Scrapes data from the provided page HTML - data will be returned in the schema provided.
     :param url: url of the page to scrape.
@@ -88,7 +88,7 @@ def scrape_page(url, html, schema):
     :param schema: Schema of the data to scrape
     :return: Scraped data, with lists truncated.
     """
-    return _api_call(httpx.post, "/transformers/get-custom-data/", json={"url": url, "html": html, "schema": schema})
+    return _api_call(httpx.post, "/transformers/get-custom-data/", json={"html": html, "schema": schema})
 def create_pipeline(name, description, program_sketch):

{parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/executor.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import time
+from urllib.parse import urljoin
 import lxml.html
 from pyvirtualdisplay import Display
@@ -74,11 +75,22 @@ class Executor:
         )
     def _get_cleaned_lxml_root(self):
-        driver = self.driver
-        html = driver.page_source
         parser = lxml.html.HTMLParser(remove_comments=True, remove_pis=True)
-        root = lxml.html.fromstring(html, parser=parser)
+        root = lxml.html.fromstring(self.driver.page_source.replace('&nbsp;', ' '), parser=parser)
+        # make links absolute
+        root.make_links_absolute(self.driver.current_url)
+        for elem in root.xpath('//img[@srcset]'):
+            srcset_list = []
+            for s in elem.get('srcset').split(','):
+                parts = s.strip().split()
+                if not parts:
+                    continue
+                parts[0] = urljoin(self.driver.current_url, parts[0])
+                srcset_list.append(' '.join(parts))
+            elem.set('srcset', ', '.join(srcset_list))
+        # remove unnecessary and bulky elements
         for elem in root.iterfind(".//script"):
             elem.text = ""
         for elem in root.iterfind(".//noscript"):
@@ -188,6 +200,7 @@ class Executor:
             examples=[
                 {
                     "html": html,
+                    "url": self.driver.current_url,
                     "elem_id": elem_id,
                 }
             ],
@@ -222,6 +235,7 @@ class Executor:
             examples=[
                 {
                     "html": html,
+                    "url": self.driver.current_url,
                     "elem_id": elem_id,
                 }
             ],
@@ -260,6 +274,7 @@ class Executor:
             examples=[
                 {
                     "html": html,
+                    "url": self.driver.current_url,
                     "elem_id": elem_id,
                 }
             ],
@@ -287,7 +302,7 @@ class Executor:
         self.driver.switch_to.window(window_id)
         logger.info("Scraping data...")
         html = self.get_scrape_html()
-        result = scrape_page(self.driver.current_url, html, schema)
+        result = scrape_page(html, schema)
         scraped_data = result["data"]
         nodes = result["nodes"]
         if not scraped_data and not nodes:

parsagon-0.9.0/src/parsagon/tests/test_executor.py ADDED Viewed

@@ -0,0 +1,20 @@
+from parsagon.executor import Executor
+class MockDriver:
+    def __init__(self, current_url, page_source):
+        self.current_url = current_url
+        self.page_source = page_source
+class MockExecutor(Executor):
+    def __init__(self, current_url, page_source):
+        self.driver = MockDriver(current_url, page_source)
+        self.max_elem_id = 0
+        self.custom_functions = {}
+def test_makes_links_absolute():
+    executor = MockExecutor("https://example.com/", '<html><body><a href="/stuff">Stuff</a></body></html>')
+    root = executor._get_cleaned_lxml_root()
+    assert root[0][0].get("href") == "https://example.com/stuff"

{parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parsagon
-Version: 0.8.1
+Version: 0.9.0
 Summary: Allows you to create browser automations with natural language
 Author-email: Sandy Suh <sandy@parsagon.io>
 Project-URL: Homepage, https://parsagon.io

{parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,5 +18,6 @@ src/parsagon/tests/__init__.py
 src/parsagon/tests/api_mocks.py
 src/parsagon/tests/cli_mocks.py
 src/parsagon/tests/conftest.py
+src/parsagon/tests/test_executor.py
 src/parsagon/tests/test_invalid_args.py
 src/parsagon/tests/test_pipeline_operations.py