parsagon 0.8.1__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {parsagon-0.8.1 → parsagon-0.9.0}/PKG-INFO +1 -1
  2. {parsagon-0.8.1 → parsagon-0.9.0}/pyproject.toml +1 -1
  3. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/api.py +2 -2
  4. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/executor.py +20 -5
  5. parsagon-0.9.0/src/parsagon/tests/test_executor.py +20 -0
  6. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/PKG-INFO +1 -1
  7. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/SOURCES.txt +1 -0
  8. {parsagon-0.8.1 → parsagon-0.9.0}/README.md +0 -0
  9. {parsagon-0.8.1 → parsagon-0.9.0}/setup.cfg +0 -0
  10. {parsagon-0.8.1 → parsagon-0.9.0}/src/__init__.py +0 -0
  11. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/__init__.py +0 -0
  12. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/custom_function.py +0 -0
  13. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/exceptions.py +0 -0
  14. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/main.py +0 -0
  15. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/settings.py +0 -0
  16. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/__init__.py +0 -0
  17. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/api_mocks.py +0 -0
  18. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/cli_mocks.py +0 -0
  19. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/conftest.py +0 -0
  20. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/test_invalid_args.py +0 -0
  21. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon/tests/test_pipeline_operations.py +0 -0
  22. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/dependency_links.txt +0 -0
  23. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/entry_points.txt +0 -0
  24. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/requires.txt +0 -0
  25. {parsagon-0.8.1 → parsagon-0.9.0}/src/parsagon.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsagon
3
- Version: 0.8.1
3
+ Version: 0.9.0
4
4
  Summary: Allows you to create browser automations with natural language
5
5
  Author-email: Sandy Suh <sandy@parsagon.io>
6
6
  Project-URL: Homepage, https://parsagon.io
@@ -10,7 +10,7 @@ line-length = 120
10
10
 
11
11
  [project]
12
12
  name = "parsagon"
13
- version = "0.8.1"
13
+ version = "0.9.0"
14
14
  description = "Allows you to create browser automations with natural language"
15
15
  readme = "README.md"
16
16
  requires-python = ">=3.8"
@@ -80,7 +80,7 @@ def get_interaction_element_id(marked_html, elem_type, description):
80
80
  return result
81
81
 
82
82
 
83
- def scrape_page(url, html, schema):
83
+ def scrape_page(html, schema):
84
84
  """
85
85
  Scrapes data from the provided page HTML - data will be returned in the schema provided.
86
86
  :param url: url of the page to scrape.
@@ -88,7 +88,7 @@ def scrape_page(url, html, schema):
88
88
  :param schema: Schema of the data to scrape
89
89
  :return: Scraped data, with lists truncated.
90
90
  """
91
- return _api_call(httpx.post, "/transformers/get-custom-data/", json={"url": url, "html": html, "schema": schema})
91
+ return _api_call(httpx.post, "/transformers/get-custom-data/", json={"html": html, "schema": schema})
92
92
 
93
93
 
94
94
  def create_pipeline(name, description, program_sketch):
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import time
4
+ from urllib.parse import urljoin
4
5
 
5
6
  import lxml.html
6
7
  from pyvirtualdisplay import Display
@@ -74,11 +75,22 @@ class Executor:
74
75
  )
75
76
 
76
77
  def _get_cleaned_lxml_root(self):
77
- driver = self.driver
78
- html = driver.page_source
79
-
80
78
  parser = lxml.html.HTMLParser(remove_comments=True, remove_pis=True)
81
- root = lxml.html.fromstring(html, parser=parser)
79
+ root = lxml.html.fromstring(self.driver.page_source.replace('&nbsp;', ' '), parser=parser)
80
+
81
+ # make links absolute
82
+ root.make_links_absolute(self.driver.current_url)
83
+ for elem in root.xpath('//img[@srcset]'):
84
+ srcset_list = []
85
+ for s in elem.get('srcset').split(','):
86
+ parts = s.strip().split()
87
+ if not parts:
88
+ continue
89
+ parts[0] = urljoin(self.driver.current_url, parts[0])
90
+ srcset_list.append(' '.join(parts))
91
+ elem.set('srcset', ', '.join(srcset_list))
92
+
93
+ # remove unnecessary and bulky elements
82
94
  for elem in root.iterfind(".//script"):
83
95
  elem.text = ""
84
96
  for elem in root.iterfind(".//noscript"):
@@ -188,6 +200,7 @@ class Executor:
188
200
  examples=[
189
201
  {
190
202
  "html": html,
203
+ "url": self.driver.current_url,
191
204
  "elem_id": elem_id,
192
205
  }
193
206
  ],
@@ -222,6 +235,7 @@ class Executor:
222
235
  examples=[
223
236
  {
224
237
  "html": html,
238
+ "url": self.driver.current_url,
225
239
  "elem_id": elem_id,
226
240
  }
227
241
  ],
@@ -260,6 +274,7 @@ class Executor:
260
274
  examples=[
261
275
  {
262
276
  "html": html,
277
+ "url": self.driver.current_url,
263
278
  "elem_id": elem_id,
264
279
  }
265
280
  ],
@@ -287,7 +302,7 @@ class Executor:
287
302
  self.driver.switch_to.window(window_id)
288
303
  logger.info("Scraping data...")
289
304
  html = self.get_scrape_html()
290
- result = scrape_page(self.driver.current_url, html, schema)
305
+ result = scrape_page(html, schema)
291
306
  scraped_data = result["data"]
292
307
  nodes = result["nodes"]
293
308
  if not scraped_data and not nodes:
@@ -0,0 +1,20 @@
1
+ from parsagon.executor import Executor
2
+
3
+
4
+ class MockDriver:
5
+ def __init__(self, current_url, page_source):
6
+ self.current_url = current_url
7
+ self.page_source = page_source
8
+
9
+
10
+ class MockExecutor(Executor):
11
+ def __init__(self, current_url, page_source):
12
+ self.driver = MockDriver(current_url, page_source)
13
+ self.max_elem_id = 0
14
+ self.custom_functions = {}
15
+
16
+
17
+ def test_makes_links_absolute():
18
+ executor = MockExecutor("https://example.com/", '<html><body><a href="/stuff">Stuff</a></body></html>')
19
+ root = executor._get_cleaned_lxml_root()
20
+ assert root[0][0].get("href") == "https://example.com/stuff"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsagon
3
- Version: 0.8.1
3
+ Version: 0.9.0
4
4
  Summary: Allows you to create browser automations with natural language
5
5
  Author-email: Sandy Suh <sandy@parsagon.io>
6
6
  Project-URL: Homepage, https://parsagon.io
@@ -18,5 +18,6 @@ src/parsagon/tests/__init__.py
18
18
  src/parsagon/tests/api_mocks.py
19
19
  src/parsagon/tests/cli_mocks.py
20
20
  src/parsagon/tests/conftest.py
21
+ src/parsagon/tests/test_executor.py
21
22
  src/parsagon/tests/test_invalid_args.py
22
23
  src/parsagon/tests/test_pipeline_operations.py
File without changes
File without changes
File without changes
File without changes