parsagon 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parsagon/api.py CHANGED
@@ -83,8 +83,9 @@ def get_interaction_element_id(marked_html, elem_type, description):
83
83
  def scrape_page(html, schema):
84
84
  """
85
85
  Scrapes data from the provided page HTML - data will be returned in the schema provided.
86
+ :param url: url of the page to scrape.
86
87
  :param html: HTML of the page to scrape.
87
- :param schema: Schema of the data to scrape, in the format
88
+ :param schema: Schema of the data to scrape
88
89
  :return: Scraped data, with lists truncated.
89
90
  """
90
91
  return _api_call(httpx.post, "/transformers/get-custom-data/", json={"html": html, "schema": schema})
parsagon/executor.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import time
4
+ from urllib.parse import urljoin
4
5
 
5
6
  import lxml.html
6
7
  from pyvirtualdisplay import Display
@@ -74,11 +75,22 @@ class Executor:
74
75
  )
75
76
 
76
77
  def _get_cleaned_lxml_root(self):
77
- driver = self.driver
78
- html = driver.page_source
79
-
80
78
  parser = lxml.html.HTMLParser(remove_comments=True, remove_pis=True)
81
- root = lxml.html.fromstring(html, parser=parser)
79
+ root = lxml.html.fromstring(self.driver.page_source.replace(' ', ' '), parser=parser)
80
+
81
+ # make links absolute
82
+ root.make_links_absolute(self.driver.current_url)
83
+ for elem in root.xpath('//img[@srcset]'):
84
+ srcset_list = []
85
+ for s in elem.get('srcset').split(','):
86
+ parts = s.strip().split()
87
+ if not parts:
88
+ continue
89
+ parts[0] = urljoin(self.driver.current_url, parts[0])
90
+ srcset_list.append(' '.join(parts))
91
+ elem.set('srcset', ', '.join(srcset_list))
92
+
93
+ # remove unnecessary and bulky elements
82
94
  for elem in root.iterfind(".//script"):
83
95
  elem.text = ""
84
96
  for elem in root.iterfind(".//noscript"):
@@ -188,6 +200,7 @@ class Executor:
188
200
  examples=[
189
201
  {
190
202
  "html": html,
203
+ "url": self.driver.current_url,
191
204
  "elem_id": elem_id,
192
205
  }
193
206
  ],
@@ -222,6 +235,7 @@ class Executor:
222
235
  examples=[
223
236
  {
224
237
  "html": html,
238
+ "url": self.driver.current_url,
225
239
  "elem_id": elem_id,
226
240
  }
227
241
  ],
@@ -260,6 +274,7 @@ class Executor:
260
274
  examples=[
261
275
  {
262
276
  "html": html,
277
+ "url": self.driver.current_url,
263
278
  "elem_id": elem_id,
264
279
  }
265
280
  ],
@@ -0,0 +1,20 @@
1
+ from parsagon.executor import Executor
2
+
3
+
4
+ class MockDriver:
5
+ def __init__(self, current_url, page_source):
6
+ self.current_url = current_url
7
+ self.page_source = page_source
8
+
9
+
10
+ class MockExecutor(Executor):
11
+ def __init__(self, current_url, page_source):
12
+ self.driver = MockDriver(current_url, page_source)
13
+ self.max_elem_id = 0
14
+ self.custom_functions = {}
15
+
16
+
17
+ def test_makes_links_absolute():
18
+ executor = MockExecutor("https://example.com/", '<html><body><a href="/stuff">Stuff</a></body></html>')
19
+ root = executor._get_cleaned_lxml_root()
20
+ assert root[0][0].get("href") == "https://example.com/stuff"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsagon
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: Allows you to create browser automations with natural language
5
5
  Author-email: Sandy Suh <sandy@parsagon.io>
6
6
  Project-URL: Homepage, https://parsagon.io
@@ -1,19 +1,20 @@
1
1
  __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parsagon/__init__.py,sha256=YRvTX1ekTKcPXTKXLgr6-CrRC0DFc11L3gFSPI_Ohc8,54
3
- parsagon/api.py,sha256=857ArenA4j4VN6ME2LdBkjK2YIFhbDbPUoYcNBY5Qt8,4632
3
+ parsagon/api.py,sha256=aMK4So9bKUFRagjuxHWPWbZ5e8dYE8t-U3m5qnS31yQ,4660
4
4
  parsagon/custom_function.py,sha256=oEj28qItaHUnsvLIHD7kg5QL3J3aO6rW6xKKP-H-Drs,770
5
5
  parsagon/exceptions.py,sha256=NYpFaSLZplBTv9fov_1LKPzDPIqb7Ffe7IunnjntxvA,819
6
- parsagon/executor.py,sha256=_8dXBseurlgY-J_HIuJwuNC3JVH0_2SXT3m14TuaX84,11458
6
+ parsagon/executor.py,sha256=Q01Jbrd2J4o7JZtZ-6VFlPHG7Y8HZh24hpuMYL_sGgg,12172
7
7
  parsagon/main.py,sha256=0BeaWwk07S4JSEGTOiPM-I1fQpRkjislsazSMUooOMM,9044
8
8
  parsagon/settings.py,sha256=s5_MsDMFM5tB8U8tfHaFnKibCoEqPnAu8b_ueg07Ftw,2947
9
9
  parsagon/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  parsagon/tests/api_mocks.py,sha256=M8xhiyPa1dI8Vx-odDk7ETopfFAfcjfAf-ApmSqgvfw,3127
11
11
  parsagon/tests/cli_mocks.py,sha256=Y4W_wgH6ixQRCk8xVdWOwDJ_ChD09XdZEV2xUVXWFiM,327
12
12
  parsagon/tests/conftest.py,sha256=KMlHohc0QT77HzumraIojzKeqroyxarnaT6naJDNvEc,428
13
+ parsagon/tests/test_executor.py,sha256=n3cmh84r74siSeJqUeAIwjjnNzDVPEdxcvYAeJ4hNX8,645
13
14
  parsagon/tests/test_invalid_args.py,sha256=kOjMpbZvviR1CwvXReteZMxBvuhq_rOv5Tm1muBSzNk,676
14
15
  parsagon/tests/test_pipeline_operations.py,sha256=TpBKCuRA8LHYWx3PD_k9mYCSsA_9SZjrOX-rS4mE8XE,1089
15
- parsagon-0.8.0.dist-info/METADATA,sha256=V5kcJsIRxt7fZbSoBGffrts_LVI0eIbj6033_GHmy8M,2253
16
- parsagon-0.8.0.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
17
- parsagon-0.8.0.dist-info/entry_points.txt,sha256=I1UlPUb4oY2k9idkI8kvdkEcrjKGRSOl5pMbA6uu6kw,48
18
- parsagon-0.8.0.dist-info/top_level.txt,sha256=xGJFmxOu0AyQHbFRRB8j6R2WrwpGWPyxtLIPdW9jKA0,18
19
- parsagon-0.8.0.dist-info/RECORD,,
16
+ parsagon-0.9.0.dist-info/METADATA,sha256=ACH3CmOYLdTxlGIlWB7TJ0QDmNEhoYfwgkfg34s2Udo,2253
17
+ parsagon-0.9.0.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
18
+ parsagon-0.9.0.dist-info/entry_points.txt,sha256=I1UlPUb4oY2k9idkI8kvdkEcrjKGRSOl5pMbA6uu6kw,48
19
+ parsagon-0.9.0.dist-info/top_level.txt,sha256=xGJFmxOu0AyQHbFRRB8j6R2WrwpGWPyxtLIPdW9jKA0,18
20
+ parsagon-0.9.0.dist-info/RECORD,,