parsagon 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsagon/api.py +2 -2
- parsagon/executor.py +20 -5
- parsagon/tests/test_executor.py +20 -0
- {parsagon-0.8.1.dist-info → parsagon-0.9.0.dist-info}/METADATA +1 -1
- {parsagon-0.8.1.dist-info → parsagon-0.9.0.dist-info}/RECORD +8 -7
- {parsagon-0.8.1.dist-info → parsagon-0.9.0.dist-info}/WHEEL +0 -0
- {parsagon-0.8.1.dist-info → parsagon-0.9.0.dist-info}/entry_points.txt +0 -0
- {parsagon-0.8.1.dist-info → parsagon-0.9.0.dist-info}/top_level.txt +0 -0
parsagon/api.py
CHANGED
@@ -80,7 +80,7 @@ def get_interaction_element_id(marked_html, elem_type, description):
|
|
80
80
|
return result
|
81
81
|
|
82
82
|
|
83
|
-
def scrape_page(
|
83
|
+
def scrape_page(html, schema):
|
84
84
|
"""
|
85
85
|
Scrapes data from the provided page HTML - data will be returned in the schema provided.
|
86
86
|
:param url: url of the page to scrape.
|
@@ -88,7 +88,7 @@ def scrape_page(url, html, schema):
|
|
88
88
|
:param schema: Schema of the data to scrape
|
89
89
|
:return: Scraped data, with lists truncated.
|
90
90
|
"""
|
91
|
-
return _api_call(httpx.post, "/transformers/get-custom-data/", json={"
|
91
|
+
return _api_call(httpx.post, "/transformers/get-custom-data/", json={"html": html, "schema": schema})
|
92
92
|
|
93
93
|
|
94
94
|
def create_pipeline(name, description, program_sketch):
|
parsagon/executor.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import time
|
4
|
+
from urllib.parse import urljoin
|
4
5
|
|
5
6
|
import lxml.html
|
6
7
|
from pyvirtualdisplay import Display
|
@@ -74,11 +75,22 @@ class Executor:
|
|
74
75
|
)
|
75
76
|
|
76
77
|
def _get_cleaned_lxml_root(self):
|
77
|
-
driver = self.driver
|
78
|
-
html = driver.page_source
|
79
|
-
|
80
78
|
parser = lxml.html.HTMLParser(remove_comments=True, remove_pis=True)
|
81
|
-
root = lxml.html.fromstring(
|
79
|
+
root = lxml.html.fromstring(self.driver.page_source.replace(' ', ' '), parser=parser)
|
80
|
+
|
81
|
+
# make links absolute
|
82
|
+
root.make_links_absolute(self.driver.current_url)
|
83
|
+
for elem in root.xpath('//img[@srcset]'):
|
84
|
+
srcset_list = []
|
85
|
+
for s in elem.get('srcset').split(','):
|
86
|
+
parts = s.strip().split()
|
87
|
+
if not parts:
|
88
|
+
continue
|
89
|
+
parts[0] = urljoin(self.driver.current_url, parts[0])
|
90
|
+
srcset_list.append(' '.join(parts))
|
91
|
+
elem.set('srcset', ', '.join(srcset_list))
|
92
|
+
|
93
|
+
# remove unnecessary and bulky elements
|
82
94
|
for elem in root.iterfind(".//script"):
|
83
95
|
elem.text = ""
|
84
96
|
for elem in root.iterfind(".//noscript"):
|
@@ -188,6 +200,7 @@ class Executor:
|
|
188
200
|
examples=[
|
189
201
|
{
|
190
202
|
"html": html,
|
203
|
+
"url": self.driver.current_url,
|
191
204
|
"elem_id": elem_id,
|
192
205
|
}
|
193
206
|
],
|
@@ -222,6 +235,7 @@ class Executor:
|
|
222
235
|
examples=[
|
223
236
|
{
|
224
237
|
"html": html,
|
238
|
+
"url": self.driver.current_url,
|
225
239
|
"elem_id": elem_id,
|
226
240
|
}
|
227
241
|
],
|
@@ -260,6 +274,7 @@ class Executor:
|
|
260
274
|
examples=[
|
261
275
|
{
|
262
276
|
"html": html,
|
277
|
+
"url": self.driver.current_url,
|
263
278
|
"elem_id": elem_id,
|
264
279
|
}
|
265
280
|
],
|
@@ -287,7 +302,7 @@ class Executor:
|
|
287
302
|
self.driver.switch_to.window(window_id)
|
288
303
|
logger.info("Scraping data...")
|
289
304
|
html = self.get_scrape_html()
|
290
|
-
result = scrape_page(
|
305
|
+
result = scrape_page(html, schema)
|
291
306
|
scraped_data = result["data"]
|
292
307
|
nodes = result["nodes"]
|
293
308
|
if not scraped_data and not nodes:
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from parsagon.executor import Executor
|
2
|
+
|
3
|
+
|
4
|
+
class MockDriver:
|
5
|
+
def __init__(self, current_url, page_source):
|
6
|
+
self.current_url = current_url
|
7
|
+
self.page_source = page_source
|
8
|
+
|
9
|
+
|
10
|
+
class MockExecutor(Executor):
|
11
|
+
def __init__(self, current_url, page_source):
|
12
|
+
self.driver = MockDriver(current_url, page_source)
|
13
|
+
self.max_elem_id = 0
|
14
|
+
self.custom_functions = {}
|
15
|
+
|
16
|
+
|
17
|
+
def test_makes_links_absolute():
|
18
|
+
executor = MockExecutor("https://example.com/", '<html><body><a href="/stuff">Stuff</a></body></html>')
|
19
|
+
root = executor._get_cleaned_lxml_root()
|
20
|
+
assert root[0][0].get("href") == "https://example.com/stuff"
|
@@ -1,19 +1,20 @@
|
|
1
1
|
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
parsagon/__init__.py,sha256=YRvTX1ekTKcPXTKXLgr6-CrRC0DFc11L3gFSPI_Ohc8,54
|
3
|
-
parsagon/api.py,sha256=
|
3
|
+
parsagon/api.py,sha256=aMK4So9bKUFRagjuxHWPWbZ5e8dYE8t-U3m5qnS31yQ,4660
|
4
4
|
parsagon/custom_function.py,sha256=oEj28qItaHUnsvLIHD7kg5QL3J3aO6rW6xKKP-H-Drs,770
|
5
5
|
parsagon/exceptions.py,sha256=NYpFaSLZplBTv9fov_1LKPzDPIqb7Ffe7IunnjntxvA,819
|
6
|
-
parsagon/executor.py,sha256=
|
6
|
+
parsagon/executor.py,sha256=Q01Jbrd2J4o7JZtZ-6VFlPHG7Y8HZh24hpuMYL_sGgg,12172
|
7
7
|
parsagon/main.py,sha256=0BeaWwk07S4JSEGTOiPM-I1fQpRkjislsazSMUooOMM,9044
|
8
8
|
parsagon/settings.py,sha256=s5_MsDMFM5tB8U8tfHaFnKibCoEqPnAu8b_ueg07Ftw,2947
|
9
9
|
parsagon/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
parsagon/tests/api_mocks.py,sha256=M8xhiyPa1dI8Vx-odDk7ETopfFAfcjfAf-ApmSqgvfw,3127
|
11
11
|
parsagon/tests/cli_mocks.py,sha256=Y4W_wgH6ixQRCk8xVdWOwDJ_ChD09XdZEV2xUVXWFiM,327
|
12
12
|
parsagon/tests/conftest.py,sha256=KMlHohc0QT77HzumraIojzKeqroyxarnaT6naJDNvEc,428
|
13
|
+
parsagon/tests/test_executor.py,sha256=n3cmh84r74siSeJqUeAIwjjnNzDVPEdxcvYAeJ4hNX8,645
|
13
14
|
parsagon/tests/test_invalid_args.py,sha256=kOjMpbZvviR1CwvXReteZMxBvuhq_rOv5Tm1muBSzNk,676
|
14
15
|
parsagon/tests/test_pipeline_operations.py,sha256=TpBKCuRA8LHYWx3PD_k9mYCSsA_9SZjrOX-rS4mE8XE,1089
|
15
|
-
parsagon-0.
|
16
|
-
parsagon-0.
|
17
|
-
parsagon-0.
|
18
|
-
parsagon-0.
|
19
|
-
parsagon-0.
|
16
|
+
parsagon-0.9.0.dist-info/METADATA,sha256=ACH3CmOYLdTxlGIlWB7TJ0QDmNEhoYfwgkfg34s2Udo,2253
|
17
|
+
parsagon-0.9.0.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
|
18
|
+
parsagon-0.9.0.dist-info/entry_points.txt,sha256=I1UlPUb4oY2k9idkI8kvdkEcrjKGRSOl5pMbA6uu6kw,48
|
19
|
+
parsagon-0.9.0.dist-info/top_level.txt,sha256=xGJFmxOu0AyQHbFRRB8j6R2WrwpGWPyxtLIPdW9jKA0,18
|
20
|
+
parsagon-0.9.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|