parsagon 0.13.0__tar.gz → 0.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsagon-0.13.0 → parsagon-0.14.0}/PKG-INFO +1 -1
- {parsagon-0.13.0 → parsagon-0.14.0}/pyproject.toml +1 -1
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/api.py +31 -6
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/assistant.py +4 -4
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/create.py +2 -2
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/executor.py +1 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/main.py +23 -22
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/runs.py +25 -18
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/api_mocks.py +1 -1
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/test_invalid_args.py +2 -12
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/test_pipeline_operations.py +2 -17
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/PKG-INFO +1 -1
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/SOURCES.txt +0 -1
- parsagon-0.13.0/src/parsagon/tests/cli_mocks.py +0 -16
- {parsagon-0.13.0 → parsagon-0.14.0}/README.md +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/setup.cfg +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/__init__.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/__init__.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/custom_function.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/exceptions.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/highlights.js +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/print.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/secrets.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/settings.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/__init__.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/conftest.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/test_executor.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon/tests/test_secrets.py +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/dependency_links.txt +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/entry_points.txt +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/requires.txt +0 -0
- {parsagon-0.13.0 → parsagon-0.14.0}/src/parsagon.egg-info/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
from json import JSONDecodeError
|
2
2
|
import time
|
3
|
+
from urllib.parse import quote
|
3
4
|
|
4
5
|
import httpx
|
5
6
|
|
@@ -166,7 +167,13 @@ def create_pipeline(name, description, program_sketch, pseudocode, secrets):
|
|
166
167
|
return _api_call(
|
167
168
|
httpx.post,
|
168
169
|
"/pipelines/",
|
169
|
-
json={
|
170
|
+
json={
|
171
|
+
"name": name,
|
172
|
+
"description": description,
|
173
|
+
"program_sketch": program_sketch,
|
174
|
+
"pseudocode": pseudocode,
|
175
|
+
"secrets": secrets,
|
176
|
+
},
|
170
177
|
)
|
171
178
|
|
172
179
|
|
@@ -195,11 +202,21 @@ def add_examples_to_custom_function(pipeline_id, call_id, custom_function, remov
|
|
195
202
|
)
|
196
203
|
|
197
204
|
|
205
|
+
def double_quote(string):
|
206
|
+
"""
|
207
|
+
A bug in Django disallows URLs even with quoted slashes as in:
|
208
|
+
/api/pipelines/name/stripe%2Fstuff/code/ HTTP/1.1" 405
|
209
|
+
Therefore we must double quote
|
210
|
+
"""
|
211
|
+
return quote(quote(string, safe=""), safe="")
|
212
|
+
|
213
|
+
|
198
214
|
def get_pipeline(pipeline_name):
|
215
|
+
escaped_pipeline_name = double_quote(pipeline_name)
|
199
216
|
with RaiseProgramNotFound(pipeline_name):
|
200
217
|
return _api_call(
|
201
218
|
httpx.get,
|
202
|
-
f"/pipelines/name/{
|
219
|
+
f"/pipelines/name/{escaped_pipeline_name}/?double_escaped=True",
|
203
220
|
)
|
204
221
|
|
205
222
|
|
@@ -207,14 +224,16 @@ def get_pipelines():
|
|
207
224
|
return _api_call(httpx.get, f"/pipelines/")
|
208
225
|
|
209
226
|
|
210
|
-
def get_pipeline_code(pipeline_name, variables, headless):
|
227
|
+
def get_pipeline_code(pipeline_name, variables, headless, use_uc):
|
228
|
+
escaped_pipeline_name = double_quote(pipeline_name)
|
211
229
|
with RaiseProgramNotFound(pipeline_name):
|
212
230
|
return _api_call(
|
213
231
|
httpx.post,
|
214
|
-
f"/pipelines/name/{
|
232
|
+
f"/pipelines/name/{escaped_pipeline_name}/code/?double_escaped=True",
|
215
233
|
json={
|
216
234
|
"variables": variables,
|
217
235
|
"headless": headless,
|
236
|
+
"use_uc": use_uc,
|
218
237
|
},
|
219
238
|
)
|
220
239
|
|
@@ -246,11 +265,17 @@ def get_run(run_id):
|
|
246
265
|
|
247
266
|
|
248
267
|
def send_assistant_message(message, thread_id=None):
|
249
|
-
return _api_call(
|
268
|
+
return _api_call(
|
269
|
+
httpx.post, "/transformers/send-assistant-message/", json={"message": message, "thread_id": thread_id}
|
270
|
+
)
|
250
271
|
|
251
272
|
|
252
273
|
def send_assistant_function_outputs(outputs, thread_id, run_id):
|
253
|
-
return _api_call(
|
274
|
+
return _api_call(
|
275
|
+
httpx.post,
|
276
|
+
"/transformers/send-assistant-function-outputs/",
|
277
|
+
json={"outputs": outputs, "thread_id": thread_id, "run_id": run_id},
|
278
|
+
)
|
254
279
|
|
255
280
|
|
256
281
|
def poll_extract(url, page_type):
|
@@ -4,10 +4,10 @@ from parsagon.create import create_program
|
|
4
4
|
from parsagon.executor import Executor
|
5
5
|
from parsagon.print import assistant_print, assistant_spinner, browser_print, error_print
|
6
6
|
from rich.prompt import Prompt
|
7
|
-
from parsagon.runs import run, batch_runs
|
7
|
+
from parsagon.runs import run, batch_runs
|
8
8
|
|
9
9
|
|
10
|
-
def assist(
|
10
|
+
def assist(verbose=False):
|
11
11
|
task = Prompt.ask("Type what do you want to do")
|
12
12
|
with assistant_spinner():
|
13
13
|
response = send_assistant_message(task)
|
@@ -37,11 +37,11 @@ def assist(headless=False, infer=False, verbose=False):
|
|
37
37
|
output["output"] = html
|
38
38
|
outputs.append(output)
|
39
39
|
elif name == "create_program":
|
40
|
-
result = create_program(args["description"]
|
40
|
+
result = create_program(args["description"])
|
41
41
|
output["output"] = json.dumps(result)
|
42
42
|
outputs.append(output)
|
43
43
|
elif name == "run_program":
|
44
|
-
result =
|
44
|
+
result = run(**args)
|
45
45
|
output["output"] = json.dumps(result)
|
46
46
|
outputs.append(output)
|
47
47
|
elif name == "batch_runs":
|
@@ -6,7 +6,7 @@ from parsagon.secrets import extract_secrets
|
|
6
6
|
from rich.prompt import Prompt
|
7
7
|
|
8
8
|
|
9
|
-
def create_program(task, headless=False, infer=False):
|
9
|
+
def create_program(task, headless=False, infer=False, undetected=False):
|
10
10
|
assistant_print("Creating a program based on your specifications...")
|
11
11
|
task, secrets = extract_secrets(task)
|
12
12
|
program_sketches = get_program_sketches(task)
|
@@ -28,7 +28,7 @@ def create_program(task, headless=False, infer=False):
|
|
28
28
|
abridged_program += f"\n\noutput = func({args})\n" # Make the program runnable
|
29
29
|
|
30
30
|
# Execute the abridged program to gather examples
|
31
|
-
executor = Executor(headless=headless, infer=infer)
|
31
|
+
executor = Executor(headless=headless, infer=infer, use_uc=undetected)
|
32
32
|
executor.execute(abridged_program)
|
33
33
|
|
34
34
|
# The user must select a name
|
@@ -17,28 +17,18 @@ from parsagon.assistant import assist
|
|
17
17
|
from parsagon.create import create_program
|
18
18
|
from parsagon.exceptions import ParsagonException
|
19
19
|
from parsagon.executor import Executor, custom_functions_to_descriptions
|
20
|
-
from parsagon.runs import
|
20
|
+
from parsagon.runs import run
|
21
21
|
from parsagon.settings import get_api_key, save_setting, configure_logging
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
logger = logging.getLogger(__name__)
|
25
25
|
|
26
26
|
|
27
|
-
def get_args():
|
27
|
+
def get_args(argv):
|
28
28
|
parser = argparse.ArgumentParser(
|
29
29
|
prog="parsagon", description="Scrapes and interacts with web pages based on natural language.", add_help=False
|
30
30
|
)
|
31
31
|
parser.add_argument("-v", "--verbose", action="store_true", help="run the task in verbose mode")
|
32
|
-
parser.add_argument(
|
33
|
-
"--headless",
|
34
|
-
action="store_true",
|
35
|
-
help="run the browser in headless mode",
|
36
|
-
)
|
37
|
-
parser.add_argument(
|
38
|
-
"--infer",
|
39
|
-
action="store_true",
|
40
|
-
help="let Parsagon infer all elements to be scraped",
|
41
|
-
)
|
42
32
|
subparsers = parser.add_subparsers()
|
43
33
|
|
44
34
|
# Create
|
@@ -53,6 +43,11 @@ def get_args():
|
|
53
43
|
action="store_true",
|
54
44
|
help="let Parsagon infer all elements to be scraped",
|
55
45
|
)
|
46
|
+
parser_create.add_argument(
|
47
|
+
"--undetected",
|
48
|
+
action="store_true",
|
49
|
+
help="run in undetected mode",
|
50
|
+
)
|
56
51
|
parser_create.set_defaults(func=create_cli)
|
57
52
|
|
58
53
|
# Detail
|
@@ -133,7 +128,17 @@ def get_args():
|
|
133
128
|
action="store_true",
|
134
129
|
help="output log data from the run",
|
135
130
|
)
|
136
|
-
parser_run.
|
131
|
+
parser_run.add_argument(
|
132
|
+
"--output_file",
|
133
|
+
type=str,
|
134
|
+
help="write the data to the given file path",
|
135
|
+
)
|
136
|
+
parser_run.add_argument(
|
137
|
+
"--undetected",
|
138
|
+
action="store_true",
|
139
|
+
help="run in undetected mode",
|
140
|
+
)
|
141
|
+
parser_run.set_defaults(func=run)
|
137
142
|
|
138
143
|
# Delete
|
139
144
|
parser_delete = subparsers.add_parser(
|
@@ -164,20 +169,16 @@ def get_args():
|
|
164
169
|
)
|
165
170
|
parser_help.set_defaults(func=help, parser=parser)
|
166
171
|
|
167
|
-
args = parser.parse_args()
|
172
|
+
args = parser.parse_args(argv)
|
168
173
|
kwargs = vars(args)
|
169
174
|
return kwargs, parser
|
170
175
|
|
171
176
|
|
172
|
-
def main():
|
173
|
-
kwargs, parser = get_args()
|
177
|
+
def main(argv=None):
|
178
|
+
kwargs, parser = get_args(argv)
|
174
179
|
func = kwargs.pop("func", None)
|
175
180
|
if func is None:
|
176
181
|
func = assist
|
177
|
-
else:
|
178
|
-
# Pop assist-only arguments
|
179
|
-
kwargs.pop("infer")
|
180
|
-
kwargs.pop("headless")
|
181
182
|
verbose = kwargs["verbose"]
|
182
183
|
configure_logging(verbose)
|
183
184
|
|
@@ -188,9 +189,9 @@ def main():
|
|
188
189
|
logger.error(error_message)
|
189
190
|
|
190
191
|
|
191
|
-
def create_cli(headless=False, infer=False, verbose=False):
|
192
|
+
def create_cli(headless=False, infer=False, undetected=False, verbose=False):
|
192
193
|
task = Prompt.ask("Enter a detailed scraping task")
|
193
|
-
create_program(task, headless=headless, infer=infer)
|
194
|
+
create_program(task, headless=headless, infer=infer, undetected=undetected)
|
194
195
|
|
195
196
|
|
196
197
|
def update(program_name, variables={}, headless=False, infer=False, replace=False, verbose=False):
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import datetime
|
2
|
-
import datetime
|
3
2
|
import json
|
4
3
|
import logging.config
|
5
4
|
import time
|
@@ -24,18 +23,7 @@ console = Console()
|
|
24
23
|
logger = logging.getLogger(__name__)
|
25
24
|
|
26
25
|
|
27
|
-
def
|
28
|
-
dump_path = Prompt.ask("Please enter a path/filename to save the output (in JSON format)")
|
29
|
-
if not dump_path.endswith(".json"):
|
30
|
-
dump_path += ".json"
|
31
|
-
result = run(*args, **kwargs)
|
32
|
-
with open(dump_path, "w") as f:
|
33
|
-
json.dump(result, f, indent=4)
|
34
|
-
print(f"Output saved to {dump_path}")
|
35
|
-
return result
|
36
|
-
|
37
|
-
|
38
|
-
def run(program_name, variables={}, headless=False, remote=False, output_log=False, verbose=False):
|
26
|
+
def run(program_name, variables={}, headless=False, remote=False, output_log=False, output_file=None, undetected=False, verbose=False):
|
39
27
|
"""
|
40
28
|
Executes pipeline code
|
41
29
|
"""
|
@@ -56,7 +44,13 @@ def run(program_name, variables={}, headless=False, remote=False, output_log=Fal
|
|
56
44
|
status = run["status"]
|
57
45
|
|
58
46
|
if output_log and status in ("FINISHED", "ERROR"):
|
59
|
-
|
47
|
+
result = {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
48
|
+
if output_file:
|
49
|
+
with open(output_file, "w") as f:
|
50
|
+
json.dump(result, f, indent=4)
|
51
|
+
return
|
52
|
+
else:
|
53
|
+
return result
|
60
54
|
|
61
55
|
if status == "FINISHED":
|
62
56
|
if verbose:
|
@@ -64,7 +58,13 @@ def run(program_name, variables={}, headless=False, remote=False, output_log=Fal
|
|
64
58
|
for warning in run["warnings"]:
|
65
59
|
logger.warning(warning)
|
66
60
|
logger.info("Program finished running.")
|
67
|
-
|
61
|
+
result = run["output"]
|
62
|
+
if output_file:
|
63
|
+
with open(output_file, "w") as f:
|
64
|
+
json.dump(result, f, indent=4)
|
65
|
+
return
|
66
|
+
else:
|
67
|
+
return result
|
68
68
|
elif status == "ERROR":
|
69
69
|
raise ParsagonException(f"Program failed to run: {run['error']}")
|
70
70
|
elif status == "CANCELED":
|
@@ -73,7 +73,7 @@ def run(program_name, variables={}, headless=False, remote=False, output_log=Fal
|
|
73
73
|
time.sleep(5)
|
74
74
|
|
75
75
|
run = create_pipeline_run(pipeline_id, variables, True)
|
76
|
-
code = get_pipeline_code(program_name, variables, headless)["code"]
|
76
|
+
code = get_pipeline_code(program_name, variables, headless, undetected)["code"]
|
77
77
|
start_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
78
78
|
run_data = {"start_time": start_time}
|
79
79
|
|
@@ -107,11 +107,18 @@ def run(program_name, variables={}, headless=False, remote=False, output_log=Fal
|
|
107
107
|
continue
|
108
108
|
run = update_pipeline_run(run["id"], run_data)
|
109
109
|
logger.info("Done.")
|
110
|
+
|
111
|
+
result = globals_locals["output"]
|
110
112
|
if output_log:
|
111
113
|
if "error" not in run_data:
|
112
114
|
run["output"] = globals_locals["output"]
|
113
|
-
|
114
|
-
|
115
|
+
result = {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
116
|
+
if output_file:
|
117
|
+
with open(output_file, "w") as f:
|
118
|
+
json.dump(result, f, indent=4)
|
119
|
+
return
|
120
|
+
else:
|
121
|
+
return result
|
115
122
|
|
116
123
|
|
117
124
|
def batch_runs(
|
@@ -53,7 +53,7 @@ def mock_httpx_method_func(*args, **kwargs):
|
|
53
53
|
},
|
54
54
|
)
|
55
55
|
|
56
|
-
if match := re.search(r"/pipelines/name/(.+)
|
56
|
+
if match := re.search(r"/pipelines/name/(.+)/", url):
|
57
57
|
assert method == "get"
|
58
58
|
pipeline_name = match.group(1)
|
59
59
|
if pipeline_name == not_found_pipeline_name:
|
@@ -1,23 +1,13 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
|
-
from parsagon import
|
4
|
-
from parsagon.tests.cli_mocks import call_cli
|
3
|
+
from parsagon.main import main
|
5
4
|
|
6
5
|
|
7
6
|
def test_headless_remote_run_invalid(mocker, debug_logs):
|
8
7
|
"""
|
9
8
|
Tests that we are unable to run a program in headless mode when the environment is remote, and that this is logged to the user.
|
10
9
|
"""
|
11
|
-
|
12
|
-
mocker,
|
13
|
-
{
|
14
|
-
"func": run,
|
15
|
-
"program_name": "test_program",
|
16
|
-
"headless": True,
|
17
|
-
"remote": True,
|
18
|
-
"verbose": False,
|
19
|
-
},
|
20
|
-
)
|
10
|
+
main(["run", "test_program", "--headless", "--remote"])
|
21
11
|
debug_logs_lower = debug_logs.text.lower()
|
22
12
|
assert "error" in debug_logs_lower
|
23
13
|
assert "headless" in debug_logs_lower
|
@@ -6,7 +6,6 @@ import pytest
|
|
6
6
|
from parsagon import delete, run
|
7
7
|
from parsagon.main import main
|
8
8
|
from parsagon.tests.api_mocks import install_api_mocks, not_found_pipeline_name
|
9
|
-
from parsagon.tests.cli_mocks import call_cli
|
10
9
|
|
11
10
|
|
12
11
|
def test_pipeline_delete(mocker):
|
@@ -18,24 +17,10 @@ def test_pipeline_not_found(mocker, debug_logs):
|
|
18
17
|
install_api_mocks(mocker, {"code_to_return": 'raise Exception("Should not exec this code if pipeline not found.")'})
|
19
18
|
|
20
19
|
# On delete
|
21
|
-
|
22
|
-
mocker,
|
23
|
-
{
|
24
|
-
"func": delete,
|
25
|
-
"program_name": not_found_pipeline_name,
|
26
|
-
"verbose": False,
|
27
|
-
},
|
28
|
-
)
|
20
|
+
main(["delete", not_found_pipeline_name, "-y"])
|
29
21
|
assert f"A program with name {not_found_pipeline_name} does not exist." in debug_logs.text
|
30
22
|
debug_logs.clear()
|
31
23
|
|
32
24
|
# On attempted run
|
33
|
-
|
34
|
-
mocker,
|
35
|
-
{
|
36
|
-
"func": run,
|
37
|
-
"program_name": not_found_pipeline_name,
|
38
|
-
"verbose": False,
|
39
|
-
},
|
40
|
-
)
|
25
|
+
main(["run", not_found_pipeline_name])
|
41
26
|
assert f"A program with name {not_found_pipeline_name} does not exist." in debug_logs.text
|
@@ -22,7 +22,6 @@ src/parsagon.egg-info/requires.txt
|
|
22
22
|
src/parsagon.egg-info/top_level.txt
|
23
23
|
src/parsagon/tests/__init__.py
|
24
24
|
src/parsagon/tests/api_mocks.py
|
25
|
-
src/parsagon/tests/cli_mocks.py
|
26
25
|
src/parsagon/tests/conftest.py
|
27
26
|
src/parsagon/tests/test_executor.py
|
28
27
|
src/parsagon/tests/test_invalid_args.py
|
@@ -1,16 +0,0 @@
|
|
1
|
-
from parsagon.main import main
|
2
|
-
|
3
|
-
|
4
|
-
def call_cli(mocker, args):
|
5
|
-
"""
|
6
|
-
Uses the mocker to pretend that the args passed are coming from argparse, then calls the main function.
|
7
|
-
"""
|
8
|
-
|
9
|
-
mocker.patch(
|
10
|
-
"parsagon.main.get_args",
|
11
|
-
lambda: (
|
12
|
-
args,
|
13
|
-
None,
|
14
|
-
),
|
15
|
-
)
|
16
|
-
return main()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|