parsagon 0.12.4__tar.gz → 0.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsagon-0.12.4 → parsagon-0.14.0}/PKG-INFO +1 -1
- {parsagon-0.12.4 → parsagon-0.14.0}/pyproject.toml +1 -1
- parsagon-0.14.0/src/parsagon/__init__.py +3 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/api.py +31 -6
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/assistant.py +13 -2
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/create.py +2 -2
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/executor.py +1 -0
- parsagon-0.14.0/src/parsagon/main.py +300 -0
- parsagon-0.14.0/src/parsagon/runs.py +227 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/settings.py +5 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/api_mocks.py +1 -1
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/test_invalid_args.py +2 -12
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/test_pipeline_operations.py +2 -17
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/PKG-INFO +1 -1
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/SOURCES.txt +1 -1
- parsagon-0.12.4/src/parsagon/__init__.py +0 -1
- parsagon-0.12.4/src/parsagon/main.py +0 -454
- parsagon-0.12.4/src/parsagon/tests/cli_mocks.py +0 -16
- {parsagon-0.12.4 → parsagon-0.14.0}/README.md +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/setup.cfg +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/__init__.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/custom_function.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/exceptions.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/highlights.js +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/print.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/secrets.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/__init__.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/conftest.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/test_executor.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon/tests/test_secrets.py +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/dependency_links.txt +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/entry_points.txt +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/requires.txt +0 -0
- {parsagon-0.12.4 → parsagon-0.14.0}/src/parsagon.egg-info/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
from json import JSONDecodeError
|
2
2
|
import time
|
3
|
+
from urllib.parse import quote
|
3
4
|
|
4
5
|
import httpx
|
5
6
|
|
@@ -166,7 +167,13 @@ def create_pipeline(name, description, program_sketch, pseudocode, secrets):
|
|
166
167
|
return _api_call(
|
167
168
|
httpx.post,
|
168
169
|
"/pipelines/",
|
169
|
-
json={
|
170
|
+
json={
|
171
|
+
"name": name,
|
172
|
+
"description": description,
|
173
|
+
"program_sketch": program_sketch,
|
174
|
+
"pseudocode": pseudocode,
|
175
|
+
"secrets": secrets,
|
176
|
+
},
|
170
177
|
)
|
171
178
|
|
172
179
|
|
@@ -195,11 +202,21 @@ def add_examples_to_custom_function(pipeline_id, call_id, custom_function, remov
|
|
195
202
|
)
|
196
203
|
|
197
204
|
|
205
|
+
def double_quote(string):
|
206
|
+
"""
|
207
|
+
A bug in Django disallows URLs even with quoted slashes as in:
|
208
|
+
/api/pipelines/name/stripe%2Fstuff/code/ HTTP/1.1" 405
|
209
|
+
Therefore we must double quote
|
210
|
+
"""
|
211
|
+
return quote(quote(string, safe=""), safe="")
|
212
|
+
|
213
|
+
|
198
214
|
def get_pipeline(pipeline_name):
|
215
|
+
escaped_pipeline_name = double_quote(pipeline_name)
|
199
216
|
with RaiseProgramNotFound(pipeline_name):
|
200
217
|
return _api_call(
|
201
218
|
httpx.get,
|
202
|
-
f"/pipelines/name/{
|
219
|
+
f"/pipelines/name/{escaped_pipeline_name}/?double_escaped=True",
|
203
220
|
)
|
204
221
|
|
205
222
|
|
@@ -207,14 +224,16 @@ def get_pipelines():
|
|
207
224
|
return _api_call(httpx.get, f"/pipelines/")
|
208
225
|
|
209
226
|
|
210
|
-
def get_pipeline_code(pipeline_name, variables, headless):
|
227
|
+
def get_pipeline_code(pipeline_name, variables, headless, use_uc):
|
228
|
+
escaped_pipeline_name = double_quote(pipeline_name)
|
211
229
|
with RaiseProgramNotFound(pipeline_name):
|
212
230
|
return _api_call(
|
213
231
|
httpx.post,
|
214
|
-
f"/pipelines/name/{
|
232
|
+
f"/pipelines/name/{escaped_pipeline_name}/code/?double_escaped=True",
|
215
233
|
json={
|
216
234
|
"variables": variables,
|
217
235
|
"headless": headless,
|
236
|
+
"use_uc": use_uc,
|
218
237
|
},
|
219
238
|
)
|
220
239
|
|
@@ -246,11 +265,17 @@ def get_run(run_id):
|
|
246
265
|
|
247
266
|
|
248
267
|
def send_assistant_message(message, thread_id=None):
|
249
|
-
return _api_call(
|
268
|
+
return _api_call(
|
269
|
+
httpx.post, "/transformers/send-assistant-message/", json={"message": message, "thread_id": thread_id}
|
270
|
+
)
|
250
271
|
|
251
272
|
|
252
273
|
def send_assistant_function_outputs(outputs, thread_id, run_id):
|
253
|
-
return _api_call(
|
274
|
+
return _api_call(
|
275
|
+
httpx.post,
|
276
|
+
"/transformers/send-assistant-function-outputs/",
|
277
|
+
json={"outputs": outputs, "thread_id": thread_id, "run_id": run_id},
|
278
|
+
)
|
254
279
|
|
255
280
|
|
256
281
|
def poll_extract(url, page_type):
|
@@ -4,9 +4,11 @@ from parsagon.create import create_program
|
|
4
4
|
from parsagon.executor import Executor
|
5
5
|
from parsagon.print import assistant_print, assistant_spinner, browser_print, error_print
|
6
6
|
from rich.prompt import Prompt
|
7
|
+
from parsagon.runs import run, batch_runs
|
7
8
|
|
8
9
|
|
9
|
-
def assist(
|
10
|
+
def assist(verbose=False):
|
11
|
+
task = Prompt.ask("Type what do you want to do")
|
10
12
|
with assistant_spinner():
|
11
13
|
response = send_assistant_message(task)
|
12
14
|
while True:
|
@@ -35,7 +37,16 @@ def assist(task, headless, infer):
|
|
35
37
|
output["output"] = html
|
36
38
|
outputs.append(output)
|
37
39
|
elif name == "create_program":
|
38
|
-
result = create_program(args["description"]
|
40
|
+
result = create_program(args["description"])
|
41
|
+
output["output"] = json.dumps(result)
|
42
|
+
outputs.append(output)
|
43
|
+
elif name == "run_program":
|
44
|
+
result = run(**args)
|
45
|
+
output["output"] = json.dumps(result)
|
46
|
+
outputs.append(output)
|
47
|
+
elif name == "batch_runs":
|
48
|
+
batch_name = input("Please enter a name for the batch run (for saving of intermediate results): ")
|
49
|
+
result = batch_runs(batch_name, **args)
|
39
50
|
output["output"] = json.dumps(result)
|
40
51
|
outputs.append(output)
|
41
52
|
with assistant_spinner():
|
@@ -6,7 +6,7 @@ from parsagon.secrets import extract_secrets
|
|
6
6
|
from rich.prompt import Prompt
|
7
7
|
|
8
8
|
|
9
|
-
def create_program(task, headless=False, infer=False):
|
9
|
+
def create_program(task, headless=False, infer=False, undetected=False):
|
10
10
|
assistant_print("Creating a program based on your specifications...")
|
11
11
|
task, secrets = extract_secrets(task)
|
12
12
|
program_sketches = get_program_sketches(task)
|
@@ -28,7 +28,7 @@ def create_program(task, headless=False, infer=False):
|
|
28
28
|
abridged_program += f"\n\noutput = func({args})\n" # Make the program runnable
|
29
29
|
|
30
30
|
# Execute the abridged program to gather examples
|
31
|
-
executor = Executor(headless=headless, infer=infer)
|
31
|
+
executor = Executor(headless=headless, infer=infer, use_uc=undetected)
|
32
32
|
executor.execute(abridged_program)
|
33
33
|
|
34
34
|
# The user must select a name
|
@@ -0,0 +1,300 @@
|
|
1
|
+
import argparse
|
2
|
+
import json
|
3
|
+
import logging.config
|
4
|
+
import time
|
5
|
+
|
6
|
+
from rich.console import Console
|
7
|
+
from rich.prompt import Prompt
|
8
|
+
|
9
|
+
from parsagon.api import (
|
10
|
+
delete_pipeline,
|
11
|
+
add_examples_to_custom_function,
|
12
|
+
get_pipeline,
|
13
|
+
get_pipelines,
|
14
|
+
poll_extract
|
15
|
+
)
|
16
|
+
from parsagon.assistant import assist
|
17
|
+
from parsagon.create import create_program
|
18
|
+
from parsagon.exceptions import ParsagonException
|
19
|
+
from parsagon.executor import Executor, custom_functions_to_descriptions
|
20
|
+
from parsagon.runs import run
|
21
|
+
from parsagon.settings import get_api_key, save_setting, configure_logging
|
22
|
+
|
23
|
+
console = Console()
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
def get_args(argv):
|
28
|
+
parser = argparse.ArgumentParser(
|
29
|
+
prog="parsagon", description="Scrapes and interacts with web pages based on natural language.", add_help=False
|
30
|
+
)
|
31
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="run the task in verbose mode")
|
32
|
+
subparsers = parser.add_subparsers()
|
33
|
+
|
34
|
+
# Create
|
35
|
+
parser_create = subparsers.add_parser("create", description="Creates a program.")
|
36
|
+
parser_create.add_argument(
|
37
|
+
"--headless",
|
38
|
+
action="store_true",
|
39
|
+
help="run the browser in headless mode",
|
40
|
+
)
|
41
|
+
parser_create.add_argument(
|
42
|
+
"--infer",
|
43
|
+
action="store_true",
|
44
|
+
help="let Parsagon infer all elements to be scraped",
|
45
|
+
)
|
46
|
+
parser_create.add_argument(
|
47
|
+
"--undetected",
|
48
|
+
action="store_true",
|
49
|
+
help="run in undetected mode",
|
50
|
+
)
|
51
|
+
parser_create.set_defaults(func=create_cli)
|
52
|
+
|
53
|
+
# Detail
|
54
|
+
parser_detail = subparsers.add_parser(
|
55
|
+
"detail",
|
56
|
+
description="Outputs details of a created program.",
|
57
|
+
)
|
58
|
+
parser_detail.add_argument(
|
59
|
+
"-p",
|
60
|
+
"--program",
|
61
|
+
dest="program_name",
|
62
|
+
type=str,
|
63
|
+
help="the name of the program",
|
64
|
+
)
|
65
|
+
parser_detail.set_defaults(func=detail)
|
66
|
+
|
67
|
+
# Update
|
68
|
+
parser_update = subparsers.add_parser(
|
69
|
+
"update",
|
70
|
+
description="Updates a created program.",
|
71
|
+
)
|
72
|
+
parser_update.add_argument(
|
73
|
+
"program_name",
|
74
|
+
type=str,
|
75
|
+
help="the name of the program to update",
|
76
|
+
)
|
77
|
+
parser_update.add_argument(
|
78
|
+
"--variables",
|
79
|
+
type=json.loads,
|
80
|
+
default="{}",
|
81
|
+
help="a JSON object mapping variables to values",
|
82
|
+
)
|
83
|
+
parser_update.add_argument(
|
84
|
+
"--headless",
|
85
|
+
action="store_true",
|
86
|
+
help="run the browser in headless mode",
|
87
|
+
)
|
88
|
+
parser_update.add_argument(
|
89
|
+
"--infer",
|
90
|
+
action="store_true",
|
91
|
+
help="let Parsagon infer all elements to be scraped",
|
92
|
+
)
|
93
|
+
parser_update.add_argument(
|
94
|
+
"--replace",
|
95
|
+
action="store_true",
|
96
|
+
help="remove old example data while updating the program",
|
97
|
+
)
|
98
|
+
parser_update.set_defaults(func=update)
|
99
|
+
|
100
|
+
# Run
|
101
|
+
parser_run = subparsers.add_parser(
|
102
|
+
"run",
|
103
|
+
description="Runs a created program.",
|
104
|
+
)
|
105
|
+
parser_run.add_argument(
|
106
|
+
"program_name",
|
107
|
+
type=str,
|
108
|
+
help="the name of the program to run",
|
109
|
+
)
|
110
|
+
parser_run.add_argument(
|
111
|
+
"--variables",
|
112
|
+
type=json.loads,
|
113
|
+
default="{}",
|
114
|
+
help="a JSON object mapping variables to values",
|
115
|
+
)
|
116
|
+
parser_run.add_argument(
|
117
|
+
"--headless",
|
118
|
+
action="store_true",
|
119
|
+
help="run the browser in headless mode",
|
120
|
+
)
|
121
|
+
parser_run.add_argument(
|
122
|
+
"--remote",
|
123
|
+
action="store_true",
|
124
|
+
help="run the program in the cloud",
|
125
|
+
)
|
126
|
+
parser_run.add_argument(
|
127
|
+
"--output_log",
|
128
|
+
action="store_true",
|
129
|
+
help="output log data from the run",
|
130
|
+
)
|
131
|
+
parser_run.add_argument(
|
132
|
+
"--output_file",
|
133
|
+
type=str,
|
134
|
+
help="write the data to the given file path",
|
135
|
+
)
|
136
|
+
parser_run.add_argument(
|
137
|
+
"--undetected",
|
138
|
+
action="store_true",
|
139
|
+
help="run in undetected mode",
|
140
|
+
)
|
141
|
+
parser_run.set_defaults(func=run)
|
142
|
+
|
143
|
+
# Delete
|
144
|
+
parser_delete = subparsers.add_parser(
|
145
|
+
"delete",
|
146
|
+
description="Deletes a program.",
|
147
|
+
)
|
148
|
+
parser_delete.add_argument(
|
149
|
+
"program_name",
|
150
|
+
type=str,
|
151
|
+
help="the name of the program to run",
|
152
|
+
)
|
153
|
+
parser_delete.add_argument(
|
154
|
+
"-y", "--yes", dest="confirm_with_user", action="store_false", help="auto-confirm option"
|
155
|
+
)
|
156
|
+
parser_delete.set_defaults(func=delete)
|
157
|
+
|
158
|
+
# Setup
|
159
|
+
parser_setup = subparsers.add_parser(
|
160
|
+
"setup",
|
161
|
+
description="Interactively sets up Parsagon with an API key.",
|
162
|
+
)
|
163
|
+
parser_setup.set_defaults(func=setup)
|
164
|
+
|
165
|
+
# Help
|
166
|
+
parser_help = subparsers.add_parser(
|
167
|
+
"help",
|
168
|
+
description="Shows help.",
|
169
|
+
)
|
170
|
+
parser_help.set_defaults(func=help, parser=parser)
|
171
|
+
|
172
|
+
args = parser.parse_args(argv)
|
173
|
+
kwargs = vars(args)
|
174
|
+
return kwargs, parser
|
175
|
+
|
176
|
+
|
177
|
+
def main(argv=None):
|
178
|
+
kwargs, parser = get_args(argv)
|
179
|
+
func = kwargs.pop("func", None)
|
180
|
+
if func is None:
|
181
|
+
func = assist
|
182
|
+
verbose = kwargs["verbose"]
|
183
|
+
configure_logging(verbose)
|
184
|
+
|
185
|
+
try:
|
186
|
+
return func(**kwargs)
|
187
|
+
except ParsagonException as e:
|
188
|
+
error_message = "Error:\n" + e.to_string(verbose)
|
189
|
+
logger.error(error_message)
|
190
|
+
|
191
|
+
|
192
|
+
def create_cli(headless=False, infer=False, undetected=False, verbose=False):
|
193
|
+
task = Prompt.ask("Enter a detailed scraping task")
|
194
|
+
create_program(task, headless=headless, infer=infer, undetected=undetected)
|
195
|
+
|
196
|
+
|
197
|
+
def update(program_name, variables={}, headless=False, infer=False, replace=False, verbose=False):
|
198
|
+
configure_logging(verbose)
|
199
|
+
|
200
|
+
pipeline = get_pipeline(program_name)
|
201
|
+
abridged_program = pipeline["abridged_sketch"]
|
202
|
+
# Make the program runnable
|
203
|
+
variables_str = ", ".join(f"{k}={repr(v)}" for k, v in variables.items())
|
204
|
+
abridged_program += f"\n\noutput = func({variables_str})\n"
|
205
|
+
|
206
|
+
# Execute the abridged program to gather examples
|
207
|
+
executor = Executor(headless=headless, infer=infer)
|
208
|
+
executor.execute(abridged_program)
|
209
|
+
|
210
|
+
while True:
|
211
|
+
program_name_input = input(
|
212
|
+
f'Type "{program_name}" to update this program, or press enter without typing a name to CANCEL: '
|
213
|
+
)
|
214
|
+
if not program_name_input:
|
215
|
+
logger.info("Canceled update.")
|
216
|
+
return
|
217
|
+
if program_name_input == program_name:
|
218
|
+
break
|
219
|
+
|
220
|
+
pipeline_id = pipeline["id"]
|
221
|
+
try:
|
222
|
+
for call_id, custom_function in executor.custom_functions.items():
|
223
|
+
debug_suffix = f" ({custom_function.name})"
|
224
|
+
description = custom_functions_to_descriptions.get(custom_function.name)
|
225
|
+
description = " to " + description if description else ""
|
226
|
+
if verbose:
|
227
|
+
description += debug_suffix
|
228
|
+
logger.info(f" Saving function{description}...")
|
229
|
+
add_examples_to_custom_function(pipeline_id, call_id, custom_function, replace)
|
230
|
+
logger.info(f"Saved.")
|
231
|
+
except Exception as e:
|
232
|
+
logger.error(f"An error occurred while saving the program. The program was not updated.")
|
233
|
+
|
234
|
+
|
235
|
+
def detail(program_name=None, verbose=False):
|
236
|
+
if program_name:
|
237
|
+
data = [get_pipeline(program_name)]
|
238
|
+
else:
|
239
|
+
data = get_pipelines()
|
240
|
+
for pipeline in data:
|
241
|
+
print(
|
242
|
+
f"Program: {pipeline['name']}\nDescription: {pipeline['description']}\nVariables: {pipeline['variables']}\n"
|
243
|
+
)
|
244
|
+
|
245
|
+
|
246
|
+
def delete(program_name, verbose=False, confirm_with_user=False):
|
247
|
+
if (
|
248
|
+
confirm_with_user
|
249
|
+
and input(f"Are you sure you want to delete program with name {program_name}? (y/N) ").lower().strip() != "y"
|
250
|
+
):
|
251
|
+
logger.error("Cancelled operation.")
|
252
|
+
return
|
253
|
+
logger.info("Preparing to delete program %s", program_name)
|
254
|
+
pipeline_id = get_pipeline(program_name)["id"]
|
255
|
+
logger.info("Deleting program...")
|
256
|
+
delete_pipeline(pipeline_id)
|
257
|
+
logger.info("Done.")
|
258
|
+
|
259
|
+
|
260
|
+
def setup(verbose=False):
|
261
|
+
try:
|
262
|
+
old_api_key = get_api_key()
|
263
|
+
except ParsagonException:
|
264
|
+
old_api_key = None
|
265
|
+
try:
|
266
|
+
save_setting("api_key", None)
|
267
|
+
get_api_key(interactive=True)
|
268
|
+
except KeyboardInterrupt:
|
269
|
+
save_setting("api_key", old_api_key)
|
270
|
+
logger.error("\nCancelled operation.")
|
271
|
+
return
|
272
|
+
logger.info("Setup complete.")
|
273
|
+
|
274
|
+
|
275
|
+
def help(parser, verbose):
|
276
|
+
parser.print_help()
|
277
|
+
|
278
|
+
|
279
|
+
def _get_data(url, page_type, timeout):
|
280
|
+
start_time = time.time()
|
281
|
+
with console.status("Extracting data...") as status:
|
282
|
+
while time.time() - start_time <= timeout:
|
283
|
+
result = poll_extract(url, page_type)
|
284
|
+
if result["done"]:
|
285
|
+
return result["result"]
|
286
|
+
time.sleep(15)
|
287
|
+
logger.info("No data found")
|
288
|
+
return None
|
289
|
+
|
290
|
+
|
291
|
+
def get_product(url, timeout=300):
|
292
|
+
return _get_data(url, "PRODUCT_DETAIL", timeout)
|
293
|
+
|
294
|
+
|
295
|
+
def get_review_article(url, timeout=300):
|
296
|
+
return _get_data(url, "REVIEW_ARTICLE_DETAIL", timeout)
|
297
|
+
|
298
|
+
|
299
|
+
def get_article_list(url, timeout=300):
|
300
|
+
return _get_data(url, "ARTICLE_LIST", timeout)
|
@@ -0,0 +1,227 @@
|
|
1
|
+
import datetime
|
2
|
+
import json
|
3
|
+
import logging.config
|
4
|
+
import time
|
5
|
+
import traceback
|
6
|
+
|
7
|
+
import psutil
|
8
|
+
from rich.console import Console
|
9
|
+
from rich.progress import Progress
|
10
|
+
from rich.prompt import Prompt
|
11
|
+
|
12
|
+
from parsagon.api import (
|
13
|
+
create_pipeline_run,
|
14
|
+
update_pipeline_run,
|
15
|
+
get_pipeline,
|
16
|
+
get_pipeline_code,
|
17
|
+
get_run,
|
18
|
+
)
|
19
|
+
from parsagon.exceptions import ParsagonException, RunFailedException
|
20
|
+
from parsagon.settings import get_api_key
|
21
|
+
|
22
|
+
console = Console()
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
def run(program_name, variables={}, headless=False, remote=False, output_log=False, output_file=None, undetected=False, verbose=False):
|
27
|
+
"""
|
28
|
+
Executes pipeline code
|
29
|
+
"""
|
30
|
+
if headless and remote:
|
31
|
+
raise ParsagonException("Cannot run a program remotely in headless mode")
|
32
|
+
|
33
|
+
if not isinstance(variables, dict):
|
34
|
+
raise ParsagonException("Variables must be a dictionary")
|
35
|
+
|
36
|
+
logger.info("Preparing to run program %s", program_name)
|
37
|
+
pipeline_id = get_pipeline(program_name)["id"]
|
38
|
+
|
39
|
+
if remote:
|
40
|
+
result = create_pipeline_run(pipeline_id, variables, False)
|
41
|
+
with console.status("Program running remotely...") as status:
|
42
|
+
while True:
|
43
|
+
run = get_run(result["id"])
|
44
|
+
status = run["status"]
|
45
|
+
|
46
|
+
if output_log and status in ("FINISHED", "ERROR"):
|
47
|
+
result = {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
48
|
+
if output_file:
|
49
|
+
with open(output_file, "w") as f:
|
50
|
+
json.dump(result, f, indent=4)
|
51
|
+
return
|
52
|
+
else:
|
53
|
+
return result
|
54
|
+
|
55
|
+
if status == "FINISHED":
|
56
|
+
if verbose:
|
57
|
+
logger.info(run["log"])
|
58
|
+
for warning in run["warnings"]:
|
59
|
+
logger.warning(warning)
|
60
|
+
logger.info("Program finished running.")
|
61
|
+
result = run["output"]
|
62
|
+
if output_file:
|
63
|
+
with open(output_file, "w") as f:
|
64
|
+
json.dump(result, f, indent=4)
|
65
|
+
return
|
66
|
+
else:
|
67
|
+
return result
|
68
|
+
elif status == "ERROR":
|
69
|
+
raise ParsagonException(f"Program failed to run: {run['error']}")
|
70
|
+
elif status == "CANCELED":
|
71
|
+
raise ParsagonException("Program execution was canceled")
|
72
|
+
|
73
|
+
time.sleep(5)
|
74
|
+
|
75
|
+
run = create_pipeline_run(pipeline_id, variables, True)
|
76
|
+
code = get_pipeline_code(program_name, variables, headless, undetected)["code"]
|
77
|
+
start_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
78
|
+
run_data = {"start_time": start_time}
|
79
|
+
|
80
|
+
logger.info("Running program...")
|
81
|
+
globals_locals = {"PARSAGON_API_KEY": get_api_key()}
|
82
|
+
try:
|
83
|
+
exec(code, globals_locals, globals_locals)
|
84
|
+
run_data["status"] = "FINISHED"
|
85
|
+
except:
|
86
|
+
run_data["status"] = "ERROR"
|
87
|
+
run_data["error"] = str(traceback.format_exc())
|
88
|
+
if not output_log:
|
89
|
+
raise
|
90
|
+
finally:
|
91
|
+
end_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
92
|
+
run_data["end_time"] = end_time
|
93
|
+
if "driver" in globals_locals:
|
94
|
+
globals_locals["driver"].quit()
|
95
|
+
if "display" in globals_locals:
|
96
|
+
globals_locals["display"].stop()
|
97
|
+
if "parsagon_log" in globals_locals:
|
98
|
+
run_data["log"] = "\n".join(globals_locals["parsagon_log"])
|
99
|
+
logger.info(run_data["log"])
|
100
|
+
if "parsagon_warnings" in globals_locals:
|
101
|
+
run_data["warnings"] = globals_locals["parsagon_warnings"]
|
102
|
+
for proc in psutil.process_iter():
|
103
|
+
try:
|
104
|
+
if proc.name() == "chromedriver":
|
105
|
+
proc.kill()
|
106
|
+
except psutil.NoSuchProcess:
|
107
|
+
continue
|
108
|
+
run = update_pipeline_run(run["id"], run_data)
|
109
|
+
logger.info("Done.")
|
110
|
+
|
111
|
+
result = globals_locals["output"]
|
112
|
+
if output_log:
|
113
|
+
if "error" not in run_data:
|
114
|
+
run["output"] = globals_locals["output"]
|
115
|
+
result = {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
116
|
+
if output_file:
|
117
|
+
with open(output_file, "w") as f:
|
118
|
+
json.dump(result, f, indent=4)
|
119
|
+
return
|
120
|
+
else:
|
121
|
+
return result
|
122
|
+
|
123
|
+
|
124
|
+
def batch_runs(
|
125
|
+
batch_name,
|
126
|
+
program_name,
|
127
|
+
runs,
|
128
|
+
headless=False,
|
129
|
+
ignore_errors=False,
|
130
|
+
error_value=None,
|
131
|
+
rerun_warnings=False,
|
132
|
+
rerun_warning_types=[],
|
133
|
+
rerun_errors=False,
|
134
|
+
verbose=False,
|
135
|
+
):
|
136
|
+
# Validate runs
|
137
|
+
if not all(isinstance(run_, dict) for run_ in runs):
|
138
|
+
raise ParsagonException("Runs must be a list of dictionaries")
|
139
|
+
|
140
|
+
save_file = f"{batch_name}.json"
|
141
|
+
try:
|
142
|
+
with open(save_file) as f:
|
143
|
+
outputs = json.load(f)
|
144
|
+
except FileNotFoundError:
|
145
|
+
outputs = []
|
146
|
+
metadata_file = f"{batch_name}_metadata.json"
|
147
|
+
try:
|
148
|
+
with open(metadata_file) as f:
|
149
|
+
metadata = json.load(f)
|
150
|
+
except FileNotFoundError:
|
151
|
+
metadata = []
|
152
|
+
|
153
|
+
num_initial_results = len(outputs)
|
154
|
+
error = None
|
155
|
+
variables = None
|
156
|
+
try:
|
157
|
+
default_desc = f'Running program "{program_name}"'
|
158
|
+
with Progress() as progress:
|
159
|
+
task = progress.add_task(default_desc, total=len(runs))
|
160
|
+
for i, variables in progress.track(enumerate(runs), task_id=task):
|
161
|
+
if i < num_initial_results:
|
162
|
+
if rerun_errors and metadata[i]["status"] == "ERROR":
|
163
|
+
pass
|
164
|
+
elif rerun_warnings and metadata[i]["warnings"]:
|
165
|
+
if not rerun_warning_types or any(
|
166
|
+
warning["type"] in rerun_warning_types for warning in metadata[i]["warnings"]
|
167
|
+
):
|
168
|
+
pass
|
169
|
+
else:
|
170
|
+
continue
|
171
|
+
else:
|
172
|
+
continue
|
173
|
+
for j in range(3):
|
174
|
+
result = run(program_name, variables, headless, output_log=True)
|
175
|
+
if result["status"] != "ERROR":
|
176
|
+
output = result.pop("output")
|
177
|
+
if i < num_initial_results:
|
178
|
+
outputs[i] = output
|
179
|
+
metadata[i] = result
|
180
|
+
else:
|
181
|
+
outputs.append(output)
|
182
|
+
metadata.append(result)
|
183
|
+
break
|
184
|
+
else:
|
185
|
+
error = result["error"].strip().split("\n")[-1]
|
186
|
+
if j < 2:
|
187
|
+
progress.update(
|
188
|
+
task,
|
189
|
+
description=f"An error occurred: {error} - Waiting 60s before retrying (Attempt {j+2}/3)",
|
190
|
+
)
|
191
|
+
time.sleep(60)
|
192
|
+
progress.update(task, description=default_desc)
|
193
|
+
error = None
|
194
|
+
continue
|
195
|
+
else:
|
196
|
+
if ignore_errors:
|
197
|
+
error = None
|
198
|
+
if i < num_initial_results:
|
199
|
+
outputs[i] = error_value
|
200
|
+
else:
|
201
|
+
outputs.append(error_value)
|
202
|
+
break
|
203
|
+
else:
|
204
|
+
raise RunFailedException
|
205
|
+
except RunFailedException:
|
206
|
+
pass
|
207
|
+
except Exception as e:
|
208
|
+
error = repr(e)
|
209
|
+
finally:
|
210
|
+
if error:
|
211
|
+
logger.error(
|
212
|
+
f"Unresolvable error occurred on run with variables {variables}: {error} - Data has been saved to {save_file}. Rerun your command to resume."
|
213
|
+
)
|
214
|
+
with open(save_file, "w") as f:
|
215
|
+
json.dump(outputs, f)
|
216
|
+
with open(metadata_file, "w") as f:
|
217
|
+
json.dump(metadata, f)
|
218
|
+
num_warnings = 0
|
219
|
+
num_runs_with_warnings = 0
|
220
|
+
for m in metadata:
|
221
|
+
if m["warnings"]:
|
222
|
+
num_warnings += len(m["warnings"])
|
223
|
+
num_runs_with_warnings += 1
|
224
|
+
logger.info(
|
225
|
+
f"\nSummary: {len(outputs)} runs made; {num_warnings} warnings encountered across {num_runs_with_warnings} runs. See {metadata_file} for logs.\n"
|
226
|
+
)
|
227
|
+
return None if error else outputs
|
@@ -3,6 +3,7 @@ import logging
|
|
3
3
|
import sys
|
4
4
|
from os import environ
|
5
5
|
from pathlib import Path
|
6
|
+
import logging.config
|
6
7
|
|
7
8
|
from parsagon.exceptions import ParsagonException
|
8
9
|
|
@@ -114,3 +115,7 @@ def get_logging_config(log_level="INFO"):
|
|
114
115
|
},
|
115
116
|
},
|
116
117
|
}
|
118
|
+
|
119
|
+
|
120
|
+
def configure_logging(verbose):
|
121
|
+
logging.config.dictConfig(get_logging_config("DEBUG" if verbose else "INFO"))
|
@@ -53,7 +53,7 @@ def mock_httpx_method_func(*args, **kwargs):
|
|
53
53
|
},
|
54
54
|
)
|
55
55
|
|
56
|
-
if match := re.search(r"/pipelines/name/(.+)
|
56
|
+
if match := re.search(r"/pipelines/name/(.+)/", url):
|
57
57
|
assert method == "get"
|
58
58
|
pipeline_name = match.group(1)
|
59
59
|
if pipeline_name == not_found_pipeline_name:
|
@@ -1,23 +1,13 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
|
-
from parsagon import
|
4
|
-
from parsagon.tests.cli_mocks import call_cli
|
3
|
+
from parsagon.main import main
|
5
4
|
|
6
5
|
|
7
6
|
def test_headless_remote_run_invalid(mocker, debug_logs):
|
8
7
|
"""
|
9
8
|
Tests that we are unable to run a program in headless mode when the environment is remote, and that this is logged to the user.
|
10
9
|
"""
|
11
|
-
|
12
|
-
mocker,
|
13
|
-
{
|
14
|
-
"func": run,
|
15
|
-
"program_name": "test_program",
|
16
|
-
"headless": True,
|
17
|
-
"remote": True,
|
18
|
-
"verbose": False,
|
19
|
-
},
|
20
|
-
)
|
10
|
+
main(["run", "test_program", "--headless", "--remote"])
|
21
11
|
debug_logs_lower = debug_logs.text.lower()
|
22
12
|
assert "error" in debug_logs_lower
|
23
13
|
assert "headless" in debug_logs_lower
|
@@ -6,7 +6,6 @@ import pytest
|
|
6
6
|
from parsagon import delete, run
|
7
7
|
from parsagon.main import main
|
8
8
|
from parsagon.tests.api_mocks import install_api_mocks, not_found_pipeline_name
|
9
|
-
from parsagon.tests.cli_mocks import call_cli
|
10
9
|
|
11
10
|
|
12
11
|
def test_pipeline_delete(mocker):
|
@@ -18,24 +17,10 @@ def test_pipeline_not_found(mocker, debug_logs):
|
|
18
17
|
install_api_mocks(mocker, {"code_to_return": 'raise Exception("Should not exec this code if pipeline not found.")'})
|
19
18
|
|
20
19
|
# On delete
|
21
|
-
|
22
|
-
mocker,
|
23
|
-
{
|
24
|
-
"func": delete,
|
25
|
-
"program_name": not_found_pipeline_name,
|
26
|
-
"verbose": False,
|
27
|
-
},
|
28
|
-
)
|
20
|
+
main(["delete", not_found_pipeline_name, "-y"])
|
29
21
|
assert f"A program with name {not_found_pipeline_name} does not exist." in debug_logs.text
|
30
22
|
debug_logs.clear()
|
31
23
|
|
32
24
|
# On attempted run
|
33
|
-
|
34
|
-
mocker,
|
35
|
-
{
|
36
|
-
"func": run,
|
37
|
-
"program_name": not_found_pipeline_name,
|
38
|
-
"verbose": False,
|
39
|
-
},
|
40
|
-
)
|
25
|
+
main(["run", not_found_pipeline_name])
|
41
26
|
assert f"A program with name {not_found_pipeline_name} does not exist." in debug_logs.text
|
@@ -11,6 +11,7 @@ src/parsagon/executor.py
|
|
11
11
|
src/parsagon/highlights.js
|
12
12
|
src/parsagon/main.py
|
13
13
|
src/parsagon/print.py
|
14
|
+
src/parsagon/runs.py
|
14
15
|
src/parsagon/secrets.py
|
15
16
|
src/parsagon/settings.py
|
16
17
|
src/parsagon.egg-info/PKG-INFO
|
@@ -21,7 +22,6 @@ src/parsagon.egg-info/requires.txt
|
|
21
22
|
src/parsagon.egg-info/top_level.txt
|
22
23
|
src/parsagon/tests/__init__.py
|
23
24
|
src/parsagon/tests/api_mocks.py
|
24
|
-
src/parsagon/tests/cli_mocks.py
|
25
25
|
src/parsagon/tests/conftest.py
|
26
26
|
src/parsagon/tests/test_executor.py
|
27
27
|
src/parsagon/tests/test_invalid_args.py
|
@@ -1 +0,0 @@
|
|
1
|
-
from parsagon.main import create, update, detail, run, batch_runs, delete, get_product, get_review_article, get_article_list
|
@@ -1,454 +0,0 @@
|
|
1
|
-
import argparse
|
2
|
-
import datetime
|
3
|
-
import json
|
4
|
-
import logging
|
5
|
-
import logging.config
|
6
|
-
import psutil
|
7
|
-
import time
|
8
|
-
import traceback
|
9
|
-
|
10
|
-
from rich.console import Console
|
11
|
-
from rich.progress import Progress
|
12
|
-
from rich.prompt import Prompt
|
13
|
-
|
14
|
-
from parsagon.api import (
|
15
|
-
get_program_sketches,
|
16
|
-
create_pipeline,
|
17
|
-
delete_pipeline,
|
18
|
-
add_examples_to_custom_function,
|
19
|
-
create_pipeline_run,
|
20
|
-
update_pipeline_run,
|
21
|
-
get_pipeline,
|
22
|
-
get_pipelines,
|
23
|
-
get_pipeline_code,
|
24
|
-
get_run,
|
25
|
-
poll_extract,
|
26
|
-
)
|
27
|
-
from parsagon.assistant import assist
|
28
|
-
from parsagon.create import create_program
|
29
|
-
from parsagon.exceptions import ParsagonException, RunFailedException
|
30
|
-
from parsagon.executor import Executor, custom_functions_to_descriptions
|
31
|
-
from parsagon.settings import get_api_key, get_settings, clear_settings, save_setting, get_logging_config
|
32
|
-
|
33
|
-
console = Console()
|
34
|
-
logger = logging.getLogger(__name__)
|
35
|
-
|
36
|
-
|
37
|
-
def configure_logging(verbose):
|
38
|
-
logging.config.dictConfig(get_logging_config("DEBUG" if verbose else "INFO"))
|
39
|
-
|
40
|
-
|
41
|
-
def get_args():
|
42
|
-
parser = argparse.ArgumentParser(
|
43
|
-
prog="parsagon", description="Scrapes and interacts with web pages based on natural language.", add_help=False
|
44
|
-
)
|
45
|
-
parser.add_argument("-v", "--verbose", action="store_true", help="run the task in verbose mode")
|
46
|
-
subparsers = parser.add_subparsers()
|
47
|
-
|
48
|
-
# Create
|
49
|
-
parser_create = subparsers.add_parser("create", description="Creates a program.")
|
50
|
-
parser_create.add_argument(
|
51
|
-
"--headless",
|
52
|
-
action="store_true",
|
53
|
-
help="run the browser in headless mode",
|
54
|
-
)
|
55
|
-
parser_create.add_argument(
|
56
|
-
"--infer",
|
57
|
-
action="store_true",
|
58
|
-
help="let Parsagon infer all elements to be scraped",
|
59
|
-
)
|
60
|
-
parser_create.add_argument(
|
61
|
-
"--no_assistant",
|
62
|
-
action="store_true",
|
63
|
-
help="disable the Parsagon assistant",
|
64
|
-
)
|
65
|
-
parser_create.set_defaults(func=create)
|
66
|
-
|
67
|
-
# Detail
|
68
|
-
parser_detail = subparsers.add_parser(
|
69
|
-
"detail",
|
70
|
-
description="Outputs details of a created program.",
|
71
|
-
)
|
72
|
-
parser_detail.add_argument(
|
73
|
-
"-p",
|
74
|
-
"--program",
|
75
|
-
dest="program_name",
|
76
|
-
type=str,
|
77
|
-
help="the name of the program",
|
78
|
-
)
|
79
|
-
parser_detail.set_defaults(func=detail)
|
80
|
-
|
81
|
-
# Update
|
82
|
-
parser_update = subparsers.add_parser(
|
83
|
-
"update",
|
84
|
-
description="Updates a created program.",
|
85
|
-
)
|
86
|
-
parser_update.add_argument(
|
87
|
-
"program_name",
|
88
|
-
type=str,
|
89
|
-
help="the name of the program to update",
|
90
|
-
)
|
91
|
-
parser_update.add_argument(
|
92
|
-
"--variables",
|
93
|
-
type=json.loads,
|
94
|
-
default="{}",
|
95
|
-
help="a JSON object mapping variables to values",
|
96
|
-
)
|
97
|
-
parser_update.add_argument(
|
98
|
-
"--headless",
|
99
|
-
action="store_true",
|
100
|
-
help="run the browser in headless mode",
|
101
|
-
)
|
102
|
-
parser_update.add_argument(
|
103
|
-
"--infer",
|
104
|
-
action="store_true",
|
105
|
-
help="let Parsagon infer all elements to be scraped",
|
106
|
-
)
|
107
|
-
parser_update.add_argument(
|
108
|
-
"--replace",
|
109
|
-
action="store_true",
|
110
|
-
help="remove old example data while updating the program",
|
111
|
-
)
|
112
|
-
parser_update.set_defaults(func=update)
|
113
|
-
|
114
|
-
# Run
|
115
|
-
parser_run = subparsers.add_parser(
|
116
|
-
"run",
|
117
|
-
description="Runs a created program.",
|
118
|
-
)
|
119
|
-
parser_run.add_argument(
|
120
|
-
"program_name",
|
121
|
-
type=str,
|
122
|
-
help="the name of the program to run",
|
123
|
-
)
|
124
|
-
parser_run.add_argument(
|
125
|
-
"--variables",
|
126
|
-
type=json.loads,
|
127
|
-
default="{}",
|
128
|
-
help="a JSON object mapping variables to values",
|
129
|
-
)
|
130
|
-
parser_run.add_argument(
|
131
|
-
"--headless",
|
132
|
-
action="store_true",
|
133
|
-
help="run the browser in headless mode",
|
134
|
-
)
|
135
|
-
parser_run.add_argument(
|
136
|
-
"--remote",
|
137
|
-
action="store_true",
|
138
|
-
help="run the program in the cloud",
|
139
|
-
)
|
140
|
-
parser_run.add_argument(
|
141
|
-
"--output_log",
|
142
|
-
action="store_true",
|
143
|
-
help="output log data from the run",
|
144
|
-
)
|
145
|
-
parser_run.set_defaults(func=run)
|
146
|
-
|
147
|
-
# Delete
|
148
|
-
parser_delete = subparsers.add_parser(
|
149
|
-
"delete",
|
150
|
-
description="Deletes a program.",
|
151
|
-
)
|
152
|
-
parser_delete.add_argument(
|
153
|
-
"program_name",
|
154
|
-
type=str,
|
155
|
-
help="the name of the program to run",
|
156
|
-
)
|
157
|
-
parser_delete.add_argument(
|
158
|
-
"-y", "--yes", dest="confirm_with_user", action="store_false", help="auto-confirm option"
|
159
|
-
)
|
160
|
-
parser_delete.set_defaults(func=delete)
|
161
|
-
|
162
|
-
# Setup
|
163
|
-
parser_setup = subparsers.add_parser(
|
164
|
-
"setup",
|
165
|
-
description="Interactively sets up Parsagon with an API key.",
|
166
|
-
)
|
167
|
-
parser_setup.set_defaults(func=setup)
|
168
|
-
|
169
|
-
args = parser.parse_args()
|
170
|
-
kwargs = vars(args)
|
171
|
-
return kwargs, parser
|
172
|
-
|
173
|
-
|
174
|
-
def main():
|
175
|
-
kwargs, parser = get_args()
|
176
|
-
func = kwargs.pop("func")
|
177
|
-
verbose = kwargs["verbose"]
|
178
|
-
configure_logging(verbose)
|
179
|
-
|
180
|
-
if func:
|
181
|
-
try:
|
182
|
-
return func(**kwargs)
|
183
|
-
except ParsagonException as e:
|
184
|
-
error_message = "Error:\n" + e.to_string(verbose)
|
185
|
-
logger.error(error_message)
|
186
|
-
else:
|
187
|
-
parser.print_help()
|
188
|
-
|
189
|
-
|
190
|
-
def create(headless=False, infer=False, no_assistant=False, verbose=False):
|
191
|
-
task = Prompt.ask("Type what do you want to do")
|
192
|
-
if no_assistant:
|
193
|
-
create_program(task, headless=headless, infer=infer)
|
194
|
-
else:
|
195
|
-
assist(task, headless=headless, infer=infer)
|
196
|
-
|
197
|
-
|
198
|
-
def update(program_name, variables={}, headless=False, infer=False, replace=False, verbose=False):
|
199
|
-
configure_logging(verbose)
|
200
|
-
|
201
|
-
pipeline = get_pipeline(program_name)
|
202
|
-
abridged_program = pipeline["abridged_sketch"]
|
203
|
-
# Make the program runnable
|
204
|
-
variables_str = ", ".join(f"{k}={repr(v)}" for k, v in variables.items())
|
205
|
-
abridged_program += f"\n\noutput = func({variables_str})\n"
|
206
|
-
|
207
|
-
# Execute the abridged program to gather examples
|
208
|
-
executor = Executor(headless=headless, infer=infer)
|
209
|
-
executor.execute(abridged_program)
|
210
|
-
|
211
|
-
while True:
|
212
|
-
program_name_input = input(f"Type \"{program_name}\" to update this program, or press enter without typing a name to CANCEL: ")
|
213
|
-
if not program_name_input:
|
214
|
-
logger.info("Canceled update.")
|
215
|
-
return
|
216
|
-
if program_name_input == program_name:
|
217
|
-
break
|
218
|
-
|
219
|
-
pipeline_id = pipeline["id"]
|
220
|
-
try:
|
221
|
-
for call_id, custom_function in executor.custom_functions.items():
|
222
|
-
debug_suffix = f" ({custom_function.name})"
|
223
|
-
description = custom_functions_to_descriptions.get(custom_function.name)
|
224
|
-
description = " to " + description if description else ""
|
225
|
-
if verbose:
|
226
|
-
description += debug_suffix
|
227
|
-
logger.info(f" Saving function{description}...")
|
228
|
-
add_examples_to_custom_function(pipeline_id, call_id, custom_function, replace)
|
229
|
-
logger.info(f"Saved.")
|
230
|
-
except Exception as e:
|
231
|
-
logger.error(f"An error occurred while saving the program. The program was not updated.")
|
232
|
-
|
233
|
-
|
234
|
-
def detail(program_name=None, verbose=False):
|
235
|
-
if program_name:
|
236
|
-
data = [get_pipeline(program_name)]
|
237
|
-
else:
|
238
|
-
data = get_pipelines()
|
239
|
-
for pipeline in data:
|
240
|
-
print(
|
241
|
-
f"Program: {pipeline['name']}\nDescription: {pipeline['description']}\nVariables: {pipeline['variables']}\n"
|
242
|
-
)
|
243
|
-
|
244
|
-
|
245
|
-
def run(program_name, variables={}, headless=False, remote=False, output_log=False, verbose=False):
|
246
|
-
"""
|
247
|
-
Executes pipeline code
|
248
|
-
"""
|
249
|
-
if headless and remote:
|
250
|
-
raise ParsagonException("Cannot run a program remotely in headless mode")
|
251
|
-
|
252
|
-
logger.info("Preparing to run program %s", program_name)
|
253
|
-
pipeline_id = get_pipeline(program_name)["id"]
|
254
|
-
|
255
|
-
if remote:
|
256
|
-
result = create_pipeline_run(pipeline_id, variables, False)
|
257
|
-
with console.status("Program running remotely...") as status:
|
258
|
-
while True:
|
259
|
-
run = get_run(result["id"])
|
260
|
-
status = run["status"]
|
261
|
-
|
262
|
-
if output_log and status in ("FINISHED", "ERROR"):
|
263
|
-
return {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
264
|
-
|
265
|
-
if status == "FINISHED":
|
266
|
-
if verbose:
|
267
|
-
logger.info(run["log"])
|
268
|
-
for warning in run["warnings"]:
|
269
|
-
logger.warning(warning)
|
270
|
-
logger.info("Program finished running.")
|
271
|
-
return run["output"]
|
272
|
-
elif status == "ERROR":
|
273
|
-
raise ParsagonException(f"Program failed to run: {run['error']}")
|
274
|
-
elif status == "CANCELED":
|
275
|
-
raise ParsagonException("Program execution was canceled")
|
276
|
-
|
277
|
-
time.sleep(5)
|
278
|
-
|
279
|
-
run = create_pipeline_run(pipeline_id, variables, True)
|
280
|
-
code = get_pipeline_code(program_name, variables, headless)["code"]
|
281
|
-
start_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
282
|
-
run_data = {"start_time": start_time}
|
283
|
-
|
284
|
-
logger.info("Running program...")
|
285
|
-
globals_locals = {"PARSAGON_API_KEY": get_api_key()}
|
286
|
-
try:
|
287
|
-
exec(code, globals_locals, globals_locals)
|
288
|
-
run_data["status"] = "FINISHED"
|
289
|
-
except:
|
290
|
-
run_data["status"] = "ERROR"
|
291
|
-
run_data["error"] = str(traceback.format_exc())
|
292
|
-
if not output_log:
|
293
|
-
raise
|
294
|
-
finally:
|
295
|
-
end_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
296
|
-
run_data["end_time"] = end_time
|
297
|
-
if "driver" in globals_locals:
|
298
|
-
globals_locals["driver"].quit()
|
299
|
-
if "display" in globals_locals:
|
300
|
-
globals_locals["display"].stop()
|
301
|
-
if "parsagon_log" in globals_locals:
|
302
|
-
run_data["log"] = "\n".join(globals_locals["parsagon_log"])
|
303
|
-
logger.info(run_data["log"])
|
304
|
-
if "parsagon_warnings" in globals_locals:
|
305
|
-
run_data["warnings"] = globals_locals["parsagon_warnings"]
|
306
|
-
for proc in psutil.process_iter():
|
307
|
-
try:
|
308
|
-
if proc.name() == "chromedriver":
|
309
|
-
proc.kill()
|
310
|
-
except psutil.NoSuchProcess:
|
311
|
-
continue
|
312
|
-
run = update_pipeline_run(run["id"], run_data)
|
313
|
-
logger.info("Done.")
|
314
|
-
if output_log:
|
315
|
-
if "error" not in run_data:
|
316
|
-
run["output"] = globals_locals["output"]
|
317
|
-
return {k: v for k, v in run.items() if k in ("output", "status", "log", "warnings", "error")}
|
318
|
-
return globals_locals["output"]
|
319
|
-
|
320
|
-
|
321
|
-
def batch_runs(batch_name, program_name, runs, headless=False, ignore_errors=False, error_value=None, rerun_warnings=False, rerun_warning_types=[], rerun_errors=False, verbose=False):
|
322
|
-
save_file = f"{batch_name}.json"
|
323
|
-
try:
|
324
|
-
with open(save_file) as f:
|
325
|
-
outputs = json.load(f)
|
326
|
-
except FileNotFoundError:
|
327
|
-
outputs = []
|
328
|
-
metadata_file = f"{batch_name}_metadata.json"
|
329
|
-
try:
|
330
|
-
with open(metadata_file) as f:
|
331
|
-
metadata = json.load(f)
|
332
|
-
except FileNotFoundError:
|
333
|
-
metadata = []
|
334
|
-
|
335
|
-
num_initial_results = len(outputs)
|
336
|
-
error = None
|
337
|
-
variables = None
|
338
|
-
try:
|
339
|
-
default_desc = f'Running program "{program_name}"'
|
340
|
-
with Progress() as progress:
|
341
|
-
task = progress.add_task(default_desc, total=len(runs))
|
342
|
-
for i, variables in progress.track(enumerate(runs), task_id=task):
|
343
|
-
if i < num_initial_results:
|
344
|
-
if rerun_errors and metadata[i]["status"] == "ERROR":
|
345
|
-
pass
|
346
|
-
elif rerun_warnings and metadata[i]["warnings"]:
|
347
|
-
if not rerun_warning_types or any(warning["type"] in rerun_warning_types for warning in metadata[i]["warnings"]):
|
348
|
-
pass
|
349
|
-
else:
|
350
|
-
continue
|
351
|
-
else:
|
352
|
-
continue
|
353
|
-
for j in range(3):
|
354
|
-
result = run(program_name, variables, headless, output_log=True)
|
355
|
-
if result["status"] != "ERROR":
|
356
|
-
output = result.pop("output")
|
357
|
-
if i < num_initial_results:
|
358
|
-
outputs[i] = output
|
359
|
-
metadata[i] = result
|
360
|
-
else:
|
361
|
-
outputs.append(output)
|
362
|
-
metadata.append(result)
|
363
|
-
break
|
364
|
-
else:
|
365
|
-
error = result["error"].strip().split("\n")[-1]
|
366
|
-
if j < 2:
|
367
|
-
progress.update(task, description=f"An error occurred: {error} - Waiting 60s before retrying (Attempt {j+2}/3)")
|
368
|
-
time.sleep(60)
|
369
|
-
progress.update(task, description=default_desc)
|
370
|
-
error = None
|
371
|
-
continue
|
372
|
-
else:
|
373
|
-
if ignore_errors:
|
374
|
-
error = None
|
375
|
-
if i < num_initial_results:
|
376
|
-
outputs[i] = error_value
|
377
|
-
else:
|
378
|
-
outputs.append(error_value)
|
379
|
-
break
|
380
|
-
else:
|
381
|
-
raise RunFailedException
|
382
|
-
except RunFailedException:
|
383
|
-
pass
|
384
|
-
except Exception as e:
|
385
|
-
error = repr(e)
|
386
|
-
finally:
|
387
|
-
configure_logging(verbose)
|
388
|
-
if error:
|
389
|
-
logger.error(f"Unresolvable error occurred on run with variables {variables}: {error} - Data has been saved to {save_file}. Rerun your command to resume.")
|
390
|
-
with open(save_file, "w") as f:
|
391
|
-
json.dump(outputs, f)
|
392
|
-
with open(metadata_file, "w") as f:
|
393
|
-
json.dump(metadata, f)
|
394
|
-
num_warnings = 0
|
395
|
-
num_runs_with_warnings = 0
|
396
|
-
for m in metadata:
|
397
|
-
if m["warnings"]:
|
398
|
-
num_warnings += len(m["warnings"])
|
399
|
-
num_runs_with_warnings += 1
|
400
|
-
logger.info(f"\nSummary: {len(outputs)} runs made; {num_warnings} warnings encountered across {num_runs_with_warnings} runs. See {metadata_file} for logs.\n")
|
401
|
-
return None if error else outputs
|
402
|
-
|
403
|
-
|
404
|
-
def delete(program_name, verbose=False, confirm_with_user=False):
|
405
|
-
if (
|
406
|
-
confirm_with_user
|
407
|
-
and input(f"Are you sure you want to delete program with name {program_name}? (y/N) ").lower().strip() != "y"
|
408
|
-
):
|
409
|
-
logger.error("Cancelled operation.")
|
410
|
-
return
|
411
|
-
logger.info("Preparing to delete program %s", program_name)
|
412
|
-
pipeline_id = get_pipeline(program_name)["id"]
|
413
|
-
logger.info("Deleting program...")
|
414
|
-
delete_pipeline(pipeline_id)
|
415
|
-
logger.info("Done.")
|
416
|
-
|
417
|
-
|
418
|
-
def setup(verbose=False):
|
419
|
-
try:
|
420
|
-
old_api_key = get_api_key()
|
421
|
-
except ParsagonException:
|
422
|
-
old_api_key = None
|
423
|
-
try:
|
424
|
-
save_setting("api_key", None)
|
425
|
-
get_api_key(interactive=True)
|
426
|
-
except KeyboardInterrupt:
|
427
|
-
save_setting("api_key", old_api_key)
|
428
|
-
logger.error("\nCancelled operation.")
|
429
|
-
return
|
430
|
-
logger.info("Setup complete.")
|
431
|
-
|
432
|
-
|
433
|
-
def _get_data(url, page_type, timeout):
|
434
|
-
start_time = time.time()
|
435
|
-
with console.status("Extracting data...") as status:
|
436
|
-
while time.time() - start_time <= timeout:
|
437
|
-
result = poll_extract(url, page_type)
|
438
|
-
if result["done"]:
|
439
|
-
return result["result"]
|
440
|
-
time.sleep(15)
|
441
|
-
logger.info("No data found")
|
442
|
-
return None
|
443
|
-
|
444
|
-
|
445
|
-
def get_product(url, timeout=300):
|
446
|
-
return _get_data(url, "PRODUCT_DETAIL", timeout)
|
447
|
-
|
448
|
-
|
449
|
-
def get_review_article(url, timeout=300):
|
450
|
-
return _get_data(url, "REVIEW_ARTICLE_DETAIL", timeout)
|
451
|
-
|
452
|
-
|
453
|
-
def get_article_list(url, timeout=300):
|
454
|
-
return _get_data(url, "ARTICLE_LIST", timeout)
|
@@ -1,16 +0,0 @@
|
|
1
|
-
from parsagon.main import main
|
2
|
-
|
3
|
-
|
4
|
-
def call_cli(mocker, args):
|
5
|
-
"""
|
6
|
-
Uses the mocker to pretend that the args passed are coming from argparse, then calls the main function.
|
7
|
-
"""
|
8
|
-
|
9
|
-
mocker.patch(
|
10
|
-
"parsagon.main.get_args",
|
11
|
-
lambda: (
|
12
|
-
args,
|
13
|
-
None,
|
14
|
-
),
|
15
|
-
)
|
16
|
-
return main()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|