PyPI - itp-interface - Versions diffs - 1.0.0__py3-none-any.whl - Mend

itp-interface 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (485) hide show

itp_interface/pisa/src/main/python/legacy/create_finetune_tfrecords.py ADDED Viewed

@@ -0,0 +1,311 @@
+import argparse
+import os
+import random
+from pathlib import Path
+import numpy as np
+import tensorflow as tf
+from lm_dataformat import Reader
+from tqdm import tqdm
+from mesh_transformer_utils.tokenization import TokenizerWrapper
+def parse_args():
+    parser = argparse.ArgumentParser(description="""
+    Converts a text dataset into the training data format expected by the model.
+    Adapted from the script create_tfrecords.py in the gpt-neo repo.
+    - Your text dataset:
+        - can be provided as .txt files, or as an archive (.tar.gz, .xz, jsonl.zst).
+        - can be one file or multiple
+            - using a single large file may use too much memory and crash - if this occurs, split the file up into a few files
+        - the model's end-of-text separator is added between the contents of each file
+        - if the string '<|endoftext|>' appears inside a file, it is treated as the model's end-of-text separator (not the actual string '<|endoftext|>')
+            - this behavior can be disabled with --treat-eot-as-text
+    This script creates a single .tfrecords file as output
+        - Why: the model's data loader ignores "trailing" data (< 1 batch) at the end of a .tfrecords file
+            - this causes data loss if you have many .tfrecords files
+        - This is probably not appropriate for very large datasets
+    """, formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--input-dir", type=str, default=None,
+                        help="Path to where your files are located.")
+    parser.add_argument("--name", type=str, default=None,
+                        help="Name of output file will be {name}_{seqnum}.tfrecords, where seqnum is total sequence count")
+    parser.add_argument("--output-dir", type=str, default="",
+                        help="Output directory (default: current directory)")
+    parser.add_argument("--tokenizer-path", type=str, default=None,
+                        help="Path to a custom BPE tokenizer (default: None, gpt2 tokenizer)")
+    cleaning_args = parser.add_argument_group('data cleaning arguments')
+    cleaning_args.add_argument("--normalize-with-ftfy", action="store_true",
+                               help="Normalize text with ftfy")
+    cleaning_args.add_argument("--normalize-with-wikitext-detokenize",
+                               action="store_true",
+                               help="Use wikitext detokenizer")
+    minu_help = "Exclude repetitive documents made up of < MIN_UNIQUE_TOKENS unique tokens. These can produce large gradients."
+    minu_help += " Set <= 0 to disable. If enabled, 200 is a good default value. (Default: 0)"
+    cleaning_args.add_argument("--min-unique-tokens", type=int, default=0,
+                               help=minu_help)
+    shuffle_pack_args = parser.add_argument_group(
+        'data shuffling/packing arguments')
+    repack_ep_help = "Repeat the data N_REPACK_EPOCHS times, shuffled differently in each repetition. Recommended for multi-epoch training (set this to your intended number of epochs)."
+    shuffle_pack_args.add_argument("--n-repack-epochs",
+                                   type=int, default=1,
+                                   help=repack_ep_help
+                                   )
+    shuffle_pack_args.add_argument("--seed", type=int, default=10,
+                                   help="random seed for shuffling data (default: 10)")
+    shuffle_pack_args.add_argument("--preserve-data-order",
+                                   default=False, action="store_true",
+                                   help="Disables shuffling, so the input and output data have the same order.")
+    misc_args = parser.add_argument_group('miscellaneous arguments')
+    misc_args.add_argument("--verbose",
+                           default=False, action="store_true",
+                           help="Prints extra information, such as the text removed by --min-unique-tokens")
+    args, unknown = parser.parse_known_args()
+    print(f'Unknown args: {unknown}')
+    return args
+def get_files(input_dir):
+    filetypes = ["jsonl.zst", ".txt", ".xz", ".tar.gz"]
+    files = [list(Path(input_dir).glob(f"*{ft}")) for ft in filetypes]
+    # flatten list of list -> list and stringify Paths
+    return [str(item) for sublist in files for item in sublist]
+def _int64_feature(value):
+    """
+    Returns an int64_list from a bool / enum / int / uint.
+    """
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+def write_to_file(writer, data):
+    """
+    writes data to tfrecord file
+    """
+    feature = {
+        "text": _int64_feature(data)
+    }
+    tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
+    writer.write(tf_example.SerializeToString())
+def write_tfrecord(sequences, fp):
+    with tf.io.TFRecordWriter(fp) as writer:
+        for seq in sequences:
+            write_to_file(writer, seq)
+def split_list(l, n):
+    # splits list/string into n size chunks
+    return [l[i:i + n] for i in range(0, len(l), n)]
+def enforce_min_unique(seqs, min_unique_tokens, enc, verbose=False):
+    for seq in tqdm(seqs, mininterval=1, smoothing=0):
+        if len(set(seq)) >= min_unique_tokens:
+            yield seq
+        elif verbose:
+            text = enc.decode(seq)
+            print(
+                f"excluding with {len(set(seq))} unique tokens:\n\n{repr(text)}\n\n")
+def eot_splitting_generator(string_iterable, encoder: TokenizerWrapper):
+    """
+    Given strings, splits them internally on <|endoftext|> and yields (generally more) strings
+    """
+    for doc in string_iterable:
+        for d in doc.split(encoder.eos_token_str):
+            if len(d.strip()) > 0:
+                yield d
+def prep_and_tokenize_generator(string_iterable, encoder: TokenizerWrapper,
+                                normalize_with_ftfy,
+                                normalize_with_wikitext_detokenize):
+    """
+    Given strings, does data cleaning / tokenization and yields arrays of tokens
+    """
+    for doc in string_iterable:
+        tokens = encoder.encode(doc) + [encoder.eos_token_id]
+        yield np.array(tokens, dtype=np.uint16)
+def file_to_tokenized_docs_generator(file_path, encoder, args):
+    """
+    Given a file path, reads the file and tokenizes the contents
+    Yields token arrays of arbitrary, unequal length
+    """
+    reader = Reader(file_path)
+    string_iterable = reader.stream_data(threaded=False)
+    string_iterable = eot_splitting_generator(string_iterable, encoder)
+    token_list_gen = prep_and_tokenize_generator(string_iterable,
+                                                 encoder,
+                                                 normalize_with_ftfy=args.normalize_with_ftfy,
+                                                 normalize_with_wikitext_detokenize=args.normalize_with_wikitext_detokenize
+                                                 )
+    return token_list_gen
+def read_files_to_tokenized_docs(files, args, encoder):
+    docs = []
+    if args.preserve_data_order:
+        files = sorted(files)
+    else:
+        random.shuffle(files)
+    for f in tqdm(files, mininterval=10, smoothing=0):
+        docs.extend(file_to_tokenized_docs_generator(f, encoder, args))
+    if not args.preserve_data_order:
+        # shuffle at individual document level
+        random.shuffle(docs)
+    return docs
+def arrays_to_sequences(token_list_iterable, sequence_length=2049):
+    """
+    Given token arrays of arbitrary lengths, concats/splits them into arrays of equal length
+    Returns equal-length token arrays, followed by a a final array of trailing tokens (which may be shorter)
+    """
+    print('Chunking in standard LM mode')
+    accum = []
+    for l in token_list_iterable:
+        accum.extend(l)
+        if len(accum) > sequence_length:
+            chunks = split_list(accum, sequence_length)
+            for chunk in chunks[:-1]:
+                yield chunk
+            accum = chunks[-1]
+    if len(accum) > 0:
+        yield accum
+def arrays_to_sequences_pad(token_list_iterable, pad_token_id,
+                            sequence_length=2049,
+                            sep_token_id=None,
+                            eos_token_id=None):
+    print('Chunking in seq2seq mode')
+    accum = []
+    too_long = 0
+    for chunk in tqdm(token_list_iterable):
+        chunk = chunk.tolist()
+        n_sep_tokens = sum(x == sep_token_id for x in chunk)
+        n_eos_tokens = sum(x == eos_token_id for x in chunk)
+        assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
+                                                   n_eos_tokens)
+        if len(chunk) > sequence_length:
+            too_long += 1
+        elif len(accum) + len(chunk) > sequence_length:
+            res = accum + [pad_token_id] * (sequence_length - len(accum))
+            n_sep_tokens = sum(x == sep_token_id for x in res)
+            n_eos_tokens = sum(x == eos_token_id for x in res)
+            assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
+                                                       n_eos_tokens)
+            yield res
+            accum = chunk
+        else:
+            accum.extend(chunk)
+    print(f'Discarded {too_long} examples longer than {sequence_length}')
+    if len(accum) > 0:
+        yield accum
+def chunk_and_finalize(arrays, args, encoder):
+    seq2seq = getattr(args, 'seq2seq', True)
+    if seq2seq:
+        sequences = list(
+            arrays_to_sequences_pad(arrays, pad_token_id=encoder.pad_token_id,
+                                    sep_token_id=encoder.sep_token_id,
+                                    eos_token_id=encoder.eos_token_id))
+    else:
+        sequences = list(map(lambda x: np.array(x, dtype=np.uint16),
+                             arrays_to_sequences(arrays)))
+    full_seqs, trailing_data = sequences[:-1], sequences[-1]
+    if args.min_unique_tokens > 0:
+        full_seqs = list(
+            enforce_min_unique(full_seqs, args.min_unique_tokens, encoder,
+                               args.verbose))
+    if not args.preserve_data_order:
+        random.shuffle(full_seqs)
+    return full_seqs, trailing_data
+def create_tfrecords(files, args):
+    encoder = TokenizerWrapper.from_file_or_gpt(args.tokenizer_path)
+    random.seed(args.seed)
+    docs = read_files_to_tokenized_docs(files, args, encoder)
+    full_seqs, trailing_data = chunk_and_finalize(docs, args, encoder)
+    if getattr(args, 'seq2seq', True):
+        # Seq2seq sanity checks
+        assert all(
+            encoder.decode(x[:20]).strip().startswith('<') for x in full_seqs)
+        sep_id = encoder.sep_token_id
+        eos_id = encoder.eos_token_id
+        pad_id = encoder.pad_token_id
+        for seq in full_seqs:
+            last_non_pad_idx = max(
+                i for i in range(len(seq)) if seq[i] != pad_id)
+            assert seq[last_non_pad_idx] == eos_id
+            n_sep_tokens = sum(x == sep_id for x in seq)
+            n_eos_tokens = sum(x == eos_id for x in seq)
+            assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
+                                                       n_eos_tokens)
+    # final
+    print(f"dropped {len(trailing_data)} tokens of trailing data")
+    total_sequence_len = len(full_seqs)
+    fp = os.path.join(args.output_dir,
+                      f"{args.name}_{total_sequence_len}.tfrecords")
+    write_tfrecord(full_seqs, fp)
+def create_finetune_tfrecords(**kwargs):
+    args = parse_args()
+    # Update by kwargs
+    for k, v in kwargs.items():
+        setattr(args, k, v)
+    if not args.input_dir.endswith("/"):
+        args.input_dir = args.input_dir + "/"
+    if args.output_dir:
+        os.makedirs(args.output_dir, exist_ok=True)
+    files = get_files(args.input_dir)
+    create_tfrecords(files, args)
+if __name__ == "__main__":
+    create_finetune_tfrecords()

itp_interface/pisa/src/main/python/legacy/demo.py ADDED Viewed

@@ -0,0 +1,49 @@
+from PisaFlexibleClient import initialise_env
+# Run a server on port 8000
+# i.e. do a 'sbt "runMain pisa.server.PisaOneStageServer8000"'
+env = initialise_env(8000,
+        working_directory="/private/home/aqj/afp-2021-10-22/thys/FunWithFunctions",
+        isa_path="/private/home/aqj/Isabelle2021",
+        theory_file_path="/private/home/aqj/afp-2021-10-22/thys/FunWithFunctions/FunWithFunctions.thy"
+    )
+# Suppose you have a list of theorems that you want to try on
+theorems = [
+    'theorem identity1: fixes f :: "nat \<Rightarrow> nat" assumes fff: "\<And>n. f(f(n)) < f(Suc(n))" shows "f(n) = n"',
+    'theorem ifac_neg0: fixes ifac :: "int \<Rightarrow> int" assumes ifac_rec: "\<And>i. ifac i = (if i=0 then 1 else i*ifac(i - 1))" shows "i<0 \<Longrightarrow> ifac i = 0"'
+]
+# And the corresponding scripts
+scripts = [
+    "sorry",
+    "bad script"
+]
+env.post("<initialise>")
+for theorem, script in zip(theorems, scripts):
+    # Execute before the theorem
+    env.post(
+        f"<accumulative step before> {theorem}"
+    )
+    # Create an experimental state with a name e.g. script[-10:]
+    # Execute the theorem declaration
+    name = script[-10:]
+    env.post(
+        f"<clone> default <clone> {name}"
+    )
+    env.post(
+        f"<apply to top level state> {name} <apply to top level state> {theorem} <apply to top level state> {name}"
+    )
+    # Execute the script and get the proof level
+    response = env.post(
+        f"<apply to top level state> {name} <apply to top level state> {script} <apply to top level state> {name}"
+    )
+    print(f"script execution response: {response}")
+    level = env.post(f"<get_proof_level> {name}")
+    # If level = 0, succeed, other wise fail
+    print(level)

itp_interface/pisa/src/main/python/legacy/evaluate.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import json
+import grpc
+import argparse
+import server_pb2
+import server_pb2_grpc
+MAX_MESSAGE_LENGTH = 10485760
+def stack_lines(input_string):
+    return " ".join(input_string.replace("\n", " ").split()).strip()
+def evaluate_single_problem(isa_path, theory_file_path, working_directory, theorem_name, model, mode_of_proving,
+                       maximum_number_of_steps=100, port=9000):
+    channel = grpc.insecure_channel('localhost:{}'.format(port),
+                                    options=[('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
+                                             ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH)])
+    stub = server_pb2_grpc.ServerStub(channel)
+    stub.InitialiseIsabelle(server_pb2.IsaPath(path=isa_path))
+    stub.IsabelleWorkingDirectory(server_pb2.IsaPath(path=working_directory))
+    stub.IsabelleContext(server_pb2.IsaContext(context=theory_file_path))
+    theorem_name = stack_lines(theorem_name)
+    state_string = stub.IsabelleCommand(server_pb2.IsaCommand(command="proceed:"+theorem_name)).state
+    if mode_of_proving not in ["proof", "state", "proof_and_state"]:
+        raise AssertionError
+    previous_proof_segment = theorem_name
+    state = state_string
+    # print(state)
+    try:
+        for i in range(maximum_number_of_steps):
+            state = stack_lines(state)
+            input_string = ""
+            if mode_of_proving == "state":
+                input_string += "State: {}".format(state)
+            if mode_of_proving == "proof_and_state":
+                input_string += " <PS_SEP> "
+            if mode_of_proving == "proof":
+                input_string += "Proof: {}".format(previous_proof_segment)
+                # TODO: previous proof segment unfinished
+            output_string = model.predict(input_string)
+            # print(input_string)
+            # print(output_string)
+            state = stub.IsabelleCommand(server_pb2.IsaCommand(command=output_string)).state
+            # print(state)
+            if "proof" not in state:
+                stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
+                return 1
+    except Exception as e:
+        print(e)
+        pass
+    stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
+    return 0
+class DummyProver:
+    def __init__(self, seq2seq_repo):
+        src_list = open(os.path.join(seq2seq_repo, "train.src"), "r").readlines()
+        tgt_list = open(os.path.join(seq2seq_repo, "train.tgt"), "r").readlines()
+        src_list.extend(open(os.path.join(seq2seq_repo, "val.src"), "r").readlines())
+        tgt_list.extend(open(os.path.join(seq2seq_repo, "val.tgt"), "r").readlines())
+        src_list.extend(open(os.path.join(seq2seq_repo, "test.src"), "r").readlines())
+        tgt_list.extend(open(os.path.join(seq2seq_repo, "test.tgt"), "r").readlines())
+        self.prover_dict = dict()
+        assert len(src_list) == len(tgt_list)
+        for i in range(len(src_list)):
+            src = stack_lines(src_list[i])
+            tgt = stack_lines(tgt_list[i])
+            self.prover_dict[src] = tgt
+    def predict(self, input_string):
+        return self.prover_dict[input_string]
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Extracting an Isabelle theory file.')
+    parser.add_argument('--isa-path', help='The path to the Isabelle executable',
+                        default="/Applications/Isabelle2020.app/Isabelle")
+    parser.add_argument('--working-directory', '-wd', help='Path to the AFP project')
+    parser.add_argument('--theory-file-path', '-tfp', help='Path to the file to parse')
+    parser.add_argument('--theorem-name', '-tn', help='Name of the theorem to prove')
+    parser.add_argument('--mode-of-proving', '-mop',
+                        help='Mode of proving, could be "state", "proof", or "proof_and_state"')
+    parser.add_argument('--port', '-p', help='Port to use to communicate', default=9000, type=int)
+    args = parser.parse_args()
+    dummy_prover = DummyProver("/Users/qj213/Projects/PISA/fs_with_state")
+    # print(evaluate_single_problem(isa_path=args.isa_path, theory_file_path=args.theory_file_path,
+    #                          working_directory=args.working_directory, theorem_name=args.theorem_name,
+    #                          port=args.port, model=dummy_prover, mode_of_proving="state"))
+    problem_names = json.load(open("fs_with_state/problem_names_split.json"))
+    train_names = problem_names["train"]
+    for i in range(0, 5):
+        theory_file_path = train_names[i][0].replace("/home/ywu/afp-2021-02-11", "/Users/qj213/Projects/afp-2021-02-11")
+        # print(theory_file_path)
+        # print(train_names[i][1])
+        print(evaluate_single_problem(isa_path=args.isa_path,
+                               theory_file_path=theory_file_path,
+                               working_directory="/".join(theory_file_path.split("/")[:-1]),
+                               theorem_name=train_names[i][1],
+                               port=args.port, model=dummy_prover, mode_of_proving="state"))

itp_interface/pisa/src/main/python/legacy/extract_first_step.py ADDED Viewed

@@ -0,0 +1,25 @@
+import os
+import json
+from tqdm import tqdm
+proof_and_state_dir = "/home/qj213/proof_and_state"
+first_step_dir = "/home/qj213/first_step"
+for file in os.listdir(proof_and_state_dir):
+    split_name = file.split(".")[0]
+    with open(os.path.join(proof_and_state_dir, file)) as fhand, \
+        open(os.path.join(first_step_dir, f"{split_name}.src"), "w") as src_out, \
+            open(os.path.join(first_step_dir, f"{split_name}.tgt"), "w") as tgt_out:
+        for line in tqdm(fhand.readlines()):
+            line_json = json.loads(line.strip())
+            source = line_json["source"]
+            proof_step_string = source.split("<PS_SEP>")[0].strip()
+            proof_state_string = source.split("<PS_SEP>")[1].strip()
+            target = line_json["target"]
+            if "\\n" not in proof_step_string:
+                # This is the first step
+                src_out.write(f"<ISA_OBS> {proof_state_string}\n")
+                tgt_out.write(f"{target}\n")

itp_interface/pisa/src/main/python/legacy/get_global_facts.py ADDED Viewed

@@ -0,0 +1,35 @@
+from PisaFlexibleClient import initialise_env
+import os
+import pickle
+def match_names_single_file_to_data_play_szymon(
+        port, working_directory, isa_path, theory_file_path, out_dir, error_log_dir):
+    env = initialise_env(
+        port=port,
+        working_directory=working_directory,
+        isa_path=isa_path,
+        theory_file_path=theory_file_path
+    )
+    try:
+        output_string = env.post("<get global facts from file>")
+        list_of_string_tuples = output_string.split("<SEP>")
+        global_fact_dict = {}
+        for element in list_of_string_tuples:
+            name, definition = element.split("<DEF>")
+            global_fact_dict[name.strip()] = definition.strip()
+        pickle.dump(global_fact_dict, open(os.path.join(out_dir, f"dict_{theory_file_path.replace('/', '_')}"), "wb"))
+    except Exception as e:
+        with open(os.path.join(error_log_dir, f"error_log_{theory_file_path.replace('/', '_')}.txt"), "w") as fout:
+            fout.write(str(e))
+if __name__ == "__main__":
+    match_names_single_file_to_data_play_szymon(
+        port=8000,
+        working_directory="/home/qj213/afp-2021-10-22/thys/FunWithFunctions",
+        isa_path="/home/qj213/Isabelle2021",
+        theory_file_path="/home/qj213/afp-2021-10-22/thys/FunWithFunctions/FunWithFunctions.thy",
+        out_dir="/home/qj213/out_stuff",
+        error_log_dir="/home/qj213/out_stuff"
+    )

itp_interface/pisa/src/main/python/legacy/mix_data.py ADDED Viewed

@@ -0,0 +1,19 @@
+import argparse
+import os
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Mix the data from multiple forms of input")
+    parser.add_argument("--input", type=str, nargs="+", help="Input files")
+    parser.add_argument("--output-path", "-op", type=str, help="Output file")
+    args = parser.parse_args()
+    for output_file_name in ["train.src", "train.tgt", "val.src", "val.tgt", "test.src", "test.tgt"]:
+        if os.path.isfile(os.path.join(args.output_path, output_file_name)):
+            os.remove(os.path.join(args.output_path, output_file_name))
+    for input_path in args.input:
+        for output_file_name in ["train.src", "train.tgt", "val.src", "val.tgt", "test.src", "test.tgt"]:
+            with open(os.path.join(args.output_path, output_file_name), "a") as output_file, \
+                    open(os.path.join(input_path, output_file_name), "r") as input_file:
+                output_file.write(input_file.read())

itp_interface/pisa/src/main/python/legacy/one_stage_extraction.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+import json
+import grpc
+import argparse
+from copy import copy
+from func_timeout import func_set_timeout, FunctionTimedOut
+import server_pb2
+import server_pb2_grpc
+MAX_MESSAGE_LENGTH = 10485760
+def analyse_whole_file(whole_file_string, use_sledgehammer=False):
+    transitions = whole_file_string.split("<\TRANSEP>")
+    state_action_proof_level_tuples = list()
+    problem_names = list()
+    proof_open = False
+    last_state = ""
+    for transition in transitions:
+        if not transition:
+            continue
+        if use_sledgehammer:
+            state, action, proof_level, hammer_results = transition.split("<\STATESEP>")
+        else:
+            state, action, proof_level = transition.split("<\STATESEP>")
+            hammer_results = "NA"
+        state = state.strip()
+        action = action.strip()
+        proof_level = int(proof_level.strip())
+        if action.startswith("lemma") or action.startswith("theorem"):
+            problem_names.append(action)
+            state_action_proof_level_tuples.append((state, action, proof_level, hammer_results))
+            proof_open = True
+        elif proof_open:
+            state_action_proof_level_tuples.append((state, action, proof_level, hammer_results))
+        if "subgoal" in last_state and "subgoal" not in state:
+            proof_open = False
+    return {
+        "problem_names": problem_names,
+        "translations": state_action_proof_level_tuples
+    }
+@func_set_timeout(12000)
+def isa_step(stub, theory_file_path, use_sledgehammer=False):
+    stub.IsabelleContext(server_pb2.IsaContext(context=theory_file_path))
+    extraction_command = "PISA extract data with hammer" if use_sledgehammer else "PISA extract data"
+    return stub.IsabelleCommand(server_pb2.IsaCommand(command=extraction_command)).state
+def extract_file(isa_path, theory_file_path, working_directory, saving_directory, port=9000, use_sledgehammer=False):
+    channel = grpc.insecure_channel('localhost:{}'.format(port),
+                                    options=[('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
+                                    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH)])
+    stub = server_pb2_grpc.ServerStub(channel)
+    stub.InitialiseIsabelle(server_pb2.IsaPath(path=isa_path))
+    stub.IsabelleWorkingDirectory(server_pb2.IsaPath(path=working_directory))
+    if not os.path.isdir(saving_directory):
+        os.makedirs(saving_directory)
+    close_program = False
+    try:
+        whole_file_parsed = isa_step(stub, theory_file_path, use_sledgehammer=use_sledgehammer)
+        stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
+    except (Exception, FunctionTimedOut) as e:
+        close_program = True
+        with open(os.path.join(saving_directory,
+                               "project_{}_file_{}_bug_report.txt".format(
+                                   working_directory.split("/")[-1], theory_file_path.split("/")[-1])), "w") as fout:
+            fout.write(str(e))
+    file_analysis = analyse_whole_file(whole_file_parsed)
+    file_info = {
+        "file_name": theory_file_path,
+        "working_directory": working_directory,
+        **file_analysis,
+        "raw_parsed_string": whole_file_parsed
+    }
+    json.dump(file_info,
+              open(os.path.join(saving_directory,
+                                "_".join(theory_file_path.split(".thy")[0].split("/"))+"_ground_truth.json"), "w"))
+    if close_program:
+        stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
+    channel.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Extracting an Isabelle theory file.')
+    parser.add_argument('--isa-path', help='The path to the Isabelle executable',
+                        default="/Applications/Isabelle2020.app/Isabelle")
+    parser.add_argument('--working-directory', '-wd', help='Path to the AFP project')
+    parser.add_argument('--theory-file-path', '-tfp', help='Path to the file to parse')
+    parser.add_argument('--saving-directory', '-sd', help='Where the save the parsed json files')
+    parser.add_argument('--port', '-p', help='Port to use to communicate', default=9000, type=int)
+    parser.add_argument('--use-sledgehammer', '-us', help='Whether to use sledgehammer',
+                        action='store_true')
+    parser.set_defaults(use_sledgehammer=False)
+    args = parser.parse_args()
+    # for file_name in os.listdir(args.working_directory):
+    #     if file_name.endswith(".thy"):
+    #         full_file_path = os.path.join(args.working_directory, file_name)
+    extract_file(args.isa_path, args.theory_file_path, args.working_directory,
+                 args.saving_directory, args.port, args.use_sledgehammer)