itp-interface 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (485) hide show
  1. itp_interface/__init__.py +0 -0
  2. itp_interface/agent/__init__.py +0 -0
  3. itp_interface/agent/simple_proof_agent.py +100 -0
  4. itp_interface/coq_ser_api/__init__.py +165 -0
  5. itp_interface/coq_ser_api/contexts.py +283 -0
  6. itp_interface/coq_ser_api/coq_agent.py +459 -0
  7. itp_interface/coq_ser_api/coq_backend.py +135 -0
  8. itp_interface/coq_ser_api/coq_util.py +839 -0
  9. itp_interface/coq_ser_api/example.py +67 -0
  10. itp_interface/coq_ser_api/lsp_backend.py +375 -0
  11. itp_interface/coq_ser_api/py.typed +0 -0
  12. itp_interface/coq_ser_api/serapi_backend.py +841 -0
  13. itp_interface/coq_ser_api/util.py +145 -0
  14. itp_interface/coq_ser_api_old/__init__.py +2583 -0
  15. itp_interface/coq_ser_api_old/contexts.py +172 -0
  16. itp_interface/coq_ser_api_old/util.py +146 -0
  17. itp_interface/lean_server/__init__.py +0 -0
  18. itp_interface/lean_server/commands.py +484 -0
  19. itp_interface/lean_server/lean3_search_tool.py +358 -0
  20. itp_interface/lean_server/lean4_repl_interface.py +151 -0
  21. itp_interface/lean_server/lean4_utils.py +255 -0
  22. itp_interface/lean_server/lean_cmd_server.py +111 -0
  23. itp_interface/lean_server/lean_context.py +60 -0
  24. itp_interface/lean_server/lean_sync_server.py +174 -0
  25. itp_interface/lean_server/lean_utils.py +199 -0
  26. itp_interface/lean_server/py.typed +1 -0
  27. itp_interface/main/__init__.py +0 -0
  28. itp_interface/main/config/afp_data_gen.yaml +14 -0
  29. itp_interface/main/config/benchmark/CompCert.yaml +366 -0
  30. itp_interface/main/config/benchmark/GeoCoq.yaml +930 -0
  31. itp_interface/main/config/benchmark/UniMath.yaml +2690 -0
  32. itp_interface/main/config/benchmark/afp_isabelle.yaml +29200 -0
  33. itp_interface/main/config/benchmark/agent_proverbot_hard.yaml +247 -0
  34. itp_interface/main/config/benchmark/category-theory.yaml +470 -0
  35. itp_interface/main/config/benchmark/compcert_118_subset.yaml +148 -0
  36. itp_interface/main/config/benchmark/compcert_benchmark.yaml +36 -0
  37. itp_interface/main/config/benchmark/compcert_benchmark_hard.yaml +498 -0
  38. itp_interface/main/config/benchmark/compcert_benchmark_hard_1.yaml +55 -0
  39. itp_interface/main/config/benchmark/compcert_benchmark_hard_2.yaml +24 -0
  40. itp_interface/main/config/benchmark/compcert_benchmark_hard_3.yaml +95 -0
  41. itp_interface/main/config/benchmark/compcert_benchmark_hard_7_per_cent.yaml +78 -0
  42. itp_interface/main/config/benchmark/compcert_benchmark_test.yaml +38 -0
  43. itp_interface/main/config/benchmark/compcert_benchmark_train.yaml +340 -0
  44. itp_interface/main/config/benchmark/leandojo_novel_premises_test.yaml +2908 -0
  45. itp_interface/main/config/benchmark/leandojo_novel_premises_train.yaml +98645 -0
  46. itp_interface/main/config/benchmark/leandojo_novel_premises_val.yaml +2912 -0
  47. itp_interface/main/config/benchmark/leandojo_random.yaml +2889 -0
  48. itp_interface/main/config/benchmark/leandojo_random_test.yaml +2421 -0
  49. itp_interface/main/config/benchmark/leandojo_random_train.yaml +62729 -0
  50. itp_interface/main/config/benchmark/leandojo_random_val.yaml +2504 -0
  51. itp_interface/main/config/benchmark/math-comp.yaml +200 -0
  52. itp_interface/main/config/benchmark/miniF2F_test.yaml +12 -0
  53. itp_interface/main/config/benchmark/miniF2F_test_aime.yaml +27 -0
  54. itp_interface/main/config/benchmark/miniF2F_test_algebra.yaml +30 -0
  55. itp_interface/main/config/benchmark/miniF2F_test_amc12.yaml +57 -0
  56. itp_interface/main/config/benchmark/miniF2F_test_few_shot_hard.yaml +231 -0
  57. itp_interface/main/config/benchmark/miniF2F_test_imo.yaml +32 -0
  58. itp_interface/main/config/benchmark/miniF2F_test_induction.yaml +20 -0
  59. itp_interface/main/config/benchmark/miniF2F_test_mathd_algebra.yaml +82 -0
  60. itp_interface/main/config/benchmark/miniF2F_test_mathd_algebra_hard.yaml +72 -0
  61. itp_interface/main/config/benchmark/miniF2F_test_mathd_numbertheory.yaml +72 -0
  62. itp_interface/main/config/benchmark/miniF2F_test_numbertheory.yaml +20 -0
  63. itp_interface/main/config/benchmark/minicompcert_benchmark_1.yaml +14 -0
  64. itp_interface/main/config/benchmark/proverbot_hard.yaml +104 -0
  65. itp_interface/main/config/benchmark/re_prover.yaml +66 -0
  66. itp_interface/main/config/benchmark/re_prover_hard.yaml +41 -0
  67. itp_interface/main/config/benchmark/re_prover_very_hard.yaml +22 -0
  68. itp_interface/main/config/benchmark/reprover_with_retrieval.yaml +73 -0
  69. itp_interface/main/config/benchmark/reprover_with_retrieval_hard.yaml +30 -0
  70. itp_interface/main/config/benchmark/reprover_with_retrieval_neg.yaml +195 -0
  71. itp_interface/main/config/benchmark/simple_benchmark_1.yaml +24 -0
  72. itp_interface/main/config/benchmark/simple_benchmark_8.yaml +50 -0
  73. itp_interface/main/config/benchmark/simple_benchmark_9.yaml +65 -0
  74. itp_interface/main/config/benchmark/simple_benchmark_isabelle.yaml +18 -0
  75. itp_interface/main/config/benchmark/simple_benchmark_lean.yaml +12 -0
  76. itp_interface/main/config/benchmark/simple_benchmark_lean_training_data.yaml +12 -0
  77. itp_interface/main/config/benchmark/simple_rl_benchmark_lean.yaml +14 -0
  78. itp_interface/main/config/benchmark/stack_machine.yaml +13 -0
  79. itp_interface/main/config/benchmark/stack_machine_hard.yaml +15 -0
  80. itp_interface/main/config/category_theory_data_gen.yaml +14 -0
  81. itp_interface/main/config/category_theory_data_gen_random.yaml +16 -0
  82. itp_interface/main/config/compcert_data_gen_test.yaml +10 -0
  83. itp_interface/main/config/compcert_data_gen_train.yaml +7 -0
  84. itp_interface/main/config/env_settings/bm25_retrieval.yaml +2 -0
  85. itp_interface/main/config/env_settings/bm25_retrieval_no_dfns.yaml +2 -0
  86. itp_interface/main/config/env_settings/bm25_retrieval_only_local_no_dfns.yaml +2 -0
  87. itp_interface/main/config/env_settings/bm25_retrieval_with_print.yaml +2 -0
  88. itp_interface/main/config/env_settings/bm25_retrieval_with_print_only_local.yaml +2 -0
  89. itp_interface/main/config/env_settings/bm25_retrieval_with_print_only_local_no_dfns.yaml +2 -0
  90. itp_interface/main/config/env_settings/no_retrieval.yaml +2 -0
  91. itp_interface/main/config/experiments.yaml +12 -0
  92. itp_interface/main/config/geo_coq_data_gen.yaml +14 -0
  93. itp_interface/main/config/geo_coq_data_gen_random.yaml +16 -0
  94. itp_interface/main/config/leandojo_random_data_gen.yaml +16 -0
  95. itp_interface/main/config/math_comp_data_gen.yaml +14 -0
  96. itp_interface/main/config/math_comp_data_gen_random.yaml +16 -0
  97. itp_interface/main/config/mathlib_data_gen.yaml +14 -0
  98. itp_interface/main/config/repo/coq_repos.yaml +191 -0
  99. itp_interface/main/config/run_settings/default_coq_data_generation_transforms.yaml +24 -0
  100. itp_interface/main/config/run_settings/default_isabelle_data_generation_transforms.yaml +24 -0
  101. itp_interface/main/config/run_settings/default_lean4_data_generation_transforms.yaml +24 -0
  102. itp_interface/main/config/run_settings/default_lean_data_generation_transforms.yaml +24 -0
  103. itp_interface/main/config/simple_coq_data_gen.yaml +12 -0
  104. itp_interface/main/config/simple_coq_data_gen_random.yaml +17 -0
  105. itp_interface/main/config/simple_lean_data_gen.yaml +12 -0
  106. itp_interface/main/config/simple_rl_lean_data_gen.yaml +12 -0
  107. itp_interface/main/config/uni_math_data_gen.yaml +14 -0
  108. itp_interface/main/config.py +192 -0
  109. itp_interface/main/extract_benchmark_dataset.py +106 -0
  110. itp_interface/main/filter_dataset.py +107 -0
  111. itp_interface/main/install.py +92 -0
  112. itp_interface/main/merge_dataset.py +96 -0
  113. itp_interface/main/run_tool.py +444 -0
  114. itp_interface/pisa/.git +1 -0
  115. itp_interface/pisa/.gitignore +125 -0
  116. itp_interface/pisa/.idea/.gitignore +8 -0
  117. itp_interface/pisa/.idea/ClojureProjectResolveSettings.xml +6 -0
  118. itp_interface/pisa/.idea/codeStyles/Project.xml +7 -0
  119. itp_interface/pisa/.idea/codeStyles/codeStyleConfig.xml +5 -0
  120. itp_interface/pisa/.idea/inspectionProfiles/Project_Default.xml +16 -0
  121. itp_interface/pisa/.idea/libraries/sbt__com_google_android_annotations_4_1_1_4_jar.xml +13 -0
  122. itp_interface/pisa/.idea/libraries/sbt__com_google_api_grpc_proto_google_common_protos_1_17_0_jar.xml +13 -0
  123. itp_interface/pisa/.idea/libraries/sbt__com_google_code_findbugs_jsr305_3_0_2_jar.xml +13 -0
  124. itp_interface/pisa/.idea/libraries/sbt__com_google_code_gson_gson_2_8_6_jar.xml +13 -0
  125. itp_interface/pisa/.idea/libraries/sbt__com_google_errorprone_error_prone_annotations_2_3_4_jar.xml +13 -0
  126. itp_interface/pisa/.idea/libraries/sbt__com_google_guava_failureaccess_1_0_1_jar.xml +13 -0
  127. itp_interface/pisa/.idea/libraries/sbt__com_google_guava_guava_30_0_jre_jar.xml +13 -0
  128. itp_interface/pisa/.idea/libraries/sbt__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava_jar.xml +9 -0
  129. itp_interface/pisa/.idea/libraries/sbt__com_google_j2objc_j2objc_annotations_1_3_jar.xml +13 -0
  130. itp_interface/pisa/.idea/libraries/sbt__com_google_protobuf_protobuf_java_3_12_0_jar.xml +13 -0
  131. itp_interface/pisa/.idea/libraries/sbt__com_google_protobuf_protobuf_java_util_3_12_0_jar.xml +13 -0
  132. itp_interface/pisa/.idea/libraries/sbt__com_lihaoyi_fastparse_2_13_2_3_0_jar.xml +13 -0
  133. itp_interface/pisa/.idea/libraries/sbt__com_lihaoyi_geny_2_13_0_6_0_jar.xml +13 -0
  134. itp_interface/pisa/.idea/libraries/sbt__com_lihaoyi_sourcecode_2_13_0_2_1_jar.xml +13 -0
  135. itp_interface/pisa/.idea/libraries/sbt__com_thesamet_scalapb_lenses_2_13_0_10_9_jar.xml +13 -0
  136. itp_interface/pisa/.idea/libraries/sbt__com_thesamet_scalapb_scalapb_runtime_2_13_0_10_9_jar.xml +13 -0
  137. itp_interface/pisa/.idea/libraries/sbt__com_thesamet_scalapb_scalapb_runtime_grpc_2_13_0_10_9_jar.xml +13 -0
  138. itp_interface/pisa/.idea/libraries/sbt__com_thesamet_scalapb_zio_grpc_zio_grpc_core_2_13_0_4_2_jar.xml +13 -0
  139. itp_interface/pisa/.idea/libraries/sbt__com_thoughtworks_paranamer_paranamer_2_8_jar.xml +13 -0
  140. itp_interface/pisa/.idea/libraries/sbt__commons_io_commons_io_2_8_0_jar.xml +13 -0
  141. itp_interface/pisa/.idea/libraries/sbt__de_unruh_java_patterns_0_1_0_jar.xml +13 -0
  142. itp_interface/pisa/.idea/libraries/sbt__de_unruh_scala_isabelle_2_13_master_SNAPSHOT_jar.xml +13 -0
  143. itp_interface/pisa/.idea/libraries/sbt__dev_zio_izumi_reflect_2_13_1_0_0_M9_jar.xml +13 -0
  144. itp_interface/pisa/.idea/libraries/sbt__dev_zio_izumi_reflect_thirdparty_boopickle_shaded_2_13_1_0_0_M9_jar.xml +13 -0
  145. itp_interface/pisa/.idea/libraries/sbt__dev_zio_zio_2_13_1_0_3_jar.xml +13 -0
  146. itp_interface/pisa/.idea/libraries/sbt__dev_zio_zio_stacktracer_2_13_1_0_3_jar.xml +13 -0
  147. itp_interface/pisa/.idea/libraries/sbt__dev_zio_zio_streams_2_13_1_0_3_jar.xml +13 -0
  148. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_api_1_34_0_jar.xml +13 -0
  149. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_context_1_34_0_jar.xml +13 -0
  150. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_core_1_34_0_jar.xml +13 -0
  151. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_netty_1_34_0_jar.xml +13 -0
  152. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_protobuf_1_34_0_jar.xml +13 -0
  153. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_protobuf_lite_1_34_0_jar.xml +13 -0
  154. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_services_1_34_0_jar.xml +13 -0
  155. itp_interface/pisa/.idea/libraries/sbt__io_grpc_grpc_stub_1_34_0_jar.xml +13 -0
  156. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_buffer_4_1_51_Final_jar.xml +13 -0
  157. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_codec_4_1_51_Final_jar.xml +13 -0
  158. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_codec_http2_4_1_51_Final_jar.xml +13 -0
  159. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_codec_http_4_1_51_Final_jar.xml +13 -0
  160. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_codec_socks_4_1_51_Final_jar.xml +13 -0
  161. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_common_4_1_51_Final_jar.xml +13 -0
  162. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_handler_4_1_51_Final_jar.xml +13 -0
  163. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_handler_proxy_4_1_51_Final_jar.xml +13 -0
  164. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_resolver_4_1_51_Final_jar.xml +13 -0
  165. itp_interface/pisa/.idea/libraries/sbt__io_netty_netty_transport_4_1_51_Final_jar.xml +13 -0
  166. itp_interface/pisa/.idea/libraries/sbt__io_perfmark_perfmark_api_0_19_0_jar.xml +13 -0
  167. itp_interface/pisa/.idea/libraries/sbt__net_java_dev_jna_jna_5_3_1_jar.xml +13 -0
  168. itp_interface/pisa/.idea/libraries/sbt__net_liftweb_lift_json_2_13_3_4_3_jar.xml +13 -0
  169. itp_interface/pisa/.idea/libraries/sbt__org_apache_commons_commons_lang3_3_11_jar.xml +13 -0
  170. itp_interface/pisa/.idea/libraries/sbt__org_apache_commons_commons_text_1_9_jar.xml +13 -0
  171. itp_interface/pisa/.idea/libraries/sbt__org_checkerframework_checker_qual_3_5_0_jar.xml +13 -0
  172. itp_interface/pisa/.idea/libraries/sbt__org_codehaus_mojo_animal_sniffer_annotations_1_18_jar.xml +13 -0
  173. itp_interface/pisa/.idea/libraries/sbt__org_jetbrains_annotations_20_1_0_jar.xml +13 -0
  174. itp_interface/pisa/.idea/libraries/sbt__org_jline_jline_3_16_0_jar.xml +13 -0
  175. itp_interface/pisa/.idea/libraries/sbt__org_log4s_log4s_2_13_1_9_0_jar.xml +13 -0
  176. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_modules_scala_collection_compat_2_13_2_1_6_jar.xml +13 -0
  177. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_modules_scala_xml_2_13_1_3_0_jar.xml +13 -0
  178. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_scala_compiler_2_13_4_jar.xml +13 -0
  179. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_scala_library_2_13_4_jar.xml +23 -0
  180. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_scala_reflect_2_13_4_jar.xml +13 -0
  181. itp_interface/pisa/.idea/libraries/sbt__org_scala_lang_scalap_2_13_4_jar.xml +13 -0
  182. itp_interface/pisa/.idea/libraries/sbt__org_scalaz_scalaz_core_2_13_7_3_2_jar.xml +13 -0
  183. itp_interface/pisa/.idea/libraries/sbt__org_slf4j_slf4j_api_1_7_30_jar.xml +13 -0
  184. itp_interface/pisa/.idea/libraries/sbt__org_slf4j_slf4j_simple_1_7_30_jar.xml +13 -0
  185. itp_interface/pisa/.idea/misc.xml +7 -0
  186. itp_interface/pisa/.idea/modules/PISA-build.iml +127 -0
  187. itp_interface/pisa/.idea/modules/PISA.iml +94 -0
  188. itp_interface/pisa/.idea/modules.xml +9 -0
  189. itp_interface/pisa/.idea/other.xml +6 -0
  190. itp_interface/pisa/.idea/sbt.xml +20 -0
  191. itp_interface/pisa/.idea/scala_compiler.xml +6 -0
  192. itp_interface/pisa/.idea/uiDesigner.xml +124 -0
  193. itp_interface/pisa/.idea/vcs.xml +6 -0
  194. itp_interface/pisa/.scalafmt.conf +2 -0
  195. itp_interface/pisa/LICENSE +29 -0
  196. itp_interface/pisa/README.md +262 -0
  197. itp_interface/pisa/build.sbt +49 -0
  198. itp_interface/pisa/build.sh +26 -0
  199. itp_interface/pisa/command_generation/close_gaps.py +44 -0
  200. itp_interface/pisa/command_generation/conjecture_normal_order.py +62 -0
  201. itp_interface/pisa/command_generation/conjecturer_command_generator.py +36 -0
  202. itp_interface/pisa/command_generation/create_dirs.py +11 -0
  203. itp_interface/pisa/command_generation/find_std.py +67 -0
  204. itp_interface/pisa/command_generation/generate_build_commands_afp.py +15 -0
  205. itp_interface/pisa/command_generation/generate_build_commands_std.py +15 -0
  206. itp_interface/pisa/command_generation/generate_commands_afp.py +103 -0
  207. itp_interface/pisa/command_generation/generate_commands_mini.py +73 -0
  208. itp_interface/pisa/command_generation/generate_commands_std.py +69 -0
  209. itp_interface/pisa/command_generation/generate_hammer_extraction_text.py +5 -0
  210. itp_interface/pisa/command_generation/hammer_command_generator.py +40 -0
  211. itp_interface/pisa/command_generation/hp_search_command_generator.py +63 -0
  212. itp_interface/pisa/command_generation/oracle_command_generator.py +56 -0
  213. itp_interface/pisa/command_generation/search_command_generator.py +69 -0
  214. itp_interface/pisa/command_generation/summarise_problem_names.py +45 -0
  215. itp_interface/pisa/command_generation/tpu_hp_search.py +75 -0
  216. itp_interface/pisa/docker/Dockerfile +34 -0
  217. itp_interface/pisa/docker/docker_tutorial.md +64 -0
  218. itp_interface/pisa/eval_setup/copy_isabelle.py +42 -0
  219. itp_interface/pisa/eval_setup/copy_pisa_jars.py +18 -0
  220. itp_interface/pisa/mesh_transformer_utils/tokenization.py +86 -0
  221. itp_interface/pisa/project/build.properties +1 -0
  222. itp_interface/pisa/project/plugins.sbt +5 -0
  223. itp_interface/pisa/requirements.txt +4 -0
  224. itp_interface/pisa/scripts/extract_last_k_steps.py +28 -0
  225. itp_interface/pisa/scripts/extract_proof_corpus.py +26 -0
  226. itp_interface/pisa/scripts/gather_hammer_results.py +27 -0
  227. itp_interface/pisa/scripts/length_in_char_stats.py +20 -0
  228. itp_interface/pisa/scripts/mix.py +127 -0
  229. itp_interface/pisa/scripts/results_stat.py +52 -0
  230. itp_interface/pisa/scripts/test_array_job.sh +34 -0
  231. itp_interface/pisa/setup.sh +25 -0
  232. itp_interface/pisa/src/main/protobuf/server.proto +60 -0
  233. itp_interface/pisa/src/main/python/.idea/.gitignore +8 -0
  234. itp_interface/pisa/src/main/python/.idea/inspectionProfiles/Project_Default.xml +18 -0
  235. itp_interface/pisa/src/main/python/.idea/inspectionProfiles/profiles_settings.xml +6 -0
  236. itp_interface/pisa/src/main/python/.idea/misc.xml +4 -0
  237. itp_interface/pisa/src/main/python/.idea/modules.xml +8 -0
  238. itp_interface/pisa/src/main/python/.idea/python.iml +12 -0
  239. itp_interface/pisa/src/main/python/.idea/vcs.xml +6 -0
  240. itp_interface/pisa/src/main/python/conjecturing_parsing/conjecturer_postprocessing.py +59 -0
  241. itp_interface/pisa/src/main/python/data_extraction/extract_data.py +184 -0
  242. itp_interface/pisa/src/main/python/data_extraction/find_premises.py +221 -0
  243. itp_interface/pisa/src/main/python/data_extraction/process_data.py +129 -0
  244. itp_interface/pisa/src/main/python/legacy/PisaFlexibleClient.py +167 -0
  245. itp_interface/pisa/src/main/python/legacy/autof_test.py +74 -0
  246. itp_interface/pisa/src/main/python/legacy/cmd_client.py +23 -0
  247. itp_interface/pisa/src/main/python/legacy/convert_scala_dump_to_test_name_jsons.py +14 -0
  248. itp_interface/pisa/src/main/python/legacy/create_data_txt.py +72 -0
  249. itp_interface/pisa/src/main/python/legacy/create_finetune_tfrecords.py +311 -0
  250. itp_interface/pisa/src/main/python/legacy/demo.py +49 -0
  251. itp_interface/pisa/src/main/python/legacy/evaluate.py +108 -0
  252. itp_interface/pisa/src/main/python/legacy/extract_first_step.py +25 -0
  253. itp_interface/pisa/src/main/python/legacy/get_global_facts.py +35 -0
  254. itp_interface/pisa/src/main/python/legacy/mix_data.py +19 -0
  255. itp_interface/pisa/src/main/python/legacy/one_stage_extraction.py +111 -0
  256. itp_interface/pisa/src/main/python/legacy/prepare_episodic_transitions.py +137 -0
  257. itp_interface/pisa/src/main/python/legacy/prepare_translation_pairs.py +277 -0
  258. itp_interface/pisa/src/main/python/pisa_client.py +322 -0
  259. itp_interface/pisa/src/main/python/server_pb2.py +394 -0
  260. itp_interface/pisa/src/main/python/server_pb2_grpc.py +230 -0
  261. itp_interface/pisa/src/main/python/test_client.py +17 -0
  262. itp_interface/pisa/src/main/python/test_client2.py +79 -0
  263. itp_interface/pisa/src/main/python/utils/filters.py +59 -0
  264. itp_interface/pisa/src/main/python/utils/pisa_server_control.py +29 -0
  265. itp_interface/pisa/src/main/scala/pisa/agent/CheckSyntax.scala +257 -0
  266. itp_interface/pisa/src/main/scala/pisa/agent/DepThms.scala +29 -0
  267. itp_interface/pisa/src/main/scala/pisa/agent/PisaStat.scala +46 -0
  268. itp_interface/pisa/src/main/scala/pisa/agent/RefactorTest.scala +40 -0
  269. itp_interface/pisa/src/main/scala/pisa/agent/RepHammer.scala +95 -0
  270. itp_interface/pisa/src/main/scala/pisa/server/HammFacts.scala +63 -0
  271. itp_interface/pisa/src/main/scala/pisa/server/PisaOS.scala +881 -0
  272. itp_interface/pisa/src/main/scala/pisa/server/PisaOneStage.scala +540 -0
  273. itp_interface/pisa/src/main/scala/pisa/server/PisaOneStageServers.scala +1048 -0
  274. itp_interface/pisa/src/main/scala/pisa/utils/TheoryManager.scala +95 -0
  275. itp_interface/pisa/src/test/python/analyse_debug.py +33 -0
  276. itp_interface/pisa/src/test/python/extract_test_seq2seq.py +53 -0
  277. itp_interface/pisa/src/test/python/extract_test_theorem_ground_truth_indices.py +31 -0
  278. itp_interface/pisa/src/test/python/proof_originality.py +24 -0
  279. itp_interface/pisa/src/test/python/test_command_generator.py +25 -0
  280. itp_interface/pisa/src/test/python/test_model_sequence_accuracy.py +70 -0
  281. itp_interface/pisa/src/test/scala/pisa/Easy.scala +26 -0
  282. itp_interface/pisa/src/test/scala/pisa/TestCurl.scala +82 -0
  283. itp_interface/pisa/src/test/scala/pisa/TestIsa.scala +27 -0
  284. itp_interface/pisa/test.sh +19 -0
  285. itp_interface/pisa/universal_test_theorems.tar.gz +0 -0
  286. itp_interface/repo/build.py +78 -0
  287. itp_interface/repo/clone.py +79 -0
  288. itp_interface/repo/dataset_discovery.py +99 -0
  289. itp_interface/retrieval/__init__.py +0 -0
  290. itp_interface/retrieval/abstraction.py +35 -0
  291. itp_interface/retrieval/coq_bm25_reranker.py +153 -0
  292. itp_interface/retrieval/isabelle_bm25_reranker.py +86 -0
  293. itp_interface/retrieval/lean3_bm25_reranker.py +86 -0
  294. itp_interface/rl/__init__.py +0 -0
  295. itp_interface/rl/abstraction.py +168 -0
  296. itp_interface/rl/proof_action.py +172 -0
  297. itp_interface/rl/proof_state.py +149 -0
  298. itp_interface/rl/proof_tree.py +109 -0
  299. itp_interface/rl/simpl_proof_env_pool.py +16 -0
  300. itp_interface/rl/simple_proof_env.py +713 -0
  301. itp_interface/rl/simple_proof_env_pool.py +591 -0
  302. itp_interface/scripts/setup.sh +228 -0
  303. itp_interface/tools/__init__.py +0 -0
  304. itp_interface/tools/basic_utils.py +172 -0
  305. itp_interface/tools/bin_packing.py +61 -0
  306. itp_interface/tools/cache.py +93 -0
  307. itp_interface/tools/coq_build_spec.py +31 -0
  308. itp_interface/tools/coq_build_tool.py +319 -0
  309. itp_interface/tools/coq_context_helper.py +354 -0
  310. itp_interface/tools/coq_executor.py +508 -0
  311. itp_interface/tools/coq_local_data_generation_transform.py +158 -0
  312. itp_interface/tools/coq_parse_utils.py +154 -0
  313. itp_interface/tools/coq_raw_proofs.py +193 -0
  314. itp_interface/tools/coq_theorem_proof_pair_generation_transform.py +146 -0
  315. itp_interface/tools/coq_training_data_generator.py +76 -0
  316. itp_interface/tools/dynamic_coq_proof_exec.py +220 -0
  317. itp_interface/tools/dynamic_isabelle_proof_exec.py +229 -0
  318. itp_interface/tools/dynamic_lean4_proof_exec.py +236 -0
  319. itp_interface/tools/dynamic_lean_proof_exec.py +228 -0
  320. itp_interface/tools/isabelle_context_helper.py +66 -0
  321. itp_interface/tools/isabelle_executor.py +862 -0
  322. itp_interface/tools/isabelle_local_data_generation_transform.py +149 -0
  323. itp_interface/tools/isabelle_parse_utils.py +131 -0
  324. itp_interface/tools/isabelle_server.py +106 -0
  325. itp_interface/tools/lean4_context_helper.py +72 -0
  326. itp_interface/tools/lean4_local_data_generation_transform.py +122 -0
  327. itp_interface/tools/lean4_sync_executor.py +1193 -0
  328. itp_interface/tools/lean_cmd_executor.py +804 -0
  329. itp_interface/tools/lean_context_helper.py +327 -0
  330. itp_interface/tools/lean_dojo_data_generation_transform.py +206 -0
  331. itp_interface/tools/lean_executor.py +687 -0
  332. itp_interface/tools/lean_local_data_generation_transform.py +136 -0
  333. itp_interface/tools/lean_parse_utils.py +32 -0
  334. itp_interface/tools/log_utils.py +20 -0
  335. itp_interface/tools/proof_exec_callback.py +76 -0
  336. itp_interface/tools/ray_utils.py +265 -0
  337. itp_interface/tools/repl/.git +1 -0
  338. itp_interface/tools/repl/.github/workflows/ci.yml +24 -0
  339. itp_interface/tools/repl/.gitignore +7 -0
  340. itp_interface/tools/repl/.vscode/copyright.code-snippets +13 -0
  341. itp_interface/tools/repl/.vscode/extensions.json +13 -0
  342. itp_interface/tools/repl/.vscode/module-docstring.code-snippets +35 -0
  343. itp_interface/tools/repl/.vscode/settings.json +11 -0
  344. itp_interface/tools/repl/README.md +174 -0
  345. itp_interface/tools/repl/REPL/Frontend.lean +47 -0
  346. itp_interface/tools/repl/REPL/JSON.lean +186 -0
  347. itp_interface/tools/repl/REPL/Lean/ContextInfo.lean +9 -0
  348. itp_interface/tools/repl/REPL/Lean/Environment.lean +31 -0
  349. itp_interface/tools/repl/REPL/Lean/InfoTree/ToJson.lean +114 -0
  350. itp_interface/tools/repl/REPL/Lean/InfoTree.lean +272 -0
  351. itp_interface/tools/repl/REPL/Main.lean +323 -0
  352. itp_interface/tools/repl/REPL/Snapshots.lean +306 -0
  353. itp_interface/tools/repl/REPL/Util/Path.lean +36 -0
  354. itp_interface/tools/repl/REPL/Util/Pickle.lean +44 -0
  355. itp_interface/tools/repl/REPL.lean +4 -0
  356. itp_interface/tools/repl/lake-manifest.json +5 -0
  357. itp_interface/tools/repl/lakefile.lean +15 -0
  358. itp_interface/tools/repl/lean-toolchain +1 -0
  359. itp_interface/tools/repl/test/Mathlib/.gitignore +5 -0
  360. itp_interface/tools/repl/test/Mathlib/H20231110.sh +2 -0
  361. itp_interface/tools/repl/test/Mathlib/ReplMathlibTests.lean +1 -0
  362. itp_interface/tools/repl/test/Mathlib/lake-manifest.json +68 -0
  363. itp_interface/tools/repl/test/Mathlib/lakefile.lean +11 -0
  364. itp_interface/tools/repl/test/Mathlib/lean-toolchain +1 -0
  365. itp_interface/tools/repl/test/Mathlib/test/20240209.expected.out +20 -0
  366. itp_interface/tools/repl/test/Mathlib/test/20240209.in +3 -0
  367. itp_interface/tools/repl/test/Mathlib/test/20240209.lean +4 -0
  368. itp_interface/tools/repl/test/Mathlib/test/H20231020.expected.out +8 -0
  369. itp_interface/tools/repl/test/Mathlib/test/H20231020.in +8 -0
  370. itp_interface/tools/repl/test/Mathlib/test/H20231020.lean +22 -0
  371. itp_interface/tools/repl/test/Mathlib/test/H20231110.expected.out +4 -0
  372. itp_interface/tools/repl/test/Mathlib/test/H20231110.in +4 -0
  373. itp_interface/tools/repl/test/Mathlib/test/H20231115.expected.out +19 -0
  374. itp_interface/tools/repl/test/Mathlib/test/H20231115.in +5 -0
  375. itp_interface/tools/repl/test/Mathlib/test/H20231115_2.expected.out +18 -0
  376. itp_interface/tools/repl/test/Mathlib/test/H20231115_2.in +4 -0
  377. itp_interface/tools/repl/test/Mathlib/test/H20231115_3.expected.out +10 -0
  378. itp_interface/tools/repl/test/Mathlib/test/H20231115_3.in +4 -0
  379. itp_interface/tools/repl/test/Mathlib/test/H20231214.in +9 -0
  380. itp_interface/tools/repl/test/Mathlib/test/H20231214.lean +30 -0
  381. itp_interface/tools/repl/test/Mathlib/test/H20231215.expected.out +4 -0
  382. itp_interface/tools/repl/test/Mathlib/test/H20231215.in +4 -0
  383. itp_interface/tools/repl/test/Mathlib/test/H20231215_2.expected.out +14 -0
  384. itp_interface/tools/repl/test/Mathlib/test/H20231215_2.in +3 -0
  385. itp_interface/tools/repl/test/Mathlib/test/exact.expected.out +37 -0
  386. itp_interface/tools/repl/test/Mathlib/test/exact.in +10 -0
  387. itp_interface/tools/repl/test/Mathlib/test/import_Mathlib.lean +1 -0
  388. itp_interface/tools/repl/test/Mathlib/test/induction.expected.out +29 -0
  389. itp_interface/tools/repl/test/Mathlib/test/induction.in +10 -0
  390. itp_interface/tools/repl/test/Mathlib/test/induction.lean +6 -0
  391. itp_interface/tools/repl/test/Mathlib/test/on_goal.expected.out +22 -0
  392. itp_interface/tools/repl/test/Mathlib/test/on_goal.in +5 -0
  393. itp_interface/tools/repl/test/Mathlib/test/pickle.expected.out +16 -0
  394. itp_interface/tools/repl/test/Mathlib/test/pickle.in +6 -0
  395. itp_interface/tools/repl/test/Mathlib/test/pickle_2.expected.out +4 -0
  396. itp_interface/tools/repl/test/Mathlib/test/pickle_2.in +4 -0
  397. itp_interface/tools/repl/test/Mathlib/test.sh +41 -0
  398. itp_interface/tools/repl/test/all_tactics.expected.out +13 -0
  399. itp_interface/tools/repl/test/all_tactics.in +1 -0
  400. itp_interface/tools/repl/test/by_cases.expected.out +25 -0
  401. itp_interface/tools/repl/test/by_cases.in +8 -0
  402. itp_interface/tools/repl/test/by_cases.lean +4 -0
  403. itp_interface/tools/repl/test/calc.expected.out +32 -0
  404. itp_interface/tools/repl/test/calc.in +1 -0
  405. itp_interface/tools/repl/test/def_eval.expected.out +9 -0
  406. itp_interface/tools/repl/test/def_eval.in +3 -0
  407. itp_interface/tools/repl/test/enableInitializersExecution.expected.out +2 -0
  408. itp_interface/tools/repl/test/enableInitializersExecution.in +1 -0
  409. itp_interface/tools/repl/test/file.expected.out +8 -0
  410. itp_interface/tools/repl/test/file.in +1 -0
  411. itp_interface/tools/repl/test/file.lean +5 -0
  412. itp_interface/tools/repl/test/have_by_sorry.expected.out +28 -0
  413. itp_interface/tools/repl/test/have_by_sorry.in +6 -0
  414. itp_interface/tools/repl/test/import_lean.in +1 -0
  415. itp_interface/tools/repl/test/incomplete.expected.out +18 -0
  416. itp_interface/tools/repl/test/incomplete.in +3 -0
  417. itp_interface/tools/repl/test/incomplete.lean +0 -0
  418. itp_interface/tools/repl/test/infotree.expected.out +20 -0
  419. itp_interface/tools/repl/test/infotree.in +2 -0
  420. itp_interface/tools/repl/test/invalid_tactic.expected.out +20 -0
  421. itp_interface/tools/repl/test/invalid_tactic.in +3 -0
  422. itp_interface/tools/repl/test/name_generator.expected.out +53 -0
  423. itp_interface/tools/repl/test/name_generator.in +18 -0
  424. itp_interface/tools/repl/test/no_goal_sorry.expected.out +11 -0
  425. itp_interface/tools/repl/test/no_goal_sorry.in +1 -0
  426. itp_interface/tools/repl/test/no_goal_sorry_2.expected.out +12 -0
  427. itp_interface/tools/repl/test/no_goal_sorry_2.in +1 -0
  428. itp_interface/tools/repl/test/options.expected.out +17 -0
  429. itp_interface/tools/repl/test/options.in +6 -0
  430. itp_interface/tools/repl/test/pickle_environment.expected.out +8 -0
  431. itp_interface/tools/repl/test/pickle_environment.in +7 -0
  432. itp_interface/tools/repl/test/pickle_environment_with_imports.expected.out +10 -0
  433. itp_interface/tools/repl/test/pickle_environment_with_imports.in +9 -0
  434. itp_interface/tools/repl/test/pickle_open.expected.out +8 -0
  435. itp_interface/tools/repl/test/pickle_open.in +7 -0
  436. itp_interface/tools/repl/test/pickle_open_2.expected.out +4 -0
  437. itp_interface/tools/repl/test/pickle_open_2.in +3 -0
  438. itp_interface/tools/repl/test/pickle_open_scoped.expected.out +18 -0
  439. itp_interface/tools/repl/test/pickle_open_scoped.in +8 -0
  440. itp_interface/tools/repl/test/pickle_open_scoped_2.expected.out +14 -0
  441. itp_interface/tools/repl/test/pickle_open_scoped_2.in +3 -0
  442. itp_interface/tools/repl/test/pickle_proof_state_1.expected.out +26 -0
  443. itp_interface/tools/repl/test/pickle_proof_state_1.in +15 -0
  444. itp_interface/tools/repl/test/pickle_proof_state_2.expected.out +4 -0
  445. itp_interface/tools/repl/test/pickle_proof_state_2.in +3 -0
  446. itp_interface/tools/repl/test/pickle_proof_state_env.expected.out +26 -0
  447. itp_interface/tools/repl/test/pickle_proof_state_env.in +15 -0
  448. itp_interface/tools/repl/test/pickle_scoped_notation.in +16 -0
  449. itp_interface/tools/repl/test/pickle_scoped_notation_2.in +3 -0
  450. itp_interface/tools/repl/test/proof_step.expected.out +18 -0
  451. itp_interface/tools/repl/test/proof_step.in +7 -0
  452. itp_interface/tools/repl/test/readme.expected.out +16 -0
  453. itp_interface/tools/repl/test/readme.in +5 -0
  454. itp_interface/tools/repl/test/sorry_hypotheses.expected.out +16 -0
  455. itp_interface/tools/repl/test/sorry_hypotheses.in +4 -0
  456. itp_interface/tools/repl/test/synthesize_placeholder.expected.out +7 -0
  457. itp_interface/tools/repl/test/synthesize_placeholder.in +1 -0
  458. itp_interface/tools/repl/test/tactic_mode_sorry.expected.out +14 -0
  459. itp_interface/tools/repl/test/tactic_mode_sorry.in +3 -0
  460. itp_interface/tools/repl/test/tactic_sorry.expected.out +12 -0
  461. itp_interface/tools/repl/test/tactic_sorry.in +1 -0
  462. itp_interface/tools/repl/test/term_sorry.expected.out +12 -0
  463. itp_interface/tools/repl/test/term_sorry.in +1 -0
  464. itp_interface/tools/repl/test/trace_simp.expected.out +41 -0
  465. itp_interface/tools/repl/test/trace_simp.in +15 -0
  466. itp_interface/tools/repl/test/unfinished_tactic_block.expected.out +11 -0
  467. itp_interface/tools/repl/test/unfinished_tactic_block.in +1 -0
  468. itp_interface/tools/repl/test/unknown_environment.expected.out +2 -0
  469. itp_interface/tools/repl/test/unknown_environment.in +1 -0
  470. itp_interface/tools/repl/test/unknown_proof_state.expected.out +14 -0
  471. itp_interface/tools/repl/test/unknown_proof_state.in +3 -0
  472. itp_interface/tools/repl/test/unknown_tactic.expected.out +14 -0
  473. itp_interface/tools/repl/test/unknown_tactic.in +3 -0
  474. itp_interface/tools/repl/test/variables.expected.out +26 -0
  475. itp_interface/tools/repl/test/variables.in +5 -0
  476. itp_interface/tools/repl/test.sh +43 -0
  477. itp_interface/tools/run_data_generation_transforms.py +350 -0
  478. itp_interface/tools/theorem_details.py +25 -0
  479. itp_interface/tools/training_data.py +358 -0
  480. itp_interface/tools/training_data_format.py +599 -0
  481. itp_interface-1.0.0.dist-info/METADATA +78 -0
  482. itp_interface-1.0.0.dist-info/RECORD +485 -0
  483. itp_interface-1.0.0.dist-info/WHEEL +4 -0
  484. itp_interface-1.0.0.dist-info/entry_points.txt +3 -0
  485. itp_interface-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,311 @@
1
+ import argparse
2
+ import os
3
+ import random
4
+
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import tensorflow as tf
9
+ from lm_dataformat import Reader
10
+ from tqdm import tqdm
11
+
12
+ from mesh_transformer_utils.tokenization import TokenizerWrapper
13
+
14
+
15
+ def parse_args():
16
+ parser = argparse.ArgumentParser(description="""
17
+ Converts a text dataset into the training data format expected by the model.
18
+
19
+ Adapted from the script create_tfrecords.py in the gpt-neo repo.
20
+
21
+ - Your text dataset:
22
+ - can be provided as .txt files, or as an archive (.tar.gz, .xz, jsonl.zst).
23
+ - can be one file or multiple
24
+ - using a single large file may use too much memory and crash - if this occurs, split the file up into a few files
25
+ - the model's end-of-text separator is added between the contents of each file
26
+ - if the string '<|endoftext|>' appears inside a file, it is treated as the model's end-of-text separator (not the actual string '<|endoftext|>')
27
+ - this behavior can be disabled with --treat-eot-as-text
28
+
29
+ This script creates a single .tfrecords file as output
30
+ - Why: the model's data loader ignores "trailing" data (< 1 batch) at the end of a .tfrecords file
31
+ - this causes data loss if you have many .tfrecords files
32
+ - This is probably not appropriate for very large datasets
33
+ """, formatter_class=argparse.RawTextHelpFormatter)
34
+ parser.add_argument("--input-dir", type=str, default=None,
35
+ help="Path to where your files are located.")
36
+ parser.add_argument("--name", type=str, default=None,
37
+ help="Name of output file will be {name}_{seqnum}.tfrecords, where seqnum is total sequence count")
38
+ parser.add_argument("--output-dir", type=str, default="",
39
+ help="Output directory (default: current directory)")
40
+ parser.add_argument("--tokenizer-path", type=str, default=None,
41
+ help="Path to a custom BPE tokenizer (default: None, gpt2 tokenizer)")
42
+
43
+ cleaning_args = parser.add_argument_group('data cleaning arguments')
44
+
45
+ cleaning_args.add_argument("--normalize-with-ftfy", action="store_true",
46
+ help="Normalize text with ftfy")
47
+ cleaning_args.add_argument("--normalize-with-wikitext-detokenize",
48
+ action="store_true",
49
+ help="Use wikitext detokenizer")
50
+ minu_help = "Exclude repetitive documents made up of < MIN_UNIQUE_TOKENS unique tokens. These can produce large gradients."
51
+ minu_help += " Set <= 0 to disable. If enabled, 200 is a good default value. (Default: 0)"
52
+ cleaning_args.add_argument("--min-unique-tokens", type=int, default=0,
53
+ help=minu_help)
54
+
55
+ shuffle_pack_args = parser.add_argument_group(
56
+ 'data shuffling/packing arguments')
57
+ repack_ep_help = "Repeat the data N_REPACK_EPOCHS times, shuffled differently in each repetition. Recommended for multi-epoch training (set this to your intended number of epochs)."
58
+ shuffle_pack_args.add_argument("--n-repack-epochs",
59
+ type=int, default=1,
60
+ help=repack_ep_help
61
+ )
62
+ shuffle_pack_args.add_argument("--seed", type=int, default=10,
63
+ help="random seed for shuffling data (default: 10)")
64
+ shuffle_pack_args.add_argument("--preserve-data-order",
65
+ default=False, action="store_true",
66
+ help="Disables shuffling, so the input and output data have the same order.")
67
+
68
+ misc_args = parser.add_argument_group('miscellaneous arguments')
69
+ misc_args.add_argument("--verbose",
70
+ default=False, action="store_true",
71
+ help="Prints extra information, such as the text removed by --min-unique-tokens")
72
+
73
+ args, unknown = parser.parse_known_args()
74
+ print(f'Unknown args: {unknown}')
75
+
76
+ return args
77
+
78
+
79
+ def get_files(input_dir):
80
+ filetypes = ["jsonl.zst", ".txt", ".xz", ".tar.gz"]
81
+ files = [list(Path(input_dir).glob(f"*{ft}")) for ft in filetypes]
82
+ # flatten list of list -> list and stringify Paths
83
+ return [str(item) for sublist in files for item in sublist]
84
+
85
+
86
+ def _int64_feature(value):
87
+ """
88
+ Returns an int64_list from a bool / enum / int / uint.
89
+ """
90
+ return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
91
+
92
+
93
+ def write_to_file(writer, data):
94
+ """
95
+ writes data to tfrecord file
96
+ """
97
+ feature = {
98
+ "text": _int64_feature(data)
99
+ }
100
+ tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
101
+ writer.write(tf_example.SerializeToString())
102
+
103
+
104
+ def write_tfrecord(sequences, fp):
105
+ with tf.io.TFRecordWriter(fp) as writer:
106
+ for seq in sequences:
107
+ write_to_file(writer, seq)
108
+
109
+
110
+ def split_list(l, n):
111
+ # splits list/string into n size chunks
112
+ return [l[i:i + n] for i in range(0, len(l), n)]
113
+
114
+
115
+ def enforce_min_unique(seqs, min_unique_tokens, enc, verbose=False):
116
+ for seq in tqdm(seqs, mininterval=1, smoothing=0):
117
+ if len(set(seq)) >= min_unique_tokens:
118
+ yield seq
119
+ elif verbose:
120
+ text = enc.decode(seq)
121
+ print(
122
+ f"excluding with {len(set(seq))} unique tokens:\n\n{repr(text)}\n\n")
123
+
124
+
125
+ def eot_splitting_generator(string_iterable, encoder: TokenizerWrapper):
126
+ """
127
+ Given strings, splits them internally on <|endoftext|> and yields (generally more) strings
128
+ """
129
+ for doc in string_iterable:
130
+ for d in doc.split(encoder.eos_token_str):
131
+ if len(d.strip()) > 0:
132
+ yield d
133
+
134
+
135
+ def prep_and_tokenize_generator(string_iterable, encoder: TokenizerWrapper,
136
+ normalize_with_ftfy,
137
+ normalize_with_wikitext_detokenize):
138
+ """
139
+ Given strings, does data cleaning / tokenization and yields arrays of tokens
140
+ """
141
+ for doc in string_iterable:
142
+ tokens = encoder.encode(doc) + [encoder.eos_token_id]
143
+ yield np.array(tokens, dtype=np.uint16)
144
+
145
+
146
+ def file_to_tokenized_docs_generator(file_path, encoder, args):
147
+ """
148
+ Given a file path, reads the file and tokenizes the contents
149
+
150
+ Yields token arrays of arbitrary, unequal length
151
+ """
152
+ reader = Reader(file_path)
153
+ string_iterable = reader.stream_data(threaded=False)
154
+ string_iterable = eot_splitting_generator(string_iterable, encoder)
155
+
156
+ token_list_gen = prep_and_tokenize_generator(string_iterable,
157
+ encoder,
158
+ normalize_with_ftfy=args.normalize_with_ftfy,
159
+ normalize_with_wikitext_detokenize=args.normalize_with_wikitext_detokenize
160
+ )
161
+ return token_list_gen
162
+
163
+
164
+ def read_files_to_tokenized_docs(files, args, encoder):
165
+ docs = []
166
+
167
+ if args.preserve_data_order:
168
+ files = sorted(files)
169
+ else:
170
+ random.shuffle(files)
171
+
172
+ for f in tqdm(files, mininterval=10, smoothing=0):
173
+ docs.extend(file_to_tokenized_docs_generator(f, encoder, args))
174
+
175
+ if not args.preserve_data_order:
176
+ # shuffle at individual document level
177
+ random.shuffle(docs)
178
+
179
+ return docs
180
+
181
+
182
+ def arrays_to_sequences(token_list_iterable, sequence_length=2049):
183
+ """
184
+ Given token arrays of arbitrary lengths, concats/splits them into arrays of equal length
185
+
186
+ Returns equal-length token arrays, followed by a a final array of trailing tokens (which may be shorter)
187
+ """
188
+ print('Chunking in standard LM mode')
189
+ accum = []
190
+
191
+ for l in token_list_iterable:
192
+ accum.extend(l)
193
+
194
+ if len(accum) > sequence_length:
195
+ chunks = split_list(accum, sequence_length)
196
+ for chunk in chunks[:-1]:
197
+ yield chunk
198
+ accum = chunks[-1]
199
+
200
+ if len(accum) > 0:
201
+ yield accum
202
+
203
+
204
+ def arrays_to_sequences_pad(token_list_iterable, pad_token_id,
205
+ sequence_length=2049,
206
+ sep_token_id=None,
207
+ eos_token_id=None):
208
+ print('Chunking in seq2seq mode')
209
+ accum = []
210
+ too_long = 0
211
+ for chunk in tqdm(token_list_iterable):
212
+ chunk = chunk.tolist()
213
+ n_sep_tokens = sum(x == sep_token_id for x in chunk)
214
+ n_eos_tokens = sum(x == eos_token_id for x in chunk)
215
+ assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
216
+ n_eos_tokens)
217
+ if len(chunk) > sequence_length:
218
+ too_long += 1
219
+ elif len(accum) + len(chunk) > sequence_length:
220
+ res = accum + [pad_token_id] * (sequence_length - len(accum))
221
+ n_sep_tokens = sum(x == sep_token_id for x in res)
222
+ n_eos_tokens = sum(x == eos_token_id for x in res)
223
+ assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
224
+ n_eos_tokens)
225
+ yield res
226
+ accum = chunk
227
+ else:
228
+ accum.extend(chunk)
229
+
230
+ print(f'Discarded {too_long} examples longer than {sequence_length}')
231
+ if len(accum) > 0:
232
+ yield accum
233
+
234
+
235
+ def chunk_and_finalize(arrays, args, encoder):
236
+ seq2seq = getattr(args, 'seq2seq', True)
237
+
238
+ if seq2seq:
239
+ sequences = list(
240
+ arrays_to_sequences_pad(arrays, pad_token_id=encoder.pad_token_id,
241
+ sep_token_id=encoder.sep_token_id,
242
+ eos_token_id=encoder.eos_token_id))
243
+ else:
244
+ sequences = list(map(lambda x: np.array(x, dtype=np.uint16),
245
+ arrays_to_sequences(arrays)))
246
+
247
+ full_seqs, trailing_data = sequences[:-1], sequences[-1]
248
+
249
+ if args.min_unique_tokens > 0:
250
+ full_seqs = list(
251
+ enforce_min_unique(full_seqs, args.min_unique_tokens, encoder,
252
+ args.verbose))
253
+
254
+ if not args.preserve_data_order:
255
+ random.shuffle(full_seqs)
256
+
257
+ return full_seqs, trailing_data
258
+
259
+
260
+ def create_tfrecords(files, args):
261
+ encoder = TokenizerWrapper.from_file_or_gpt(args.tokenizer_path)
262
+
263
+ random.seed(args.seed)
264
+
265
+ docs = read_files_to_tokenized_docs(files, args, encoder)
266
+
267
+ full_seqs, trailing_data = chunk_and_finalize(docs, args, encoder)
268
+
269
+ if getattr(args, 'seq2seq', True):
270
+ # Seq2seq sanity checks
271
+ assert all(
272
+ encoder.decode(x[:20]).strip().startswith('<') for x in full_seqs)
273
+ sep_id = encoder.sep_token_id
274
+ eos_id = encoder.eos_token_id
275
+ pad_id = encoder.pad_token_id
276
+ for seq in full_seqs:
277
+ last_non_pad_idx = max(
278
+ i for i in range(len(seq)) if seq[i] != pad_id)
279
+ assert seq[last_non_pad_idx] == eos_id
280
+ n_sep_tokens = sum(x == sep_id for x in seq)
281
+ n_eos_tokens = sum(x == eos_id for x in seq)
282
+ assert n_sep_tokens == n_eos_tokens, print(n_sep_tokens,
283
+ n_eos_tokens)
284
+
285
+ # final
286
+ print(f"dropped {len(trailing_data)} tokens of trailing data")
287
+
288
+ total_sequence_len = len(full_seqs)
289
+
290
+ fp = os.path.join(args.output_dir,
291
+ f"{args.name}_{total_sequence_len}.tfrecords")
292
+ write_tfrecord(full_seqs, fp)
293
+
294
+
295
+ def create_finetune_tfrecords(**kwargs):
296
+ args = parse_args()
297
+
298
+ # Update by kwargs
299
+ for k, v in kwargs.items():
300
+ setattr(args, k, v)
301
+ if not args.input_dir.endswith("/"):
302
+ args.input_dir = args.input_dir + "/"
303
+
304
+ if args.output_dir:
305
+ os.makedirs(args.output_dir, exist_ok=True)
306
+ files = get_files(args.input_dir)
307
+ create_tfrecords(files, args)
308
+
309
+
310
+ if __name__ == "__main__":
311
+ create_finetune_tfrecords()
@@ -0,0 +1,49 @@
1
+ from PisaFlexibleClient import initialise_env
2
+
3
+ # Run a server on port 8000
4
+ # i.e. do a 'sbt "runMain pisa.server.PisaOneStageServer8000"'
5
+
6
+
7
+ env = initialise_env(8000,
8
+ working_directory="/private/home/aqj/afp-2021-10-22/thys/FunWithFunctions",
9
+ isa_path="/private/home/aqj/Isabelle2021",
10
+ theory_file_path="/private/home/aqj/afp-2021-10-22/thys/FunWithFunctions/FunWithFunctions.thy"
11
+ )
12
+
13
+
14
+ # Suppose you have a list of theorems that you want to try on
15
+ theorems = [
16
+ 'theorem identity1: fixes f :: "nat \<Rightarrow> nat" assumes fff: "\<And>n. f(f(n)) < f(Suc(n))" shows "f(n) = n"',
17
+ 'theorem ifac_neg0: fixes ifac :: "int \<Rightarrow> int" assumes ifac_rec: "\<And>i. ifac i = (if i=0 then 1 else i*ifac(i - 1))" shows "i<0 \<Longrightarrow> ifac i = 0"'
18
+ ]
19
+ # And the corresponding scripts
20
+ scripts = [
21
+ "sorry",
22
+ "bad script"
23
+ ]
24
+
25
+ env.post("<initialise>")
26
+ for theorem, script in zip(theorems, scripts):
27
+ # Execute before the theorem
28
+ env.post(
29
+ f"<accumulative step before> {theorem}"
30
+ )
31
+
32
+ # Create an experimental state with a name e.g. script[-10:]
33
+ # Execute the theorem declaration
34
+ name = script[-10:]
35
+ env.post(
36
+ f"<clone> default <clone> {name}"
37
+ )
38
+ env.post(
39
+ f"<apply to top level state> {name} <apply to top level state> {theorem} <apply to top level state> {name}"
40
+ )
41
+
42
+ # Execute the script and get the proof level
43
+ response = env.post(
44
+ f"<apply to top level state> {name} <apply to top level state> {script} <apply to top level state> {name}"
45
+ )
46
+ print(f"script execution response: {response}")
47
+ level = env.post(f"<get_proof_level> {name}")
48
+ # If level = 0, succeed, other wise fail
49
+ print(level)
@@ -0,0 +1,108 @@
1
+ import os
2
+ import json
3
+ import grpc
4
+ import argparse
5
+
6
+ import server_pb2
7
+ import server_pb2_grpc
8
+
9
+ MAX_MESSAGE_LENGTH = 10485760
10
+
11
+
12
+ def stack_lines(input_string):
13
+ return " ".join(input_string.replace("\n", " ").split()).strip()
14
+
15
+
16
+ def evaluate_single_problem(isa_path, theory_file_path, working_directory, theorem_name, model, mode_of_proving,
17
+ maximum_number_of_steps=100, port=9000):
18
+ channel = grpc.insecure_channel('localhost:{}'.format(port),
19
+ options=[('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
20
+ ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH)])
21
+ stub = server_pb2_grpc.ServerStub(channel)
22
+ stub.InitialiseIsabelle(server_pb2.IsaPath(path=isa_path))
23
+ stub.IsabelleWorkingDirectory(server_pb2.IsaPath(path=working_directory))
24
+ stub.IsabelleContext(server_pb2.IsaContext(context=theory_file_path))
25
+
26
+ theorem_name = stack_lines(theorem_name)
27
+ state_string = stub.IsabelleCommand(server_pb2.IsaCommand(command="proceed:"+theorem_name)).state
28
+
29
+ if mode_of_proving not in ["proof", "state", "proof_and_state"]:
30
+ raise AssertionError
31
+
32
+ previous_proof_segment = theorem_name
33
+ state = state_string
34
+ # print(state)
35
+ try:
36
+ for i in range(maximum_number_of_steps):
37
+ state = stack_lines(state)
38
+ input_string = ""
39
+ if mode_of_proving == "state":
40
+ input_string += "State: {}".format(state)
41
+ if mode_of_proving == "proof_and_state":
42
+ input_string += " <PS_SEP> "
43
+ if mode_of_proving == "proof":
44
+ input_string += "Proof: {}".format(previous_proof_segment)
45
+ # TODO: previous proof segment unfinished
46
+
47
+ output_string = model.predict(input_string)
48
+ # print(input_string)
49
+ # print(output_string)
50
+ state = stub.IsabelleCommand(server_pb2.IsaCommand(command=output_string)).state
51
+ # print(state)
52
+ if "proof" not in state:
53
+ stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
54
+ return 1
55
+ except Exception as e:
56
+ print(e)
57
+ pass
58
+ stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
59
+ return 0
60
+
61
+
62
+ class DummyProver:
63
+ def __init__(self, seq2seq_repo):
64
+ src_list = open(os.path.join(seq2seq_repo, "train.src"), "r").readlines()
65
+ tgt_list = open(os.path.join(seq2seq_repo, "train.tgt"), "r").readlines()
66
+ src_list.extend(open(os.path.join(seq2seq_repo, "val.src"), "r").readlines())
67
+ tgt_list.extend(open(os.path.join(seq2seq_repo, "val.tgt"), "r").readlines())
68
+ src_list.extend(open(os.path.join(seq2seq_repo, "test.src"), "r").readlines())
69
+ tgt_list.extend(open(os.path.join(seq2seq_repo, "test.tgt"), "r").readlines())
70
+ self.prover_dict = dict()
71
+ assert len(src_list) == len(tgt_list)
72
+ for i in range(len(src_list)):
73
+ src = stack_lines(src_list[i])
74
+ tgt = stack_lines(tgt_list[i])
75
+ self.prover_dict[src] = tgt
76
+
77
+ def predict(self, input_string):
78
+ return self.prover_dict[input_string]
79
+
80
+
81
+ if __name__ == "__main__":
82
+ parser = argparse.ArgumentParser(description='Extracting an Isabelle theory file.')
83
+ parser.add_argument('--isa-path', help='The path to the Isabelle executable',
84
+ default="/Applications/Isabelle2020.app/Isabelle")
85
+ parser.add_argument('--working-directory', '-wd', help='Path to the AFP project')
86
+ parser.add_argument('--theory-file-path', '-tfp', help='Path to the file to parse')
87
+ parser.add_argument('--theorem-name', '-tn', help='Name of the theorem to prove')
88
+ parser.add_argument('--mode-of-proving', '-mop',
89
+ help='Mode of proving, could be "state", "proof", or "proof_and_state"')
90
+ parser.add_argument('--port', '-p', help='Port to use to communicate', default=9000, type=int)
91
+ args = parser.parse_args()
92
+
93
+ dummy_prover = DummyProver("/Users/qj213/Projects/PISA/fs_with_state")
94
+ # print(evaluate_single_problem(isa_path=args.isa_path, theory_file_path=args.theory_file_path,
95
+ # working_directory=args.working_directory, theorem_name=args.theorem_name,
96
+ # port=args.port, model=dummy_prover, mode_of_proving="state"))
97
+
98
+ problem_names = json.load(open("fs_with_state/problem_names_split.json"))
99
+ train_names = problem_names["train"]
100
+ for i in range(0, 5):
101
+ theory_file_path = train_names[i][0].replace("/home/ywu/afp-2021-02-11", "/Users/qj213/Projects/afp-2021-02-11")
102
+ # print(theory_file_path)
103
+ # print(train_names[i][1])
104
+ print(evaluate_single_problem(isa_path=args.isa_path,
105
+ theory_file_path=theory_file_path,
106
+ working_directory="/".join(theory_file_path.split("/")[:-1]),
107
+ theorem_name=train_names[i][1],
108
+ port=args.port, model=dummy_prover, mode_of_proving="state"))
@@ -0,0 +1,25 @@
1
+ import os
2
+ import json
3
+
4
+ from tqdm import tqdm
5
+
6
+ proof_and_state_dir = "/home/qj213/proof_and_state"
7
+ first_step_dir = "/home/qj213/first_step"
8
+
9
+
10
+ for file in os.listdir(proof_and_state_dir):
11
+ split_name = file.split(".")[0]
12
+ with open(os.path.join(proof_and_state_dir, file)) as fhand, \
13
+ open(os.path.join(first_step_dir, f"{split_name}.src"), "w") as src_out, \
14
+ open(os.path.join(first_step_dir, f"{split_name}.tgt"), "w") as tgt_out:
15
+ for line in tqdm(fhand.readlines()):
16
+ line_json = json.loads(line.strip())
17
+ source = line_json["source"]
18
+ proof_step_string = source.split("<PS_SEP>")[0].strip()
19
+ proof_state_string = source.split("<PS_SEP>")[1].strip()
20
+ target = line_json["target"]
21
+ if "\\n" not in proof_step_string:
22
+ # This is the first step
23
+ src_out.write(f"<ISA_OBS> {proof_state_string}\n")
24
+ tgt_out.write(f"{target}\n")
25
+
@@ -0,0 +1,35 @@
1
+ from PisaFlexibleClient import initialise_env
2
+ import os
3
+ import pickle
4
+
5
+
6
+ def match_names_single_file_to_data_play_szymon(
7
+ port, working_directory, isa_path, theory_file_path, out_dir, error_log_dir):
8
+ env = initialise_env(
9
+ port=port,
10
+ working_directory=working_directory,
11
+ isa_path=isa_path,
12
+ theory_file_path=theory_file_path
13
+ )
14
+ try:
15
+ output_string = env.post("<get global facts from file>")
16
+ list_of_string_tuples = output_string.split("<SEP>")
17
+ global_fact_dict = {}
18
+ for element in list_of_string_tuples:
19
+ name, definition = element.split("<DEF>")
20
+ global_fact_dict[name.strip()] = definition.strip()
21
+ pickle.dump(global_fact_dict, open(os.path.join(out_dir, f"dict_{theory_file_path.replace('/', '_')}"), "wb"))
22
+ except Exception as e:
23
+ with open(os.path.join(error_log_dir, f"error_log_{theory_file_path.replace('/', '_')}.txt"), "w") as fout:
24
+ fout.write(str(e))
25
+
26
+
27
+ if __name__ == "__main__":
28
+ match_names_single_file_to_data_play_szymon(
29
+ port=8000,
30
+ working_directory="/home/qj213/afp-2021-10-22/thys/FunWithFunctions",
31
+ isa_path="/home/qj213/Isabelle2021",
32
+ theory_file_path="/home/qj213/afp-2021-10-22/thys/FunWithFunctions/FunWithFunctions.thy",
33
+ out_dir="/home/qj213/out_stuff",
34
+ error_log_dir="/home/qj213/out_stuff"
35
+ )
@@ -0,0 +1,19 @@
1
+ import argparse
2
+ import os
3
+
4
+
5
+ if __name__ == "__main__":
6
+ parser = argparse.ArgumentParser(description="Mix the data from multiple forms of input")
7
+ parser.add_argument("--input", type=str, nargs="+", help="Input files")
8
+ parser.add_argument("--output-path", "-op", type=str, help="Output file")
9
+ args = parser.parse_args()
10
+
11
+ for output_file_name in ["train.src", "train.tgt", "val.src", "val.tgt", "test.src", "test.tgt"]:
12
+ if os.path.isfile(os.path.join(args.output_path, output_file_name)):
13
+ os.remove(os.path.join(args.output_path, output_file_name))
14
+
15
+ for input_path in args.input:
16
+ for output_file_name in ["train.src", "train.tgt", "val.src", "val.tgt", "test.src", "test.tgt"]:
17
+ with open(os.path.join(args.output_path, output_file_name), "a") as output_file, \
18
+ open(os.path.join(input_path, output_file_name), "r") as input_file:
19
+ output_file.write(input_file.read())
@@ -0,0 +1,111 @@
1
+ import os
2
+ import json
3
+ import grpc
4
+ import argparse
5
+
6
+ from copy import copy
7
+ from func_timeout import func_set_timeout, FunctionTimedOut
8
+
9
+ import server_pb2
10
+ import server_pb2_grpc
11
+
12
+
13
+ MAX_MESSAGE_LENGTH = 10485760
14
+
15
+
16
+ def analyse_whole_file(whole_file_string, use_sledgehammer=False):
17
+ transitions = whole_file_string.split("<\TRANSEP>")
18
+ state_action_proof_level_tuples = list()
19
+ problem_names = list()
20
+ proof_open = False
21
+ last_state = ""
22
+ for transition in transitions:
23
+ if not transition:
24
+ continue
25
+ if use_sledgehammer:
26
+ state, action, proof_level, hammer_results = transition.split("<\STATESEP>")
27
+ else:
28
+ state, action, proof_level = transition.split("<\STATESEP>")
29
+ hammer_results = "NA"
30
+ state = state.strip()
31
+ action = action.strip()
32
+ proof_level = int(proof_level.strip())
33
+ if action.startswith("lemma") or action.startswith("theorem"):
34
+ problem_names.append(action)
35
+ state_action_proof_level_tuples.append((state, action, proof_level, hammer_results))
36
+ proof_open = True
37
+ elif proof_open:
38
+ state_action_proof_level_tuples.append((state, action, proof_level, hammer_results))
39
+
40
+ if "subgoal" in last_state and "subgoal" not in state:
41
+ proof_open = False
42
+ return {
43
+ "problem_names": problem_names,
44
+ "translations": state_action_proof_level_tuples
45
+ }
46
+
47
+
48
+ @func_set_timeout(12000)
49
+ def isa_step(stub, theory_file_path, use_sledgehammer=False):
50
+ stub.IsabelleContext(server_pb2.IsaContext(context=theory_file_path))
51
+ extraction_command = "PISA extract data with hammer" if use_sledgehammer else "PISA extract data"
52
+ return stub.IsabelleCommand(server_pb2.IsaCommand(command=extraction_command)).state
53
+
54
+
55
+ def extract_file(isa_path, theory_file_path, working_directory, saving_directory, port=9000, use_sledgehammer=False):
56
+ channel = grpc.insecure_channel('localhost:{}'.format(port),
57
+ options=[('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
58
+ ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH)])
59
+ stub = server_pb2_grpc.ServerStub(channel)
60
+
61
+ stub.InitialiseIsabelle(server_pb2.IsaPath(path=isa_path))
62
+ stub.IsabelleWorkingDirectory(server_pb2.IsaPath(path=working_directory))
63
+
64
+ if not os.path.isdir(saving_directory):
65
+ os.makedirs(saving_directory)
66
+ close_program = False
67
+ try:
68
+ whole_file_parsed = isa_step(stub, theory_file_path, use_sledgehammer=use_sledgehammer)
69
+ stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
70
+ except (Exception, FunctionTimedOut) as e:
71
+ close_program = True
72
+ with open(os.path.join(saving_directory,
73
+ "project_{}_file_{}_bug_report.txt".format(
74
+ working_directory.split("/")[-1], theory_file_path.split("/")[-1])), "w") as fout:
75
+ fout.write(str(e))
76
+
77
+ file_analysis = analyse_whole_file(whole_file_parsed)
78
+ file_info = {
79
+ "file_name": theory_file_path,
80
+ "working_directory": working_directory,
81
+ **file_analysis,
82
+ "raw_parsed_string": whole_file_parsed
83
+ }
84
+
85
+ json.dump(file_info,
86
+ open(os.path.join(saving_directory,
87
+ "_".join(theory_file_path.split(".thy")[0].split("/"))+"_ground_truth.json"), "w"))
88
+
89
+ if close_program:
90
+ stub.IsabelleCommand(server_pb2.IsaCommand(command="exit"))
91
+ channel.close()
92
+
93
+
94
+ if __name__ == "__main__":
95
+ parser = argparse.ArgumentParser(description='Extracting an Isabelle theory file.')
96
+ parser.add_argument('--isa-path', help='The path to the Isabelle executable',
97
+ default="/Applications/Isabelle2020.app/Isabelle")
98
+ parser.add_argument('--working-directory', '-wd', help='Path to the AFP project')
99
+ parser.add_argument('--theory-file-path', '-tfp', help='Path to the file to parse')
100
+ parser.add_argument('--saving-directory', '-sd', help='Where the save the parsed json files')
101
+ parser.add_argument('--port', '-p', help='Port to use to communicate', default=9000, type=int)
102
+ parser.add_argument('--use-sledgehammer', '-us', help='Whether to use sledgehammer',
103
+ action='store_true')
104
+ parser.set_defaults(use_sledgehammer=False)
105
+ args = parser.parse_args()
106
+
107
+ # for file_name in os.listdir(args.working_directory):
108
+ # if file_name.endswith(".thy"):
109
+ # full_file_path = os.path.join(args.working_directory, file_name)
110
+ extract_file(args.isa_path, args.theory_file_path, args.working_directory,
111
+ args.saving_directory, args.port, args.use_sledgehammer)