sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,68 @@
1
+ sdg_hub/__init__.py,sha256=Tw-6R5a8_W1kJcTAsW3R9ltBDP1dy5-fe7Tvt3cSyCQ,550
2
+ sdg_hub/_version.py,sha256=UoNvMtd4wCG76RwoSpNCUtaFyTwakGcZolfjXzNVSMY,511
3
+ sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sdg_hub/core/__init__.py,sha256=NwqB4fwhC29W50VW7QXZssLxx122YvgO9LHDLdgAnrI,496
5
+ sdg_hub/core/blocks/__init__.py,sha256=9sCkCvDQzJGSedaePVlEIpbNwrkBz_K500VW_6FLhuE,1601
6
+ sdg_hub/core/blocks/base.py,sha256=TrzUAkG7Tiquk0Z3SOFsb5mRnHd1IbHH6gFPVH1P7T8,10424
7
+ sdg_hub/core/blocks/registry.py,sha256=U__75QrxFpRaJlt36mOd26dgOqBeePs-ZX0Rnutp6r0,9782
8
+ sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
9
+ sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
10
+ sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
11
+ sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
12
+ sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
13
+ sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
14
+ sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
15
+ sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
16
+ sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=ABcXZrqEMsgKfdGAkSo2plMp4LsZSqPhEQugoDEYm1I,2950
17
+ sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
18
+ sdg_hub/core/blocks/evaluation/__init__.py,sha256=kFXee-vsVVdU2XtLio9qHgPx_a0zoB_rQr509EKBGJc,357
19
+ sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py,sha256=ZuQ8jq2JwTdslUJtFi1E9NXebCWFZS8isXOafcJ_CMU,23026
20
+ sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py,sha256=ieQRwl4bx5EQ3m7Wa2P3pHLUPQY7HuwNWjHUCo98u6g,22832
21
+ sdg_hub/core/blocks/evaluation/verify_question_block.py,sha256=fSNbW1KpdfVE0fQsm4Y8QfVk6A3J5H3C0dtGn49t8tM,22853
22
+ sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
23
+ sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=H8Gif0q9Wc_d1TnVow8Zpsg7blJOFGN1EZmV6OPpkcg,5971
24
+ sdg_hub/core/blocks/llm/__init__.py,sha256=N6-Prgd4X85oWbMQzhYMrq7OX-NTJm57cghowK-val0,844
25
+ sdg_hub/core/blocks/llm/client_manager.py,sha256=vaoPoTITJ9IlooeVRfu6M4WBc08mp4aJZ5tvnl2fMv8,12309
26
+ sdg_hub/core/blocks/llm/config.py,sha256=TmbfqxPHH3mShTK2EuCX2AGKtDvl0aSvihsaqgzABtM,11266
27
+ sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
28
+ sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=3o2oV_ecWsEHFp5FWPIpBT-yJ1imJmeZy2b9GZL-T54,20121
29
+ sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=mMmifTC-sRUhdxuLRRtAMhQC7r7NOyTAfBx-xTzLzTc,19669
30
+ sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
31
+ sdg_hub/core/blocks/llm/text_parser_block.py,sha256=vQgUaeYJI9HuxDPRjII-NIOsR01JA-sBBGl05623L8I,14391
32
+ sdg_hub/core/blocks/transform/__init__.py,sha256=Y_3izPCtgnMbFK-gBMeLHZspSrNLgbGheAJXU57XfFw,746
33
+ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
34
+ sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=mGup5agvDf9kAFSvXE5X6Puo6CQc9UOdFdbhdFWJjwk,8225
35
+ sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
36
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
37
+ sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
38
+ sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
39
+ sdg_hub/core/flow/__init__.py,sha256=N2NZGngvd7qpT5FI_knKukUFM0IkD9K5jdTi-gDeUI4,475
40
+ sdg_hub/core/flow/base.py,sha256=Jm90xQ1ns0ArEiqkceSME6phzBtkw6nthjSJNTU3IkQ,45530
41
+ sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
42
+ sdg_hub/core/flow/metadata.py,sha256=h9jpvAzWsF5n4ztZMzwa9ZNgnzKTHmFWdn7YbyJLHCw,12977
43
+ sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
44
+ sdg_hub/core/flow/registry.py,sha256=DzCqEEgwhvwnCBAGLogoMVdwXh4pCHrxOWqoxam7O8I,12162
45
+ sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
46
+ sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
47
+ sdg_hub/core/utils/datautils.py,sha256=QnzMl7nOp0crNJEWgAqurOuuAyz0SnvAjLiKzvG0uds,1933
48
+ sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
49
+ sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
50
+ sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
51
+ sdg_hub/core/utils/logger_config.py,sha256=MPYdpyNXh_pxFUOAvSCHa98LGjxjaLXoUoqWekqTG4s,422
52
+ sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
53
+ sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
54
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml,sha256=xgUNY793y4lcpdtuWm5Ah1CmbU2gvvPQCpZMMa6kPXU,2447
57
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml,sha256=_vF-AzjC8d6wqAle5pkQ103EW-BbAhNA0qllk3ojUZc,353
58
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml,sha256=GiIipXrjm7btghvpgFUoTZYAJRyu7yE-WEi5yDLxjY4,3032
59
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
60
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
61
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
62
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=Rrl9eve9QsGLojAkflgKTHyUgUawKfvhEVAnAxBLZJ8,6307
63
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
64
+ sdg_hub-0.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
65
+ sdg_hub-0.2.1.dist-info/METADATA,sha256=0Si2PZotpwtUI2Pg2cc3uSZIJtS12jF4VInJSTyBngA,8606
66
+ sdg_hub-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
67
+ sdg_hub-0.2.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
68
+ sdg_hub-0.2.1.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- """Block implementations for SDG Hub.
2
-
3
- This package provides various block implementations for data generation, processing, and transformation.
4
- """
5
-
6
- # Local
7
- from .block import Block
8
- from .llmblock import LLMBlock, ConditionalLLMBlock
9
- from .openaichatblock import (
10
- OpenAIChatBlock,
11
- OpenAIAsyncChatBlock
12
- )
13
- from .utilblocks import (
14
- SamplePopulatorBlock,
15
- SelectorBlock,
16
- CombineColumnsBlock,
17
- FlattenColumnsBlock,
18
- DuplicateColumns,
19
- RenameColumns,
20
- SetToMajorityValue,
21
- FilterByValueBlock,
22
- IterBlock,
23
- )
24
- from ..registry import BlockRegistry
25
-
26
- __all__ = [
27
- "Block",
28
- "FilterByValueBlock",
29
- "IterBlock",
30
- "LLMBlock",
31
- "ConditionalLLMBlock",
32
- "SamplePopulatorBlock",
33
- "SelectorBlock",
34
- "CombineColumnsBlock",
35
- "FlattenColumnsBlock",
36
- "DuplicateColumns",
37
- "RenameColumns",
38
- "SetToMajorityValue",
39
- "BlockRegistry",
40
- "OpenAIChatBlock",
41
- "OpenAIAsyncChatBlock"
42
- ]
sdg_hub/blocks/block.py DELETED
@@ -1,96 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Base block implementation for the SDG Hub system.
3
-
4
- This module provides the abstract base class for all blocks in the system,
5
- including functionality for template validation and configuration management.
6
- """
7
-
8
- # Standard
9
- from abc import ABC
10
- from collections import ChainMap
11
- from typing import Any, Dict, Optional
12
-
13
- # Third Party
14
- from jinja2 import Template, UndefinedError
15
- import yaml
16
-
17
- # Local
18
- from ..registry import BlockRegistry
19
- from ..logger_config import setup_logger
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register("Block")
25
- class Block(ABC):
26
- """Base abstract class for all blocks in the system.
27
-
28
- This class provides common functionality for block validation and configuration loading.
29
- All specific block implementations should inherit from this class.
30
- """
31
-
32
- def __init__(self, block_name: str) -> None:
33
- self.block_name = block_name
34
-
35
- @staticmethod
36
- def _validate(prompt_template: Template, input_dict: Dict[str, Any]) -> bool:
37
- """Validate the input data for this block.
38
-
39
- This method validates whether all required variables in the Jinja template are provided in the input_dict.
40
-
41
- Parameters
42
- ----------
43
- prompt_template : Template
44
- The Jinja2 template object.
45
- input_dict : Dict[str, Any]
46
- A dictionary of input values to check against the template.
47
-
48
- Returns
49
- -------
50
- bool
51
- True if the input data is valid (i.e., no missing variables), False otherwise.
52
- """
53
-
54
- class Default(dict):
55
- def __missing__(self, key: str) -> None:
56
- raise KeyError(key)
57
-
58
- try:
59
- # Try rendering the template with the input_dict
60
- prompt_template.render(ChainMap(input_dict, Default()))
61
- return True
62
- except UndefinedError as e:
63
- logger.error(f"Missing key: {e}")
64
- return False
65
-
66
- def _load_config(self, config_path: str) -> Optional[Dict[str, Any]]:
67
- """Load the configuration file for this block.
68
-
69
- Parameters
70
- ----------
71
- config_path : str
72
- The path to the configuration file.
73
-
74
- Returns
75
- -------
76
- Optional[Dict[str, Any]]
77
- The loaded configuration. Returns None if file cannot be read or parsed.
78
-
79
- Raises
80
- ------
81
- FileNotFoundError
82
- If the configuration file does not exist.
83
- """
84
- try:
85
- with open(config_path, "r", encoding="utf-8") as config_file:
86
- try:
87
- return yaml.safe_load(config_file)
88
- except yaml.YAMLError as e:
89
- logger.error(f"Error parsing YAML from {config_path}: {e}")
90
- return None
91
- except FileNotFoundError:
92
- logger.error(f"Configuration file not found: {config_path}")
93
- raise
94
- except Exception as e:
95
- logger.error(f"Unexpected error reading config file {config_path}: {e}")
96
- return None
@@ -1,375 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """LLM-based blocks for text generation and processing.
3
-
4
- This module provides blocks for interacting with language models.
5
- """
6
-
7
- # Standard
8
- from typing import Any, Dict, List, Optional, Union
9
- import json
10
- import re
11
-
12
- # Third Party
13
- from datasets import Dataset
14
- from jinja2 import Template
15
- import openai
16
-
17
- # Local
18
- from .block import Block
19
- from ..logger_config import setup_logger
20
- from ..registry import BlockRegistry, PromptRegistry
21
-
22
- logger = setup_logger(__name__)
23
-
24
-
25
- def server_supports_batched(client: openai.OpenAI, model_id: str) -> bool:
26
- """Check if the server supports batched inputs.
27
-
28
- This function checks if the server supports batched inputs by making a test call to the server.
29
-
30
- Parameters
31
- ----------
32
- client : openai.OpenAI
33
- The client to use to make the test call.
34
- model_id : str
35
- The model ID to use for the test call.
36
- """
37
- supported = getattr(client, "server_supports_batched", None)
38
- if supported is not None:
39
- return supported
40
- try:
41
- # Make a test call to the server to determine whether it supports
42
- # multiple input prompts per request and also the n parameter
43
- response = client.completions.create(
44
- model=model_id, prompt=["test1", "test2"], max_tokens=1, n=3
45
- )
46
- # Number outputs should be 2 * 3 = 6
47
- supported = len(response.choices) == 6
48
- except openai.InternalServerError:
49
- supported = False
50
- client.server_supports_batched = supported
51
- logger.info(f"LLM server supports batched inputs: {client.server_supports_batched}")
52
- return supported
53
-
54
-
55
- @BlockRegistry.register("LLMBlock")
56
- class LLMBlock(Block):
57
- """Block for generating text using language models.
58
-
59
- This block handles text generation, prompt formatting, and output parsing
60
- for language model interactions.
61
-
62
- Parameters
63
- ----------
64
- block_name : str
65
- Name of the block.
66
- config_path : str
67
- Path to the configuration file.
68
- client : openai.OpenAI
69
- OpenAI client instance.
70
- output_cols : List[str]
71
- List of output column names.
72
- parser_kwargs : Dict[str, Any], optional
73
- Keyword arguments for the parser, by default {}.
74
- model_prompt : str, optional
75
- Template string for model prompt, by default "{prompt}".
76
- model_id : Optional[str], optional
77
- Model ID to use, by default None.
78
- **batch_kwargs : Dict[str, Any]
79
- Additional keyword arguments for batch processing.
80
- """
81
-
82
- # pylint: disable=too-many-instance-attributes
83
- def __init__(
84
- self,
85
- block_name: str,
86
- config_path: str,
87
- client: openai.OpenAI,
88
- output_cols: List[str],
89
- parser_kwargs: Dict[str, Any] = {},
90
- model_prompt: str = "{prompt}",
91
- model_id: Optional[str] = None,
92
- **batch_kwargs: Dict[str, Any],
93
- ) -> None:
94
- super().__init__(block_name)
95
- self.block_config = self._load_config(config_path)
96
- self.prompt_struct = (
97
- """{system}\n{introduction}\n{principles}\n{examples}\n{generation}"""
98
- )
99
- filtered_config = {
100
- k: (v if v is not None else "") for k, v in self.block_config.items()
101
- }
102
- self.prompt_template = Template(self.prompt_struct.format(**filtered_config))
103
- self.client = client
104
- if model_id:
105
- self.model = model_id
106
- else:
107
- # get the default model id from client
108
- self.model = self.client.models.list().data[0].id
109
-
110
- self.model_prompt = model_prompt
111
- self.output_cols = output_cols
112
- self.batch_params = batch_kwargs.get("batch_kwargs", {})
113
- self.parser_name = parser_kwargs.get("parser_name", None)
114
- self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
115
- self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
116
- self.defaults = {
117
- "model": self.model,
118
- "temperature": 0,
119
- "max_tokens": 4096,
120
- }
121
-
122
- # Whether the LLM server supports a list of input prompts
123
- # and supports the n parameter to generate n outputs per input
124
- self.server_supports_batched = server_supports_batched(client, self.model)
125
-
126
- def _extract_matches(
127
- self, text: str, start_tag: Optional[str], end_tag: Optional[str]
128
- ) -> List[str]:
129
- if not text:
130
- return []
131
- if not start_tag and not end_tag:
132
- return [text.strip()]
133
-
134
- pattern = ""
135
- if start_tag:
136
- pattern += re.escape(start_tag)
137
- pattern += r"(.*?)"
138
- if end_tag:
139
- pattern += re.escape(end_tag)
140
- elif start_tag:
141
- # Enforce matching till end of string when only start_tag is provided.
142
- pattern += "$"
143
-
144
- return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
145
-
146
- def _parse(self, generated_string: str) -> dict:
147
- matches = {}
148
-
149
- if self.parser_name is not None and self.parser_name == "custom":
150
- pattern = re.compile(self.parsing_pattern, re.DOTALL)
151
- all_matches = pattern.findall(generated_string)
152
- matches = {column_name: [] for column_name in self.output_cols}
153
- if all_matches and isinstance(all_matches[0], tuple):
154
- for match in all_matches:
155
- for column_name, value in zip(self.output_cols, match):
156
- value = value.strip()
157
- for clean_tag in self.parser_cleanup_tags:
158
- value = value.replace(clean_tag, "")
159
- matches[column_name].append(value)
160
- else:
161
- matches[self.output_cols[0]] = (
162
- [match.strip() for match in all_matches] if all_matches else []
163
- )
164
- else:
165
- for start_tag, end_tag, output_col in zip(
166
- self.block_config.get("start_tags", []),
167
- self.block_config.get("end_tags", []),
168
- self.output_cols,
169
- ):
170
- matches[output_col] = self._extract_matches(
171
- generated_string, start_tag, end_tag
172
- )
173
-
174
- return matches
175
-
176
- def _format_prompt(self, sample: Dict) -> str:
177
- prompt_templated_str = self.prompt_template.render(sample).strip()
178
- return PromptRegistry.render_template(
179
- self.model_prompt, prompt_templated_str, add_generation_prompt=True
180
- ).strip()
181
-
182
- def _generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> list:
183
- prompts = [self._format_prompt(sample) for sample in samples]
184
- logger.debug("Prompt: %s", prompts[0])
185
- generate_args = {**self.defaults, **gen_kwargs}
186
-
187
- if self.server_supports_batched:
188
- response = self.client.completions.create(prompt=prompts, **generate_args)
189
- # if stop is provided, then we need to add the stop token to the generated text,
190
- # this is because the stop token is not included in the generated text - this is a limitation of the openai api
191
- # we need to add the stop token to the generated text to make it consistent for the parser
192
- if "stop" in generate_args:
193
- return [
194
- choice.text.strip() + "".join(generate_args["stop"])
195
- for choice in response.choices
196
- ]
197
- return [choice.text.strip() for choice in response.choices]
198
-
199
- n = gen_kwargs.get("n", 1)
200
- results = []
201
- for prompt in prompts:
202
- for _ in range(n):
203
- response = self.client.completions.create(
204
- prompt=prompt, **generate_args
205
- )
206
- if "stop" in generate_args:
207
- results.append(
208
- response.choices[0].text.strip()
209
- + "".join(generate_args["stop"])
210
- )
211
- results.append(response.choices[0].text.strip())
212
- return results
213
-
214
- def generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> Dataset:
215
- """Generate the output from the block.
216
-
217
- This method should first validate the input data,
218
- then generate the output, and finally parse the generated output before returning it.
219
-
220
- Returns
221
- -------
222
- Dataset
223
- The parsed output after generation.
224
- """
225
- num_samples = self.block_config.get("num_samples", None)
226
- logger.debug("Generating outputs for {} samples".format(len(samples)))
227
-
228
- if (num_samples is not None) and ("num_samples" not in samples.column_names):
229
- samples = samples.add_column("num_samples", [num_samples] * len(samples))
230
-
231
- # validate each sample
232
- # Log errors and remove invalid samples
233
- valid_samples = []
234
-
235
- for sample in samples:
236
- if self._validate(self.prompt_template, sample):
237
- valid_samples.append(sample)
238
- else:
239
- logger.warning(
240
- f"Sample failed validation: {sample}"
241
- ) # Log details of the failed sample
242
-
243
- samples = valid_samples
244
-
245
- if len(samples) == 0:
246
- logger.warning(
247
- "No valid samples to generate outputs for, returning empty dataset"
248
- )
249
- return Dataset.from_list([])
250
-
251
- # generate the output
252
-
253
- outputs = self._generate(samples, **gen_kwargs)
254
-
255
- logger.debug("Generated outputs: %s", outputs)
256
-
257
- num_parallel_samples = gen_kwargs.get("n", 1)
258
- extended_samples = []
259
-
260
- # Duplicate each input sample n times, where n is the number
261
- # of output sequences generated per input, so that we can
262
- # pair up the inputs and outputs.
263
- for item in samples:
264
- extended_samples.extend([item] * num_parallel_samples)
265
-
266
- new_data = []
267
- for sample, output in zip(extended_samples, outputs):
268
- parsed_outputs = self._parse(output)
269
- max_length = max(len(value) for value in parsed_outputs.values())
270
- for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
271
- new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
272
-
273
- return Dataset.from_list(new_data)
274
-
275
-
276
- @BlockRegistry.register("ConditionalLLMBlock")
277
- class ConditionalLLMBlock(LLMBlock):
278
- """Block for conditional text generation using language models.
279
-
280
- This block selects different prompt templates based on a selector column value.
281
-
282
- Parameters
283
- ----------
284
- block_name : str
285
- Name of the block.
286
- config_paths : Dict[str, str]
287
- Dictionary mapping selector values to their config file paths.
288
- client : openai.OpenAI
289
- OpenAI client instance.
290
- model_id : str
291
- Model ID to use.
292
- output_cols : List[str]
293
- List of output column names.
294
- selector_column_name : str
295
- Name of the column used to select the prompt template.
296
- model_prompt : str, optional
297
- Template string for model prompt, by default "{prompt}".
298
- **batch_kwargs : Dict[str, Any]
299
- Additional keyword arguments for batch processing.
300
- """
301
-
302
- def __init__(
303
- self,
304
- block_name: str,
305
- config_paths: Dict[str, str],
306
- client: openai.OpenAI,
307
- model_id: str,
308
- output_cols: List[str],
309
- selector_column_name: str,
310
- model_prompt: str = "{prompt}",
311
- **batch_kwargs: Dict[str, Any],
312
- ) -> None:
313
- super().__init__(
314
- block_name=block_name,
315
- config_path=list(config_paths.values())[0],
316
- client=client,
317
- model_id=model_id,
318
- output_cols=output_cols,
319
- model_prompt=model_prompt,
320
- **batch_kwargs,
321
- )
322
- self.selector_column_name = selector_column_name
323
- self.prompt_template = {}
324
- if "All" in config_paths:
325
- self.prompt_template = self.prompt_struct.format(**self.block_config)
326
- else:
327
- for config_key, config in config_paths.items():
328
- filtered_config = {
329
- k: (v if v is not None else "")
330
- for k, v in self.block_config.items()
331
- }
332
- self.prompt_template[config_key] = Template(
333
- self.prompt_struct.format(**self._load_config(config))
334
- )
335
-
336
- def _format_prompt(self, sample: Dict[str, Any]) -> str:
337
- """Format the prompt based on the selector column value.
338
-
339
- Parameters
340
- ----------
341
- sample : Dict[str, Any]
342
- Input sample containing the selector column.
343
-
344
- Returns
345
- -------
346
- str
347
- Formatted prompt string.
348
- """
349
- if isinstance(self.prompt_template, dict):
350
- return (
351
- self.prompt_template[sample[self.selector_column_name]]
352
- .render(**sample)
353
- .strip()
354
- )
355
-
356
- return self.prompt_template.render(**sample).strip()
357
-
358
- def _validate(self, prompt_template: Union[str, Template], input_dict: Dict[str, Any]) -> bool:
359
- """Validate the input data for this block.
360
-
361
- Parameters
362
- ----------
363
- prompt_template : Union[str, Template]
364
- The template to validate against.
365
- input_dict : Dict[str, Any]
366
- Input data to validate.
367
-
368
- Returns
369
- -------
370
- bool
371
- True if the input data is valid, False otherwise.
372
- """
373
- if isinstance(prompt_template, dict):
374
- prompt_template = prompt_template[input_dict[self.selector_column_name]]
375
- return super()._validate(prompt_template, input_dict)