sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +41 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +121 -193
  5. sdg_hub/blocks/openaichatblock.py +556 -0
  6. sdg_hub/blocks/utilblocks.py +500 -43
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  9. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  10. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  11. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  13. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  14. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  15. sdg_hub/configs/skills/contexts.yaml +18 -11
  16. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  17. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  18. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  19. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  20. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  21. sdg_hub/configs/skills/router.yaml +53 -6
  22. sdg_hub/flow.py +366 -33
  23. sdg_hub/flow_runner.py +437 -0
  24. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
  25. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  26. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  27. sdg_hub/pipeline.py +67 -12
  28. sdg_hub/prompts.py +52 -0
  29. sdg_hub/sdg.py +128 -86
  30. sdg_hub/utils/__init__.py +5 -0
  31. sdg_hub/utils/config_validation.py +91 -0
  32. sdg_hub/utils/error_handling.py +94 -0
  33. sdg_hub/utils/path_resolution.py +62 -0
  34. sdg_hub/utils/validation_result.py +10 -0
  35. sdg_hub-0.1.2.dist-info/METADATA +190 -0
  36. sdg_hub-0.1.2.dist-info/RECORD +89 -0
  37. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
  38. sdg_hub/blocks/filterblock.py +0 -76
  39. sdg_hub/blocks/iterblock.py +0 -31
  40. sdg_hub/blocks/rmblocks.py +0 -194
  41. sdg_hub/configs/annotations/simple.yaml +0 -10
  42. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  43. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  44. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  45. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  46. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  47. sdg_hub/utils/chunking.py +0 -73
  48. sdg_hub/utils/docprocessor.py +0 -357
  49. sdg_hub/utils/parse_and_convert.py +0 -392
  50. sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
  51. sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
  52. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  53. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  54. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  55. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  58. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  59. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  60. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  61. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  62. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
  63. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
@@ -1,3 +0,0 @@
1
- datasets: []
2
- sys_prompt: |
3
- I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
@@ -1,6 +0,0 @@
1
- datasets:
2
- - path: instructlab/InstructLabCommunity
3
- sampling_size: 1.0
4
-
5
- sys_prompt: |
6
- I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
@@ -1,19 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_responses
4
- config_path: configs/annotations/detailed_description.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- max_tokens: 5
10
- temperature: 0
11
- extra_body:
12
- guided_choice:
13
- - "joy"
14
- - "sadness"
15
- - "anger"
16
- - "fear"
17
- - "love"
18
- drop_duplicates:
19
- - prompt
@@ -1,19 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_responses
4
- config_path: configs/annotations/detailed_description_icl.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- max_tokens: 5
10
- temperature: 0
11
- extra_body:
12
- guided_choice:
13
- - "joy"
14
- - "sadness"
15
- - "anger"
16
- - "fear"
17
- - "love"
18
- drop_duplicates:
19
- - prompt
@@ -1,19 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_responses
4
- config_path: configs/annotations/simple.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- max_tokens: 5
10
- temperature: 0
11
- extra_body:
12
- guided_choice:
13
- - "joy"
14
- - "sadness"
15
- - "anger"
16
- - "fear"
17
- - "love"
18
- drop_duplicates:
19
- - prompt
sdg_hub/utils/chunking.py DELETED
@@ -1,73 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
-
3
- # Standard
4
- from typing import List
5
- import logging
6
- import re
7
-
8
- # Third Party
9
- from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
10
-
11
- _DEFAULT_CHUNK_OVERLAP = 100
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def _num_tokens_from_words(num_words) -> int:
17
- return int(num_words * 1.3) # 1 word ~ 1.3 token
18
-
19
-
20
- def _num_chars_from_tokens(num_tokens) -> int:
21
- return int(num_tokens * 4) # 1 token ~ 4 English character
22
-
23
-
24
- def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
25
- """
26
- Iterates over the documents and splits them into chunks based on the word count provided by the user.
27
- Args:
28
- documents (list): List of documents retrieved from git (can also consist of a single document).
29
- server_ctx_size (int): Context window size of server.
30
- chunk_word_count (int): Maximum number of words to chunk a document.
31
- Returns:
32
- List[str]: List of chunked documents.
33
- """
34
-
35
- # Checks for input type error
36
- if isinstance(documents, str):
37
- documents = [documents]
38
-
39
- elif not isinstance(documents, list):
40
- raise TypeError(
41
- "Expected: documents to be a list, but got {}".format(type(documents))
42
- )
43
-
44
- no_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
45
- if no_tokens_per_doc > int(server_ctx_size - 1024):
46
- raise ValueError(
47
- "Error: {}".format(
48
- str(
49
- f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
50
- )
51
- )
52
- )
53
- # Placeholder for params
54
- content = []
55
- chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
56
- chunk_overlap = _DEFAULT_CHUNK_OVERLAP
57
-
58
- # Using Markdown as default, document-specific chunking will be implemented in seperate pr.
59
- text_splitter = RecursiveCharacterTextSplitter.from_language(
60
- language=Language.MARKDOWN,
61
- chunk_size=chunk_size,
62
- chunk_overlap=chunk_overlap,
63
- )
64
-
65
- # Determine file type for heuristics, default with markdown
66
- for docs in documents:
67
- # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
68
- docs = re.sub(r"-{2,}\|", "-|", docs)
69
- # Remove unnecessary spaces in front of pipe characters in a markdown table.
70
- docs = re.sub(r"\ +\|", " |", docs)
71
- temp = text_splitter.create_documents([docs])
72
- content.extend([item.page_content for item in temp])
73
- return content
@@ -1,357 +0,0 @@
1
- # Standard
2
- from pathlib import Path
3
- import json
4
-
5
- # Third Party
6
- from datasets import Dataset
7
- from tabulate import tabulate
8
- from transformers import AutoTokenizer
9
- import yaml
10
-
11
- # First Party
12
- from sdg_hub.logger_config import setup_logger
13
-
14
- # Local
15
- from .datautils import safe_concatenate_datasets
16
- from .chunking import chunk_document
17
-
18
- logger = setup_logger(__name__)
19
-
20
-
21
- def fuse_texts(text_list, short_length_threshold=100):
22
- fused_texts = []
23
- previous_long_text = ""
24
-
25
- for text in text_list:
26
- word_count = len(text.split())
27
-
28
- if word_count <= short_length_threshold and previous_long_text:
29
- # Append the short text to the last long text
30
- fused_texts[-1] += "\n\n" + text
31
- else:
32
- # This is a long text, so add it to the list and remember it
33
- fused_texts.append(text)
34
- previous_long_text = text
35
-
36
- return fused_texts
37
-
38
-
39
- def handle_footnote(book_element):
40
- pass
41
-
42
-
43
- def create_tokenizer():
44
- return AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")
45
-
46
-
47
- def get_token_count(text, tokenizer):
48
- return len(tokenizer.tokenize(text))
49
-
50
-
51
- def add_heading_formatting(text):
52
- text = text.split(".")
53
- # TODO: Change this from hardcoded to something that makes sense
54
- if len(text) > 1 and len(text[0].split(" ")) < 3:
55
- text = f"**{text[0]}**" + ".".join(text[1:])
56
- else:
57
- text = ".".join(text)
58
- return text
59
-
60
-
61
- def generate_table_from_parsed_rep(item):
62
- """
63
- Generate the table from the parsed representation and return
64
- """
65
- caption = ""
66
- if "text" in item:
67
- # print("caption: ", item["text"])
68
- caption = item["text"]
69
-
70
- data = item["data"]
71
-
72
- if len(data) <= 1 or len(data[0]) <= 1:
73
- return ""
74
-
75
- table = []
76
- for i, row in enumerate(data):
77
- trow = []
78
- for j, cell in enumerate(row):
79
- trow.append(cell["text"])
80
- table.append(trow)
81
-
82
- table_text = tabulate(table, tablefmt="github")
83
- if caption:
84
- table_text += f"\nCaption: {caption}\n"
85
- return table_text
86
-
87
-
88
- def get_table(json_book, table_ref):
89
- parts = table_ref.split("/")
90
- table_text = generate_table_from_parsed_rep(json_book[parts[1]][int(parts[2])])
91
- return table_text
92
-
93
-
94
- def get_table_page_number(json_book, idx):
95
- # Get previous page number
96
- prev_page_num, next_page_num = None, None
97
- for book_element in json_book["main-text"][idx - 1 :: -1]:
98
- if "prov" in book_element:
99
- prev_page_num = book_element["prov"][0]["page"]
100
- break
101
- for book_element in json_book["main-text"][idx:]:
102
- if "prov" in book_element:
103
- next_page_num = book_element["prov"][0]["page"]
104
- break
105
- if prev_page_num is not None and next_page_num is not None:
106
- if prev_page_num == next_page_num:
107
- return prev_page_num
108
- else:
109
- return next_page_num
110
- elif prev_page_num is not None:
111
- return prev_page_num
112
- elif next_page_num is not None:
113
- return next_page_num
114
-
115
-
116
- def build_chunks_from_docling_json(
117
- json_book,
118
- max_token_per_chunk,
119
- tokenizer,
120
- keep_same_page_thing_together=False,
121
- chunking_criteria=None,
122
- ):
123
- current_buffer = []
124
- document_chunks = []
125
- prev_page_number = None
126
- book_title = None
127
-
128
- for idx, book_element in enumerate(json_book["main-text"]):
129
- if book_element["type"] in [
130
- "page-footer",
131
- "picture",
132
- "reference",
133
- "meta-data",
134
- "figure",
135
- "page-header",
136
- ]:
137
- continue
138
- elif book_element["type"] == "footnote":
139
- handle_footnote(book_element)
140
- current_book_page_number = book_element["prov"][0]["page"]
141
- elif book_element["type"] in [
142
- "subtitle-level-1",
143
- "paragraph",
144
- "table",
145
- "title",
146
- "equation",
147
- ]: # 'page-header',
148
- if book_element["type"] == "table":
149
- current_book_page_number = get_table_page_number(json_book, idx)
150
- else:
151
- current_book_page_number = book_element["prov"][0]["page"]
152
- book_text = book_element["text"]
153
-
154
- if book_element["type"] == "subtitle-level-1":
155
- if book_title is None:
156
- book_title = book_text
157
- book_text = f"# Title: **{book_text}**"
158
- else:
159
- book_text = f"## **{book_text}**"
160
-
161
- if book_element["type"] == "title":
162
- book_text = f"# **{book_text}**"
163
- if book_element["type"] == "page-header":
164
- book_text = f"Page Header: **{book_text}**\n\n"
165
-
166
- if chunking_criteria is not None:
167
- # custom break function that can be used to chunk document
168
- if chunking_criteria(book_text):
169
- document_chunks.append("\n\n".join(current_buffer))
170
- current_buffer = []
171
- elif (
172
- prev_page_number is not None
173
- and prev_page_number != current_book_page_number
174
- ) and keep_same_page_thing_together:
175
- document_chunks.append("\n\n".join(current_buffer))
176
- current_buffer = []
177
- else:
178
- if (
179
- get_token_count("\n\n".join(current_buffer), tokenizer)
180
- >= max_token_per_chunk
181
- and len(current_buffer) > 1
182
- ):
183
- # chunk_text = '\n\n'.join(current_buffer[:-1])
184
- # print(f"Current chunk size {get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}")
185
- document_chunks.append("\n\n".join(current_buffer[:-1]))
186
-
187
- if (
188
- get_token_count(current_buffer[-1], tokenizer)
189
- >= max_token_per_chunk
190
- ):
191
- # print(f"This is too big document to be left in the current buffer { get_token_count(current_buffer[-1], tokenizer)}")
192
- document_chunks.append(current_buffer[-1])
193
- current_buffer = []
194
- else:
195
- current_buffer = current_buffer[-1:]
196
-
197
- if book_element["type"] == "paragraph":
198
- book_text = add_heading_formatting(book_text)
199
- elif book_element["type"] == "table":
200
- book_text = get_table(json_book, book_element["$ref"])
201
- if "## References" in book_text or "## Acknowledgements" in book_text:
202
- # For reasearch papers we ignore everything after this sections
203
- break
204
- current_buffer.append(book_text)
205
-
206
- try:
207
- prev_page_number = current_book_page_number
208
- except:
209
- logger.error(book_element)
210
- if "\n\n".join(current_buffer) not in document_chunks:
211
- document_chunks.append("\n\n".join(current_buffer))
212
- return document_chunks
213
-
214
-
215
- class DocProcessor:
216
- def __init__(
217
- self,
218
- parsed_doc_dir: Path,
219
- tokenizer: str = "instructlab/granite-7b-lab",
220
- user_config_path: Path = None,
221
- ):
222
- self.parsed_doc_dir = self._path_validator(parsed_doc_dir)
223
- self.user_config = self._load_user_config(
224
- self._path_validator(user_config_path)
225
- )
226
- self.docling_jsons = list(self.parsed_doc_dir.glob("*.json"))
227
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
228
-
229
- def _path_validator(self, path) -> Path:
230
- """
231
- Validate the path and return a Path object.
232
- Args:
233
- path (str): Path to be validated.
234
- Returns:
235
- Path`: Path object.
236
- """
237
- if isinstance(path, str):
238
- path = Path(path)
239
- if not path.exists():
240
- raise FileNotFoundError(f"{path} does not exist.")
241
- return path
242
-
243
- def _load_user_config(self, user_config_path: Path) -> dict:
244
- """
245
- Load the user config file.
246
- Args:
247
- user_config_path (Path): Path to the user config file.
248
- Returns:
249
- dict: User config dictionary.
250
- """
251
- # load user config as yaml
252
- with open(user_config_path, "r", encoding="utf-8") as f:
253
- return yaml.safe_load(f)
254
-
255
- def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
256
- """
257
- Process the parsed docling json file and return a dataset.
258
- Args:
259
- json_fp (str): Path to the parsed docling json file.
260
- Returns:
261
- Dataset: Dataset object.
262
- """
263
- logger.info(f"Processing parsed docling json file: {json_fp}")
264
- with open(json_fp, "r", encoding="utf-8") as f:
265
- data = json.load(f)
266
-
267
- file_name = json_fp.name.split(".")[0]
268
- chunks = build_chunks_from_docling_json(
269
- data,
270
- max_token_per_chunk=500,
271
- tokenizer=self.tokenizer,
272
- )
273
- chunks = fuse_texts(chunks, 200)
274
- return Dataset.from_dict(
275
- {
276
- "document": chunks,
277
- "document_outline": [self.user_config["document_outline"]]
278
- * len(chunks),
279
- "document_title": [file_name] * len(chunks),
280
- "domain": [self.user_config["domain"]] * len(chunks),
281
- }
282
- )
283
-
284
- def _add_icls(self, chunked_document: Dataset) -> Dataset:
285
- """
286
- Add the ICLS label to the dataset.
287
- Args:
288
- dataset (Dataset): Dataset object.
289
- Returns:
290
- Dataset: Dataset object with ICLS label.
291
- """
292
- icl = self.user_config["seed_examples"]
293
- chunked_document_all_icl = []
294
- for icl_ in icl:
295
- chunked_document_all_icl.append(
296
- chunked_document.map(
297
- lambda x: {
298
- "icl_document": icl_["context"],
299
- "icl_query_1": icl_["questions_and_answers"][0]["question"],
300
- "icl_response_1": icl_["questions_and_answers"][0]["answer"],
301
- "icl_query_2": icl_["questions_and_answers"][1]["question"],
302
- "icl_response_2": icl_["questions_and_answers"][1]["answer"],
303
- "icl_query_3": icl_["questions_and_answers"][2]["question"],
304
- "icl_response_3": icl_["questions_and_answers"][2]["answer"],
305
- }
306
- )
307
- )
308
- chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
309
- chunked_document_all_icl = chunked_document_all_icl.map(
310
- lambda x: {
311
- "chunks": chunk_document(
312
- [x["document"]], server_ctx_size=4096, chunk_word_count=1024
313
- )
314
- if get_token_count(x["document"], self.tokenizer) > 1024
315
- else [x["document"]]
316
- }
317
- )
318
- df = chunked_document_all_icl.to_pandas()
319
- df_exploded = df.explode("chunks").reset_index(drop=True)
320
- new_ds = Dataset.from_pandas(df_exploded)
321
- new_ds = new_ds.remove_columns("document").rename_columns(
322
- {"chunks": "document"}
323
- )
324
-
325
- # Only keep document greater than 100 tokens
326
- new_ds = new_ds.filter(
327
- lambda x: get_token_count(x["document"], self.tokenizer) > 100
328
- )
329
- return new_ds
330
-
331
- def get_processed_dataset(self) -> Dataset:
332
- """
333
- Process all the parsed docling json files and return a dataset.
334
- Returns:
335
- Dataset: Dataset object.
336
- """
337
- datasets = []
338
- for json_fp in self.docling_jsons:
339
- chunk_ds = self._process_parsed_docling_json(json_fp)
340
- chunk_ds_with_icls = self._add_icls(chunk_ds)
341
- datasets.append(chunk_ds_with_icls)
342
- return safe_concatenate_datasets(datasets)
343
-
344
- def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
345
- chunks_mds = []
346
- for md_file in list_md_files:
347
- with open(md_file, "r", encoding="utf-8") as f:
348
- text = f.read()
349
- chunks_mds.append({
350
- "document": text,
351
- "document_outline": self.user_config["document_outline"],
352
- "document_title": md_file,
353
- "domain": self.user_config["domain"],
354
- })
355
- chunk_ds = Dataset.from_list(chunks_mds)
356
- chunk_ds_with_icls = self._add_icls(chunk_ds)
357
- return chunk_ds_with_icls