sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/blocks/__init__.py +41 -5
- sdg_hub/blocks/block.py +58 -16
- sdg_hub/blocks/llmblock.py +121 -193
- sdg_hub/blocks/openaichatblock.py +556 -0
- sdg_hub/blocks/utilblocks.py +500 -43
- sdg_hub/checkpointer.py +139 -0
- sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
- sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
- sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
- sdg_hub/configs/skills/contexts.yaml +18 -11
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
- sdg_hub/configs/skills/freeform_questions.yaml +21 -16
- sdg_hub/configs/skills/freeform_responses.yaml +19 -25
- sdg_hub/configs/skills/router.yaml +53 -6
- sdg_hub/flow.py +366 -33
- sdg_hub/flow_runner.py +437 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
- sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
- sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
- sdg_hub/pipeline.py +67 -12
- sdg_hub/prompts.py +52 -0
- sdg_hub/sdg.py +128 -86
- sdg_hub/utils/__init__.py +5 -0
- sdg_hub/utils/config_validation.py +91 -0
- sdg_hub/utils/error_handling.py +94 -0
- sdg_hub/utils/path_resolution.py +62 -0
- sdg_hub/utils/validation_result.py +10 -0
- sdg_hub-0.1.2.dist-info/METADATA +190 -0
- sdg_hub-0.1.2.dist-info/RECORD +89 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
- sdg_hub/blocks/filterblock.py +0 -76
- sdg_hub/blocks/iterblock.py +0 -31
- sdg_hub/blocks/rmblocks.py +0 -194
- sdg_hub/configs/annotations/simple.yaml +0 -10
- sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
- sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
- sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
- sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
- sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
- sdg_hub/utils/chunking.py +0 -73
- sdg_hub/utils/docprocessor.py +0 -357
- sdg_hub/utils/parse_and_convert.py +0 -392
- sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
- sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
- /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
- /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
- /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
- /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
- /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
- /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
- /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
- /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +0,0 @@
|
|
1
|
-
datasets:
|
2
|
-
- path: instructlab/InstructLabCommunity
|
3
|
-
sampling_size: 1.0
|
4
|
-
|
5
|
-
sys_prompt: |
|
6
|
-
I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/detailed_description.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/detailed_description_icl.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/simple.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|
sdg_hub/utils/chunking.py
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
|
3
|
-
# Standard
|
4
|
-
from typing import List
|
5
|
-
import logging
|
6
|
-
import re
|
7
|
-
|
8
|
-
# Third Party
|
9
|
-
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
|
10
|
-
|
11
|
-
_DEFAULT_CHUNK_OVERLAP = 100
|
12
|
-
|
13
|
-
logger = logging.getLogger(__name__)
|
14
|
-
|
15
|
-
|
16
|
-
def _num_tokens_from_words(num_words) -> int:
|
17
|
-
return int(num_words * 1.3) # 1 word ~ 1.3 token
|
18
|
-
|
19
|
-
|
20
|
-
def _num_chars_from_tokens(num_tokens) -> int:
|
21
|
-
return int(num_tokens * 4) # 1 token ~ 4 English character
|
22
|
-
|
23
|
-
|
24
|
-
def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
|
25
|
-
"""
|
26
|
-
Iterates over the documents and splits them into chunks based on the word count provided by the user.
|
27
|
-
Args:
|
28
|
-
documents (list): List of documents retrieved from git (can also consist of a single document).
|
29
|
-
server_ctx_size (int): Context window size of server.
|
30
|
-
chunk_word_count (int): Maximum number of words to chunk a document.
|
31
|
-
Returns:
|
32
|
-
List[str]: List of chunked documents.
|
33
|
-
"""
|
34
|
-
|
35
|
-
# Checks for input type error
|
36
|
-
if isinstance(documents, str):
|
37
|
-
documents = [documents]
|
38
|
-
|
39
|
-
elif not isinstance(documents, list):
|
40
|
-
raise TypeError(
|
41
|
-
"Expected: documents to be a list, but got {}".format(type(documents))
|
42
|
-
)
|
43
|
-
|
44
|
-
no_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
|
45
|
-
if no_tokens_per_doc > int(server_ctx_size - 1024):
|
46
|
-
raise ValueError(
|
47
|
-
"Error: {}".format(
|
48
|
-
str(
|
49
|
-
f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
|
50
|
-
)
|
51
|
-
)
|
52
|
-
)
|
53
|
-
# Placeholder for params
|
54
|
-
content = []
|
55
|
-
chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
|
56
|
-
chunk_overlap = _DEFAULT_CHUNK_OVERLAP
|
57
|
-
|
58
|
-
# Using Markdown as default, document-specific chunking will be implemented in seperate pr.
|
59
|
-
text_splitter = RecursiveCharacterTextSplitter.from_language(
|
60
|
-
language=Language.MARKDOWN,
|
61
|
-
chunk_size=chunk_size,
|
62
|
-
chunk_overlap=chunk_overlap,
|
63
|
-
)
|
64
|
-
|
65
|
-
# Determine file type for heuristics, default with markdown
|
66
|
-
for docs in documents:
|
67
|
-
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
|
68
|
-
docs = re.sub(r"-{2,}\|", "-|", docs)
|
69
|
-
# Remove unnecessary spaces in front of pipe characters in a markdown table.
|
70
|
-
docs = re.sub(r"\ +\|", " |", docs)
|
71
|
-
temp = text_splitter.create_documents([docs])
|
72
|
-
content.extend([item.page_content for item in temp])
|
73
|
-
return content
|
sdg_hub/utils/docprocessor.py
DELETED
@@ -1,357 +0,0 @@
|
|
1
|
-
# Standard
|
2
|
-
from pathlib import Path
|
3
|
-
import json
|
4
|
-
|
5
|
-
# Third Party
|
6
|
-
from datasets import Dataset
|
7
|
-
from tabulate import tabulate
|
8
|
-
from transformers import AutoTokenizer
|
9
|
-
import yaml
|
10
|
-
|
11
|
-
# First Party
|
12
|
-
from sdg_hub.logger_config import setup_logger
|
13
|
-
|
14
|
-
# Local
|
15
|
-
from .datautils import safe_concatenate_datasets
|
16
|
-
from .chunking import chunk_document
|
17
|
-
|
18
|
-
logger = setup_logger(__name__)
|
19
|
-
|
20
|
-
|
21
|
-
def fuse_texts(text_list, short_length_threshold=100):
|
22
|
-
fused_texts = []
|
23
|
-
previous_long_text = ""
|
24
|
-
|
25
|
-
for text in text_list:
|
26
|
-
word_count = len(text.split())
|
27
|
-
|
28
|
-
if word_count <= short_length_threshold and previous_long_text:
|
29
|
-
# Append the short text to the last long text
|
30
|
-
fused_texts[-1] += "\n\n" + text
|
31
|
-
else:
|
32
|
-
# This is a long text, so add it to the list and remember it
|
33
|
-
fused_texts.append(text)
|
34
|
-
previous_long_text = text
|
35
|
-
|
36
|
-
return fused_texts
|
37
|
-
|
38
|
-
|
39
|
-
def handle_footnote(book_element):
|
40
|
-
pass
|
41
|
-
|
42
|
-
|
43
|
-
def create_tokenizer():
|
44
|
-
return AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")
|
45
|
-
|
46
|
-
|
47
|
-
def get_token_count(text, tokenizer):
|
48
|
-
return len(tokenizer.tokenize(text))
|
49
|
-
|
50
|
-
|
51
|
-
def add_heading_formatting(text):
|
52
|
-
text = text.split(".")
|
53
|
-
# TODO: Change this from hardcoded to something that makes sense
|
54
|
-
if len(text) > 1 and len(text[0].split(" ")) < 3:
|
55
|
-
text = f"**{text[0]}**" + ".".join(text[1:])
|
56
|
-
else:
|
57
|
-
text = ".".join(text)
|
58
|
-
return text
|
59
|
-
|
60
|
-
|
61
|
-
def generate_table_from_parsed_rep(item):
|
62
|
-
"""
|
63
|
-
Generate the table from the parsed representation and return
|
64
|
-
"""
|
65
|
-
caption = ""
|
66
|
-
if "text" in item:
|
67
|
-
# print("caption: ", item["text"])
|
68
|
-
caption = item["text"]
|
69
|
-
|
70
|
-
data = item["data"]
|
71
|
-
|
72
|
-
if len(data) <= 1 or len(data[0]) <= 1:
|
73
|
-
return ""
|
74
|
-
|
75
|
-
table = []
|
76
|
-
for i, row in enumerate(data):
|
77
|
-
trow = []
|
78
|
-
for j, cell in enumerate(row):
|
79
|
-
trow.append(cell["text"])
|
80
|
-
table.append(trow)
|
81
|
-
|
82
|
-
table_text = tabulate(table, tablefmt="github")
|
83
|
-
if caption:
|
84
|
-
table_text += f"\nCaption: {caption}\n"
|
85
|
-
return table_text
|
86
|
-
|
87
|
-
|
88
|
-
def get_table(json_book, table_ref):
|
89
|
-
parts = table_ref.split("/")
|
90
|
-
table_text = generate_table_from_parsed_rep(json_book[parts[1]][int(parts[2])])
|
91
|
-
return table_text
|
92
|
-
|
93
|
-
|
94
|
-
def get_table_page_number(json_book, idx):
|
95
|
-
# Get previous page number
|
96
|
-
prev_page_num, next_page_num = None, None
|
97
|
-
for book_element in json_book["main-text"][idx - 1 :: -1]:
|
98
|
-
if "prov" in book_element:
|
99
|
-
prev_page_num = book_element["prov"][0]["page"]
|
100
|
-
break
|
101
|
-
for book_element in json_book["main-text"][idx:]:
|
102
|
-
if "prov" in book_element:
|
103
|
-
next_page_num = book_element["prov"][0]["page"]
|
104
|
-
break
|
105
|
-
if prev_page_num is not None and next_page_num is not None:
|
106
|
-
if prev_page_num == next_page_num:
|
107
|
-
return prev_page_num
|
108
|
-
else:
|
109
|
-
return next_page_num
|
110
|
-
elif prev_page_num is not None:
|
111
|
-
return prev_page_num
|
112
|
-
elif next_page_num is not None:
|
113
|
-
return next_page_num
|
114
|
-
|
115
|
-
|
116
|
-
def build_chunks_from_docling_json(
|
117
|
-
json_book,
|
118
|
-
max_token_per_chunk,
|
119
|
-
tokenizer,
|
120
|
-
keep_same_page_thing_together=False,
|
121
|
-
chunking_criteria=None,
|
122
|
-
):
|
123
|
-
current_buffer = []
|
124
|
-
document_chunks = []
|
125
|
-
prev_page_number = None
|
126
|
-
book_title = None
|
127
|
-
|
128
|
-
for idx, book_element in enumerate(json_book["main-text"]):
|
129
|
-
if book_element["type"] in [
|
130
|
-
"page-footer",
|
131
|
-
"picture",
|
132
|
-
"reference",
|
133
|
-
"meta-data",
|
134
|
-
"figure",
|
135
|
-
"page-header",
|
136
|
-
]:
|
137
|
-
continue
|
138
|
-
elif book_element["type"] == "footnote":
|
139
|
-
handle_footnote(book_element)
|
140
|
-
current_book_page_number = book_element["prov"][0]["page"]
|
141
|
-
elif book_element["type"] in [
|
142
|
-
"subtitle-level-1",
|
143
|
-
"paragraph",
|
144
|
-
"table",
|
145
|
-
"title",
|
146
|
-
"equation",
|
147
|
-
]: # 'page-header',
|
148
|
-
if book_element["type"] == "table":
|
149
|
-
current_book_page_number = get_table_page_number(json_book, idx)
|
150
|
-
else:
|
151
|
-
current_book_page_number = book_element["prov"][0]["page"]
|
152
|
-
book_text = book_element["text"]
|
153
|
-
|
154
|
-
if book_element["type"] == "subtitle-level-1":
|
155
|
-
if book_title is None:
|
156
|
-
book_title = book_text
|
157
|
-
book_text = f"# Title: **{book_text}**"
|
158
|
-
else:
|
159
|
-
book_text = f"## **{book_text}**"
|
160
|
-
|
161
|
-
if book_element["type"] == "title":
|
162
|
-
book_text = f"# **{book_text}**"
|
163
|
-
if book_element["type"] == "page-header":
|
164
|
-
book_text = f"Page Header: **{book_text}**\n\n"
|
165
|
-
|
166
|
-
if chunking_criteria is not None:
|
167
|
-
# custom break function that can be used to chunk document
|
168
|
-
if chunking_criteria(book_text):
|
169
|
-
document_chunks.append("\n\n".join(current_buffer))
|
170
|
-
current_buffer = []
|
171
|
-
elif (
|
172
|
-
prev_page_number is not None
|
173
|
-
and prev_page_number != current_book_page_number
|
174
|
-
) and keep_same_page_thing_together:
|
175
|
-
document_chunks.append("\n\n".join(current_buffer))
|
176
|
-
current_buffer = []
|
177
|
-
else:
|
178
|
-
if (
|
179
|
-
get_token_count("\n\n".join(current_buffer), tokenizer)
|
180
|
-
>= max_token_per_chunk
|
181
|
-
and len(current_buffer) > 1
|
182
|
-
):
|
183
|
-
# chunk_text = '\n\n'.join(current_buffer[:-1])
|
184
|
-
# print(f"Current chunk size {get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}")
|
185
|
-
document_chunks.append("\n\n".join(current_buffer[:-1]))
|
186
|
-
|
187
|
-
if (
|
188
|
-
get_token_count(current_buffer[-1], tokenizer)
|
189
|
-
>= max_token_per_chunk
|
190
|
-
):
|
191
|
-
# print(f"This is too big document to be left in the current buffer { get_token_count(current_buffer[-1], tokenizer)}")
|
192
|
-
document_chunks.append(current_buffer[-1])
|
193
|
-
current_buffer = []
|
194
|
-
else:
|
195
|
-
current_buffer = current_buffer[-1:]
|
196
|
-
|
197
|
-
if book_element["type"] == "paragraph":
|
198
|
-
book_text = add_heading_formatting(book_text)
|
199
|
-
elif book_element["type"] == "table":
|
200
|
-
book_text = get_table(json_book, book_element["$ref"])
|
201
|
-
if "## References" in book_text or "## Acknowledgements" in book_text:
|
202
|
-
# For reasearch papers we ignore everything after this sections
|
203
|
-
break
|
204
|
-
current_buffer.append(book_text)
|
205
|
-
|
206
|
-
try:
|
207
|
-
prev_page_number = current_book_page_number
|
208
|
-
except:
|
209
|
-
logger.error(book_element)
|
210
|
-
if "\n\n".join(current_buffer) not in document_chunks:
|
211
|
-
document_chunks.append("\n\n".join(current_buffer))
|
212
|
-
return document_chunks
|
213
|
-
|
214
|
-
|
215
|
-
class DocProcessor:
|
216
|
-
def __init__(
|
217
|
-
self,
|
218
|
-
parsed_doc_dir: Path,
|
219
|
-
tokenizer: str = "instructlab/granite-7b-lab",
|
220
|
-
user_config_path: Path = None,
|
221
|
-
):
|
222
|
-
self.parsed_doc_dir = self._path_validator(parsed_doc_dir)
|
223
|
-
self.user_config = self._load_user_config(
|
224
|
-
self._path_validator(user_config_path)
|
225
|
-
)
|
226
|
-
self.docling_jsons = list(self.parsed_doc_dir.glob("*.json"))
|
227
|
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
228
|
-
|
229
|
-
def _path_validator(self, path) -> Path:
|
230
|
-
"""
|
231
|
-
Validate the path and return a Path object.
|
232
|
-
Args:
|
233
|
-
path (str): Path to be validated.
|
234
|
-
Returns:
|
235
|
-
Path`: Path object.
|
236
|
-
"""
|
237
|
-
if isinstance(path, str):
|
238
|
-
path = Path(path)
|
239
|
-
if not path.exists():
|
240
|
-
raise FileNotFoundError(f"{path} does not exist.")
|
241
|
-
return path
|
242
|
-
|
243
|
-
def _load_user_config(self, user_config_path: Path) -> dict:
|
244
|
-
"""
|
245
|
-
Load the user config file.
|
246
|
-
Args:
|
247
|
-
user_config_path (Path): Path to the user config file.
|
248
|
-
Returns:
|
249
|
-
dict: User config dictionary.
|
250
|
-
"""
|
251
|
-
# load user config as yaml
|
252
|
-
with open(user_config_path, "r", encoding="utf-8") as f:
|
253
|
-
return yaml.safe_load(f)
|
254
|
-
|
255
|
-
def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
|
256
|
-
"""
|
257
|
-
Process the parsed docling json file and return a dataset.
|
258
|
-
Args:
|
259
|
-
json_fp (str): Path to the parsed docling json file.
|
260
|
-
Returns:
|
261
|
-
Dataset: Dataset object.
|
262
|
-
"""
|
263
|
-
logger.info(f"Processing parsed docling json file: {json_fp}")
|
264
|
-
with open(json_fp, "r", encoding="utf-8") as f:
|
265
|
-
data = json.load(f)
|
266
|
-
|
267
|
-
file_name = json_fp.name.split(".")[0]
|
268
|
-
chunks = build_chunks_from_docling_json(
|
269
|
-
data,
|
270
|
-
max_token_per_chunk=500,
|
271
|
-
tokenizer=self.tokenizer,
|
272
|
-
)
|
273
|
-
chunks = fuse_texts(chunks, 200)
|
274
|
-
return Dataset.from_dict(
|
275
|
-
{
|
276
|
-
"document": chunks,
|
277
|
-
"document_outline": [self.user_config["document_outline"]]
|
278
|
-
* len(chunks),
|
279
|
-
"document_title": [file_name] * len(chunks),
|
280
|
-
"domain": [self.user_config["domain"]] * len(chunks),
|
281
|
-
}
|
282
|
-
)
|
283
|
-
|
284
|
-
def _add_icls(self, chunked_document: Dataset) -> Dataset:
|
285
|
-
"""
|
286
|
-
Add the ICLS label to the dataset.
|
287
|
-
Args:
|
288
|
-
dataset (Dataset): Dataset object.
|
289
|
-
Returns:
|
290
|
-
Dataset: Dataset object with ICLS label.
|
291
|
-
"""
|
292
|
-
icl = self.user_config["seed_examples"]
|
293
|
-
chunked_document_all_icl = []
|
294
|
-
for icl_ in icl:
|
295
|
-
chunked_document_all_icl.append(
|
296
|
-
chunked_document.map(
|
297
|
-
lambda x: {
|
298
|
-
"icl_document": icl_["context"],
|
299
|
-
"icl_query_1": icl_["questions_and_answers"][0]["question"],
|
300
|
-
"icl_response_1": icl_["questions_and_answers"][0]["answer"],
|
301
|
-
"icl_query_2": icl_["questions_and_answers"][1]["question"],
|
302
|
-
"icl_response_2": icl_["questions_and_answers"][1]["answer"],
|
303
|
-
"icl_query_3": icl_["questions_and_answers"][2]["question"],
|
304
|
-
"icl_response_3": icl_["questions_and_answers"][2]["answer"],
|
305
|
-
}
|
306
|
-
)
|
307
|
-
)
|
308
|
-
chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
|
309
|
-
chunked_document_all_icl = chunked_document_all_icl.map(
|
310
|
-
lambda x: {
|
311
|
-
"chunks": chunk_document(
|
312
|
-
[x["document"]], server_ctx_size=4096, chunk_word_count=1024
|
313
|
-
)
|
314
|
-
if get_token_count(x["document"], self.tokenizer) > 1024
|
315
|
-
else [x["document"]]
|
316
|
-
}
|
317
|
-
)
|
318
|
-
df = chunked_document_all_icl.to_pandas()
|
319
|
-
df_exploded = df.explode("chunks").reset_index(drop=True)
|
320
|
-
new_ds = Dataset.from_pandas(df_exploded)
|
321
|
-
new_ds = new_ds.remove_columns("document").rename_columns(
|
322
|
-
{"chunks": "document"}
|
323
|
-
)
|
324
|
-
|
325
|
-
# Only keep document greater than 100 tokens
|
326
|
-
new_ds = new_ds.filter(
|
327
|
-
lambda x: get_token_count(x["document"], self.tokenizer) > 100
|
328
|
-
)
|
329
|
-
return new_ds
|
330
|
-
|
331
|
-
def get_processed_dataset(self) -> Dataset:
|
332
|
-
"""
|
333
|
-
Process all the parsed docling json files and return a dataset.
|
334
|
-
Returns:
|
335
|
-
Dataset: Dataset object.
|
336
|
-
"""
|
337
|
-
datasets = []
|
338
|
-
for json_fp in self.docling_jsons:
|
339
|
-
chunk_ds = self._process_parsed_docling_json(json_fp)
|
340
|
-
chunk_ds_with_icls = self._add_icls(chunk_ds)
|
341
|
-
datasets.append(chunk_ds_with_icls)
|
342
|
-
return safe_concatenate_datasets(datasets)
|
343
|
-
|
344
|
-
def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
|
345
|
-
chunks_mds = []
|
346
|
-
for md_file in list_md_files:
|
347
|
-
with open(md_file, "r", encoding="utf-8") as f:
|
348
|
-
text = f.read()
|
349
|
-
chunks_mds.append({
|
350
|
-
"document": text,
|
351
|
-
"document_outline": self.user_config["document_outline"],
|
352
|
-
"document_title": md_file,
|
353
|
-
"domain": self.user_config["domain"],
|
354
|
-
})
|
355
|
-
chunk_ds = Dataset.from_list(chunks_mds)
|
356
|
-
chunk_ds_with_icls = self._add_icls(chunk_ds)
|
357
|
-
return chunk_ds_with_icls
|