PyPI - sdg-hub - Versions diffs - 0.1.0a2.dev0__py3-none-any.whl - Mend

sdg-hub 0.1.0a2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

sdg_hub/__init__.py +4 -0
sdg_hub/_version.py +21 -0
sdg_hub/blocks/__init__.py +6 -0
sdg_hub/blocks/block.py +54 -0
sdg_hub/blocks/filterblock.py +76 -0
sdg_hub/blocks/iterblock.py +31 -0
sdg_hub/blocks/llmblock.py +430 -0
sdg_hub/blocks/rmblocks.py +194 -0
sdg_hub/blocks/utilblocks.py +140 -0
sdg_hub/configs/__init__.py +0 -0
sdg_hub/configs/annotations/__init__.py +0 -0
sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
sdg_hub/configs/annotations/detailed_description.yaml +10 -0
sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
sdg_hub/configs/annotations/simple.yaml +10 -0
sdg_hub/configs/knowledge/__init__.py +0 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
sdg_hub/configs/knowledge/data_recipe/__init__.py +0 -0
sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +3 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
sdg_hub/configs/knowledge/router.yaml +12 -0
sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
sdg_hub/configs/skills/_A_.yaml +97 -0
sdg_hub/configs/skills/_B_.yaml +36 -0
sdg_hub/configs/skills/_C_.yaml +71 -0
sdg_hub/configs/skills/_D_.yaml +85 -0
sdg_hub/configs/skills/_E_.yaml +30 -0
sdg_hub/configs/skills/_F_.yaml +45 -0
sdg_hub/configs/skills/_G_.yaml +56 -0
sdg_hub/configs/skills/_H_.yaml +80 -0
sdg_hub/configs/skills/__init__.py +0 -0
sdg_hub/configs/skills/analyzer.yaml +48 -0
sdg_hub/configs/skills/annotation.yaml +36 -0
sdg_hub/configs/skills/contexts.yaml +21 -0
sdg_hub/configs/skills/critic.yaml +60 -0
sdg_hub/configs/skills/data_recipe/__init__.py +0 -0
sdg_hub/configs/skills/data_recipe/default_recipe.yaml +6 -0
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +44 -0
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +46 -0
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +54 -0
sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
sdg_hub/configs/skills/freeform_questions.yaml +29 -0
sdg_hub/configs/skills/freeform_responses.yaml +45 -0
sdg_hub/configs/skills/grounded_questions.yaml +38 -0
sdg_hub/configs/skills/grounded_responses.yaml +59 -0
sdg_hub/configs/skills/judge.yaml +53 -0
sdg_hub/configs/skills/planner.yaml +67 -0
sdg_hub/configs/skills/respond.yaml +8 -0
sdg_hub/configs/skills/revised_responder.yaml +78 -0
sdg_hub/configs/skills/router.yaml +12 -0
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
sdg_hub/flow.py +127 -0
sdg_hub/flows/annotation/emotion/detailed_description.yaml +19 -0
sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +19 -0
sdg_hub/flows/annotation/emotion/simple.yaml +19 -0
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
sdg_hub/flows/generation/skills/agentic_improve_skill.yaml +108 -0
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
sdg_hub/logger_config.py +20 -0
sdg_hub/pipeline.py +66 -0
sdg_hub/prompts.py +17 -0
sdg_hub/py.typed +0 -0
sdg_hub/registry.py +122 -0
sdg_hub/sdg.py +164 -0
sdg_hub/utils/__init__.py +5 -0
sdg_hub/utils/chunking.py +73 -0
sdg_hub/utils/datamixing.py +123 -0
sdg_hub/utils/datautils.py +14 -0
sdg_hub/utils/docprocessor.py +357 -0
sdg_hub/utils/json.py +48 -0
sdg_hub/utils/models.py +31 -0
sdg_hub/utils/parse_and_convert.py +392 -0
sdg_hub/utils/taxonomy.py +489 -0
sdg_hub-0.1.0a2.dev0.dist-info/METADATA +154 -0
sdg_hub-0.1.0a2.dev0.dist-info/RECORD +94 -0
sdg_hub-0.1.0a2.dev0.dist-info/WHEEL +5 -0
sdg_hub-0.1.0a2.dev0.dist-info/licenses/LICENSE +201 -0
sdg_hub-0.1.0a2.dev0.dist-info/top_level.txt +1 -0

sdg_hub/utils/docprocessor.py ADDED Viewed

@@ -0,0 +1,357 @@
+# Standard
+from pathlib import Path
+import json
+# Third Party
+from datasets import Dataset
+from tabulate import tabulate
+from transformers import AutoTokenizer
+import yaml
+# First Party
+from sdg_hub.logger_config import setup_logger
+# Local
+from .datautils import safe_concatenate_datasets
+from .chunking import chunk_document
+logger = setup_logger(__name__)
+def fuse_texts(text_list, short_length_threshold=100):
+    fused_texts = []
+    previous_long_text = ""
+    for text in text_list:
+        word_count = len(text.split())
+        if word_count <= short_length_threshold and previous_long_text:
+            # Append the short text to the last long text
+            fused_texts[-1] += "\n\n" + text
+        else:
+            # This is a long text, so add it to the list and remember it
+            fused_texts.append(text)
+            previous_long_text = text
+    return fused_texts
+def handle_footnote(book_element):
+    pass
+def create_tokenizer():
+    return AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")
+def get_token_count(text, tokenizer):
+    return len(tokenizer.tokenize(text))
+def add_heading_formatting(text):
+    text = text.split(".")
+    # TODO: Change this from hardcoded to something that makes sense
+    if len(text) > 1 and len(text[0].split(" ")) < 3:
+        text = f"**{text[0]}**" + ".".join(text[1:])
+    else:
+        text = ".".join(text)
+    return text
+def generate_table_from_parsed_rep(item):
+    """
+    Generate the table from the parsed representation and return
+    """
+    caption = ""
+    if "text" in item:
+        # print("caption: ", item["text"])
+        caption = item["text"]
+    data = item["data"]
+    if len(data) <= 1 or len(data[0]) <= 1:
+        return ""
+    table = []
+    for i, row in enumerate(data):
+        trow = []
+        for j, cell in enumerate(row):
+            trow.append(cell["text"])
+        table.append(trow)
+    table_text = tabulate(table, tablefmt="github")
+    if caption:
+        table_text += f"\nCaption: {caption}\n"
+    return table_text
+def get_table(json_book, table_ref):
+    parts = table_ref.split("/")
+    table_text = generate_table_from_parsed_rep(json_book[parts[1]][int(parts[2])])
+    return table_text
+def get_table_page_number(json_book, idx):
+    # Get previous page number
+    prev_page_num, next_page_num = None, None
+    for book_element in json_book["main-text"][idx - 1 :: -1]:
+        if "prov" in book_element:
+            prev_page_num = book_element["prov"][0]["page"]
+            break
+    for book_element in json_book["main-text"][idx:]:
+        if "prov" in book_element:
+            next_page_num = book_element["prov"][0]["page"]
+            break
+    if prev_page_num is not None and next_page_num is not None:
+        if prev_page_num == next_page_num:
+            return prev_page_num
+        else:
+            return next_page_num
+    elif prev_page_num is not None:
+        return prev_page_num
+    elif next_page_num is not None:
+        return next_page_num
+def build_chunks_from_docling_json(
+    json_book,
+    max_token_per_chunk,
+    tokenizer,
+    keep_same_page_thing_together=False,
+    chunking_criteria=None,
+):
+    current_buffer = []
+    document_chunks = []
+    prev_page_number = None
+    book_title = None
+    for idx, book_element in enumerate(json_book["main-text"]):
+        if book_element["type"] in [
+            "page-footer",
+            "picture",
+            "reference",
+            "meta-data",
+            "figure",
+            "page-header",
+        ]:
+            continue
+        elif book_element["type"] == "footnote":
+            handle_footnote(book_element)
+            current_book_page_number = book_element["prov"][0]["page"]
+        elif book_element["type"] in [
+            "subtitle-level-1",
+            "paragraph",
+            "table",
+            "title",
+            "equation",
+        ]:  # 'page-header',
+            if book_element["type"] == "table":
+                current_book_page_number = get_table_page_number(json_book, idx)
+            else:
+                current_book_page_number = book_element["prov"][0]["page"]
+                book_text = book_element["text"]
+            if book_element["type"] == "subtitle-level-1":
+                if book_title is None:
+                    book_title = book_text
+                    book_text = f"# Title: **{book_text}**"
+                else:
+                    book_text = f"## **{book_text}**"
+            if book_element["type"] == "title":
+                book_text = f"# **{book_text}**"
+            if book_element["type"] == "page-header":
+                book_text = f"Page Header: **{book_text}**\n\n"
+            if chunking_criteria is not None:
+                # custom break function that can be used to chunk document
+                if chunking_criteria(book_text):
+                    document_chunks.append("\n\n".join(current_buffer))
+                    current_buffer = []
+            elif (
+                prev_page_number is not None
+                and prev_page_number != current_book_page_number
+            ) and keep_same_page_thing_together:
+                document_chunks.append("\n\n".join(current_buffer))
+                current_buffer = []
+            else:
+                if (
+                    get_token_count("\n\n".join(current_buffer), tokenizer)
+                    >= max_token_per_chunk
+                    and len(current_buffer) > 1
+                ):
+                    # chunk_text = '\n\n'.join(current_buffer[:-1])
+                    # print(f"Current chunk size {get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}")
+                    document_chunks.append("\n\n".join(current_buffer[:-1]))
+                    if (
+                        get_token_count(current_buffer[-1], tokenizer)
+                        >= max_token_per_chunk
+                    ):
+                        # print(f"This is too big document to be left in the current buffer { get_token_count(current_buffer[-1], tokenizer)}")
+                        document_chunks.append(current_buffer[-1])
+                        current_buffer = []
+                    else:
+                        current_buffer = current_buffer[-1:]
+            if book_element["type"] == "paragraph":
+                book_text = add_heading_formatting(book_text)
+            elif book_element["type"] == "table":
+                book_text = get_table(json_book, book_element["$ref"])
+            if "## References" in book_text or "## Acknowledgements" in book_text:
+                # For reasearch papers we ignore everything after this sections
+                break
+            current_buffer.append(book_text)
+        try:
+            prev_page_number = current_book_page_number
+        except:
+            logger.error(book_element)
+    if "\n\n".join(current_buffer) not in document_chunks:
+        document_chunks.append("\n\n".join(current_buffer))
+    return document_chunks
+class DocProcessor:
+    def __init__(
+        self,
+        parsed_doc_dir: Path,
+        tokenizer: str = "instructlab/granite-7b-lab",
+        user_config_path: Path = None,
+    ):
+        self.parsed_doc_dir = self._path_validator(parsed_doc_dir)
+        self.user_config = self._load_user_config(
+            self._path_validator(user_config_path)
+        )
+        self.docling_jsons = list(self.parsed_doc_dir.glob("*.json"))
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    def _path_validator(self, path) -> Path:
+        """
+        Validate the path and return a Path object.
+        Args:
+            path (str): Path to be validated.
+        Returns:
+            Path`: Path object.
+        """
+        if isinstance(path, str):
+            path = Path(path)
+            if not path.exists():
+                raise FileNotFoundError(f"{path} does not exist.")
+        return path
+    def _load_user_config(self, user_config_path: Path) -> dict:
+        """
+        Load the user config file.
+        Args:
+            user_config_path (Path): Path to the user config file.
+        Returns:
+            dict: User config dictionary.
+        """
+        # load user config as yaml
+        with open(user_config_path, "r", encoding="utf-8") as f:
+            return yaml.safe_load(f)
+    def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
+        """
+        Process the parsed docling json file and return a dataset.
+        Args:
+            json_fp (str): Path to the parsed docling json file.
+        Returns:
+            Dataset: Dataset object.
+        """
+        logger.info(f"Processing parsed docling json file: {json_fp}")
+        with open(json_fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        file_name = json_fp.name.split(".")[0]
+        chunks = build_chunks_from_docling_json(
+            data,
+            max_token_per_chunk=500,
+            tokenizer=self.tokenizer,
+        )
+        chunks = fuse_texts(chunks, 200)
+        return Dataset.from_dict(
+            {
+                "document": chunks,
+                "document_outline": [self.user_config["document_outline"]]
+                * len(chunks),
+                "document_title": [file_name] * len(chunks),
+                "domain": [self.user_config["domain"]] * len(chunks),
+            }
+        )
+    def _add_icls(self, chunked_document: Dataset) -> Dataset:
+        """
+        Add the ICLS label to the dataset.
+        Args:
+            dataset (Dataset): Dataset object.
+        Returns:
+            Dataset: Dataset object with ICLS label.
+        """
+        icl = self.user_config["seed_examples"]
+        chunked_document_all_icl = []
+        for icl_ in icl:
+            chunked_document_all_icl.append(
+                chunked_document.map(
+                    lambda x: {
+                        "icl_document": icl_["context"],
+                        "icl_query_1": icl_["questions_and_answers"][0]["question"],
+                        "icl_response_1": icl_["questions_and_answers"][0]["answer"],
+                        "icl_query_2": icl_["questions_and_answers"][1]["question"],
+                        "icl_response_2": icl_["questions_and_answers"][1]["answer"],
+                        "icl_query_3": icl_["questions_and_answers"][2]["question"],
+                        "icl_response_3": icl_["questions_and_answers"][2]["answer"],
+                    }
+                )
+            )
+        chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
+        chunked_document_all_icl = chunked_document_all_icl.map(
+            lambda x: {
+                "chunks": chunk_document(
+                    [x["document"]], server_ctx_size=4096, chunk_word_count=1024
+                )
+                if get_token_count(x["document"], self.tokenizer) > 1024
+                else [x["document"]]
+            }
+        )
+        df = chunked_document_all_icl.to_pandas()
+        df_exploded = df.explode("chunks").reset_index(drop=True)
+        new_ds = Dataset.from_pandas(df_exploded)
+        new_ds = new_ds.remove_columns("document").rename_columns(
+            {"chunks": "document"}
+        )
+        # Only keep document greater than 100 tokens
+        new_ds = new_ds.filter(
+            lambda x: get_token_count(x["document"], self.tokenizer) > 100
+        )
+        return new_ds
+    def get_processed_dataset(self) -> Dataset:
+        """
+        Process all the parsed docling json files and return a dataset.
+        Returns:
+            Dataset: Dataset object.
+        """
+        datasets = []
+        for json_fp in self.docling_jsons:
+            chunk_ds = self._process_parsed_docling_json(json_fp)
+            chunk_ds_with_icls = self._add_icls(chunk_ds)
+            datasets.append(chunk_ds_with_icls)
+        return safe_concatenate_datasets(datasets)
+    def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
+        chunks_mds = []
+        for md_file in list_md_files:
+            with open(md_file, "r", encoding="utf-8") as f:
+                text = f.read()
+                chunks_mds.append({
+            "document": text,
+            "document_outline": self.user_config["document_outline"],
+            "document_title": md_file,
+            "domain": self.user_config["domain"],
+            })
+        chunk_ds = Dataset.from_list(chunks_mds)
+        chunk_ds_with_icls = self._add_icls(chunk_ds)
+        return chunk_ds_with_icls

sdg_hub/utils/json.py ADDED Viewed

@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# Standard
+import io
+import json
+import os
+def _make_w_io_base(f, mode: str):
+    # pylint: disable=consider-using-with
+    if not isinstance(f, io.IOBase):
+        f_dirname = os.path.dirname(f)
+        if f_dirname != "":
+            os.makedirs(f_dirname, exist_ok=True)
+        f = open(f, mode=mode, encoding="utf-8")
+    return f
+def _make_r_io_base(f, mode: str):
+    # pylint: disable=consider-using-with
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode, encoding="utf-8")
+    return f
+def jdump(obj, f, mode="w", indent=4, default=str):
+    """Dump a str or dictionary to a file in json format.
+    Args:
+        obj: An object to be written.
+        f: A string path to the location on disk.
+        mode: Mode for opening the file.
+        indent: Indent for storing json dictionaries.
+        default: A function to handle non-serializable entries; defaults to `str`.
+    """
+    with _make_w_io_base(f, mode) as f_:
+        if isinstance(obj, (dict, list)):
+            json.dump(obj, f_, indent=indent, default=default)
+        elif isinstance(obj, str):
+            f_.write(obj)
+        else:
+            raise ValueError(f"Unexpected type: {type(obj)}")
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    with _make_r_io_base(f, mode) as f_:
+        return json.load(f_)

sdg_hub/utils/models.py ADDED Viewed

@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# Standard
+import os
+import re
+# First Party
+from sdg_hub.utils import GenerateException
+# When otherwise unknown, ilab uses this as the default family
+DEFAULT_MODEL_FAMILY = "merlinite"
+# Model families understood by ilab
+MODEL_FAMILIES = set(("merlinite", "mixtral"))
+# Map model names to their family
+MODEL_FAMILY_MAPPINGS = {
+    "granite": "merlinite",
+}
+def get_model_family(forced, model_path):
+    forced = MODEL_FAMILY_MAPPINGS.get(forced, forced)
+    if forced and forced.lower() not in MODEL_FAMILIES:
+        raise GenerateException("Unknown model family: %s" % forced)
+    # Try to guess the model family based on the model's filename
+    guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower()
+    guess = MODEL_FAMILY_MAPPINGS.get(guess, guess)
+    return guess if guess in MODEL_FAMILIES else DEFAULT_MODEL_FAMILY