PyPI - dingo-python - Versions diffs - 1.0__py3-none-any.whl - Mend

dingo-python 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

dingo/__init__.py +0 -0
dingo/config/__init__.py +1 -0
dingo/config/config.py +47 -0
dingo/convert/__init__.py +4 -0
dingo/convert/base.py +147 -0
dingo/exec/__init__.py +3 -0
dingo/exec/base.py +54 -0
dingo/exec/local.py +288 -0
dingo/exec/spark.py +169 -0
dingo/io/__init__.py +2 -0
dingo/io/export.py +0 -0
dingo/io/input.py +27 -0
dingo/io/summary.py +28 -0
dingo/model/__init__.py +3 -0
dingo/model/llm/__init__.py +0 -0
dingo/model/llm/base.py +12 -0
dingo/model/llm/common/__init__.py +0 -0
dingo/model/llm/common/base_llm.py +395 -0
dingo/model/llm/common/base_llm_api.py +396 -0
dingo/model/llm/common/openai_api.py +222 -0
dingo/model/llm/common/turbomind_api.py +148 -0
dingo/model/llm/gpt.py +62 -0
dingo/model/llm/llama3.py +97 -0
dingo/model/llm/perspective.py +68 -0
dingo/model/model.py +227 -0
dingo/model/rule/__init__.py +0 -0
dingo/model/rule/base.py +14 -0
dingo/model/rule/common_rule.py +551 -0
dingo/model/rule/image_rule.py +81 -0
dingo/model/rule/prompt_rule.py +39 -0
dingo/model/rule/util.py +282 -0
dingo/utils/__init__.py +1 -0
dingo/utils/log_util/__init__.py +32 -0
dingo/utils/log_util/logger.py +39 -0
dingo_python-1.0.dist-info/LICENSE +201 -0
dingo_python-1.0.dist-info/METADATA +221 -0
dingo_python-1.0.dist-info/RECORD +39 -0
dingo_python-1.0.dist-info/WHEEL +5 -0
dingo_python-1.0.dist-info/top_level.txt +1 -0

dingo/exec/spark.py ADDED Viewed

@@ -0,0 +1,169 @@
+import uuid
+import json
+import orjson
+from abc import ABC, abstractmethod
+from typing import Protocol, List, Dict, Any, Callable, Optional
+from pyspark import SparkConf
+from pyspark.sql import SparkSession, Row, DataFrame
+from pyspark.sql.functions import explode, count, col, format_number
+from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType
+from dingo.model import Model
+from dingo.model.rule.base import BaseRule, ResModel as RuleResModel
+QUALITY_MAP = Model.rule_metric_type_map
+class SparkExecutor():
+    def __init__(self):
+        self.spark: Optional[SparkSession] = None
+        self.input_df: Optional[DataFrame] = None
+        self.convert_df: Optional[DataFrame] = None
+        self.output_df: Optional[DataFrame] = None
+        self.summary = {
+            'score': 0.0,
+            'num_good': 0,
+            'num_bad': 0,
+            'total': 0,
+            'error_ratio': {
+                "QUALITY_SIGNAL_EFFECTIVENESS": 0.0,
+                "QUALITY_SIGNAL_COMPLETENESS": 0.0,
+                "QUALITY_SIGNAL_UNDERSTANDABILITY": 0.0,
+                "QUALITY_SIGNAL_SIMILARITY": 0.0,
+                "QUALITY_SIGNAL_FLUENCY": 0.0,
+                "QUALITY_SIGNAL_RELEVANCE": 0.0,
+                "QUALITY_SIGNAL_SECURITY": 0.0
+            }
+        }
+    def set_spark(self, spark: SparkSession):
+        self.spark = spark
+    def set_input_df(self, df: DataFrame):
+        self.input_df = df
+    def get_spark(self):
+        return self.spark
+    def get_input_df(self):
+        return self.input_df
+    def get_convert_df(self):
+        return self.convert_df
+    def get_output_df(self):
+        return self.output_df
+    def get_summary(self):
+        return self.summary
+    def create_spark(self, conf: SparkConf):
+        try:
+            self.spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()  # type: ignore
+        except:
+            self.spark = SparkSession.builder.config(conf=conf).getOrCreate()  # type: ignore
+    def convert_data(
+            self,
+            column_content: List[str],
+            column_id: List[str] = None,
+            column_prompt: List[str] = None,
+    ):
+        def func(row: Row) -> Row:
+            data = orjson.loads(row.value)
+            new_data = {
+                'data_id': find_nested_data(data, column_id) if column_id is not None else str(uuid.uuid4()),
+                'prompt': find_nested_data(data, column_prompt) if column_prompt is not None else '',
+                'content': find_nested_data(data, column_content),
+            }
+            return Row(value=orjson.dumps(new_data).decode("utf-8"))
+        convert_df = self.input_df.rdd.map(func).toDF()
+        self.summary['total'] = convert_df.count()
+        self.convert_df = convert_df
+    # # @abstractmethod
+    # # def evaluate(self) -> List[SummaryModel]:
+    # #     raise NotImplementedError()
+    def summarize(self):
+        self.summary['num_good'] = self.summary['total'] - self.summary['num_bad']
+        self.summary['score'] = round(self.summary['num_good'] / self.summary['total'] * 100, 2) if self.summary['total'] != 0 else 0
+        def extract_error_info(row):
+            data = orjson.loads(row.value)
+            return Row(id=data['data_id'], error_functions=data['quality_signals'])
+        schema = StructType([
+            StructField("data_id", StringType(), True),
+            StructField("quality_signals", ArrayType(StringType()), True)
+        ])
+        df_error_info = self.spark.createDataFrame(self.output_df.rdd.map(extract_error_info), schema=schema)
+        df_exploded = df_error_info.select("data_id", explode("quality_signals").alias("quality_signal"))
+        df_grouped = df_exploded.groupBy("quality_signal").agg(count("*").alias("count"))
+        df_grouped = df_grouped.withColumn("ratio", format_number(col("count") / self.summary["total"], 6))
+        rows = df_grouped.collect()
+        for row in rows:
+            quality_signal = row['quality_signal']
+            ratio = row['ratio']
+            self.summary['error_ratio'][quality_signal] = ratio
+    def execute(self, rule_list: List[str]):
+        def func_exec(row: Row):
+            data = orjson.loads(row.value)
+            new_data = execute_rule(rule_list, data)
+            return Row(value=orjson.dumps(new_data).decode("utf-8"))
+        def func_filter(row: Row):
+            return orjson.loads(row.value)['error_status'] is True
+        self.output_df = self.convert_df.rdd.map(func_exec).toDF()
+        self.output_df = self.output_df.rdd.filter(func_filter).toDF()
+        self.summary['num_bad'] = self.output_df.count()
+def find_nested_data(jsn: json, levels: List[str]):
+    data = jsn
+    for key in levels:
+        data = data[key]
+    return data
+def get_quality_signal(rule: BaseRule):
+    for quality_signal in QUALITY_MAP:
+        for rule_class in QUALITY_MAP[quality_signal]:
+            if rule.__name__ == rule_class.__name__:
+                return quality_signal
+    raise RuntimeError('this rule can not find its quality_signal: ' + rule.__name__)
+def execute_rule(rule_list: List[str], data: json) -> json:
+    data['error_status'] = False
+    data['error_functions'] = []
+    data['quality_signals'] = []
+    model: List[BaseRule] = []
+    for rule in rule_list:
+        assert isinstance(rule, str)
+        if rule not in Model.rule_name_map:
+            raise KeyError(f"{rule} not in Model.rule_name_map, there are {str(Model.rule_name_map.keys())}")
+        model.append(Model.rule_name_map[rule])
+    for rule_class in model:
+        rule_name = rule_class.__name__
+        if rule_name.startswith('Prompt'):
+            tmp: RuleResModel = rule_class.eval([data["prompt"], data["content"]])
+        else:
+            tmp: RuleResModel = rule_class.eval([data["content"]])
+        if tmp.error_status:
+            data['error_status'] = True
+            data['error_functions'].append(rule_name)
+            quality_signal = get_quality_signal(rule_class)
+            if quality_signal not in data['quality_signals']:
+                data['quality_signals'].append(quality_signal)
+    return data

dingo/io/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from dingo.io.input import InputModel, RawInputModel
2	+ from dingo.io.summary import SummaryModel

dingo/io/export.py ADDED Viewed

File without changes

dingo/io/input.py ADDED Viewed

@@ -0,0 +1,27 @@
+import time
+from typing import Optional, List
+from pydantic import BaseModel
+class InputModel(BaseModel):
+    """
+    Input model, output of converter.
+    """
+    data_id: str
+    prompt: str
+    content: str
+class RawInputModel(BaseModel):
+    """
+    Dataset model, output of converter.
+    """
+    dataset_id: str = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    eval_models: List[str] = ['default']
+    input_path: str = "data/inputs/test_data1.json"
+    output_path: str = "data/outputs/"
+    data_type: str = "json"
+    column_content: List[str] = []
+    column_id: List[str] = []
+    column_prompt: List[str] = []
+    custom_config_path: Optional[str] = None

dingo/io/summary.py ADDED Viewed

@@ -0,0 +1,28 @@
+from typing import List, Dict
+from pydantic import BaseModel
+class SummaryModel(BaseModel):
+    dataset_id: str
+    input_model: str
+    input_path: str
+    output_path: str
+    score: float
+    num_good: int
+    num_bad: int
+    total: int
+    error_ratio: Dict[str, float]
+    def to_dict(self):
+        return {
+            'dataset_id': self.dataset_id,
+            'input_model': self.input_model,
+            'input_path': self.input_path,
+            'output_path': self.output_path,
+            'score': self.score,
+            'num_good': self.num_good,
+            'num_bad': self.num_bad,
+            'total': self.total,
+            'error_ratio': self.error_ratio
+        }

dingo/model/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from dingo.model.model import Model
+Model.load_model()

dingo/model/llm/__init__.py ADDED Viewed

File without changes

dingo/model/llm/base.py ADDED Viewed

@@ -0,0 +1,12 @@
+from typing import Protocol, List
+from pydantic import BaseModel
+class ResModel(BaseModel):
+    score: int = 0
+    error: str = ''
+    reason: str = ''
+class BaseLLM(Protocol):
+    @classmethod
+    def call_api(cls, input_data: str) -> ResModel:
+        ...

dingo/model/llm/common/__init__.py ADDED Viewed

File without changes

dingo/model/llm/common/base_llm.py ADDED Viewed

@@ -0,0 +1,395 @@
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import Dict, List, Optional, Tuple, Union
+from dingo.utils import log
+PromptType = str
+class BaseLLMModel(ABC):
+    """Base class for model wrapper.
+    Args:
+        path (str): The path to the model.
+        max_seq_len (int): The maximum sequence length of the model. Defaults
+            to 2048.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+    """
+    is_api: bool = False
+    def __init__(self,
+                 path: str,
+                 max_seq_len: int = 2048,
+                 tokenizer_only: bool = False,
+                 meta_template: Optional[Dict] = None):
+        self.model = None
+        self.path = path
+        self.max_seq_len = max_seq_len
+        self.tokenizer_only = tokenizer_only
+        # meta template
+        self.template_parser = LMTemplateParser(meta_template)
+        self.eos_token_id = None
+        if meta_template and 'eos_token_id' in meta_template:
+            self.eos_token_id = meta_template['eos_token_id']
+    @abstractmethod
+    def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs.
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+        Returns:
+            List[str]: A list of generated strings.
+        """
+    @abstractmethod
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+    @abstractmethod
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized strings.
+        Args:
+            prompt (str): Input string.
+        Returns:
+            int: Length of the input tokens
+        """
+    def parse_template(self, prompt_template: PromptType, mode: str) -> str:
+        """Parse a prompt template, and wrap it with meta template if
+        applicable.
+        Args:
+            prompt_template (List[str or PromptList]): A prompt
+                template (potentially before being wrapped by meta template).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+        Returns:
+            str: The final string.
+        """
+        return self.template_parser.parse_template(prompt_template, mode)
+    def get_ppl_from_template(self,
+                              templates: List[PromptType],
+                              mask_length=None):
+        """Get perplexity given a list of templates.
+        Args:
+            templates (List[PromptType]): A list of templates.
+            mask_length (List[int]): A list of mask lengths. If provided, the
+                perplexity will be calculated only on the unmasked tokens.
+        """
+        inputs = self.parse_template(templates, mode='ppl')
+        return self.get_ppl(inputs, mask_length)
+    def generate_from_template(self, templates: List[PromptType],
+                               max_out_len: int, **kwargs):
+        """Generate completion from a list of templates.
+        Args:
+            templates (List[PromptType]): A list of templates.
+            max_out_len (int): The maximum length of the output.
+        """
+        inputs = self.parse_template(templates, mode='gen')
+        return self.generate(inputs, max_out_len=max_out_len, **kwargs)
+    def get_token_len_from_template(
+            self,
+            templates: Union[PromptType, List[PromptType]],
+            mode: str = 'ppl') -> Union[List[int], int]:
+        """Get lengths given a list of templates.
+        Args:
+            templates (Union[List[str], str]): Input template(s).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+        Returns:
+            Union[List[int], int]: Length(s) of the input tokens. If the input
+            is a list, a list of lengths will be returned. Otherwise, an int
+            will be returned.
+        """
+        prompts = self.parse_template(templates, mode=mode)
+        assert isinstance(prompts, (list, str)), 'tokens must be list or str'
+        is_batched = isinstance(prompts,
+                                list) and not isinstance(prompts, str)
+        if not is_batched:
+            prompts = [prompts]
+        prompts = [str(prompt) for prompt in prompts]
+        token_lens = [self.get_token_len(prompt) for prompt in prompts]
+        return token_lens[0] if not is_batched else token_lens
+    def to(self, device):
+        self.model.to(device)
+class LMTemplateParser:
+    """Intermediate prompt template parser, specifically for language models.
+    Args:
+        meta_template (Dict): The meta template for the model.
+    """
+    def __init__(self, meta_template: Optional[Dict] = None):
+        self.meta_template = meta_template
+        if meta_template:
+            assert 'round' in meta_template, 'round is required in meta' \
+                ' template'
+            assert isinstance(meta_template['round'], list)
+            keys_to_check = ['round']
+            if 'reserved_roles' in meta_template:
+                assert isinstance(meta_template['reserved_roles'], list)
+                keys_to_check.append('reserved_roles')
+            self.roles: Dict[str, dict] = dict()  # maps role name to config
+            for meta_key in keys_to_check:
+                for item in meta_template[meta_key]:
+                    assert isinstance(item, (str, dict))
+                    if isinstance(item, dict):
+                        assert item['role'] not in self.roles, \
+                            'role in meta prompt must be unique!'
+                        self.roles[item['role']] = item.copy()
+                        # convert list of string and int into a raw string
+                        # for the ease of future prompt processing
+                        for key in ['begin', 'end']:
+                            value = self.roles[item['role']].get(key, '')
+                            if isinstance(value, list):
+                                self.roles[item['role']][
+                                    key] = self._encode_speical_tokens(value)
+    def parse_template(self, prompt_template: PromptType, mode: str) -> str:
+        """Parse a prompt template, and wrap it with meta template if
+        applicable.
+        Args:
+            prompt_template (List[str or PromptList]): A prompt
+                template (potentially before being wrapped by meta template).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+        Returns:
+            str: The final string.
+        """
+        assert isinstance(prompt_template, (str, list))
+        if not isinstance(prompt_template, (str)):
+            return [self.parse_template(p, mode=mode) for p in prompt_template]
+        assert mode in ['ppl', 'gen']
+        if isinstance(prompt_template, str):
+            return prompt_template
+        if self.meta_template:
+            prompt = ''
+            # Whether to keep generating the prompt
+            generate = True
+            section_stack = []  # stores tuples: (section_name, start_idx)
+            for i, item in enumerate(prompt_template):
+                if not generate:
+                    break
+                if isinstance(item, str):
+                    prompt += item
+                elif isinstance(item, dict) and 'section' in item:
+                    if item['pos'] == 'end':
+                        section_name, start_idx = section_stack.pop(-1)
+                        assert section_name == item['section']
+                        if section_name in ['round', 'ice']:
+                            dialogue = prompt_template[start_idx:i]
+                            round_ranges = self._split_rounds(
+                                dialogue, self.meta_template['round'])
+                            # Consider inserting multiple round examples into
+                            # template
+                            for i in range(len(round_ranges) - 1):
+                                start = round_ranges[i]
+                                end = round_ranges[i + 1]
+                                round_template = dialogue[start:end]
+                                role_dict = self._update_role_dict(
+                                    round_template)
+                                new_str, generate = self._prompt2str(
+                                    self.meta_template['round'],
+                                    role_dict,
+                                    # Start generating only when the mode is in
+                                    # generation and the template reaches the
+                                    # last round
+                                    for_gen=mode == 'gen'
+                                    and i == len(round_ranges) - 2
+                                    and section_name == 'round')
+                                prompt += new_str
+                    elif item['pos'] == 'begin':
+                        assert item['section'] in [
+                            'begin', 'round', 'end', 'ice'
+                        ]
+                        section_stack.append((item['section'], i + 1))
+                    else:
+                        raise ValueError(f'Invalid pos {item["pos"]}')
+                elif section_stack[-1][0] in ['begin', 'end']:
+                    role_dict = self._update_role_dict(item)
+                    new_str, generate = self._prompt2str(item,
+                                                         role_dict,
+                                                         for_gen=mode == 'gen')
+                    prompt += new_str
+            prompt = self.meta_template.get('begin', '') + prompt
+            if generate:
+                prompt += self.meta_template.get('end', '')
+        else:
+            # in case the model does not have any meta template
+            prompt = ''
+            last_sep = ''
+            for item in prompt_template:
+                if isinstance(item, dict) and {'section', 'pos'} == set(
+                        item.keys()):
+                    continue
+                if isinstance(item, str):
+                    if item:
+                        prompt += last_sep + item
+                elif item.get('prompt', ''):  # it's a dict
+                    prompt += last_sep + item.get('prompt', '')
+                last_sep = '\n'
+        return prompt
+    def _split_rounds(
+            self, prompt_template: List[Union[str, Dict]],
+            single_round_template: List[Union[str, Dict]]) -> List[int]:
+        """Split the prompt template into rounds, based on single round
+        template.
+        Return the index ranges of each round. Specifically,
+        prompt_template[res[i]:res[i+1]] represents the i-th round in the
+        template.
+        """
+        role_idxs = {
+            role_cfg['role']: i
+            for i, role_cfg in enumerate(single_round_template)
+            if not isinstance(role_cfg, str)
+        }
+        last_role_idx = -1
+        cutoff_idxs = [0]
+        for idx, template in enumerate(prompt_template):
+            if isinstance(template, str):
+                continue
+            role_idx = role_idxs[template['role']]
+            if role_idx <= last_role_idx:
+                cutoff_idxs.append(idx)
+            last_role_idx = role_idx
+        cutoff_idxs.append(len(prompt_template))
+        return cutoff_idxs
+    def _update_role_dict(self, prompt: Union[List, str,
+                                              Dict]) -> Dict[str, Dict]:
+        """Update the default role dict with the given prompt(s)."""
+        assert isinstance(prompt, (str, list, dict))
+        role_dict = deepcopy(self.roles)
+        if isinstance(prompt, str):
+            return role_dict
+        if isinstance(prompt, dict):
+            prompt = [prompt]
+        for p in prompt:
+            if isinstance(p, dict):
+                role = p['role']
+                if role not in self.roles:
+                    role = p.get('fallback_role', None)
+                    if not role:
+                        log.info(f'{p} neither has an appropriate role nor a fallback role.')
+                role_dict[role].update(p)
+        return role_dict
+    def _prompt2str(self,
+                    prompt: Union[List, str, Dict],
+                    role_dict: Dict[str, Dict],
+                    for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert the prompts to a string, given an updated role_dict.
+        Args:
+            prompts (Union[List, str, dict]): The prompt(s) to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        assert isinstance(prompt, (list, str, dict))
+        if isinstance(prompt, str):
+            return prompt, True
+        if isinstance(prompt, dict):
+            return self._role2str(prompt, role_dict, for_gen)
+        res = ''
+        for p in prompt:
+            new_str, cont = self._prompt2str(p, role_dict, for_gen)
+            res += new_str
+            if not cont:
+                break
+        return res, cont
+    def _role2str(self,
+                  role_prompt: Dict,
+                  role_dict: Dict[str, Dict],
+                  for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert a role prompt to a string, given an updated role_dict.
+        Args:
+            role_prompt (Dict): The role prompt to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        merged_prompt = role_dict.get(
+            role_prompt['role'],
+            role_dict.get(role_prompt.get('fallback_role')))
+        res = merged_prompt.get('begin', '')
+        if for_gen and merged_prompt.get('generate', False):
+            return res, False
+        # res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
+        res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
+        return res, True
+    def _encode_speical_tokens(self, prompt: List[Union[str, int]]) -> str:
+        """Encode the special tokens in the prompt.
+        Now this is left for the future work
+        """
+        raise NotImplementedError('Using List[str|int] is as the begin or end'
+                                  'of a prompt is not supported yet.')
+        res = ''
+        for item in prompt:
+            if isinstance(item, str):
+                res += item
+            else:
+                res += f'<META_TOKEN_{item}>'
+        return res