PyPI - dingo-python - Versions diffs - 1.0__py3-none-any.whl - Mend

dingo-python 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

dingo/__init__.py +0 -0
dingo/config/__init__.py +1 -0
dingo/config/config.py +47 -0
dingo/convert/__init__.py +4 -0
dingo/convert/base.py +147 -0
dingo/exec/__init__.py +3 -0
dingo/exec/base.py +54 -0
dingo/exec/local.py +288 -0
dingo/exec/spark.py +169 -0
dingo/io/__init__.py +2 -0
dingo/io/export.py +0 -0
dingo/io/input.py +27 -0
dingo/io/summary.py +28 -0
dingo/model/__init__.py +3 -0
dingo/model/llm/__init__.py +0 -0
dingo/model/llm/base.py +12 -0
dingo/model/llm/common/__init__.py +0 -0
dingo/model/llm/common/base_llm.py +395 -0
dingo/model/llm/common/base_llm_api.py +396 -0
dingo/model/llm/common/openai_api.py +222 -0
dingo/model/llm/common/turbomind_api.py +148 -0
dingo/model/llm/gpt.py +62 -0
dingo/model/llm/llama3.py +97 -0
dingo/model/llm/perspective.py +68 -0
dingo/model/model.py +227 -0
dingo/model/rule/__init__.py +0 -0
dingo/model/rule/base.py +14 -0
dingo/model/rule/common_rule.py +551 -0
dingo/model/rule/image_rule.py +81 -0
dingo/model/rule/prompt_rule.py +39 -0
dingo/model/rule/util.py +282 -0
dingo/utils/__init__.py +1 -0
dingo/utils/log_util/__init__.py +32 -0
dingo/utils/log_util/logger.py +39 -0
dingo_python-1.0.dist-info/LICENSE +201 -0
dingo_python-1.0.dist-info/METADATA +221 -0
dingo_python-1.0.dist-info/RECORD +39 -0
dingo_python-1.0.dist-info/WHEEL +5 -0
dingo_python-1.0.dist-info/top_level.txt +1 -0

dingo/model/llm/common/turbomind_api.py ADDED Viewed

@@ -0,0 +1,148 @@
+import threading
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+from dingo.model.llm.common.base_llm_api import BaseLLMModel
+from dingo.model.llm.common.base_llm import LMTemplateParser
+# from opencompass.utils.logging import get_logger
+# from opencompass.utils.prompt import PromptList
+PromptType = Union[str]
+def valid_str(string, coding='utf-8'):
+    """decode text according to its encoding type."""
+    invalid_chars = [b'\xef\xbf\xbd']
+    byte_str = bytes(string, coding)
+    for invalid_char in invalid_chars:
+        byte_str = byte_str.replace(invalid_char, b'')
+    ret = byte_str.decode(encoding=coding, errors='ignore')
+    return ret
+class TurboMindAPIModel(BaseLLMModel):
+    """Model wrapper for lmdeploy api server.
+    Args:
+        api_addr (str): The address (ip:port format) of lmdeploy's
+            api server.
+        max_seq_len (int): The maximum allowed sequence length of a model.
+            Note that the length of prompt + generated tokens shall not exceed
+            this value. Defaults to 2048.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
+    """
+    is_api: bool = True
+    def __init__(self,
+                 api_addr: str = 'http://0.0.0.0:23333',
+                 max_seq_len: int = 2048,
+                 meta_template: Optional[Dict] = None,
+                 end_str: Optional[str] = None,
+                 **kwargs):
+        super().__init__(path='',
+                         max_seq_len=max_seq_len,
+                         meta_template=meta_template)
+        try:
+            from lmdeploy.serve.openai.api_client import APIClient
+        except ImportError:
+            raise ImportError('lmdeploy is not installed, please install lmdeploy first.')
+        self.chatbot = APIClient(api_addr)
+        self.model_name = self.chatbot.available_models[0]
+        # self.logger = get_logger()
+        self.template_parser = LMTemplateParser(meta_template)
+        self.eos_token_id = None
+        self.token_bucket = None
+        if meta_template and 'eos_token_id' in meta_template:
+            self.eos_token_id = meta_template['eos_token_id']
+        self.api_addr = api_addr
+        self.end_str = end_str
+    def generate(
+            self,
+            inputs: List[str],
+            max_out_len: int = 512,
+            temperature: float = 1.0,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+        Args:
+            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic. Defaults to 0.7.
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs),
+                             [temperature] * len(inputs),
+                             [self.end_str] * len(inputs)))
+        return results
+    def get_token_len(self, prompt: str) -> int:
+        input_ids, length = self.chatbot.encode(prompt)
+        return length
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        raise NotImplementedError('Not implemented in TurboMindAPIModel.')
+    def wait(self):
+        """Wait till the next query can be sent.
+        Applicable in both single-thread and multi-thread environments.
+        """
+        return self.token_bucket.get_token()
+    def _generate(self, prompt: str, max_out_len: int,
+                  temperature: float, end_str: str) -> str:
+        """Generate results given a list of inputs.
+        Args:
+            prompt (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic.
+        Returns:
+            str: The generated string.
+        """
+        assert type(
+            prompt) is str, 'We only support string for TurboMind RPC API'
+        res = ''
+        for output in self.chatbot.completions_v1(
+                session_id=threading.current_thread().ident,
+                prompt=prompt,
+                model=self.model_name,
+                max_tokens=max_out_len,
+                temperature=temperature,
+                top_p=1.0,
+                top_k=10):
+            res += output['choices'][0]['text']
+        res = valid_str(res)
+        if end_str:
+            res = res.split(end_str)[0]
+        return res

dingo/model/llm/gpt.py ADDED Viewed

@@ -0,0 +1,62 @@
+import json
+from dingo.model import Model
+from dingo.model.llm.common.openai_api import OpenAI
+from dingo.model.llm.base import BaseLLM, ResModel
+from dingo.utils import log
+@Model.llm_register('gpt')
+class GPT(BaseLLM):
+    key = ''
+    gpt_client = None
+    general_filter = """
+            Please rate the following sentences based on their fluency, completeness, and level of repetition.
+            The scores from low to high indicate the quality of the sentences, with values ranging from 0 to 10 and reasons given.
+            Please provide a JSON format reply containing the specified key and value.
+            requirement:
+            -The returned content must be in JSON format and there should be no extra content.
+            -The first key returned is score, which is an integer between 0 and 10.
+            -The second key returned is error, with a value of one of the following: unsmooth, incomplete, or repetitive. If the sentence is correct, this value is empty.
+            -The third key returned is reason, and the value is the reason for scoring.
+            -If the sentence is empty, please give it a score of 0.
+            %s
+            """
+    @classmethod
+    def create_client(cls):
+        if cls.gpt_client is None:
+            cls.gpt_client = OpenAI('gpt-4', key=cls.key)
+    @classmethod
+    def check_key(cls, data: json):
+        key_list = ['score', 'error', 'reason']
+        for key in key_list:
+            if key not in data:
+                return False
+        return True
+    @classmethod
+    def call_api(cls, input_data: str) -> ResModel:
+        cls.create_client()
+        response = cls.gpt_client.generate([cls.general_filter % input_data])
+        log.debug(response)
+        try:
+            response = json.loads(response[0])
+            if cls.check_key(response) is False:
+                raise RuntimeError('miss key: score, error, reason')
+            return ResModel(
+                score=response['score'],
+                error=response['error'],
+                reason=response['reason']
+            )
+        except RuntimeError:
+            return ResModel(
+                score=0,
+                error='API_LOSS',
+                reason=''
+            )

dingo/model/llm/llama3.py ADDED Viewed

@@ -0,0 +1,97 @@
+import json
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from dingo.model import Model
+from dingo.model.llm.base import BaseLLM, ResModel
+try:
+    import torch
+except ImportError as e:
+    raise ImportError("You need to install `torch`, try `pip install torch`")
+@Model.llm_register('llama3')
+class LLaMa3(BaseLLM):
+    path = ''
+    model = None
+    tokenizer = None
+    general_filter = """
+            Please rate the following sentences based on their fluency, completeness, and level of repetition.
+            The scores from low to high indicate the quality of the sentences, with values ranging from 0 to 10 and reasons given.
+            Please provide a JSON format reply containing the specified key and value.
+            requirement:
+            -The returned content must be in JSON format and there should be no extra content.
+            -The first key returned is score, which is an integer between 0 and 10.
+            -The second key returned is error, with a value of one of the following: unsmooth, incomplete, or repetitive. If the sentence is correct, this value is empty.
+            -The third key returned is reason, and the value is the reason for scoring.
+            -If the sentence is empty, please give it a score of 0.
+            %s
+            """
+    @classmethod
+    def generate_words(cls, input_data: str) -> json:
+        if cls.model is None:
+            cls.model = AutoModelForCausalLM.from_pretrained(
+                cls.path,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+            )
+        if cls.tokenizer is None:
+            cls.tokenizer = AutoTokenizer.from_pretrained(cls.path)
+        messages = [
+            {"role": "system", "content": input_data},
+        ]
+        input_ids = cls.tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(cls.model.device)
+        terminators = [
+            cls.tokenizer.eos_token_id,
+            cls.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+        outputs = cls.model.generate(
+            input_ids,
+            max_new_tokens=256,
+            eos_token_id=terminators,
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.9,
+        )
+        response = outputs[0][input_ids.shape[-1]:]
+        return json.loads(cls.tokenizer.decode(response, skip_special_tokens=True))
+    @classmethod
+    def check_key(cls, data: json):
+        key_list = ['score', 'error', 'reason']
+        for key in key_list:
+            if key not in data:
+                return False
+        return True
+    @classmethod
+    def call_api(cls, input_data: str) -> ResModel:
+        try:
+            response = cls.generate_words(cls.general_filter % input_data)
+            if cls.check_key(response) is False:
+                raise RuntimeError('miss key: score, error, reason')
+            return ResModel(
+                score=response['score'],
+                error=response['error'],
+                reason=response['reason']
+            )
+        except RuntimeError:
+            return ResModel(
+                score=0,
+                error='API_LOSS',
+                reason=''
+            )

dingo/model/llm/perspective.py ADDED Viewed

@@ -0,0 +1,68 @@
+import pprint
+from dingo.model import Model
+from dingo.model.llm.base import BaseLLM, ResModel
+try:
+    from googleapiclient import discovery
+except ImportError:
+    raise ImportError('googleapiclient not installed, please install it with `pip install google-api-python-client`')
+@Model.llm_register('perspective')
+class Perspective(BaseLLM):
+    key = ''
+    api_url = 'https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1'
+    client = None
+    @classmethod
+    def create_client(cls):
+        if cls.client is None:
+            cls.client = discovery.build(
+                "commentanalyzer",
+                "v1alpha1",
+                developerKey=cls.key,
+                discoveryServiceUrl=cls.api_url,
+                static_discovery=False,
+            )
+    @classmethod
+    def call_api(cls, input_data: str) -> ResModel:
+        cls.create_client()
+        analyze_request = {
+            'comment': {'text': input_data},
+            'requestedAttributes': {
+                'TOXICITY': {},
+                'SEVERE_TOXICITY': {},
+                'IDENTITY_ATTACK': {},
+                'INSULT': {},
+                'PROFANITY': {},
+                'THREAT': {},
+                # 'SEXUALLY_EXPLICIT': {},
+                # 'FLIRTATION': {},
+            }
+        }
+        try:
+            response = cls.client.comments().analyze(body=analyze_request).execute()
+            result = ResModel()
+            is_good = True
+            error_list = []
+            for e in response['attributeScores']:
+                value = response['attributeScores'][e]['summaryScore']['value']
+                if value > 0.6:
+                    is_good = False
+                    error_list.append(e)
+            if is_good is True:
+                result.score = 10
+            else:
+                result.score = 0
+                result.error = 'ERROR_PERSPECTIVE'
+                result.reason = ",".join(error_list)
+            return result
+        except RuntimeError:
+            return ResModel(
+                score=0,
+                error='API_LOSS',
+                reason=''
+            )

dingo/model/model.py ADDED Viewed

@@ -0,0 +1,227 @@
+from functools import wraps
+from typing import Dict, List, Callable, Optional
+import os
+import importlib
+from pydantic import BaseModel
+from dingo.config import GlobalConfig
+from dingo.model.llm.base import BaseLLM
+from dingo.model.rule.base import BaseRule
+from dingo.utils import log
+from dingo.model.llm.common.base_llm import BaseLLMModel
+class BaseEvalModel(BaseModel):
+    name: str
+    type: str
+class Model:
+    """
+    Model configuration class.
+    """
+    module_loaded = False
+    rule_metric_type_map = {
+        'QUALITY_SIGNAL_EFFECTIVENESS': [],  # Effectiveness
+        'QUALITY_SIGNAL_COMPLETENESS': [],  # Completeness
+        'QUALITY_SIGNAL_UNDERSTANDABILITY': [],  # Understandability
+        'QUALITY_SIGNAL_SIMILARITY': [],  # Similarity
+        'QUALITY_SIGNAL_FLUENCY': [],  # Fluency
+        'QUALITY_SIGNAL_RELEVANCE': [],  # Relevance
+        'QUALITY_SIGNAL_SECURITY': [],  # Security
+    }
+    rule_groups = {}
+    rule_name_map = {}
+    llm_models = {}
+    def __init__(self):
+        return
+    @classmethod
+    def get_rule_metric_type_map(cls) -> Dict[str, List[Callable]]:
+        """
+        Returns the rule metric type map.
+        Returns:
+            Rule metric type map ( { rule_metric_type: [rules] } )
+        """
+        return cls.rule_metric_type_map
+    @classmethod
+    def get_rule_group(cls, rule_group_name: str) -> List[Callable]:
+        """
+        Returns the rule groups by rule_group_name.
+        Returns:
+            Rule groups ( [rules] ).
+        """
+        return cls.rule_groups[rule_group_name]
+    @classmethod
+    def get_rule_groups(cls) -> Dict[str, List[Callable]]:
+        """
+        Returns the rule groups.
+        Returns:
+            Rule groups map ( { rule_group_id: [rules] } ).
+        """
+        return cls.rule_groups
+    @classmethod
+    def get_rule_by_name(cls, name: str) -> Callable:
+        """
+        Returns rule by name.
+        Returns:
+            Rule function.
+        """
+        return cls.rule_name_map[name]
+    @classmethod
+    def get_llm_models(cls) -> Dict[str, BaseLLMModel]:
+        """
+        Returns the llm models.
+        Returns:
+            LLM models class List
+        """
+        return cls.llm_models
+    @classmethod
+    def get_llm_model(cls, llm_model_name: str) -> BaseLLMModel:
+        """
+        Returns the llm model by llm_model_name.
+        Args:
+            llm_model_name (str): The name of the llm model.
+        Returns:
+            LLM model class
+        """
+        return cls.llm_models[llm_model_name]
+    @classmethod
+    def print_rule_list(cls) -> None:
+        """
+        Print the rule list.
+        Returns:
+            List of rules.
+        """
+        rule_list = []
+        for rule_name in cls.rule_name_map:
+            rule_list.append(rule_name)
+        print(rule_list)
+    @classmethod
+    def get_all_info(cls):
+        """
+        Returns rules' map and llm models' map
+        """
+        raise NotImplementedError()
+    @classmethod
+    def rule_register(cls, metric_type: str, group: List[str]) -> Callable:
+        """
+        Register a model. (register)
+        Args:
+            metric_type (str): The metric type (quality map).
+            group (List[str]): The group names.
+        """
+        def decorator(root_class):
+            # group
+            for group_name in group:
+                if group_name not in cls.rule_groups:
+                    cls.rule_groups[group_name] = []
+                cls.rule_groups[group_name].append(root_class)
+            cls.rule_name_map[root_class.__name__] = root_class
+            # metric_type
+            if metric_type not in cls.rule_metric_type_map:
+                raise KeyError(f'Metric type "{metric_type}" can not be registered.')
+            cls.rule_metric_type_map[metric_type].append(root_class)
+            @wraps(root_class)
+            def wrapped_function(*args, **kwargs):
+                return root_class(*args, **kwargs)
+            return wrapped_function
+        return decorator
+    @classmethod
+    def llm_register(cls, llm_id: str) -> Callable:
+        """
+        Register a model. (register)
+        Args:
+            llm_id (str): Name of llm model class.
+        """
+        def decorator(root_method):
+            cls.llm_models[llm_id] = root_method
+            @wraps(root_method)
+            def wrapped_function(*args, **kwargs):
+                return root_method(*args, **kwargs)
+            return wrapped_function
+        return decorator
+    @classmethod
+    def apply_config(cls, custom_config_path: Optional[str]):
+        GlobalConfig.read_config_file(custom_config_path)
+        if GlobalConfig.config and GlobalConfig.config.rule_config:
+            for rule, params in GlobalConfig.config.rule_config.items():
+                if rule not in cls.rule_name_map:
+                    continue
+                assert isinstance(rule, str)
+                for param_name in ['threshold', 'pattern', 'key_list', 'file_path']:
+                    param_value = getattr(params, param_name)
+                    if not param_value:
+                        continue
+                    log.debug(f"[Rule config]: config {param_name} for {rule}")
+                    cls_rule: BaseRule = cls.rule_name_map[rule]
+                    setattr(cls_rule, param_name, param_value)
+        if GlobalConfig.config and GlobalConfig.config.llm_config:
+            for llm, params in GlobalConfig.config.llm_config.items():
+                if llm not in cls.llm_models.keys():
+                    continue
+                assert isinstance(llm, str)
+                for param_name in ['path', 'key', 'api_url']:
+                    param_value = getattr(params, param_name)
+                    if not param_value:
+                        continue
+                    log.debug(f"[LLM config]: config {param_name} for {llm}")
+                    cls_llm: BaseLLM = cls.llm_models[llm]
+                    setattr(cls_llm, param_name, param_value)
+    @classmethod
+    def load_model(cls):
+        if cls.module_loaded:
+            return
+        this_module_directory = os.path.dirname(os.path.abspath(__file__))
+        # rule auto register
+        for file in os.listdir(os.path.join(this_module_directory, 'rule')):
+            path = os.path.join(this_module_directory, 'rule', file)
+            if os.path.isfile(path) and file.endswith('.py') and not file == '__init__.py':
+                try:
+                    importlib.import_module('dingo.model.rule.' + file.split('.')[0])
+                except ModuleNotFoundError as e:
+                    log.debug(e)
+        # llm auto register
+        for file in os.listdir(os.path.join(this_module_directory, 'llm')):
+            path = os.path.join(this_module_directory, 'llm', file)
+            if os.path.isfile(path) and file.endswith('.py') and not file == '__init__.py':
+                try:
+                    importlib.import_module('dingo.model.llm.' + file.split('.')[0])
+                except ModuleNotFoundError as e:
+                    log.debug(e)
+                except ImportError as e:
+                    log.debug("=" * 30 + " ImportError " + "=" * 30)
+                    log.debug(f'module {file.split(".")[0]} not imported because: \n{e}')
+                    log.debug("=" * 73)
+        cls.module_loaded = True

dingo/model/rule/__init__.py ADDED Viewed

File without changes

dingo/model/rule/base.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import Protocol, List, Union
+from pydantic import BaseModel
+class ResModel(BaseModel):
+    error_status: bool = False
+    error_reason: str = ''
+class BaseRule(Protocol):
+    @classmethod
+    def eval(cls, input_data: List[str]) -> ResModel:
+        ...