dingo-python 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dingo/__init__.py ADDED
File without changes
@@ -0,0 +1 @@
1
+ from dingo.config.config import GlobalConfig
dingo/config/config.py ADDED
@@ -0,0 +1,47 @@
1
+ from typing import Optional, List, Dict
2
+ import json
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from dingo.utils import log
7
+
8
+
9
+ class DynamicRuleConfig(BaseModel):
10
+ threshold: Optional[float] = None
11
+ pattern: Optional[str] = None
12
+ key_list: Optional[List[str]] = None
13
+ file_path: Optional[str] = None
14
+
15
+ class DynamicLLMConfig(BaseModel):
16
+ path: Optional[str] = None
17
+ key: Optional[str] = None
18
+ api_url: Optional[str] = None
19
+
20
+ class Config(BaseModel):
21
+ custom_rule_list: Optional[List[str]] = []
22
+ rule_config: Optional[Dict[str, DynamicRuleConfig]] = {}
23
+ llm_config: Optional[Dict[str, DynamicLLMConfig]] = {}
24
+
25
+
26
+ class GlobalConfig:
27
+ config = None
28
+
29
+ @classmethod
30
+ def read_config_file(cls, custom_config_path: Optional[str]):
31
+ if custom_config_path is None:
32
+ cls.config = Config()
33
+ return
34
+ try:
35
+ with open(custom_config_path, "r") as f:
36
+ rule_json = json.load(f)
37
+ except FileNotFoundError:
38
+ log.error("No config file found, error path.")
39
+
40
+ try:
41
+ cls.config = Config(
42
+ custom_rule_list=rule_json.get('custom_rule_list', []),
43
+ rule_config={i: DynamicRuleConfig(**rule_config) for i, rule_config in rule_json.get('rule_config', {}).items()},
44
+ llm_config={i: DynamicLLMConfig(**llm_config) for i, llm_config in rule_json.get('llm_config', {}).items()},
45
+ )
46
+ except Exception as e:
47
+ raise RuntimeError(f"Error loading config: {e}")
@@ -0,0 +1,4 @@
1
+ from typing import Dict
2
+ from dingo.convert.base import BaseConverter
3
+
4
+ converters: Dict[str, BaseConverter] = BaseConverter.converters
dingo/convert/base.py ADDED
@@ -0,0 +1,147 @@
1
+ from functools import wraps
2
+ from typing import List, Protocol
3
+ import json
4
+
5
+ from dingo.io import InputModel, RawInputModel
6
+ from dingo.utils import log
7
+
8
+
9
+ class ConverterProto(Protocol):
10
+ @classmethod
11
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
12
+ ...
13
+
14
+
15
+ class BaseConverter(ConverterProto):
16
+ converters = {}
17
+
18
+ def __init__(self):
19
+ pass
20
+
21
+ @classmethod
22
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
23
+ raise NotImplementedError()
24
+
25
+ @classmethod
26
+ def register(cls, type_name: str):
27
+ def decorator(root_class):
28
+ cls.converters[type_name] = root_class
29
+
30
+ @wraps(root_class)
31
+ def wrapped_function(*args, **kwargs):
32
+ return root_class(*args, **kwargs)
33
+
34
+ return wrapped_function
35
+
36
+ return decorator
37
+
38
+ @classmethod
39
+ def find_levels_data(cls, data: json, levels: List[str]):
40
+ res = data
41
+ for key in levels:
42
+ res = res[key]
43
+ return res
44
+
45
+
46
+ @BaseConverter.register('json')
47
+ class JsonConverter(BaseConverter):
48
+ """
49
+ Json file converter.
50
+ """
51
+
52
+ def __init__(self):
53
+ super().__init__()
54
+
55
+ @classmethod
56
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
57
+ log.debug("Loading data from json file")
58
+ raw_data = []
59
+ with open(raw_input.input_path, 'r', encoding='utf-8') as f:
60
+ s = f.read()
61
+ j = json.loads(s)
62
+ for k, v in j.items():
63
+ raw_data.append(InputModel(**{
64
+ 'data_id': cls.find_levels_data(v, raw_input.column_id) if raw_input.column_id != [] else str(k),
65
+ 'prompt': cls.find_levels_data(v, raw_input.column_prompt) if raw_input.column_prompt != [] else '',
66
+ 'content': cls.find_levels_data(v, raw_input.column_content)
67
+
68
+ }))
69
+ return raw_data
70
+
71
+
72
+ @BaseConverter.register('plaintext')
73
+ class PlainConverter(BaseConverter):
74
+ """
75
+ Plain text file converter
76
+ """
77
+
78
+ def __init__(self):
79
+ super().__init__()
80
+
81
+ @classmethod
82
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
83
+ log.debug("Loading data from plaintext file")
84
+ data_id = 0
85
+ raw_data = []
86
+ with open(raw_input.input_path, 'r', encoding='utf-8') as f:
87
+ for line in f:
88
+ raw_data.append(InputModel(**{
89
+ 'data_id': str(data_id),
90
+ 'prompt': '',
91
+ 'content': line
92
+ }))
93
+ data_id += 1
94
+ return raw_data
95
+
96
+
97
+ @BaseConverter.register('jsonl')
98
+ class JsonLineConverter(BaseConverter):
99
+ """
100
+ Json line file converter.
101
+ """
102
+
103
+ def __init__(self):
104
+ super().__init__()
105
+
106
+ @classmethod
107
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
108
+ log.debug("Loading data from jsonl file")
109
+ data_id = 0
110
+ raw_data = []
111
+ with open(raw_input.input_path, 'r', encoding='utf-8') as f:
112
+ for j_l in f:
113
+ j = json.loads(j_l)
114
+ raw_data.append(InputModel(**{
115
+ 'data_id': cls.find_levels_data(j, raw_input.column_id) if raw_input.column_id != [] else str(data_id),
116
+ 'prompt': cls.find_levels_data(j, raw_input.column_prompt) if raw_input.column_prompt != [] else '',
117
+ 'content': cls.find_levels_data(j, raw_input.column_content)
118
+ }))
119
+ data_id += 1
120
+ return raw_data
121
+
122
+ @BaseConverter.register('listjson')
123
+ class ListJsonConverter(BaseConverter):
124
+ """
125
+ List json file converter.
126
+ """
127
+
128
+ def __init__(self):
129
+ super().__init__()
130
+
131
+ @classmethod
132
+ def load_data(cls, raw_input: RawInputModel) -> List[InputModel]:
133
+ log.debug("Loading data from list json file")
134
+ data_id = 0
135
+ raw_data = []
136
+ with open(raw_input.input_path, 'r', encoding='utf-8') as f:
137
+ s = f.read()
138
+ l_j = json.loads(s)
139
+ for j in l_j:
140
+ raw_data.append(InputModel(**{
141
+ 'data_id': cls.find_levels_data(j, raw_input.column_id) if raw_input.column_id != [] else str(data_id),
142
+ 'prompt': cls.find_levels_data(j, raw_input.column_prompt) if raw_input.column_prompt != [] else '',
143
+ 'content': cls.find_levels_data(j, raw_input.column_content)
144
+
145
+ }))
146
+ data_id += 1
147
+ return raw_data
dingo/exec/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from dingo.exec.local import LocalExecutor # noqa E402.
2
+
3
+ from dingo.exec.base import Executor, ExecProto # noqa E402.
dingo/exec/base.py ADDED
@@ -0,0 +1,54 @@
1
+ from functools import wraps
2
+ from abc import ABC, abstractmethod
3
+ from typing import Protocol, List, Dict, Any
4
+
5
+ from dingo.io import InputModel, SummaryModel
6
+
7
+
8
+ class ExecProto(Protocol):
9
+ def load_data(self, path: str, data_type: str) -> List[InputModel]:
10
+ ...
11
+
12
+ def evaluate(self) -> List[SummaryModel]:
13
+ ...
14
+
15
+ def summarize(self, inputs: InputModel) -> SummaryModel:
16
+ ...
17
+
18
+ def execute(self) -> SummaryModel:
19
+ ...
20
+
21
+
22
+ class Executor(ABC):
23
+ exec_map: Dict[str, Any] = {}
24
+
25
+ @abstractmethod
26
+ def load_data(self, path) -> List[InputModel]:
27
+ raise NotImplementedError()
28
+
29
+ @abstractmethod
30
+ def evaluate(self) -> List[SummaryModel]:
31
+ raise NotImplementedError()
32
+
33
+ @abstractmethod
34
+ def summarize(self, inputs: InputModel) -> SummaryModel:
35
+ raise NotImplementedError()
36
+
37
+ @abstractmethod
38
+ def execute(self) -> SummaryModel:
39
+ raise NotImplementedError()
40
+
41
+ @classmethod
42
+ def register(cls, exec_name: str):
43
+
44
+ def decorator(root_exec):
45
+ cls.exec_map[exec_name] = root_exec
46
+
47
+ @wraps(root_exec)
48
+ def wrapped_function(*args, **kwargs):
49
+ return root_exec(*args, **kwargs)
50
+
51
+ return wrapped_function
52
+
53
+ return decorator
54
+
dingo/exec/local.py ADDED
@@ -0,0 +1,288 @@
1
+ from typing import Callable, List
2
+ import os
3
+ import time
4
+ import json
5
+ import pprint
6
+
7
+ from dingo.exec.base import Executor
8
+ from dingo.convert import converters
9
+ from dingo.config import GlobalConfig
10
+ from dingo.model import Model
11
+ from dingo.model.llm.base import BaseLLM, ResModel as LLMResModel
12
+ from dingo.model.rule.base import BaseRule, ResModel as RuleResModel
13
+ from dingo.io import RawInputModel, InputModel, SummaryModel
14
+ from dingo.utils import log
15
+
16
+ QUALITY_MAP = Model.rule_metric_type_map
17
+
18
+
19
+ @Executor.register('local')
20
+ class LocalExecutor(Executor):
21
+
22
+ def __init__(self, raw_input: RawInputModel):
23
+ self.raw_input = raw_input
24
+
25
+ def load_data(self, path) -> List[InputModel]:
26
+ """
27
+ Reads data from given path.
28
+
29
+ Returns:
30
+ List[InputModel]
31
+ """
32
+ new_raw_input = self.raw_input
33
+ new_raw_input.input_path = path
34
+ converter = converters[self.raw_input.data_type]
35
+ return converter.load_data(raw_input=new_raw_input)
36
+
37
+ def execute(self) -> List[SummaryModel]:
38
+ """
39
+ Executes given input models.
40
+
41
+ Returns:
42
+
43
+ """
44
+ pass
45
+
46
+ def summarize(self, input_models: List[InputModel]) -> SummaryModel:
47
+ pass
48
+
49
+ def walk_path(self, path_list, record, model, model_type):
50
+ """
51
+ Walk the path list to get
52
+ Args:
53
+
54
+ path_list (_type_): _description_
55
+ record (_type_): _description_
56
+ model (_type_): _description_
57
+ model_type (str):
58
+ """
59
+ for path in path_list:
60
+ log.debug('[Handling]:' + path)
61
+
62
+ if os.path.isdir(path):
63
+ for path_child in os.listdir(path):
64
+ path_child = path + '/' + path_child
65
+ path_list.append(path_child)
66
+ continue
67
+
68
+ if not os.path.isfile(path):
69
+ continue
70
+ dataset = self.load_data(path)
71
+ if len(dataset) == 0:
72
+ continue
73
+ record['total'] += len(dataset)
74
+
75
+ log.debug('[Dataset]: ' + str(dataset))
76
+ for data in dataset:
77
+ executor(model_type)(record, model, data)
78
+ log.debug('[Record]: ' + str(record))
79
+ calculate_ratio(record, model_type)
80
+ log.debug('[Record]: ' + str(record))
81
+
82
+ def evaluate(self) -> List[SummaryModel]:
83
+ current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
84
+ input_path = self.raw_input.input_path
85
+ output_path = self.raw_input.output_path + current_time
86
+ if not os.path.exists(output_path):
87
+ os.makedirs(output_path)
88
+
89
+ record_list = []
90
+ custom = False
91
+ log.debug(str(self.raw_input.eval_models))
92
+ for model_name in self.raw_input.eval_models:
93
+ log.debug(f"[GlobalConfig.config]: {GlobalConfig.config}")
94
+ if model_name in Model.llm_models:
95
+ log.debug(f"[Load llm model {model_name}]")
96
+ model = Model.llm_models[model_name]
97
+ model_type = 'llm'
98
+ elif model_name in Model.rule_groups:
99
+ log.debug(f"[Load rule model {model_name}]")
100
+ model: List[BaseRule] = Model.rule_groups[model_name]
101
+ model_type = 'rule'
102
+ elif GlobalConfig.config and GlobalConfig.config.custom_rule_list:
103
+ log.debug("[Load custom rule]")
104
+ custom = True
105
+ model: List[BaseRule] = []
106
+ for rule in GlobalConfig.config.custom_rule_list:
107
+ assert isinstance(rule, str)
108
+ if rule not in Model.rule_name_map:
109
+ raise KeyError(f"{rule} not in Model.rule_name_map, there are {str(Model.rule_name_map.keys())}")
110
+ model.append(Model.rule_name_map[rule])
111
+
112
+ model_type = 'rule'
113
+ else:
114
+ raise KeyError('no such model: ' + model_name)
115
+ log.debug("[ModelType]: " + model_type)
116
+ model_path = output_path + '/' + model_name
117
+ if not os.path.exists(model_path):
118
+ os.makedirs(model_path)
119
+
120
+ record = {
121
+ 'dataset_id': self.raw_input.dataset_id,
122
+ 'input_model': model_name,
123
+ 'input_path': input_path,
124
+ 'output_path': output_path,
125
+ 'score': 0,
126
+ 'num_good': 0,
127
+ 'num_bad': 0,
128
+ 'total': 0,
129
+ 'error_info': {},
130
+ }
131
+ path_list = [input_path]
132
+ self.walk_path(path_list, record, model, model_type)
133
+
134
+ # pprint.pprint(record, sort_dicts=False)
135
+ if model_type == 'rule':
136
+ summary = write_data_rule(record, model_path)
137
+ elif model_type == 'llm':
138
+ summary = write_data_llm(record, model_path)
139
+ else:
140
+ raise KeyError('no such model: ' + model_type)
141
+
142
+ record_list.append(summary)
143
+ if custom:
144
+ break
145
+ log.debug(record_list)
146
+ return record_list
147
+
148
+
149
+ def get_quality_signal(rule: Callable):
150
+ for quality_signal in QUALITY_MAP:
151
+ for rule_class in QUALITY_MAP[quality_signal]:
152
+ if rule.__name__ == rule_class.__name__:
153
+ return quality_signal
154
+
155
+ raise RuntimeError('this rule can not find its quality_signal: ' + rule.__name__)
156
+
157
+
158
+ def write_data_rule(record, path):
159
+ summary = SummaryModel(
160
+ dataset_id = record['dataset_id'],
161
+ input_model = record['input_model'],
162
+ input_path = record['input_path'],
163
+ output_path = record['output_path'],
164
+ score = record['score'],
165
+ num_good = record['num_good'],
166
+ num_bad = record['num_bad'],
167
+ total = record['total'],
168
+ error_ratio = {}
169
+ )
170
+
171
+ for quality_signal in record['error_info']:
172
+ summary.error_ratio[quality_signal] = 0
173
+ if record['error_info'][quality_signal]['count'] == 0:
174
+ continue
175
+
176
+ if not os.path.exists(path + "/{}".format(quality_signal)):
177
+ os.makedirs(path + "/{}".format(quality_signal))
178
+ summary.error_ratio[quality_signal] = round(record['error_info'][quality_signal]['count'] / record['total'], 6)
179
+ for rule_name in record['error_info'][quality_signal]:
180
+ if rule_name in ['count',]:
181
+ continue
182
+ with open(path + '/{}/{}.json'.format(quality_signal, rule_name), 'w', encoding='utf-8') as f:
183
+ json.dump(record['error_info'][quality_signal][rule_name], f, indent=4, ensure_ascii=False)
184
+
185
+ with open(path + '/summary.json', 'w', encoding='utf-8') as f:
186
+ json.dump(summary.to_dict(), f, indent=4, ensure_ascii=False)
187
+ return summary
188
+
189
+
190
+ def write_data_llm(record, path):
191
+ summary = SummaryModel(
192
+ dataset_id=record['dataset_id'],
193
+ input_model=record['input_model'],
194
+ input_path=record['input_path'],
195
+ output_path=record['output_path'],
196
+ score=record['score'],
197
+ num_good=record['num_good'],
198
+ num_bad=record['num_bad'],
199
+ total=record['total'],
200
+ error_ratio={},
201
+ )
202
+
203
+ for error_type in record['error_info']:
204
+ summary.error_ratio[error_type] = record['error_info'][error_type]['ratio']
205
+ with open(path + '/{}.json'.format(error_type), 'w', encoding='utf-8') as f:
206
+ json.dump(record['error_info'][error_type], f, indent=4, ensure_ascii=False)
207
+
208
+ with open(path + '/summary.json', 'w', encoding='utf-8') as f:
209
+ json.dump(summary.to_dict(), f, indent=4, ensure_ascii=False)
210
+ return summary
211
+
212
+
213
+ def execute_rule(record, rule_map, d: InputModel):
214
+ if_good = True
215
+ if_qs_good = {}
216
+ for q_s in QUALITY_MAP:
217
+ if_qs_good[q_s] = True
218
+ if q_s not in record['error_info']:
219
+ record['error_info'][q_s] = {'count': 0}
220
+ log.debug("[RuleMap]: " + str(rule_map))
221
+ for r in rule_map:
222
+ r_n = r.__name__
223
+ # execute rule
224
+ if r_n.startswith('Prompt'):
225
+ tmp: RuleResModel = r.eval([d.prompt, d.content])
226
+ else:
227
+ tmp: RuleResModel = r.eval([d.content])
228
+ # analyze result
229
+ if not tmp.error_status:
230
+ continue
231
+ if_good = False
232
+ q_s = get_quality_signal(r)
233
+ if_qs_good[q_s] = False
234
+ if r_n not in record['error_info'][q_s]:
235
+ record['error_info'][q_s][r_n] = {'name': r_n, 'count': 0, 'ratio': 0, 'detail': []}
236
+ record['error_info'][q_s][r_n]['count'] += 1
237
+ record['error_info'][q_s][r_n]['detail'].append(
238
+ {'data_id': d.data_id, 'prompt': d.prompt, 'content': d.content, 'error_reason': tmp.error_reason})
239
+
240
+ if not if_good:
241
+ record['num_bad'] += 1
242
+ for q_s in if_qs_good:
243
+ if not if_qs_good[q_s]:
244
+ record['error_info'][q_s]['count'] += 1
245
+
246
+
247
+ def execute_llm(record, llm: BaseLLM, d: InputModel):
248
+ tmp: LLMResModel = llm.call_api(d.content)
249
+ if tmp.score > 6:
250
+ return
251
+
252
+ record['num_bad'] += 1
253
+ e = tmp.error
254
+ if e not in record['error_info']:
255
+ record['error_info'][e] = {'name': e, 'count': 0, 'ratio': 0, 'detail': []}
256
+ record['error_info'][e]['count'] += 1
257
+ record['error_info'][e]['detail'].append(
258
+ {'api_score': tmp.score, 'data_id': d.data_id, 'prompt': d.prompt, 'content': d.content,
259
+ 'error_reason': tmp.reason})
260
+
261
+
262
+ def calculate_ratio(record, model_type):
263
+ record['num_good'] = record['total'] - record['num_bad']
264
+ record['score'] = round(record['num_good'] / record['total'] * 100, 2)
265
+ if model_type == 'rule':
266
+ for q_s in record['error_info']:
267
+ for r_n in record['error_info'][q_s]:
268
+ if r_n in ['count',]:
269
+ continue
270
+ record['error_info'][q_s][r_n]['ratio'] = round(record['error_info'][q_s][r_n]['count'] / record['total'], 6)
271
+ else:
272
+ for e in record['error_info']:
273
+ record['error_info'][e]['ratio'] = round(record['error_info'][e]['count'] / record['total'], 6)
274
+
275
+
276
+ def executor(model_type: str) -> Callable:
277
+ if model_type == 'rule':
278
+ return execute_rule
279
+ if model_type == 'llm':
280
+ return execute_llm
281
+ raise RuntimeError(f'Unsupported model type: {model_type}')
282
+
283
+
284
+ def write_data(model_type: str) -> Callable:
285
+ if model_type == 'rule':
286
+ return write_data_rule
287
+ if model_type == 'llm':
288
+ return write_data_llm