dingo-python 1.2__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dingo/data/converter/base.py +2 -2
- dingo/data/dataset/base.py +17 -0
- dingo/data/datasource/base.py +17 -0
- dingo/data/utils/digit.py +17 -42
- dingo/exec/local.py +9 -4
- dingo/exec/spark.py +3 -1
- dingo/io/input/InputArgs.py +10 -6
- dingo/io/output/ResultInfo.py +11 -1
- dingo/model/prompt/prompt_text_quality_v3.py +5 -4
- dingo/model/rule/rule_common.py +106 -32
- dingo/model/rule/utils/detect_lang.py +6 -0
- dingo/model/rule/utils/util.py +12 -1
- dingo/run/cli.py +4 -0
- {dingo_python-1.2.dist-info → dingo_python-1.2.2.dist-info}/METADATA +3 -1
- {dingo_python-1.2.dist-info → dingo_python-1.2.2.dist-info}/RECORD +19 -19
- /dingo/model/rule/utils/{xyz_head_word.py → multi_lan_util.py} +0 -0
- {dingo_python-1.2.dist-info → dingo_python-1.2.2.dist-info}/LICENSE +0 -0
- {dingo_python-1.2.dist-info → dingo_python-1.2.2.dist-info}/WHEEL +0 -0
- {dingo_python-1.2.dist-info → dingo_python-1.2.2.dist-info}/top_level.txt +0 -0
dingo/data/converter/base.py
CHANGED
|
@@ -66,7 +66,7 @@ class JsonConverter(BaseConverter):
|
|
|
66
66
|
'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k),
|
|
67
67
|
'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '',
|
|
68
68
|
'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '',
|
|
69
|
-
'raw_data':
|
|
69
|
+
'raw_data': v
|
|
70
70
|
})
|
|
71
71
|
|
|
72
72
|
return _convert
|
|
@@ -91,7 +91,7 @@ class PlainConverter(BaseConverter):
|
|
|
91
91
|
'data_id': str(cls.data_id),
|
|
92
92
|
'prompt': '',
|
|
93
93
|
'content': raw,
|
|
94
|
-
'raw_data': {'
|
|
94
|
+
'raw_data': {'content': raw}
|
|
95
95
|
})
|
|
96
96
|
cls.data_id += 1
|
|
97
97
|
return data
|
dingo/data/dataset/base.py
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
# This file is modified from:
|
|
2
|
+
# https://github.com/mlflow/mlflow/blob/master/mlflow/data/dataset.py
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2018 Databricks, Inc. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
1
18
|
import json
|
|
2
19
|
from functools import wraps
|
|
3
20
|
from abc import abstractmethod
|
dingo/data/datasource/base.py
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
# This file is modified from:
|
|
2
|
+
# https://github.com/mlflow/mlflow/blob/master/mlflow/data/dataset_source.py
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2018 Databricks, Inc. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
1
18
|
import json
|
|
2
19
|
from functools import wraps
|
|
3
20
|
from abc import abstractmethod
|
dingo/data/utils/digit.py
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
# This file is modified from:
|
|
2
|
+
# https://github.com/mlflow/mlflow/blob/master/mlflow/data/digest_utils.py
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2018 Databricks, Inc. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
1
18
|
import logging
|
|
2
19
|
from typing import Any, List
|
|
3
20
|
|
|
@@ -44,48 +61,6 @@ def compute_pandas_digest(df) -> str:
|
|
|
44
61
|
)
|
|
45
62
|
|
|
46
63
|
|
|
47
|
-
def compute_numpy_digest(features, targets=None) -> str:
|
|
48
|
-
"""Computes a digest for the given numpy array.
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
features: A numpy array containing dataset features.
|
|
52
|
-
targets: A numpy array containing dataset targets. Optional.
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
A string digest.
|
|
56
|
-
"""
|
|
57
|
-
import numpy as np
|
|
58
|
-
import pandas as pd
|
|
59
|
-
|
|
60
|
-
hashable_elements = []
|
|
61
|
-
|
|
62
|
-
def hash_array(array):
|
|
63
|
-
flattened_array = array.flatten()
|
|
64
|
-
trimmed_array = flattened_array[0:MAX_ROWS]
|
|
65
|
-
try:
|
|
66
|
-
hashable_elements.append(pd.util.hash_array(trimmed_array))
|
|
67
|
-
except TypeError:
|
|
68
|
-
hashable_elements.append(np.int64(trimmed_array.size))
|
|
69
|
-
|
|
70
|
-
# hash full array dimensions
|
|
71
|
-
for x in array.shape:
|
|
72
|
-
hashable_elements.append(np.int64(x))
|
|
73
|
-
|
|
74
|
-
def hash_dict_of_arrays(array_dict):
|
|
75
|
-
for key in sorted(array_dict.keys()):
|
|
76
|
-
hash_array(array_dict[key])
|
|
77
|
-
|
|
78
|
-
for item in [features, targets]:
|
|
79
|
-
if item is None:
|
|
80
|
-
continue
|
|
81
|
-
if isinstance(item, dict):
|
|
82
|
-
hash_dict_of_arrays(item)
|
|
83
|
-
else:
|
|
84
|
-
hash_array(item)
|
|
85
|
-
|
|
86
|
-
return get_normalized_md5_digest(hashable_elements)
|
|
87
|
-
|
|
88
|
-
|
|
89
64
|
def get_normalized_md5_digest(elements: List[Any]) -> str:
|
|
90
65
|
"""Computes a normalized digest for a list of hashable elements.
|
|
91
66
|
|
dingo/exec/local.py
CHANGED
|
@@ -79,7 +79,6 @@ class LocalExecutor(Executor):
|
|
|
79
79
|
if self.input_args.save_data:
|
|
80
80
|
self.save_data(output_path)
|
|
81
81
|
|
|
82
|
-
log.debug(self.summary)
|
|
83
82
|
return [self.summary]
|
|
84
83
|
|
|
85
84
|
def evaluate(self):
|
|
@@ -89,7 +88,6 @@ class LocalExecutor(Executor):
|
|
|
89
88
|
group (Any): _description_
|
|
90
89
|
group_type (str): _description_
|
|
91
90
|
"""
|
|
92
|
-
log.debug('[get_score]:' + self.input_args.input_path)
|
|
93
91
|
with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as executor:
|
|
94
92
|
data_iter = self.load_data()
|
|
95
93
|
def process_batch(batch: List):
|
|
@@ -178,6 +176,7 @@ class LocalExecutor(Executor):
|
|
|
178
176
|
self.summary.name_ratio[n] += 1
|
|
179
177
|
|
|
180
178
|
self.summary.total += 1
|
|
179
|
+
log.info(f'[Data Number]: {str(self.summary.total)} ')
|
|
181
180
|
|
|
182
181
|
def evaluate_rule(self, group: List[BaseRule], d: MetaData) -> ResultInfo:
|
|
183
182
|
result_info = ResultInfo(data_id=d.data_id, prompt=d.prompt, content=d.content)
|
|
@@ -275,7 +274,10 @@ class LocalExecutor(Executor):
|
|
|
275
274
|
os.makedirs(p_t)
|
|
276
275
|
f_n = os.path.join(path, t, n) + ".jsonl"
|
|
277
276
|
with open(f_n, 'a', encoding='utf-8') as f:
|
|
278
|
-
|
|
277
|
+
if self.input_args.save_raw:
|
|
278
|
+
str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False)
|
|
279
|
+
else:
|
|
280
|
+
str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
|
|
279
281
|
f.write(str_json + '\n')
|
|
280
282
|
if self.input_args.save_correct:
|
|
281
283
|
for result_info in self.good_info_list:
|
|
@@ -287,7 +289,10 @@ class LocalExecutor(Executor):
|
|
|
287
289
|
os.makedirs(p_t)
|
|
288
290
|
f_n = os.path.join(path, t, n) + ".jsonl"
|
|
289
291
|
with open(f_n, 'a', encoding='utf-8') as f:
|
|
290
|
-
|
|
292
|
+
if self.input_args.save_raw:
|
|
293
|
+
str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False)
|
|
294
|
+
else:
|
|
295
|
+
str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
|
|
291
296
|
f.write(str_json + '\n')
|
|
292
297
|
|
|
293
298
|
with open(path + '/summary.json', 'w', encoding='utf-8') as f:
|
dingo/exec/spark.py
CHANGED
|
@@ -31,6 +31,7 @@ class SparkExecutor(Executor):
|
|
|
31
31
|
spark_conf: SparkConf = None):
|
|
32
32
|
# eval param
|
|
33
33
|
self.llm: Optional[BaseLLM] = None
|
|
34
|
+
self.group: Optional[Dict] = None
|
|
34
35
|
self.summary: Optional[SummaryModel] = None
|
|
35
36
|
self.bad_info_list: Optional[RDD] = None
|
|
36
37
|
self.good_info_list: Optional[RDD] = None
|
|
@@ -72,6 +73,7 @@ class SparkExecutor(Executor):
|
|
|
72
73
|
def execute(self) -> List[SummaryModel]:
|
|
73
74
|
create_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
|
|
74
75
|
Model.apply_config(self.input_args.custom_config, self.input_args.eval_group)
|
|
76
|
+
self.group = Model.get_group(self.input_args.eval_group)
|
|
75
77
|
if GlobalConfig.config and GlobalConfig.config.llm_config:
|
|
76
78
|
for llm_name in GlobalConfig.config.llm_config:
|
|
77
79
|
self.llm = Model.get_llm(llm_name)
|
|
@@ -148,7 +150,7 @@ class SparkExecutor(Executor):
|
|
|
148
150
|
good_name_list = []
|
|
149
151
|
bad_reason_list = []
|
|
150
152
|
good_reason_list = []
|
|
151
|
-
for group_type, group in
|
|
153
|
+
for group_type, group in self.group.items():
|
|
152
154
|
if group_type == 'rule':
|
|
153
155
|
r_i = self.evaluate_rule(group, data)
|
|
154
156
|
elif group_type == 'prompt':
|
dingo/io/input/InputArgs.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from pydantic import BaseModel
|
|
5
|
-
|
|
4
|
+
from pydantic import BaseModel, ValidationError
|
|
6
5
|
|
|
7
6
|
class InputArgs(BaseModel):
|
|
8
7
|
"""
|
|
@@ -35,10 +34,15 @@ class InputArgs(BaseModel):
|
|
|
35
34
|
|
|
36
35
|
custom_config: Optional[str | dict] = None
|
|
37
36
|
|
|
38
|
-
|
|
39
|
-
|
|
37
|
+
class Config:
|
|
38
|
+
extra = 'forbid' # Forbid extra parameters
|
|
40
39
|
|
|
41
|
-
|
|
40
|
+
def __init__(self, **kwargs):
|
|
41
|
+
try:
|
|
42
|
+
super().__init__(**kwargs)
|
|
43
|
+
self.check_args()
|
|
44
|
+
except ValidationError as e:
|
|
45
|
+
raise ValueError(f"Invalid input parameters: {e}")
|
|
42
46
|
|
|
43
47
|
def check_args(self):
|
|
44
48
|
# check eval group
|
|
@@ -46,7 +50,7 @@ class InputArgs(BaseModel):
|
|
|
46
50
|
raise ValueError("eval_group cannot be empty.")
|
|
47
51
|
|
|
48
52
|
# check input path
|
|
49
|
-
if self.dataset != 'hugging_face' and
|
|
53
|
+
if self.dataset != 'hugging_face' and not os.path.exists(self.input_path):
|
|
50
54
|
raise FileNotFoundError(f"Input path '{self.input_path}' does not exist.")
|
|
51
55
|
|
|
52
56
|
# check save_data/save_correct
|
dingo/io/output/ResultInfo.py
CHANGED
|
@@ -23,4 +23,14 @@ class ResultInfo(BaseModel):
|
|
|
23
23
|
'name_list': self.name_list,
|
|
24
24
|
'reason_list': self.reason_list,
|
|
25
25
|
'raw_data': self.raw_data
|
|
26
|
-
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def to_raw_dict(self):
|
|
29
|
+
dingo_result = {
|
|
30
|
+
'error_status': self.error_status,
|
|
31
|
+
'type_list': self.type_list,
|
|
32
|
+
'name_list': self.name_list,
|
|
33
|
+
'reason_list': self.reason_list,
|
|
34
|
+
}
|
|
35
|
+
self.raw_data['dingo_result'] = dingo_result
|
|
36
|
+
return self.raw_data
|
|
@@ -15,11 +15,12 @@ Your primary objective is to assess the suitability of this dataset for training
|
|
|
15
15
|
1. Completeness
|
|
16
16
|
1.1 Error_Formula_Table: If the text contains formulas or tables, then their format or content is incorrect.
|
|
17
17
|
1.2 Error_List_Number: If the text contains list number, numbers in the list number have inconsistent formats and logical discontinuity.
|
|
18
|
-
1.3
|
|
18
|
+
1.3 Error_Line_Segment: The text contains sentences unreasonably divided into multiple lines by line breaks; Or the text contains segments stuck together due to lacking line breaks.
|
|
19
19
|
2. Effectiveness
|
|
20
20
|
2.1 Error_Garbled_Characters: The text contains a large amount of garbled and anti crawler characters.
|
|
21
21
|
2.2 Error_Words_Stuck: The text contains a large number of words that are stuck together without being separated by spaces. Words with hyphens are considered normal and treat newline characters (\n) as spaces.
|
|
22
|
-
2.3 Error_Lack_Punctuation:
|
|
22
|
+
2.3 Error_Lack_Punctuation: The text contains a large number of words piled up, which cannot form a sentence when connected together.
|
|
23
|
+
2.4 Error_Empty_Content: The text contains no other characters except for spaces, line breaks, carriage returns, and tabs.
|
|
23
24
|
3. Similarity
|
|
24
25
|
3.1 Error_Duplicate_Content: The text contains consecutive repeated text and multiple occurrences of characters.
|
|
25
26
|
4. Security
|
|
@@ -31,8 +32,8 @@ Your primary objective is to assess the suitability of this dataset for training
|
|
|
31
32
|
-If the text does not hit any negative criteria above, type must only be 'Good'; otherwise, type must only be one of the list ['Completeness', 'Effectiveness', 'Similarity', 'Security'].
|
|
32
33
|
3. Assign a name to the text.
|
|
33
34
|
-If type is 'Good', name must only be 'None'.
|
|
34
|
-
-If type is "Completeness", name must only be one of the list ["Error_Formula_Table", "Error_List_Number", "
|
|
35
|
-
-If type is "Effectiveness", name must only be one of the list ["Error_Garbled_Characters", "Error_Words_Stuck" or "
|
|
35
|
+
-If type is "Completeness", name must only be one of the list ["Error_Formula_Table", "Error_List_Number", "Error_Line_Segment"]
|
|
36
|
+
-If type is "Effectiveness", name must only be one of the list ["Error_Garbled_Characters", "Error_Words_Stuck", "Error_Lack_Punctuation" or "Error_Empty_Content"]
|
|
36
37
|
-If type is "Similarity", name must only be one of the list ["Error_Duplicate_Content"]
|
|
37
38
|
-If type is "Security", name must only be one of the list ["Error_Political_Content", "Error_Prohibited_Content"]
|
|
38
39
|
4. Assign a score to the text according the type. If the type is "Good", score is 1, otherwise the score is 0.
|
dingo/model/rule/rule_common.py
CHANGED
|
@@ -38,10 +38,12 @@ class RuleAlphaWords(BaseRule):
|
|
|
38
38
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
39
39
|
from nltk.tokenize import word_tokenize
|
|
40
40
|
|
|
41
|
-
from dingo.model.rule.utils.detect_lang import decide_language_by_str
|
|
41
|
+
from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
|
|
42
42
|
|
|
43
43
|
res = ModelRes()
|
|
44
44
|
content = input_data.content
|
|
45
|
+
if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
|
|
46
|
+
set_fasttext(cls.dynamic_config.refer_path[0])
|
|
45
47
|
language = decide_language_by_str(content)
|
|
46
48
|
if language != 'en':
|
|
47
49
|
return res
|
|
@@ -76,6 +78,8 @@ class RuleCapitalWords(BaseRule):
|
|
|
76
78
|
content = input_data.content
|
|
77
79
|
words = WordPunctTokenizer().tokenize(content)
|
|
78
80
|
num_words = len(words)
|
|
81
|
+
if num_words == 0:
|
|
82
|
+
return res
|
|
79
83
|
num_caps_words = sum(map(str.isupper, words))
|
|
80
84
|
ratio = num_caps_words / num_words
|
|
81
85
|
if ratio > cls.dynamic_config.threshold and num_words < 200:
|
|
@@ -149,7 +153,9 @@ class RuleColonEnd(BaseRule):
|
|
|
149
153
|
return res
|
|
150
154
|
|
|
151
155
|
|
|
152
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
156
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
157
|
+
'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi',
|
|
158
|
+
'multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1','pdf'])
|
|
153
159
|
class RuleContentNull(BaseRule):
|
|
154
160
|
"""check whether content is null"""
|
|
155
161
|
|
|
@@ -167,7 +173,7 @@ class RuleContentNull(BaseRule):
|
|
|
167
173
|
return res
|
|
168
174
|
|
|
169
175
|
|
|
170
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all', 'qa_standard_v1'])
|
|
176
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all', 'qa_standard_v1','pdf'])
|
|
171
177
|
class RuleContentShort(BaseRule):
|
|
172
178
|
|
|
173
179
|
dynamic_config = DynamicRuleConfig(threshold = 20)
|
|
@@ -184,7 +190,8 @@ class RuleContentShort(BaseRule):
|
|
|
184
190
|
return res
|
|
185
191
|
|
|
186
192
|
|
|
187
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
|
|
193
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
|
|
194
|
+
'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
|
|
188
195
|
class RuleContentShortMultiLan(BaseRule):
|
|
189
196
|
"""check whether content is too short."""
|
|
190
197
|
|
|
@@ -216,9 +223,11 @@ class RuleCurlyBracket(BaseRule):
|
|
|
216
223
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
217
224
|
res = ModelRes()
|
|
218
225
|
content = input_data.content
|
|
226
|
+
if len(content) == 0:
|
|
227
|
+
return res
|
|
219
228
|
|
|
220
229
|
num = content.count('{') + content.count('}')
|
|
221
|
-
ratio = num / len(content)
|
|
230
|
+
ratio = num / len(content)
|
|
222
231
|
if ratio > cls.dynamic_config.threshold:
|
|
223
232
|
res.error_status = True
|
|
224
233
|
res.type = cls.metric_type
|
|
@@ -227,7 +236,9 @@ class RuleCurlyBracket(BaseRule):
|
|
|
227
236
|
return res
|
|
228
237
|
|
|
229
238
|
|
|
230
|
-
@Model.rule_register('QUALITY_BAD_SIMILARITY', ['default','sft','pretrain','benchmark','text_base_all',
|
|
239
|
+
@Model.rule_register('QUALITY_BAD_SIMILARITY', ['default','sft','pretrain','benchmark','text_base_all',
|
|
240
|
+
'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
|
|
241
|
+
'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','pdf'])
|
|
231
242
|
class RuleDocRepeat(BaseRule):
|
|
232
243
|
"""check whether content repeats"""
|
|
233
244
|
|
|
@@ -247,7 +258,9 @@ class RuleDocRepeat(BaseRule):
|
|
|
247
258
|
return res
|
|
248
259
|
|
|
249
260
|
|
|
250
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
261
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
262
|
+
'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
|
|
263
|
+
'multi_lan_sr', 'qa_standard_v1','pdf'])
|
|
251
264
|
class RuleEnterMore(BaseRule):
|
|
252
265
|
"""check whether content has 8 consecutive carriage returns."""
|
|
253
266
|
|
|
@@ -269,7 +282,9 @@ class RuleEnterMore(BaseRule):
|
|
|
269
282
|
return res
|
|
270
283
|
|
|
271
284
|
|
|
272
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
285
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
286
|
+
'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
|
|
287
|
+
'multi_lan_sr', 'qa_standard_v1','pdf'])
|
|
273
288
|
class RuleEnterRatioMore(BaseRule):
|
|
274
289
|
"""check whether the number of enter / the number of content > 25%"""
|
|
275
290
|
|
|
@@ -299,7 +314,7 @@ class RuleHeadWordAr(BaseRule):
|
|
|
299
314
|
|
|
300
315
|
@classmethod
|
|
301
316
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
302
|
-
from dingo.model.rule.utils.
|
|
317
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
303
318
|
|
|
304
319
|
res = ModelRes()
|
|
305
320
|
keyword = get_xyz_head_word("ar")
|
|
@@ -321,7 +336,7 @@ class RuleHeadWordCs(BaseRule):
|
|
|
321
336
|
|
|
322
337
|
@classmethod
|
|
323
338
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
324
|
-
from dingo.model.rule.utils.
|
|
339
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
325
340
|
|
|
326
341
|
res = ModelRes()
|
|
327
342
|
keyword = get_xyz_head_word("cs")
|
|
@@ -343,7 +358,7 @@ class RuleHeadWordHu(BaseRule):
|
|
|
343
358
|
|
|
344
359
|
@classmethod
|
|
345
360
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
346
|
-
from dingo.model.rule.utils.
|
|
361
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
347
362
|
|
|
348
363
|
res = ModelRes()
|
|
349
364
|
keyword = get_xyz_head_word("hu")
|
|
@@ -354,7 +369,7 @@ class RuleHeadWordHu(BaseRule):
|
|
|
354
369
|
res.type = cls.metric_type
|
|
355
370
|
res.name = cls.__name__
|
|
356
371
|
res.reason = ['Content has irrelevance tail source info.']
|
|
357
|
-
|
|
372
|
+
return res
|
|
358
373
|
|
|
359
374
|
|
|
360
375
|
@Model.rule_register('QUALITY_BAD_RELEVANCE', ['multi_lan_ko'])
|
|
@@ -365,7 +380,7 @@ class RuleHeadWordKo(BaseRule):
|
|
|
365
380
|
|
|
366
381
|
@classmethod
|
|
367
382
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
368
|
-
from dingo.model.rule.utils.
|
|
383
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
369
384
|
|
|
370
385
|
res = ModelRes()
|
|
371
386
|
keyword = get_xyz_head_word("ko")
|
|
@@ -387,7 +402,7 @@ class RuleHeadWordRu(BaseRule):
|
|
|
387
402
|
|
|
388
403
|
@classmethod
|
|
389
404
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
390
|
-
from dingo.model.rule.utils.
|
|
405
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
391
406
|
|
|
392
407
|
res = ModelRes()
|
|
393
408
|
keyword = get_xyz_head_word("ru")
|
|
@@ -409,7 +424,7 @@ class RuleHeadWordSr(BaseRule):
|
|
|
409
424
|
|
|
410
425
|
@classmethod
|
|
411
426
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
412
|
-
from dingo.model.rule.utils.
|
|
427
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
413
428
|
|
|
414
429
|
res = ModelRes()
|
|
415
430
|
keyword = get_xyz_head_word("sr")
|
|
@@ -431,7 +446,7 @@ class RuleHeadWordTh(BaseRule):
|
|
|
431
446
|
|
|
432
447
|
@classmethod
|
|
433
448
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
434
|
-
from dingo.model.rule.utils.
|
|
449
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
435
450
|
|
|
436
451
|
res = ModelRes()
|
|
437
452
|
keyword = get_xyz_head_word("th")
|
|
@@ -453,7 +468,7 @@ class RuleHeadWordVi(BaseRule):
|
|
|
453
468
|
|
|
454
469
|
@classmethod
|
|
455
470
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
456
|
-
from dingo.model.rule.utils.
|
|
471
|
+
from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
|
|
457
472
|
|
|
458
473
|
res = ModelRes()
|
|
459
474
|
keyword = get_xyz_head_word("vi")
|
|
@@ -467,7 +482,9 @@ class RuleHeadWordVi(BaseRule):
|
|
|
467
482
|
return res
|
|
468
483
|
|
|
469
484
|
|
|
470
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
485
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
486
|
+
'multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi',
|
|
487
|
+
'multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1','pdf'])
|
|
471
488
|
class RuleHtmlEntity(BaseRule):
|
|
472
489
|
"""check whether content has html entity"""
|
|
473
490
|
|
|
@@ -491,6 +508,8 @@ class RuleHtmlEntity(BaseRule):
|
|
|
491
508
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
492
509
|
res = ModelRes()
|
|
493
510
|
content = input_data.content
|
|
511
|
+
if len(content) == 0:
|
|
512
|
+
return res
|
|
494
513
|
|
|
495
514
|
entities = cls.dynamic_config.key_list
|
|
496
515
|
full_entities_1 = [f"&{entity};" for entity in entities]
|
|
@@ -520,7 +539,9 @@ class RuleHtmlEntity(BaseRule):
|
|
|
520
539
|
return res
|
|
521
540
|
|
|
522
541
|
|
|
523
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
|
|
542
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
|
|
543
|
+
'multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr',
|
|
544
|
+
'qa_standard_v1','pdf'])
|
|
524
545
|
class RuleHtmlTag(BaseRule):
|
|
525
546
|
"""check whether content has image links or html tags."""
|
|
526
547
|
|
|
@@ -530,6 +551,8 @@ class RuleHtmlTag(BaseRule):
|
|
|
530
551
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
531
552
|
res = ModelRes()
|
|
532
553
|
content = input_data.content
|
|
554
|
+
if len(content) == 0:
|
|
555
|
+
return res
|
|
533
556
|
|
|
534
557
|
matches = re.findall('|'.join(cls.dynamic_config.key_list), content)
|
|
535
558
|
num = len(matches)
|
|
@@ -563,7 +586,9 @@ class RuleIDCard(BaseRule):
|
|
|
563
586
|
return res
|
|
564
587
|
|
|
565
588
|
|
|
566
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
|
|
589
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
|
|
590
|
+
'multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr',
|
|
591
|
+
'qa_standard_v1'])
|
|
567
592
|
class RuleInvisibleChar(BaseRule):
|
|
568
593
|
"""check whether content has invisible chars."""
|
|
569
594
|
|
|
@@ -573,6 +598,8 @@ class RuleInvisibleChar(BaseRule):
|
|
|
573
598
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
574
599
|
res = ModelRes()
|
|
575
600
|
content = input_data.content
|
|
601
|
+
if len(content) == 0:
|
|
602
|
+
return res
|
|
576
603
|
|
|
577
604
|
matches = re.findall(cls.dynamic_config.pattern, content)
|
|
578
605
|
num = len(matches)
|
|
@@ -791,7 +818,9 @@ class RuleMeanWordLength(BaseRule):
|
|
|
791
818
|
return res
|
|
792
819
|
|
|
793
820
|
|
|
794
|
-
@Model.rule_register('QUALITY_BAD_FLUENCY', ['default','sft','pretrain','benchmark','text_base_all',
|
|
821
|
+
@Model.rule_register('QUALITY_BAD_FLUENCY', ['default','sft','pretrain','benchmark','text_base_all',
|
|
822
|
+
'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
|
|
823
|
+
'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
|
|
795
824
|
class RuleNoPunc(BaseRule):
|
|
796
825
|
"""check whether paragraph has no punctuation."""
|
|
797
826
|
|
|
@@ -799,10 +828,12 @@ class RuleNoPunc(BaseRule):
|
|
|
799
828
|
|
|
800
829
|
@classmethod
|
|
801
830
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
802
|
-
from dingo.model.rule.utils.detect_lang import decide_language_by_str
|
|
831
|
+
from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
|
|
803
832
|
|
|
804
833
|
res = ModelRes()
|
|
805
834
|
content = input_data.content
|
|
835
|
+
if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
|
|
836
|
+
set_fasttext(cls.dynamic_config.refer_path[0])
|
|
806
837
|
language = decide_language_by_str(content)
|
|
807
838
|
if language != 'en':
|
|
808
839
|
return res
|
|
@@ -867,7 +898,9 @@ class RuleSentenceNumber(BaseRule):
|
|
|
867
898
|
return res
|
|
868
899
|
|
|
869
900
|
|
|
870
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
901
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
902
|
+
'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
|
|
903
|
+
'multi_lan_sr','qa_standard_v1','pdf'])
|
|
871
904
|
class RuleSpaceMore(BaseRule):
|
|
872
905
|
"""check whether content has 500 spaces."""
|
|
873
906
|
|
|
@@ -887,7 +920,10 @@ class RuleSpaceMore(BaseRule):
|
|
|
887
920
|
return res
|
|
888
921
|
|
|
889
922
|
|
|
890
|
-
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
923
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
|
|
924
|
+
'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
|
|
925
|
+
'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1',
|
|
926
|
+
'pdf'])
|
|
891
927
|
class RuleSpecialCharacter(BaseRule):
|
|
892
928
|
"""check whether content has special characters. """
|
|
893
929
|
|
|
@@ -897,7 +933,8 @@ class RuleSpecialCharacter(BaseRule):
|
|
|
897
933
|
# r"(\\\\;){3,}|(\{\}){3,}|( ){3,}",
|
|
898
934
|
r"÷|\? :",
|
|
899
935
|
r"[�□]|\{\/U\}",
|
|
900
|
-
r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]"
|
|
936
|
+
r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]",
|
|
937
|
+
r"<\|.*?\|>"
|
|
901
938
|
]
|
|
902
939
|
)
|
|
903
940
|
|
|
@@ -905,6 +942,8 @@ class RuleSpecialCharacter(BaseRule):
|
|
|
905
942
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
906
943
|
res = ModelRes()
|
|
907
944
|
content = input_data.content
|
|
945
|
+
if len(content) == 0:
|
|
946
|
+
return res
|
|
908
947
|
|
|
909
948
|
matches = []
|
|
910
949
|
num = 0
|
|
@@ -930,11 +969,13 @@ class RuleStopWord(BaseRule):
|
|
|
930
969
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
931
970
|
from nltk.tokenize import WordPunctTokenizer
|
|
932
971
|
|
|
933
|
-
from dingo.model.rule.utils.detect_lang import decide_language_by_str
|
|
972
|
+
from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
|
|
934
973
|
from dingo.model.rule.utils.util import get_stop_words
|
|
935
974
|
|
|
936
975
|
res = ModelRes()
|
|
937
976
|
raw_content = input_data.content
|
|
977
|
+
if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
|
|
978
|
+
set_fasttext(cls.dynamic_config.refer_path[0])
|
|
938
979
|
language = decide_language_by_str(raw_content)
|
|
939
980
|
if language != 'en':
|
|
940
981
|
return res
|
|
@@ -1018,7 +1059,32 @@ class RuleUniqueWords(BaseRule):
|
|
|
1018
1059
|
return res
|
|
1019
1060
|
|
|
1020
1061
|
|
|
1021
|
-
@Model.rule_register(
|
|
1062
|
+
@Model.rule_register("QUALITY_BAD_SECURITY", [])
|
|
1063
|
+
class RuleUnsafeWords(BaseRule):
|
|
1064
|
+
"""check whether content contains unsafe words."""
|
|
1065
|
+
|
|
1066
|
+
dynamic_config = DynamicRuleConfig(refer_path=[])
|
|
1067
|
+
|
|
1068
|
+
@classmethod
|
|
1069
|
+
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
1070
|
+
from dingo.model.rule.utils.util import get_unsafe_words
|
|
1071
|
+
|
|
1072
|
+
res = ModelRes()
|
|
1073
|
+
content = input_data.content
|
|
1074
|
+
if cls.dynamic_config.key_list is None:
|
|
1075
|
+
cls.dynamic_config.key_list = get_unsafe_words(cls.dynamic_config.refer_path)
|
|
1076
|
+
matches = list(filter(lambda x:x in content, cls.dynamic_config.key_list))
|
|
1077
|
+
if matches:
|
|
1078
|
+
res.error_status = True
|
|
1079
|
+
res.type = cls.metric_type
|
|
1080
|
+
res.name = cls.__name__
|
|
1081
|
+
res.reason = matches
|
|
1082
|
+
return res
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
1086
|
+
'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
|
|
1087
|
+
'multi_lan_sr','qa_standard_v1','pdf'])
|
|
1022
1088
|
class RuleOnlyUrl(BaseRule):
|
|
1023
1089
|
"""check whether content is only an url link."""
|
|
1024
1090
|
|
|
@@ -1027,8 +1093,12 @@ class RuleOnlyUrl(BaseRule):
|
|
|
1027
1093
|
@classmethod
|
|
1028
1094
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
1029
1095
|
res = ModelRes()
|
|
1096
|
+
content = input_data.content
|
|
1097
|
+
if len(content.strip()) == 0:
|
|
1098
|
+
return res
|
|
1030
1099
|
SEARCH_REGEX = re.compile(cls.dynamic_config.pattern)
|
|
1031
|
-
content_without_url = SEARCH_REGEX.sub("",
|
|
1100
|
+
content_without_url = SEARCH_REGEX.sub("", content)
|
|
1101
|
+
print(content_without_url)
|
|
1032
1102
|
if len(content_without_url.strip()) == 0:
|
|
1033
1103
|
res.error_status = True
|
|
1034
1104
|
res.type = cls.metric_type
|
|
@@ -1098,7 +1168,9 @@ class RuleWordSplit(BaseRule):
|
|
|
1098
1168
|
return res
|
|
1099
1169
|
|
|
1100
1170
|
|
|
1101
|
-
@Model.rule_register('QUALITY_BAD_FLUENCY', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
1171
|
+
@Model.rule_register('QUALITY_BAD_FLUENCY', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
|
|
1172
|
+
'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
|
|
1173
|
+
'multi_lan_sr'])
|
|
1102
1174
|
class RuleWordStuck(BaseRule):
|
|
1103
1175
|
"""check whether words are stuck."""
|
|
1104
1176
|
|
|
@@ -1116,11 +1188,13 @@ class RuleWordStuck(BaseRule):
|
|
|
1116
1188
|
def eval(cls, input_data: MetaData) -> ModelRes:
|
|
1117
1189
|
import wordninja
|
|
1118
1190
|
|
|
1119
|
-
from dingo.model.rule.utils.detect_lang import decide_language_by_str
|
|
1191
|
+
from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
|
|
1120
1192
|
from dingo.model.rule.utils.util import is_sha256
|
|
1121
1193
|
|
|
1122
1194
|
res = ModelRes()
|
|
1123
1195
|
content = input_data.content
|
|
1196
|
+
if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
|
|
1197
|
+
set_fasttext(cls.dynamic_config.refer_path[0])
|
|
1124
1198
|
language = decide_language_by_str(content)
|
|
1125
1199
|
if language != 'en':
|
|
1126
1200
|
return res
|
|
@@ -1148,7 +1222,7 @@ if __name__ == '__main__':
|
|
|
1148
1222
|
data = MetaData(
|
|
1149
1223
|
data_id = '',
|
|
1150
1224
|
prompt = '',
|
|
1151
|
-
content = "
|
|
1225
|
+
content = " \n \n"
|
|
1152
1226
|
)
|
|
1153
|
-
tmp =
|
|
1227
|
+
tmp = RuleOnlyUrl().eval(data)
|
|
1154
1228
|
print(tmp)
|
|
@@ -6,9 +6,15 @@ from huggingface_hub import hf_hub_download
|
|
|
6
6
|
from dingo.utils import log
|
|
7
7
|
|
|
8
8
|
_global_lang_detect = []
|
|
9
|
+
_fasttext_path = ''
|
|
9
10
|
|
|
11
|
+
def set_fasttext(path: str):
|
|
12
|
+
global _fasttext_path
|
|
13
|
+
_fasttext_path = path
|
|
10
14
|
|
|
11
15
|
def download_fasttext() -> str:
|
|
16
|
+
if _fasttext_path:
|
|
17
|
+
return _fasttext_path
|
|
12
18
|
file_path = hf_hub_download(repo_id='chupei/fasttext.lib.176.bin', filename='lid.176.bin')
|
|
13
19
|
return file_path
|
|
14
20
|
|
dingo/model/rule/utils/util.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import re
|
|
2
3
|
import os
|
|
3
4
|
import sys
|
|
@@ -6,7 +7,7 @@ import string
|
|
|
6
7
|
import unicodedata
|
|
7
8
|
import zhon.hanzi
|
|
8
9
|
|
|
9
|
-
from typing import Set, Tuple, Callable
|
|
10
|
+
from typing import Set, Tuple, Callable, List
|
|
10
11
|
from collections import Counter
|
|
11
12
|
from zhon.hanzi import punctuation
|
|
12
13
|
|
|
@@ -61,6 +62,16 @@ class TextSlice:
|
|
|
61
62
|
self.end = end
|
|
62
63
|
|
|
63
64
|
|
|
65
|
+
def get_unsafe_words(file_path_list: List[str]) -> List:
|
|
66
|
+
unsafe_words_list = []
|
|
67
|
+
for file_path in file_path_list:
|
|
68
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
69
|
+
for line in f:
|
|
70
|
+
j = json.loads(line)
|
|
71
|
+
word = str(j['word'])
|
|
72
|
+
unsafe_words_list.append(word)
|
|
73
|
+
return unsafe_words_list
|
|
74
|
+
|
|
64
75
|
def split_paragraphs(
|
|
65
76
|
text: str, normalizer: Callable[[str], str], remove_empty: bool = True
|
|
66
77
|
) -> Tuple[TextSlice]:
|
dingo/run/cli.py
CHANGED
|
@@ -25,6 +25,8 @@ def parse_args():
|
|
|
25
25
|
default=None, help="Save data in output path")
|
|
26
26
|
parser.add_argument("--save_correct", type=bool,
|
|
27
27
|
default=None, help="Save correct data in output path")
|
|
28
|
+
parser.add_argument("--save_raw", type=bool,
|
|
29
|
+
default=None, help="Save raw data in output path")
|
|
28
30
|
parser.add_argument("--data_format", type=str,
|
|
29
31
|
default=None, choices=['json', 'jsonl', 'listjson', 'plaintext', 'image', 's3_image'],
|
|
30
32
|
help="Dataset format (in ['json', 'jsonl', 'listjson', 'plaintext', 'image', 's3_image']), default is 'json'")
|
|
@@ -100,6 +102,8 @@ if __name__ == '__main__':
|
|
|
100
102
|
input_data['save_data'] = args.save_data
|
|
101
103
|
if args.save_correct:
|
|
102
104
|
input_data['save_correct'] = args.save_correct
|
|
105
|
+
if args.save_raw:
|
|
106
|
+
input_data['save_raw'] = args.save_raw
|
|
103
107
|
if args.data_format:
|
|
104
108
|
input_data['data_format'] = args.data_format
|
|
105
109
|
if args.dataset:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dingo-python
|
|
3
|
-
Version: 1.2
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: Language quality evaluation tool.
|
|
5
5
|
Home-page: https://github.com/shijinpjlab/Dingo/main
|
|
6
6
|
Author: SH AI Lab
|
|
@@ -16,10 +16,12 @@ Requires-Dist: chardet
|
|
|
16
16
|
Requires-Dist: datasets
|
|
17
17
|
Requires-Dist: fasttext-wheel ==0.9.2
|
|
18
18
|
Requires-Dist: hanziconv
|
|
19
|
+
Requires-Dist: httpx ==0.27.2
|
|
19
20
|
Requires-Dist: huggingface-hub
|
|
20
21
|
Requires-Dist: jieba
|
|
21
22
|
Requires-Dist: jsonlines
|
|
22
23
|
Requires-Dist: langid
|
|
24
|
+
Requires-Dist: openai ==1.56.2
|
|
23
25
|
Requires-Dist: opencv-python
|
|
24
26
|
Requires-Dist: packaging
|
|
25
27
|
Requires-Dist: pandas
|
|
@@ -3,30 +3,30 @@ dingo/config/__init__.py,sha256=8qPvpZTKUBcZqAqu2S5b2P-GRQGMV6VwGYl8bvQDLI0,45
|
|
|
3
3
|
dingo/config/config.py,sha256=EstF5mWkLrA24Eg1SbOir1fVTCN_-9n3w4rz54RjJcA,1883
|
|
4
4
|
dingo/data/__init__.py,sha256=X7ZLiJN8vDpzRufwpJ9E36phqbW9gEpMT68TxzWLDt4,172
|
|
5
5
|
dingo/data/converter/__init__.py,sha256=1MiG4H8Sg2sYHQmYdg0F9_1okP_YoMNHyQorPEAf6zw,91
|
|
6
|
-
dingo/data/converter/base.py,sha256=
|
|
6
|
+
dingo/data/converter/base.py,sha256=hvXPtYtHR97_LeywgKrEwy7Trr6Yx-qL1WCCi4_R_zs,6677
|
|
7
7
|
dingo/data/converter/img_utils.py,sha256=Pjy4Db3bETAuRmkVO5GzUxTE_hNJhnYyQEJXd_nHaXk,3516
|
|
8
8
|
dingo/data/dataset/__init__.py,sha256=AdBLdr3j4NN-wGvQOuPi_jmzkMcggJApdQ24spLN3-U,405
|
|
9
|
-
dingo/data/dataset/base.py,sha256=
|
|
9
|
+
dingo/data/dataset/base.py,sha256=0mr2qXhfu6hhJ0Dz2nk7S_ZQd4k7PfMfxEprRzkJNnE,5518
|
|
10
10
|
dingo/data/dataset/huggingface.py,sha256=kHtfXKSptxv3hQRGv6WNFMyN3m0nr7CECPU8ESLXGfQ,7181
|
|
11
11
|
dingo/data/dataset/local.py,sha256=6HSfF4vGANh1KcxNyjohOqKrnqPzjjC11gHFZK5aITQ,2654
|
|
12
12
|
dingo/data/dataset/spark.py,sha256=lBubZM7lJrPHO6hcnuD39eEtDB7nuLjWNbQi6jQCItI,4119
|
|
13
13
|
dingo/data/datasource/__init__.py,sha256=nr7dX7c2ylLBJVU9gnAcZzqMTYMRTbhopVAO_dOs0Dw,427
|
|
14
|
-
dingo/data/datasource/base.py,sha256=
|
|
14
|
+
dingo/data/datasource/base.py,sha256=T9y8uxMegHgbb6o7aPbmeLIr5xSAOOl7k5Fpab6jZNc,2931
|
|
15
15
|
dingo/data/datasource/huggingface.py,sha256=-0JCr8f1cOAmWIqZnO8E10QEJ4tiWxSftoUOE6woZI4,3744
|
|
16
16
|
dingo/data/datasource/local.py,sha256=GEa3-P5FTdeS-SWRyLRCew9WXfNl8E9I6AUXSU4eJlg,2672
|
|
17
17
|
dingo/data/datasource/s3.py,sha256=5u8TZN67qVjJD3QQSGEeSmldBHY0HeEvm0s3HB3W0BU,2778
|
|
18
18
|
dingo/data/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
dingo/data/utils/digit.py,sha256=
|
|
19
|
+
dingo/data/utils/digit.py,sha256=V_Cy8o0t0JdBHOJZmi0A6nSczSfi2AbdE23fcWbTN_s,2415
|
|
20
20
|
dingo/data/utils/insecure_hash.py,sha256=1FnevDyjeOrtsBQVlckJDEbk6mItMvfj07_Ut7oBioo,447
|
|
21
21
|
dingo/exec/__init__.py,sha256=5faQMKKWAx4OFxBNN5FOnJuqjf-iXBEDWGOYxnVogV4,341
|
|
22
22
|
dingo/exec/base.py,sha256=upeqzXVX7IiFWfcx3XFdaVmPmZIKMJ0-EtgCRt_Ws-w,1311
|
|
23
|
-
dingo/exec/local.py,sha256=
|
|
24
|
-
dingo/exec/spark.py,sha256=
|
|
23
|
+
dingo/exec/local.py,sha256=_l3e5mIEWt8YNYAho8cKWw26yUeGj7jODchIc_bJKTA,12623
|
|
24
|
+
dingo/exec/spark.py,sha256=7M-pG78Ugp1Shy20_cTe3-eIl7sTWCQ72KiNE3xoPww,11500
|
|
25
25
|
dingo/io/__init__.py,sha256=XxTZKh8nVsoYjfPriaTvW7Or7lNM_11SjJ8uC-T3kws,196
|
|
26
|
-
dingo/io/input/InputArgs.py,sha256=
|
|
26
|
+
dingo/io/input/InputArgs.py,sha256=v2O8gpR8Jni861eV5eU9Am8d4RgypJuYbPDiKdeOCdQ,2429
|
|
27
27
|
dingo/io/input/MetaData.py,sha256=BhJtPA-tTpN7-RhZF42eHO7e4VY4Bl88fevXABKKXbc,272
|
|
28
28
|
dingo/io/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
-
dingo/io/output/ResultInfo.py,sha256=
|
|
29
|
+
dingo/io/output/ResultInfo.py,sha256=T0BPigEk9hU7wj5UdjAHFxBfPFg0YecaCPMHVxjVd08,995
|
|
30
30
|
dingo/io/output/SummaryModel.py,sha256=sl05AaeT4yTMQrjp4EVETKmEIa5nOgO0ReGKt-x0wXQ,1008
|
|
31
31
|
dingo/io/output/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
32
|
dingo/model/__init__.py,sha256=CULKDg2nazgoRvg8j2Ue8GBzZnTXwztX-t0REyAs8SQ,56
|
|
@@ -57,25 +57,25 @@ dingo/model/prompt/prompt_image.py,sha256=Y35RwTeWxXeEmdEMEz5BWSpxWi5sPUk-iQrPqX
|
|
|
57
57
|
dingo/model/prompt/prompt_text_language.py,sha256=5NNM2rXJk5tJxJXfALjGtxwV7H97et81Xr26xqVK7_M,2870
|
|
58
58
|
dingo/model/prompt/prompt_text_quality_multilan.py,sha256=_xDdz5ytNvZmP8DM7S4c329usPnCi76ftlNMezmCb94,2173
|
|
59
59
|
dingo/model/prompt/prompt_text_quality_v2.py,sha256=F4W-SmFvRiCKRkN4PEb3vVUecUuUHWknQ_K1eN83krY,3565
|
|
60
|
-
dingo/model/prompt/prompt_text_quality_v3.py,sha256=
|
|
60
|
+
dingo/model/prompt/prompt_text_quality_v3.py,sha256=Nkxr5Jz6rpYBfFtSMlSj8zNIO_StM4kFkWKuEWJ8u7M,3520
|
|
61
61
|
dingo/model/rule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
dingo/model/rule/base.py,sha256=9kxxq59oaCadZiIOxZsnhrUcSJgeesNB-iXepdSp0h4,427
|
|
63
|
-
dingo/model/rule/rule_common.py,sha256=
|
|
63
|
+
dingo/model/rule/rule_common.py,sha256=wnVN3Pncg9Mb4DQ9qk--A3cI0AbkvTvrh9iDoaTPuFQ,45633
|
|
64
64
|
dingo/model/rule/rule_image.py,sha256=0vclF5CXUMk25Gs3uWc0YyP91kOtROns7M_fA6wswl4,5766
|
|
65
65
|
dingo/model/rule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
-
dingo/model/rule/utils/detect_lang.py,sha256=
|
|
66
|
+
dingo/model/rule/utils/detect_lang.py,sha256=wuCxR_JuTTG0Jj2A9nqN581h9NL11W92bzdvDEeQ46M,5344
|
|
67
67
|
dingo/model/rule/utils/image_util.py,sha256=YovsL-uLNNpUsY6iLVFwvO6kwNpPXtAm8aoHAWboafw,187
|
|
68
|
-
dingo/model/rule/utils/
|
|
69
|
-
dingo/model/rule/utils/
|
|
68
|
+
dingo/model/rule/utils/multi_lan_util.py,sha256=D2sgNyRQL8JOuD807_kah0NR59PI9mioK3nZBMpoT54,2710
|
|
69
|
+
dingo/model/rule/utils/util.py,sha256=iwCBUcoKhUUZUnVz-jAhoAQT6j5jcYoNfN9XI2v0CMc,13849
|
|
70
70
|
dingo/run/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
-
dingo/run/cli.py,sha256=
|
|
71
|
+
dingo/run/cli.py,sha256=_Ly3AAQm2xsJ4eOFvxoABhUDYXQKSdiY7sTtRIuD_HU,6687
|
|
72
72
|
dingo/run/vsl.py,sha256=ygmlVdKH99mo2JfVDfMpv2UItjkn6S-eoPRosrxOPM4,7341
|
|
73
73
|
dingo/run/web.py,sha256=Dyl97ur92ecmyf-8JgttdvEEXviWqLtm8iJxtVuauWI,1599
|
|
74
74
|
dingo/utils/__init__.py,sha256=masgEgU90tbPMKtZz5NF1oraNMrx1xLpHQ9B8QMPm9o,37
|
|
75
75
|
dingo/utils/log_util/__init__.py,sha256=B4SurbYC7MqlI9ILM2_gS4QPLYj_UbyPRQQSpcGccdI,721
|
|
76
76
|
dingo/utils/log_util/logger.py,sha256=jliGVit4mHB17nBeXOqbLHrlEWwuZJsNu_xBDmxr42I,1424
|
|
77
|
-
dingo_python-1.2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
78
|
-
dingo_python-1.2.dist-info/METADATA,sha256=
|
|
79
|
-
dingo_python-1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
80
|
-
dingo_python-1.2.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
|
|
81
|
-
dingo_python-1.2.dist-info/RECORD,,
|
|
77
|
+
dingo_python-1.2.2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
78
|
+
dingo_python-1.2.2.dist-info/METADATA,sha256=cBUwU1lcfSBtUsbotDm0ha_H7Jf7hF8lwlqzWPPTsdo,10027
|
|
79
|
+
dingo_python-1.2.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
80
|
+
dingo_python-1.2.2.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
|
|
81
|
+
dingo_python-1.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|