dingo-python 1.2__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -66,7 +66,7 @@ class JsonConverter(BaseConverter):
66
66
  'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k),
67
67
  'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '',
68
68
  'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '',
69
- 'raw_data': j
69
+ 'raw_data': v
70
70
  })
71
71
 
72
72
  return _convert
@@ -91,7 +91,7 @@ class PlainConverter(BaseConverter):
91
91
  'data_id': str(cls.data_id),
92
92
  'prompt': '',
93
93
  'content': raw,
94
- 'raw_data': {'data_id':str(cls.data_id), 'content': raw}
94
+ 'raw_data': {'content': raw}
95
95
  })
96
96
  cls.data_id += 1
97
97
  return data
@@ -1,3 +1,20 @@
1
+ # This file is modified from:
2
+ # https://github.com/mlflow/mlflow/blob/master/mlflow/data/dataset.py
3
+ #
4
+ # Copyright 2018 Databricks, Inc. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
1
18
  import json
2
19
  from functools import wraps
3
20
  from abc import abstractmethod
@@ -1,3 +1,20 @@
1
+ # This file is modified from:
2
+ # https://github.com/mlflow/mlflow/blob/master/mlflow/data/dataset_source.py
3
+ #
4
+ # Copyright 2018 Databricks, Inc. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
1
18
  import json
2
19
  from functools import wraps
3
20
  from abc import abstractmethod
dingo/data/utils/digit.py CHANGED
@@ -1,3 +1,20 @@
1
+ # This file is modified from:
2
+ # https://github.com/mlflow/mlflow/blob/master/mlflow/data/digest_utils.py
3
+ #
4
+ # Copyright 2018 Databricks, Inc. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
1
18
  import logging
2
19
  from typing import Any, List
3
20
 
@@ -44,48 +61,6 @@ def compute_pandas_digest(df) -> str:
44
61
  )
45
62
 
46
63
 
47
- def compute_numpy_digest(features, targets=None) -> str:
48
- """Computes a digest for the given numpy array.
49
-
50
- Args:
51
- features: A numpy array containing dataset features.
52
- targets: A numpy array containing dataset targets. Optional.
53
-
54
- Returns:
55
- A string digest.
56
- """
57
- import numpy as np
58
- import pandas as pd
59
-
60
- hashable_elements = []
61
-
62
- def hash_array(array):
63
- flattened_array = array.flatten()
64
- trimmed_array = flattened_array[0:MAX_ROWS]
65
- try:
66
- hashable_elements.append(pd.util.hash_array(trimmed_array))
67
- except TypeError:
68
- hashable_elements.append(np.int64(trimmed_array.size))
69
-
70
- # hash full array dimensions
71
- for x in array.shape:
72
- hashable_elements.append(np.int64(x))
73
-
74
- def hash_dict_of_arrays(array_dict):
75
- for key in sorted(array_dict.keys()):
76
- hash_array(array_dict[key])
77
-
78
- for item in [features, targets]:
79
- if item is None:
80
- continue
81
- if isinstance(item, dict):
82
- hash_dict_of_arrays(item)
83
- else:
84
- hash_array(item)
85
-
86
- return get_normalized_md5_digest(hashable_elements)
87
-
88
-
89
64
  def get_normalized_md5_digest(elements: List[Any]) -> str:
90
65
  """Computes a normalized digest for a list of hashable elements.
91
66
 
dingo/exec/local.py CHANGED
@@ -79,7 +79,6 @@ class LocalExecutor(Executor):
79
79
  if self.input_args.save_data:
80
80
  self.save_data(output_path)
81
81
 
82
- log.debug(self.summary)
83
82
  return [self.summary]
84
83
 
85
84
  def evaluate(self):
@@ -89,7 +88,6 @@ class LocalExecutor(Executor):
89
88
  group (Any): _description_
90
89
  group_type (str): _description_
91
90
  """
92
- log.debug('[get_score]:' + self.input_args.input_path)
93
91
  with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as executor:
94
92
  data_iter = self.load_data()
95
93
  def process_batch(batch: List):
@@ -178,6 +176,7 @@ class LocalExecutor(Executor):
178
176
  self.summary.name_ratio[n] += 1
179
177
 
180
178
  self.summary.total += 1
179
+ log.info(f'[Data Number]: {str(self.summary.total)} ')
181
180
 
182
181
  def evaluate_rule(self, group: List[BaseRule], d: MetaData) -> ResultInfo:
183
182
  result_info = ResultInfo(data_id=d.data_id, prompt=d.prompt, content=d.content)
@@ -275,7 +274,10 @@ class LocalExecutor(Executor):
275
274
  os.makedirs(p_t)
276
275
  f_n = os.path.join(path, t, n) + ".jsonl"
277
276
  with open(f_n, 'a', encoding='utf-8') as f:
278
- str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
277
+ if self.input_args.save_raw:
278
+ str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False)
279
+ else:
280
+ str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
279
281
  f.write(str_json + '\n')
280
282
  if self.input_args.save_correct:
281
283
  for result_info in self.good_info_list:
@@ -287,7 +289,10 @@ class LocalExecutor(Executor):
287
289
  os.makedirs(p_t)
288
290
  f_n = os.path.join(path, t, n) + ".jsonl"
289
291
  with open(f_n, 'a', encoding='utf-8') as f:
290
- str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
292
+ if self.input_args.save_raw:
293
+ str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False)
294
+ else:
295
+ str_json = json.dumps(result_info.to_dict(), ensure_ascii=False)
291
296
  f.write(str_json + '\n')
292
297
 
293
298
  with open(path + '/summary.json', 'w', encoding='utf-8') as f:
dingo/exec/spark.py CHANGED
@@ -31,6 +31,7 @@ class SparkExecutor(Executor):
31
31
  spark_conf: SparkConf = None):
32
32
  # eval param
33
33
  self.llm: Optional[BaseLLM] = None
34
+ self.group: Optional[Dict] = None
34
35
  self.summary: Optional[SummaryModel] = None
35
36
  self.bad_info_list: Optional[RDD] = None
36
37
  self.good_info_list: Optional[RDD] = None
@@ -72,6 +73,7 @@ class SparkExecutor(Executor):
72
73
  def execute(self) -> List[SummaryModel]:
73
74
  create_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
74
75
  Model.apply_config(self.input_args.custom_config, self.input_args.eval_group)
76
+ self.group = Model.get_group(self.input_args.eval_group)
75
77
  if GlobalConfig.config and GlobalConfig.config.llm_config:
76
78
  for llm_name in GlobalConfig.config.llm_config:
77
79
  self.llm = Model.get_llm(llm_name)
@@ -148,7 +150,7 @@ class SparkExecutor(Executor):
148
150
  good_name_list = []
149
151
  bad_reason_list = []
150
152
  good_reason_list = []
151
- for group_type, group in Model.get_group(self.input_args.eval_group).items():
153
+ for group_type, group in self.group.items():
152
154
  if group_type == 'rule':
153
155
  r_i = self.evaluate_rule(group, data)
154
156
  elif group_type == 'prompt':
@@ -1,8 +1,7 @@
1
1
  import os
2
2
  from typing import Optional
3
3
 
4
- from pydantic import BaseModel
5
-
4
+ from pydantic import BaseModel, ValidationError
6
5
 
7
6
  class InputArgs(BaseModel):
8
7
  """
@@ -35,10 +34,15 @@ class InputArgs(BaseModel):
35
34
 
36
35
  custom_config: Optional[str | dict] = None
37
36
 
38
- def __init__(self, **kwargs):
39
- super().__init__(**kwargs)
37
+ class Config:
38
+ extra = 'forbid' # Forbid extra parameters
40
39
 
41
- self.check_args()
40
+ def __init__(self, **kwargs):
41
+ try:
42
+ super().__init__(**kwargs)
43
+ self.check_args()
44
+ except ValidationError as e:
45
+ raise ValueError(f"Invalid input parameters: {e}")
42
46
 
43
47
  def check_args(self):
44
48
  # check eval group
@@ -46,7 +50,7 @@ class InputArgs(BaseModel):
46
50
  raise ValueError("eval_group cannot be empty.")
47
51
 
48
52
  # check input path
49
- if self.dataset != 'hugging_face' and not os.path.exists(self.input_path):
53
+ if self.dataset != 'hugging_face' and not os.path.exists(self.input_path):
50
54
  raise FileNotFoundError(f"Input path '{self.input_path}' does not exist.")
51
55
 
52
56
  # check save_data/save_correct
@@ -23,4 +23,14 @@ class ResultInfo(BaseModel):
23
23
  'name_list': self.name_list,
24
24
  'reason_list': self.reason_list,
25
25
  'raw_data': self.raw_data
26
- }
26
+ }
27
+
28
+ def to_raw_dict(self):
29
+ dingo_result = {
30
+ 'error_status': self.error_status,
31
+ 'type_list': self.type_list,
32
+ 'name_list': self.name_list,
33
+ 'reason_list': self.reason_list,
34
+ }
35
+ self.raw_data['dingo_result'] = dingo_result
36
+ return self.raw_data
@@ -15,11 +15,12 @@ Your primary objective is to assess the suitability of this dataset for training
15
15
  1. Completeness
16
16
  1.1 Error_Formula_Table: If the text contains formulas or tables, then their format or content is incorrect.
17
17
  1.2 Error_List_Number: If the text contains list number, numbers in the list number have inconsistent formats and logical discontinuity.
18
- 1.3 Error_Section_Order: The order of text is cluttered, such as titles embedded in the main text or structures that are difficult to read.
18
+ 1.3 Error_Line_Segment: The text contains sentences unreasonably divided into multiple lines by line breaks; Or the text contains segments stuck together due to lacking line breaks.
19
19
  2. Effectiveness
20
20
  2.1 Error_Garbled_Characters: The text contains a large amount of garbled and anti crawler characters.
21
21
  2.2 Error_Words_Stuck: The text contains a large number of words that are stuck together without being separated by spaces. Words with hyphens are considered normal and treat newline characters (\n) as spaces.
22
- 2.3 Error_Lack_Punctuation: Text contains a large number of words piled up, which cannot form a sentence when connected together.
22
+ 2.3 Error_Lack_Punctuation: The text contains a large number of words piled up, which cannot form a sentence when connected together.
23
+ 2.4 Error_Empty_Content: The text contains no other characters except for spaces, line breaks, carriage returns, and tabs.
23
24
  3. Similarity
24
25
  3.1 Error_Duplicate_Content: The text contains consecutive repeated text and multiple occurrences of characters.
25
26
  4. Security
@@ -31,8 +32,8 @@ Your primary objective is to assess the suitability of this dataset for training
31
32
  -If the text does not hit any negative criteria above, type must only be 'Good'; otherwise, type must only be one of the list ['Completeness', 'Effectiveness', 'Similarity', 'Security'].
32
33
  3. Assign a name to the text.
33
34
  -If type is 'Good', name must only be 'None'.
34
- -If type is "Completeness", name must only be one of the list ["Error_Formula_Table", "Error_List_Number", "Error_Section_Order"]
35
- -If type is "Effectiveness", name must only be one of the list ["Error_Garbled_Characters", "Error_Words_Stuck" or "Error_Lack_Punctuation"]
35
+ -If type is "Completeness", name must only be one of the list ["Error_Formula_Table", "Error_List_Number", "Error_Line_Segment"]
36
+ -If type is "Effectiveness", name must only be one of the list ["Error_Garbled_Characters", "Error_Words_Stuck", "Error_Lack_Punctuation" or "Error_Empty_Content"]
36
37
  -If type is "Similarity", name must only be one of the list ["Error_Duplicate_Content"]
37
38
  -If type is "Security", name must only be one of the list ["Error_Political_Content", "Error_Prohibited_Content"]
38
39
  4. Assign a score to the text according the type. If the type is "Good", score is 1, otherwise the score is 0.
@@ -38,10 +38,12 @@ class RuleAlphaWords(BaseRule):
38
38
  def eval(cls, input_data: MetaData) -> ModelRes:
39
39
  from nltk.tokenize import word_tokenize
40
40
 
41
- from dingo.model.rule.utils.detect_lang import decide_language_by_str
41
+ from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
42
42
 
43
43
  res = ModelRes()
44
44
  content = input_data.content
45
+ if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
46
+ set_fasttext(cls.dynamic_config.refer_path[0])
45
47
  language = decide_language_by_str(content)
46
48
  if language != 'en':
47
49
  return res
@@ -76,6 +78,8 @@ class RuleCapitalWords(BaseRule):
76
78
  content = input_data.content
77
79
  words = WordPunctTokenizer().tokenize(content)
78
80
  num_words = len(words)
81
+ if num_words == 0:
82
+ return res
79
83
  num_caps_words = sum(map(str.isupper, words))
80
84
  ratio = num_caps_words / num_words
81
85
  if ratio > cls.dynamic_config.threshold and num_words < 200:
@@ -149,7 +153,9 @@ class RuleColonEnd(BaseRule):
149
153
  return res
150
154
 
151
155
 
152
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
156
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
157
+ 'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi',
158
+ 'multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1','pdf'])
153
159
  class RuleContentNull(BaseRule):
154
160
  """check whether content is null"""
155
161
 
@@ -167,7 +173,7 @@ class RuleContentNull(BaseRule):
167
173
  return res
168
174
 
169
175
 
170
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all', 'qa_standard_v1'])
176
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all', 'qa_standard_v1','pdf'])
171
177
  class RuleContentShort(BaseRule):
172
178
 
173
179
  dynamic_config = DynamicRuleConfig(threshold = 20)
@@ -184,7 +190,8 @@ class RuleContentShort(BaseRule):
184
190
  return res
185
191
 
186
192
 
187
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
193
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
194
+ 'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
188
195
  class RuleContentShortMultiLan(BaseRule):
189
196
  """check whether content is too short."""
190
197
 
@@ -216,9 +223,11 @@ class RuleCurlyBracket(BaseRule):
216
223
  def eval(cls, input_data: MetaData) -> ModelRes:
217
224
  res = ModelRes()
218
225
  content = input_data.content
226
+ if len(content) == 0:
227
+ return res
219
228
 
220
229
  num = content.count('{') + content.count('}')
221
- ratio = num / len(content) if len(content) !=0 else 0
230
+ ratio = num / len(content)
222
231
  if ratio > cls.dynamic_config.threshold:
223
232
  res.error_status = True
224
233
  res.type = cls.metric_type
@@ -227,7 +236,9 @@ class RuleCurlyBracket(BaseRule):
227
236
  return res
228
237
 
229
238
 
230
- @Model.rule_register('QUALITY_BAD_SIMILARITY', ['default','sft','pretrain','benchmark','text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
239
+ @Model.rule_register('QUALITY_BAD_SIMILARITY', ['default','sft','pretrain','benchmark','text_base_all',
240
+ 'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
241
+ 'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','pdf'])
231
242
  class RuleDocRepeat(BaseRule):
232
243
  """check whether content repeats"""
233
244
 
@@ -247,7 +258,9 @@ class RuleDocRepeat(BaseRule):
247
258
  return res
248
259
 
249
260
 
250
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr', 'qa_standard_v1'])
261
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
262
+ 'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
263
+ 'multi_lan_sr', 'qa_standard_v1','pdf'])
251
264
  class RuleEnterMore(BaseRule):
252
265
  """check whether content has 8 consecutive carriage returns."""
253
266
 
@@ -269,7 +282,9 @@ class RuleEnterMore(BaseRule):
269
282
  return res
270
283
 
271
284
 
272
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr', 'qa_standard_v1'])
285
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
286
+ 'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
287
+ 'multi_lan_sr', 'qa_standard_v1','pdf'])
273
288
  class RuleEnterRatioMore(BaseRule):
274
289
  """check whether the number of enter / the number of content > 25%"""
275
290
 
@@ -299,7 +314,7 @@ class RuleHeadWordAr(BaseRule):
299
314
 
300
315
  @classmethod
301
316
  def eval(cls, input_data: MetaData) -> ModelRes:
302
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
317
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
303
318
 
304
319
  res = ModelRes()
305
320
  keyword = get_xyz_head_word("ar")
@@ -321,7 +336,7 @@ class RuleHeadWordCs(BaseRule):
321
336
 
322
337
  @classmethod
323
338
  def eval(cls, input_data: MetaData) -> ModelRes:
324
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
339
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
325
340
 
326
341
  res = ModelRes()
327
342
  keyword = get_xyz_head_word("cs")
@@ -343,7 +358,7 @@ class RuleHeadWordHu(BaseRule):
343
358
 
344
359
  @classmethod
345
360
  def eval(cls, input_data: MetaData) -> ModelRes:
346
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
361
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
347
362
 
348
363
  res = ModelRes()
349
364
  keyword = get_xyz_head_word("hu")
@@ -354,7 +369,7 @@ class RuleHeadWordHu(BaseRule):
354
369
  res.type = cls.metric_type
355
370
  res.name = cls.__name__
356
371
  res.reason = ['Content has irrelevance tail source info.']
357
- return res
372
+ return res
358
373
 
359
374
 
360
375
  @Model.rule_register('QUALITY_BAD_RELEVANCE', ['multi_lan_ko'])
@@ -365,7 +380,7 @@ class RuleHeadWordKo(BaseRule):
365
380
 
366
381
  @classmethod
367
382
  def eval(cls, input_data: MetaData) -> ModelRes:
368
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
383
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
369
384
 
370
385
  res = ModelRes()
371
386
  keyword = get_xyz_head_word("ko")
@@ -387,7 +402,7 @@ class RuleHeadWordRu(BaseRule):
387
402
 
388
403
  @classmethod
389
404
  def eval(cls, input_data: MetaData) -> ModelRes:
390
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
405
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
391
406
 
392
407
  res = ModelRes()
393
408
  keyword = get_xyz_head_word("ru")
@@ -409,7 +424,7 @@ class RuleHeadWordSr(BaseRule):
409
424
 
410
425
  @classmethod
411
426
  def eval(cls, input_data: MetaData) -> ModelRes:
412
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
427
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
413
428
 
414
429
  res = ModelRes()
415
430
  keyword = get_xyz_head_word("sr")
@@ -431,7 +446,7 @@ class RuleHeadWordTh(BaseRule):
431
446
 
432
447
  @classmethod
433
448
  def eval(cls, input_data: MetaData) -> ModelRes:
434
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
449
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
435
450
 
436
451
  res = ModelRes()
437
452
  keyword = get_xyz_head_word("th")
@@ -453,7 +468,7 @@ class RuleHeadWordVi(BaseRule):
453
468
 
454
469
  @classmethod
455
470
  def eval(cls, input_data: MetaData) -> ModelRes:
456
- from dingo.model.rule.utils.xyz_head_word import get_xyz_head_word
471
+ from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word
457
472
 
458
473
  res = ModelRes()
459
474
  keyword = get_xyz_head_word("vi")
@@ -467,7 +482,9 @@ class RuleHeadWordVi(BaseRule):
467
482
  return res
468
483
 
469
484
 
470
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
485
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
486
+ 'multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi',
487
+ 'multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1','pdf'])
471
488
  class RuleHtmlEntity(BaseRule):
472
489
  """check whether content has html entity"""
473
490
 
@@ -491,6 +508,8 @@ class RuleHtmlEntity(BaseRule):
491
508
  def eval(cls, input_data: MetaData) -> ModelRes:
492
509
  res = ModelRes()
493
510
  content = input_data.content
511
+ if len(content) == 0:
512
+ return res
494
513
 
495
514
  entities = cls.dynamic_config.key_list
496
515
  full_entities_1 = [f"&{entity};" for entity in entities]
@@ -520,7 +539,9 @@ class RuleHtmlEntity(BaseRule):
520
539
  return res
521
540
 
522
541
 
523
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
542
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
543
+ 'multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr',
544
+ 'qa_standard_v1','pdf'])
524
545
  class RuleHtmlTag(BaseRule):
525
546
  """check whether content has image links or html tags."""
526
547
 
@@ -530,6 +551,8 @@ class RuleHtmlTag(BaseRule):
530
551
  def eval(cls, input_data: MetaData) -> ModelRes:
531
552
  res = ModelRes()
532
553
  content = input_data.content
554
+ if len(content) == 0:
555
+ return res
533
556
 
534
557
  matches = re.findall('|'.join(cls.dynamic_config.key_list), content)
535
558
  num = len(matches)
@@ -563,7 +586,9 @@ class RuleIDCard(BaseRule):
563
586
  return res
564
587
 
565
588
 
566
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr', 'qa_standard_v1'])
589
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','multi_lan_ar','multi_lan_ko','multi_lan_ru',
590
+ 'multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr',
591
+ 'qa_standard_v1'])
567
592
  class RuleInvisibleChar(BaseRule):
568
593
  """check whether content has invisible chars."""
569
594
 
@@ -573,6 +598,8 @@ class RuleInvisibleChar(BaseRule):
573
598
  def eval(cls, input_data: MetaData) -> ModelRes:
574
599
  res = ModelRes()
575
600
  content = input_data.content
601
+ if len(content) == 0:
602
+ return res
576
603
 
577
604
  matches = re.findall(cls.dynamic_config.pattern, content)
578
605
  num = len(matches)
@@ -791,7 +818,9 @@ class RuleMeanWordLength(BaseRule):
791
818
  return res
792
819
 
793
820
 
794
- @Model.rule_register('QUALITY_BAD_FLUENCY', ['default','sft','pretrain','benchmark','text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
821
+ @Model.rule_register('QUALITY_BAD_FLUENCY', ['default','sft','pretrain','benchmark','text_base_all',
822
+ 'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
823
+ 'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
795
824
  class RuleNoPunc(BaseRule):
796
825
  """check whether paragraph has no punctuation."""
797
826
 
@@ -799,10 +828,12 @@ class RuleNoPunc(BaseRule):
799
828
 
800
829
  @classmethod
801
830
  def eval(cls, input_data: MetaData) -> ModelRes:
802
- from dingo.model.rule.utils.detect_lang import decide_language_by_str
831
+ from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
803
832
 
804
833
  res = ModelRes()
805
834
  content = input_data.content
835
+ if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
836
+ set_fasttext(cls.dynamic_config.refer_path[0])
806
837
  language = decide_language_by_str(content)
807
838
  if language != 'en':
808
839
  return res
@@ -867,7 +898,9 @@ class RuleSentenceNumber(BaseRule):
867
898
  return res
868
899
 
869
900
 
870
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
901
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
902
+ 'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
903
+ 'multi_lan_sr','qa_standard_v1','pdf'])
871
904
  class RuleSpaceMore(BaseRule):
872
905
  """check whether content has 500 spaces."""
873
906
 
@@ -887,7 +920,10 @@ class RuleSpaceMore(BaseRule):
887
920
  return res
888
921
 
889
922
 
890
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
923
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['default','sft','pretrain','benchmark','text_base_all',
924
+ 'llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th',
925
+ 'multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1',
926
+ 'pdf'])
891
927
  class RuleSpecialCharacter(BaseRule):
892
928
  """check whether content has special characters. """
893
929
 
@@ -897,7 +933,8 @@ class RuleSpecialCharacter(BaseRule):
897
933
  # r"(\\\\;){3,}|(\{\}){3,}|(&nbsp;){3,}",
898
934
  r"&#247;|\? :",
899
935
  r"[�□]|\{\/U\}",
900
- r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]"
936
+ r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]",
937
+ r"<\|.*?\|>"
901
938
  ]
902
939
  )
903
940
 
@@ -905,6 +942,8 @@ class RuleSpecialCharacter(BaseRule):
905
942
  def eval(cls, input_data: MetaData) -> ModelRes:
906
943
  res = ModelRes()
907
944
  content = input_data.content
945
+ if len(content) == 0:
946
+ return res
908
947
 
909
948
  matches = []
910
949
  num = 0
@@ -930,11 +969,13 @@ class RuleStopWord(BaseRule):
930
969
  def eval(cls, input_data: MetaData) -> ModelRes:
931
970
  from nltk.tokenize import WordPunctTokenizer
932
971
 
933
- from dingo.model.rule.utils.detect_lang import decide_language_by_str
972
+ from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
934
973
  from dingo.model.rule.utils.util import get_stop_words
935
974
 
936
975
  res = ModelRes()
937
976
  raw_content = input_data.content
977
+ if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
978
+ set_fasttext(cls.dynamic_config.refer_path[0])
938
979
  language = decide_language_by_str(raw_content)
939
980
  if language != 'en':
940
981
  return res
@@ -1018,7 +1059,32 @@ class RuleUniqueWords(BaseRule):
1018
1059
  return res
1019
1060
 
1020
1061
 
1021
- @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr','qa_standard_v1'])
1062
+ @Model.rule_register("QUALITY_BAD_SECURITY", [])
1063
+ class RuleUnsafeWords(BaseRule):
1064
+ """check whether content contains unsafe words."""
1065
+
1066
+ dynamic_config = DynamicRuleConfig(refer_path=[])
1067
+
1068
+ @classmethod
1069
+ def eval(cls, input_data: MetaData) -> ModelRes:
1070
+ from dingo.model.rule.utils.util import get_unsafe_words
1071
+
1072
+ res = ModelRes()
1073
+ content = input_data.content
1074
+ if cls.dynamic_config.key_list is None:
1075
+ cls.dynamic_config.key_list = get_unsafe_words(cls.dynamic_config.refer_path)
1076
+ matches = list(filter(lambda x:x in content, cls.dynamic_config.key_list))
1077
+ if matches:
1078
+ res.error_status = True
1079
+ res.type = cls.metric_type
1080
+ res.name = cls.__name__
1081
+ res.reason = matches
1082
+ return res
1083
+
1084
+
1085
+ @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
1086
+ 'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
1087
+ 'multi_lan_sr','qa_standard_v1','pdf'])
1022
1088
  class RuleOnlyUrl(BaseRule):
1023
1089
  """check whether content is only an url link."""
1024
1090
 
@@ -1027,8 +1093,12 @@ class RuleOnlyUrl(BaseRule):
1027
1093
  @classmethod
1028
1094
  def eval(cls, input_data: MetaData) -> ModelRes:
1029
1095
  res = ModelRes()
1096
+ content = input_data.content
1097
+ if len(content.strip()) == 0:
1098
+ return res
1030
1099
  SEARCH_REGEX = re.compile(cls.dynamic_config.pattern)
1031
- content_without_url = SEARCH_REGEX.sub("", input_data.content)
1100
+ content_without_url = SEARCH_REGEX.sub("", content)
1101
+ print(content_without_url)
1032
1102
  if len(content_without_url.strip()) == 0:
1033
1103
  res.error_status = True
1034
1104
  res.type = cls.metric_type
@@ -1098,7 +1168,9 @@ class RuleWordSplit(BaseRule):
1098
1168
  return res
1099
1169
 
1100
1170
 
1101
- @Model.rule_register('QUALITY_BAD_FLUENCY', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko','multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu','multi_lan_sr'])
1171
+ @Model.rule_register('QUALITY_BAD_FLUENCY', ['text_base_all','llm_base','multi_lan_ar','multi_lan_ko',
1172
+ 'multi_lan_ru','multi_lan_th','multi_lan_vi','multi_lan_cs','multi_lan_hu',
1173
+ 'multi_lan_sr'])
1102
1174
  class RuleWordStuck(BaseRule):
1103
1175
  """check whether words are stuck."""
1104
1176
 
@@ -1116,11 +1188,13 @@ class RuleWordStuck(BaseRule):
1116
1188
  def eval(cls, input_data: MetaData) -> ModelRes:
1117
1189
  import wordninja
1118
1190
 
1119
- from dingo.model.rule.utils.detect_lang import decide_language_by_str
1191
+ from dingo.model.rule.utils.detect_lang import decide_language_by_str, set_fasttext
1120
1192
  from dingo.model.rule.utils.util import is_sha256
1121
1193
 
1122
1194
  res = ModelRes()
1123
1195
  content = input_data.content
1196
+ if cls.dynamic_config.refer_path is not None and len(cls.dynamic_config.refer_path) != 0:
1197
+ set_fasttext(cls.dynamic_config.refer_path[0])
1124
1198
  language = decide_language_by_str(content)
1125
1199
  if language != 'en':
1126
1200
  return res
@@ -1148,7 +1222,7 @@ if __name__ == '__main__':
1148
1222
  data = MetaData(
1149
1223
  data_id = '',
1150
1224
  prompt = '',
1151
- content = " �FA OR FICTION? WH CA IT DO?{{{{{{{{{{{ "
1225
+ content = " \n \n"
1152
1226
  )
1153
- tmp = RuleSpecialCharacter().eval(data)
1227
+ tmp = RuleOnlyUrl().eval(data)
1154
1228
  print(tmp)
@@ -6,9 +6,15 @@ from huggingface_hub import hf_hub_download
6
6
  from dingo.utils import log
7
7
 
8
8
  _global_lang_detect = []
9
+ _fasttext_path = ''
9
10
 
11
+ def set_fasttext(path: str):
12
+ global _fasttext_path
13
+ _fasttext_path = path
10
14
 
11
15
  def download_fasttext() -> str:
16
+ if _fasttext_path:
17
+ return _fasttext_path
12
18
  file_path = hf_hub_download(repo_id='chupei/fasttext.lib.176.bin', filename='lid.176.bin')
13
19
  return file_path
14
20
 
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import re
2
3
  import os
3
4
  import sys
@@ -6,7 +7,7 @@ import string
6
7
  import unicodedata
7
8
  import zhon.hanzi
8
9
 
9
- from typing import Set, Tuple, Callable
10
+ from typing import Set, Tuple, Callable, List
10
11
  from collections import Counter
11
12
  from zhon.hanzi import punctuation
12
13
 
@@ -61,6 +62,16 @@ class TextSlice:
61
62
  self.end = end
62
63
 
63
64
 
65
+ def get_unsafe_words(file_path_list: List[str]) -> List:
66
+ unsafe_words_list = []
67
+ for file_path in file_path_list:
68
+ with open(file_path, 'r', encoding='utf-8') as f:
69
+ for line in f:
70
+ j = json.loads(line)
71
+ word = str(j['word'])
72
+ unsafe_words_list.append(word)
73
+ return unsafe_words_list
74
+
64
75
  def split_paragraphs(
65
76
  text: str, normalizer: Callable[[str], str], remove_empty: bool = True
66
77
  ) -> Tuple[TextSlice]:
dingo/run/cli.py CHANGED
@@ -25,6 +25,8 @@ def parse_args():
25
25
  default=None, help="Save data in output path")
26
26
  parser.add_argument("--save_correct", type=bool,
27
27
  default=None, help="Save correct data in output path")
28
+ parser.add_argument("--save_raw", type=bool,
29
+ default=None, help="Save raw data in output path")
28
30
  parser.add_argument("--data_format", type=str,
29
31
  default=None, choices=['json', 'jsonl', 'listjson', 'plaintext', 'image', 's3_image'],
30
32
  help="Dataset format (in ['json', 'jsonl', 'listjson', 'plaintext', 'image', 's3_image']), default is 'json'")
@@ -100,6 +102,8 @@ if __name__ == '__main__':
100
102
  input_data['save_data'] = args.save_data
101
103
  if args.save_correct:
102
104
  input_data['save_correct'] = args.save_correct
105
+ if args.save_raw:
106
+ input_data['save_raw'] = args.save_raw
103
107
  if args.data_format:
104
108
  input_data['data_format'] = args.data_format
105
109
  if args.dataset:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dingo-python
3
- Version: 1.2
3
+ Version: 1.2.2
4
4
  Summary: Language quality evaluation tool.
5
5
  Home-page: https://github.com/shijinpjlab/Dingo/main
6
6
  Author: SH AI Lab
@@ -16,10 +16,12 @@ Requires-Dist: chardet
16
16
  Requires-Dist: datasets
17
17
  Requires-Dist: fasttext-wheel ==0.9.2
18
18
  Requires-Dist: hanziconv
19
+ Requires-Dist: httpx ==0.27.2
19
20
  Requires-Dist: huggingface-hub
20
21
  Requires-Dist: jieba
21
22
  Requires-Dist: jsonlines
22
23
  Requires-Dist: langid
24
+ Requires-Dist: openai ==1.56.2
23
25
  Requires-Dist: opencv-python
24
26
  Requires-Dist: packaging
25
27
  Requires-Dist: pandas
@@ -3,30 +3,30 @@ dingo/config/__init__.py,sha256=8qPvpZTKUBcZqAqu2S5b2P-GRQGMV6VwGYl8bvQDLI0,45
3
3
  dingo/config/config.py,sha256=EstF5mWkLrA24Eg1SbOir1fVTCN_-9n3w4rz54RjJcA,1883
4
4
  dingo/data/__init__.py,sha256=X7ZLiJN8vDpzRufwpJ9E36phqbW9gEpMT68TxzWLDt4,172
5
5
  dingo/data/converter/__init__.py,sha256=1MiG4H8Sg2sYHQmYdg0F9_1okP_YoMNHyQorPEAf6zw,91
6
- dingo/data/converter/base.py,sha256=Y81bQtc56hKeziLoB5IIdvKJtqHNs2XNzvC2IWM7QbE,6705
6
+ dingo/data/converter/base.py,sha256=hvXPtYtHR97_LeywgKrEwy7Trr6Yx-qL1WCCi4_R_zs,6677
7
7
  dingo/data/converter/img_utils.py,sha256=Pjy4Db3bETAuRmkVO5GzUxTE_hNJhnYyQEJXd_nHaXk,3516
8
8
  dingo/data/dataset/__init__.py,sha256=AdBLdr3j4NN-wGvQOuPi_jmzkMcggJApdQ24spLN3-U,405
9
- dingo/data/dataset/base.py,sha256=o_gqMquedxl2zWxanCcq25XiqDhtvsjGX0CqSALoMGo,4813
9
+ dingo/data/dataset/base.py,sha256=0mr2qXhfu6hhJ0Dz2nk7S_ZQd4k7PfMfxEprRzkJNnE,5518
10
10
  dingo/data/dataset/huggingface.py,sha256=kHtfXKSptxv3hQRGv6WNFMyN3m0nr7CECPU8ESLXGfQ,7181
11
11
  dingo/data/dataset/local.py,sha256=6HSfF4vGANh1KcxNyjohOqKrnqPzjjC11gHFZK5aITQ,2654
12
12
  dingo/data/dataset/spark.py,sha256=lBubZM7lJrPHO6hcnuD39eEtDB7nuLjWNbQi6jQCItI,4119
13
13
  dingo/data/datasource/__init__.py,sha256=nr7dX7c2ylLBJVU9gnAcZzqMTYMRTbhopVAO_dOs0Dw,427
14
- dingo/data/datasource/base.py,sha256=_Fy92I3LntR0H8Zc-eX6fe3GFhM7de1Lz1--pyOkuQo,2220
14
+ dingo/data/datasource/base.py,sha256=T9y8uxMegHgbb6o7aPbmeLIr5xSAOOl7k5Fpab6jZNc,2931
15
15
  dingo/data/datasource/huggingface.py,sha256=-0JCr8f1cOAmWIqZnO8E10QEJ4tiWxSftoUOE6woZI4,3744
16
16
  dingo/data/datasource/local.py,sha256=GEa3-P5FTdeS-SWRyLRCew9WXfNl8E9I6AUXSU4eJlg,2672
17
17
  dingo/data/datasource/s3.py,sha256=5u8TZN67qVjJD3QQSGEeSmldBHY0HeEvm0s3HB3W0BU,2778
18
18
  dingo/data/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- dingo/data/utils/digit.py,sha256=NyfCdjo6lGro0jILSORrEZLSxn06FhwZ9Nn6o_mKISI,2882
19
+ dingo/data/utils/digit.py,sha256=V_Cy8o0t0JdBHOJZmi0A6nSczSfi2AbdE23fcWbTN_s,2415
20
20
  dingo/data/utils/insecure_hash.py,sha256=1FnevDyjeOrtsBQVlckJDEbk6mItMvfj07_Ut7oBioo,447
21
21
  dingo/exec/__init__.py,sha256=5faQMKKWAx4OFxBNN5FOnJuqjf-iXBEDWGOYxnVogV4,341
22
22
  dingo/exec/base.py,sha256=upeqzXVX7IiFWfcx3XFdaVmPmZIKMJ0-EtgCRt_Ws-w,1311
23
- dingo/exec/local.py,sha256=H4UJrzdHTnFp50p981rN0_Btqqdb0gfUa0sUXFtGS10,12303
24
- dingo/exec/spark.py,sha256=j5af-tdbciTRUQwaC1U2jNb9JntbYZ8WQ1NBqPPwHsQ,11426
23
+ dingo/exec/local.py,sha256=_l3e5mIEWt8YNYAho8cKWw26yUeGj7jODchIc_bJKTA,12623
24
+ dingo/exec/spark.py,sha256=7M-pG78Ugp1Shy20_cTe3-eIl7sTWCQ72KiNE3xoPww,11500
25
25
  dingo/io/__init__.py,sha256=XxTZKh8nVsoYjfPriaTvW7Or7lNM_11SjJ8uC-T3kws,196
26
- dingo/io/input/InputArgs.py,sha256=7yh5XVLXrrAJeDEzbgiSE4vq-7BxfFuYa2Dk4E_0nvM,2223
26
+ dingo/io/input/InputArgs.py,sha256=v2O8gpR8Jni861eV5eU9Am8d4RgypJuYbPDiKdeOCdQ,2429
27
27
  dingo/io/input/MetaData.py,sha256=BhJtPA-tTpN7-RhZF42eHO7e4VY4Bl88fevXABKKXbc,272
28
28
  dingo/io/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
- dingo/io/output/ResultInfo.py,sha256=rpy-ImKV5HfAd65IETrAw4W0X8oSSU2BTqfjBtfVoK0,676
29
+ dingo/io/output/ResultInfo.py,sha256=T0BPigEk9hU7wj5UdjAHFxBfPFg0YecaCPMHVxjVd08,995
30
30
  dingo/io/output/SummaryModel.py,sha256=sl05AaeT4yTMQrjp4EVETKmEIa5nOgO0ReGKt-x0wXQ,1008
31
31
  dingo/io/output/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  dingo/model/__init__.py,sha256=CULKDg2nazgoRvg8j2Ue8GBzZnTXwztX-t0REyAs8SQ,56
@@ -57,25 +57,25 @@ dingo/model/prompt/prompt_image.py,sha256=Y35RwTeWxXeEmdEMEz5BWSpxWi5sPUk-iQrPqX
57
57
  dingo/model/prompt/prompt_text_language.py,sha256=5NNM2rXJk5tJxJXfALjGtxwV7H97et81Xr26xqVK7_M,2870
58
58
  dingo/model/prompt/prompt_text_quality_multilan.py,sha256=_xDdz5ytNvZmP8DM7S4c329usPnCi76ftlNMezmCb94,2173
59
59
  dingo/model/prompt/prompt_text_quality_v2.py,sha256=F4W-SmFvRiCKRkN4PEb3vVUecUuUHWknQ_K1eN83krY,3565
60
- dingo/model/prompt/prompt_text_quality_v3.py,sha256=CcEchxdCYzs1gHd8Aj0tFJVugF9__KldzK7XVubFLV0,3325
60
+ dingo/model/prompt/prompt_text_quality_v3.py,sha256=Nkxr5Jz6rpYBfFtSMlSj8zNIO_StM4kFkWKuEWJ8u7M,3520
61
61
  dingo/model/rule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  dingo/model/rule/base.py,sha256=9kxxq59oaCadZiIOxZsnhrUcSJgeesNB-iXepdSp0h4,427
63
- dingo/model/rule/rule_common.py,sha256=I5Z9eivaIq4H6Z9iNlf2EWrCIwASpGrTK5rmAbaVqng,42375
63
+ dingo/model/rule/rule_common.py,sha256=wnVN3Pncg9Mb4DQ9qk--A3cI0AbkvTvrh9iDoaTPuFQ,45633
64
64
  dingo/model/rule/rule_image.py,sha256=0vclF5CXUMk25Gs3uWc0YyP91kOtROns7M_fA6wswl4,5766
65
65
  dingo/model/rule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
- dingo/model/rule/utils/detect_lang.py,sha256=IcUgchI2cKZis6qqSj8YOwggTm75Mh2JGEhXxEdNUUo,5190
66
+ dingo/model/rule/utils/detect_lang.py,sha256=wuCxR_JuTTG0Jj2A9nqN581h9NL11W92bzdvDEeQ46M,5344
67
67
  dingo/model/rule/utils/image_util.py,sha256=YovsL-uLNNpUsY6iLVFwvO6kwNpPXtAm8aoHAWboafw,187
68
- dingo/model/rule/utils/util.py,sha256=SUuho9NmUU6gIaDOgroGXCFAhZFejFEH4u7SUI-LYEg,13473
69
- dingo/model/rule/utils/xyz_head_word.py,sha256=D2sgNyRQL8JOuD807_kah0NR59PI9mioK3nZBMpoT54,2710
68
+ dingo/model/rule/utils/multi_lan_util.py,sha256=D2sgNyRQL8JOuD807_kah0NR59PI9mioK3nZBMpoT54,2710
69
+ dingo/model/rule/utils/util.py,sha256=iwCBUcoKhUUZUnVz-jAhoAQT6j5jcYoNfN9XI2v0CMc,13849
70
70
  dingo/run/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- dingo/run/cli.py,sha256=fL1Vo8nd4UAPtDvzZi4exPorbz1Fyx0Wvej2dFvEyE0,6486
71
+ dingo/run/cli.py,sha256=_Ly3AAQm2xsJ4eOFvxoABhUDYXQKSdiY7sTtRIuD_HU,6687
72
72
  dingo/run/vsl.py,sha256=ygmlVdKH99mo2JfVDfMpv2UItjkn6S-eoPRosrxOPM4,7341
73
73
  dingo/run/web.py,sha256=Dyl97ur92ecmyf-8JgttdvEEXviWqLtm8iJxtVuauWI,1599
74
74
  dingo/utils/__init__.py,sha256=masgEgU90tbPMKtZz5NF1oraNMrx1xLpHQ9B8QMPm9o,37
75
75
  dingo/utils/log_util/__init__.py,sha256=B4SurbYC7MqlI9ILM2_gS4QPLYj_UbyPRQQSpcGccdI,721
76
76
  dingo/utils/log_util/logger.py,sha256=jliGVit4mHB17nBeXOqbLHrlEWwuZJsNu_xBDmxr42I,1424
77
- dingo_python-1.2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
78
- dingo_python-1.2.dist-info/METADATA,sha256=BbTFYYiWeigLydAhycy-RugD67BksnttG_v4Gt7TsO4,9964
79
- dingo_python-1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
80
- dingo_python-1.2.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
81
- dingo_python-1.2.dist-info/RECORD,,
77
+ dingo_python-1.2.2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
78
+ dingo_python-1.2.2.dist-info/METADATA,sha256=cBUwU1lcfSBtUsbotDm0ha_H7Jf7hF8lwlqzWPPTsdo,10027
79
+ dingo_python-1.2.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
80
+ dingo_python-1.2.2.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
81
+ dingo_python-1.2.2.dist-info/RECORD,,