deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,578 @@
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import copy
18
+ import logging
19
+ import math
20
+ import os
21
+ import re
22
+ import string
23
+ import sys
24
+ import threading
25
+
26
+ import datrie
27
+ from hanziconv import HanziConv
28
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
29
+
30
+ from ..common.model_store import resolve_tokenizer_dict_prefix
31
+ from .nltk_manager import ensure_nltk_data
32
+
33
+
34
+ class RagTokenizer:
35
+ def key_(self, line):
36
+ return str(line.lower().encode("utf-8"))[2:-1]
37
+
38
+ def rkey_(self, line):
39
+ return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
40
+
41
+ def loadDict_(self, fnm):
42
+ logging.info(f"[HUQIE]:Build trie from {fnm}")
43
+ try:
44
+ of = open(fnm, "r", encoding='utf-8')
45
+ while True:
46
+ line = of.readline()
47
+ if not line:
48
+ break
49
+ line = re.sub(r"[\r\n]+", "", line)
50
+ line = re.split(r"[ \t]", line)
51
+ k = self.key_(line[0])
52
+ F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
53
+ if k not in self.trie_ or self.trie_[k][0] < F:
54
+ self.trie_[self.key_(line[0])] = (F, line[2])
55
+ self.trie_[self.rkey_(line[0])] = 1
56
+
57
+ dict_file_cache = fnm + ".trie"
58
+ logging.info(f"[HUQIE]:Build trie cache to {dict_file_cache}")
59
+ self.trie_.save(dict_file_cache)
60
+ of.close()
61
+ except Exception:
62
+ logging.exception(f"[HUQIE]:Build trie {fnm} failed")
63
+
64
+ def __init__(
65
+ self,
66
+ debug=False,
67
+ dict_prefix: str | None = None,
68
+ model_home: str | None = None,
69
+ model_provider: str | None = None,
70
+ offline: bool | None = None,
71
+ nltk_data_dir: str | None = None,
72
+ ):
73
+ self.DEBUG = debug
74
+ self.DENOMINATOR = 1000000
75
+
76
+ ensure_nltk_data(
77
+ data_dir=nltk_data_dir,
78
+ offline=offline,
79
+ )
80
+
81
+ from nltk import word_tokenize
82
+
83
+ self.word_tokenize = word_tokenize
84
+ self.DIR_ = dict_prefix or resolve_tokenizer_dict_prefix(
85
+ model_home=model_home,
86
+ provider=model_provider,
87
+ offline=offline,
88
+ )
89
+
90
+ self.stemmer = PorterStemmer()
91
+ self.lemmatizer = WordNetLemmatizer()
92
+
93
+ self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-zA-Z0-9,\.-]+)"
94
+
95
+ trie_file_name = self.DIR_ + ".txt.trie"
96
+ # check if trie file existence
97
+ if os.path.exists(trie_file_name):
98
+ try:
99
+ # load trie from file
100
+ self.trie_ = datrie.Trie.load(trie_file_name)
101
+ return
102
+ except Exception:
103
+ # fail to load trie from file, build default trie
104
+ logging.exception(f"[HUQIE]:Fail to load trie file {trie_file_name}, build the default trie file")
105
+ self.trie_ = datrie.Trie(string.printable)
106
+ else:
107
+ # file not exist, build default trie
108
+ logging.info(f"[HUQIE]:Trie file {trie_file_name} not found, build the default trie file")
109
+ self.trie_ = datrie.Trie(string.printable)
110
+
111
+ # load data from dict file and save to trie file
112
+ self.loadDict_(self.DIR_ + ".txt")
113
+
114
+ def loadUserDict(self, fnm):
115
+ try:
116
+ self.trie_ = datrie.Trie.load(fnm + ".trie")
117
+ return
118
+ except Exception:
119
+ self.trie_ = datrie.Trie(string.printable)
120
+ self.loadDict_(fnm)
121
+
122
+ def addUserDict(self, fnm):
123
+ self.loadDict_(fnm)
124
+
125
+ def _strQ2B(self, ustring):
126
+ """Convert full-width characters to half-width characters"""
127
+ rstring = ""
128
+ for uchar in ustring:
129
+ inside_code = ord(uchar)
130
+ if inside_code == 0x3000:
131
+ inside_code = 0x0020
132
+ else:
133
+ inside_code -= 0xfee0
134
+ if inside_code < 0x0020 or inside_code > 0x7e: # After the conversion, if it's not a half-width character, return the original character.
135
+ rstring += uchar
136
+ else:
137
+ rstring += chr(inside_code)
138
+ return rstring
139
+
140
+ def _tradi2simp(self, line):
141
+ return HanziConv.toSimplified(line)
142
+
143
+ def dfs_(self, chars, s, preTks, tkslist, _depth=0, _memo=None):
144
+ if _memo is None:
145
+ _memo = {}
146
+ MAX_DEPTH = 10
147
+ if _depth > MAX_DEPTH:
148
+ if s < len(chars):
149
+ copy_pretks = copy.deepcopy(preTks)
150
+ remaining = "".join(chars[s:])
151
+ copy_pretks.append((remaining, (-12, '')))
152
+ tkslist.append(copy_pretks)
153
+ return s
154
+
155
+ state_key = (s, tuple(tk[0] for tk in preTks)) if preTks else (s, None)
156
+ if state_key in _memo:
157
+ return _memo[state_key]
158
+
159
+ res = s
160
+ if s >= len(chars):
161
+ tkslist.append(preTks)
162
+ _memo[state_key] = s
163
+ return s
164
+ if s < len(chars) - 4:
165
+ is_repetitive = True
166
+ char_to_check = chars[s]
167
+ for i in range(1, 5):
168
+ if s + i >= len(chars) or chars[s + i] != char_to_check:
169
+ is_repetitive = False
170
+ break
171
+ if is_repetitive:
172
+ end = s
173
+ while end < len(chars) and chars[end] == char_to_check:
174
+ end += 1
175
+ mid = s + min(10, end - s)
176
+ t = "".join(chars[s:mid])
177
+ k = self.key_(t)
178
+ copy_pretks = copy.deepcopy(preTks)
179
+ if k in self.trie_:
180
+ copy_pretks.append((t, self.trie_[k]))
181
+ else:
182
+ copy_pretks.append((t, (-12, '')))
183
+ next_res = self.dfs_(chars, mid, copy_pretks, tkslist, _depth + 1, _memo)
184
+ res = max(res, next_res)
185
+ _memo[state_key] = res
186
+ return res
187
+
188
+ S = s + 1
189
+ if s + 2 <= len(chars):
190
+ t1 = "".join(chars[s:s + 1])
191
+ t2 = "".join(chars[s:s + 2])
192
+ if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(self.key_(t2)):
193
+ S = s + 2
194
+ if len(preTks) > 2 and len(preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
195
+ t1 = preTks[-1][0] + "".join(chars[s:s + 1])
196
+ if self.trie_.has_keys_with_prefix(self.key_(t1)):
197
+ S = s + 2
198
+
199
+ for e in range(S, len(chars) + 1):
200
+ t = "".join(chars[s:e])
201
+ k = self.key_(t)
202
+ if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
203
+ break
204
+ if k in self.trie_:
205
+ pretks = copy.deepcopy(preTks)
206
+ pretks.append((t, self.trie_[k]))
207
+ res = max(res, self.dfs_(chars, e, pretks, tkslist, _depth + 1, _memo))
208
+
209
+ if res > s:
210
+ _memo[state_key] = res
211
+ return res
212
+
213
+ t = "".join(chars[s:s + 1])
214
+ k = self.key_(t)
215
+ copy_pretks = copy.deepcopy(preTks)
216
+ if k in self.trie_:
217
+ copy_pretks.append((t, self.trie_[k]))
218
+ else:
219
+ copy_pretks.append((t, (-12, '')))
220
+ result = self.dfs_(chars, s + 1, copy_pretks, tkslist, _depth + 1, _memo)
221
+ _memo[state_key] = result
222
+ return result
223
+
224
+ def freq(self, tk):
225
+ k = self.key_(tk)
226
+ if k not in self.trie_:
227
+ return 0
228
+ return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
229
+
230
+ def tag(self, tk):
231
+ k = self.key_(tk)
232
+ if k not in self.trie_:
233
+ return ""
234
+ return self.trie_[k][1]
235
+
236
+ def score_(self, tfts):
237
+ B = 30
238
+ F, L, tks = 0, 0, []
239
+ for tk, (freq, tag) in tfts:
240
+ F += freq
241
+ L += 0 if len(tk) < 2 else 1
242
+ tks.append(tk)
243
+ #F /= len(tks)
244
+ L /= len(tks)
245
+ logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
246
+ return tks, B / len(tks) + L + F
247
+
248
+ def sortTks_(self, tkslist):
249
+ res = []
250
+ for tfts in tkslist:
251
+ tks, s = self.score_(tfts)
252
+ res.append((tks, s))
253
+ return sorted(res, key=lambda x: x[1], reverse=True)
254
+
255
+ def merge_(self, tks):
256
+ # if split chars is part of token
257
+ res = []
258
+ tks = re.sub(r"[ ]+", " ", tks).split()
259
+ s = 0
260
+ while True:
261
+ if s >= len(tks):
262
+ break
263
+ E = s + 1
264
+ for e in range(s + 2, min(len(tks) + 2, s + 6)):
265
+ tk = "".join(tks[s:e])
266
+ if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
267
+ E = e
268
+ res.append("".join(tks[s:E]))
269
+ s = E
270
+
271
+ return " ".join(res)
272
+
273
+ def maxForward_(self, line):
274
+ res = []
275
+ s = 0
276
+ while s < len(line):
277
+ e = s + 1
278
+ t = line[s:e]
279
+ while e < len(line) and self.trie_.has_keys_with_prefix(
280
+ self.key_(t)):
281
+ e += 1
282
+ t = line[s:e]
283
+
284
+ while e - 1 > s and self.key_(t) not in self.trie_:
285
+ e -= 1
286
+ t = line[s:e]
287
+
288
+ if self.key_(t) in self.trie_:
289
+ res.append((t, self.trie_[self.key_(t)]))
290
+ else:
291
+ res.append((t, (0, '')))
292
+
293
+ s = e
294
+
295
+ return self.score_(res)
296
+
297
+ def maxBackward_(self, line):
298
+ res = []
299
+ s = len(line) - 1
300
+ while s >= 0:
301
+ e = s + 1
302
+ t = line[s:e]
303
+ while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
304
+ s -= 1
305
+ t = line[s:e]
306
+
307
+ while s + 1 < e and self.key_(t) not in self.trie_:
308
+ s += 1
309
+ t = line[s:e]
310
+
311
+ if self.key_(t) in self.trie_:
312
+ res.append((t, self.trie_[self.key_(t)]))
313
+ else:
314
+ res.append((t, (0, '')))
315
+
316
+ s -= 1
317
+
318
+ return self.score_(res[::-1])
319
+
320
+ def english_normalize_(self, tks):
321
+ return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
322
+
323
+ def _split_by_lang(self, line):
324
+ txt_lang_pairs = []
325
+ arr = re.split(self.SPLIT_CHAR, line)
326
+ for a in arr:
327
+ if not a:
328
+ continue
329
+ s = 0
330
+ e = s + 1
331
+ zh = is_chinese(a[s])
332
+ while e < len(a):
333
+ _zh = is_chinese(a[e])
334
+ if _zh == zh:
335
+ e += 1
336
+ continue
337
+ txt_lang_pairs.append((a[s: e], zh))
338
+ s = e
339
+ e = s + 1
340
+ zh = _zh
341
+ if s >= len(a):
342
+ continue
343
+ txt_lang_pairs.append((a[s: e], zh))
344
+ return txt_lang_pairs
345
+
346
+ def tokenize(self, line):
347
+ line = re.sub(r"\W+", " ", line)
348
+ line = self._strQ2B(line).lower()
349
+ line = self._tradi2simp(line)
350
+
351
+ arr = self._split_by_lang(line)
352
+ res = []
353
+ for L,lang in arr:
354
+ if not lang:
355
+ res.extend([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in self.word_tokenize(L)])
356
+ continue
357
+ if len(L) < 2 or re.match(
358
+ r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
359
+ res.append(L)
360
+ continue
361
+
362
+ # use maxforward for the first time
363
+ tks, s = self.maxForward_(L)
364
+ tks1, s1 = self.maxBackward_(L)
365
+ if self.DEBUG:
366
+ logging.debug("[FW] {} {}".format(tks, s))
367
+ logging.debug("[BW] {} {}".format(tks1, s1))
368
+
369
+ i, j, _i, _j = 0, 0, 0, 0
370
+ same = 0
371
+ while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
372
+ same += 1
373
+ if same > 0:
374
+ res.append(" ".join(tks[j: j + same]))
375
+ _i = i + same
376
+ _j = j + same
377
+ j = _j + 1
378
+ i = _i + 1
379
+
380
+ while i < len(tks1) and j < len(tks):
381
+ tk1, tk = "".join(tks1[_i:i]), "".join(tks[_j:j])
382
+ if tk1 != tk:
383
+ if len(tk1) > len(tk):
384
+ j += 1
385
+ else:
386
+ i += 1
387
+ continue
388
+
389
+ if tks1[i] != tks[j]:
390
+ i += 1
391
+ j += 1
392
+ continue
393
+ # backward tokens from_i to i are different from forward tokens from _j to j.
394
+ tkslist = []
395
+ self.dfs_("".join(tks[_j:j]), 0, [], tkslist)
396
+ res.append(" ".join(self.sortTks_(tkslist)[0][0]))
397
+
398
+ same = 1
399
+ while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
400
+ same += 1
401
+ res.append(" ".join(tks[j: j + same]))
402
+ _i = i + same
403
+ _j = j + same
404
+ j = _j + 1
405
+ i = _i + 1
406
+
407
+ if _i < len(tks1):
408
+ assert _j < len(tks)
409
+ assert "".join(tks1[_i:]) == "".join(tks[_j:])
410
+ tkslist = []
411
+ self.dfs_("".join(tks[_j:]), 0, [], tkslist)
412
+ res.append(" ".join(self.sortTks_(tkslist)[0][0]))
413
+
414
+ res = " ".join(res)
415
+ logging.debug("[TKS] {}".format(self.merge_(res)))
416
+ return self.merge_(res)
417
+
418
+ def fine_grained_tokenize(self, tks):
419
+ tks = tks.split()
420
+ zh_num = len([1 for c in tks if c and is_chinese(c[0])])
421
+ if zh_num < len(tks) * 0.2:
422
+ res = []
423
+ for tk in tks:
424
+ res.extend(tk.split("/"))
425
+ return " ".join(res)
426
+
427
+ res = []
428
+ for tk in tks:
429
+ if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
430
+ res.append(tk)
431
+ continue
432
+ tkslist = []
433
+ if len(tk) > 10:
434
+ tkslist.append(tk)
435
+ else:
436
+ self.dfs_(tk, 0, [], tkslist)
437
+ if len(tkslist) < 2:
438
+ res.append(tk)
439
+ continue
440
+ stk = self.sortTks_(tkslist)[1][0]
441
+ if len(stk) == len(tk):
442
+ stk = tk
443
+ else:
444
+ if re.match(r"[a-z\.-]+$", tk):
445
+ for t in stk:
446
+ if len(t) < 3:
447
+ stk = tk
448
+ break
449
+ else:
450
+ stk = " ".join(stk)
451
+ else:
452
+ stk = " ".join(stk)
453
+
454
+ res.append(stk)
455
+
456
+ return " ".join(self.english_normalize_(res))
457
+
458
+
459
+ def is_chinese(s):
460
+ if s >= u'\u4e00' and s <= u'\u9fa5':
461
+ return True
462
+ else:
463
+ return False
464
+
465
+
466
+ def is_number(s):
467
+ if s >= u'\u0030' and s <= u'\u0039':
468
+ return True
469
+ else:
470
+ return False
471
+
472
+
473
+ def is_alphabet(s):
474
+ if (s >= u'\u0041' and s <= u'\u005a') or (
475
+ s >= u'\u0061' and s <= u'\u007a'):
476
+ return True
477
+ else:
478
+ return False
479
+
480
+
481
+ def naiveQie(txt):
482
+ tks = []
483
+ for t in txt.split():
484
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]
485
+ ) and re.match(r".*[a-zA-Z]$", t):
486
+ tks.append(" ")
487
+ tks.append(t)
488
+ return tks
489
+
490
+
491
+ _default_tokenizer_lock = threading.Lock()
492
+ _default_tokenizer: RagTokenizer | None = None
493
+
494
+
495
+ def get_default_tokenizer() -> RagTokenizer:
496
+ global _default_tokenizer
497
+
498
+ if _default_tokenizer is not None:
499
+ return _default_tokenizer
500
+
501
+ with _default_tokenizer_lock:
502
+ if _default_tokenizer is None:
503
+ _default_tokenizer = RagTokenizer()
504
+ return _default_tokenizer
505
+
506
+
507
+ def tokenize(line):
508
+ return get_default_tokenizer().tokenize(line)
509
+
510
+
511
+ def fine_grained_tokenize(tks):
512
+ return get_default_tokenizer().fine_grained_tokenize(tks)
513
+
514
+
515
+ def tag(tk):
516
+ return get_default_tokenizer().tag(tk)
517
+
518
+
519
+ def freq(tk):
520
+ return get_default_tokenizer().freq(tk)
521
+
522
+
523
+ def loadUserDict(fnm):
524
+ return get_default_tokenizer().loadUserDict(fnm)
525
+
526
+
527
+ def addUserDict(fnm):
528
+ return get_default_tokenizer().addUserDict(fnm)
529
+
530
+
531
+ def tradi2simp(line):
532
+ return get_default_tokenizer()._tradi2simp(line)
533
+
534
+
535
+ def strQ2B(line):
536
+ return get_default_tokenizer()._strQ2B(line)
537
+
538
+
539
+ if __name__ == '__main__':
540
+ tknzr = RagTokenizer(debug=True)
541
+ # huqie.addUserDict("/tmp/tmp.new.tks.dict")
542
+ tks = tknzr.tokenize(
543
+ "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
544
+ logging.info(tknzr.fine_grained_tokenize(tks))
545
+ tks = tknzr.tokenize(
546
+ "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
547
+ logging.info(tknzr.fine_grained_tokenize(tks))
548
+ tks = tknzr.tokenize(
549
+ "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
550
+ logging.info(tknzr.fine_grained_tokenize(tks))
551
+ tks = tknzr.tokenize(
552
+ "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
553
+ logging.info(tknzr.fine_grained_tokenize(tks))
554
+ tks = tknzr.tokenize("虽然我不怎么玩")
555
+ logging.info(tknzr.fine_grained_tokenize(tks))
556
+ tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
557
+ logging.info(tknzr.fine_grained_tokenize(tks))
558
+ tks = tknzr.tokenize(
559
+ "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
560
+ logging.info(tknzr.fine_grained_tokenize(tks))
561
+ tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
562
+ logging.info(tknzr.fine_grained_tokenize(tks))
563
+ tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
564
+ logging.info(tknzr.fine_grained_tokenize(tks))
565
+ tks = tknzr.tokenize(
566
+ "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
567
+ logging.info(tknzr.fine_grained_tokenize(tks))
568
+ if len(sys.argv) < 2:
569
+ sys.exit()
570
+ tknzr.DEBUG = False
571
+ tknzr.loadUserDict(sys.argv[1])
572
+ of = open(sys.argv[2], "r")
573
+ while True:
574
+ line = of.readline()
575
+ if not line:
576
+ break
577
+ logging.info(tknzr.tokenize(line))
578
+ of.close()