deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,692 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import re
19
+ import copy
20
+ import time
21
+ import datetime
22
+ import demjson3
23
+ import traceback
24
+ import signal
25
+ import numpy as np
26
+ from .entities import degrees, schools, corporations
27
+ from ...depend import rag_tokenizer
28
+ from ...depend.surname import surname
29
+ from xpinyin import Pinyin
30
+ from contextlib import contextmanager
31
+
32
+
33
+ class TimeoutException(Exception):
34
+ pass
35
+
36
+
37
+ @contextmanager
38
+ def time_limit(seconds):
39
+ def signal_handler(signum, frame):
40
+ raise TimeoutException("Timed out!")
41
+
42
+ signal.signal(signal.SIGALRM, signal_handler)
43
+ signal.alarm(seconds)
44
+ try:
45
+ yield
46
+ finally:
47
+ signal.alarm(0)
48
+
49
+
50
+ ENV = None
51
+ PY = Pinyin()
52
+
53
+
54
+ def rmHtmlTag(line):
55
+ return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
56
+
57
+
58
+ def highest_degree(dg):
59
+ if not dg:
60
+ return ""
61
+ if isinstance(dg, str):
62
+ dg = [dg]
63
+ m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
64
+ return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
65
+
66
+
67
+ def forEdu(cv):
68
+ if not cv.get("education_obj"):
69
+ cv["integerity_flt"] *= 0.8
70
+ return cv
71
+
72
+ first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
73
+ edu_nst = []
74
+ edu_end_dt = ""
75
+ cv["school_rank_int"] = 1000000
76
+ for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
77
+ e = {}
78
+ if n.get("end_time"):
79
+ if n["end_time"] > edu_end_dt:
80
+ edu_end_dt = n["end_time"]
81
+ try:
82
+ dt = n["end_time"]
83
+ if re.match(r"[0-9]{9,}", dt):
84
+ dt = turnTm2Dt(dt)
85
+ y, m, d = getYMD(dt)
86
+ ed_dt.append(str(y))
87
+ e["end_dt_kwd"] = str(y)
88
+ except Exception as e:
89
+ pass
90
+ if n.get("start_time"):
91
+ try:
92
+ dt = n["start_time"]
93
+ if re.match(r"[0-9]{9,}", dt):
94
+ dt = turnTm2Dt(dt)
95
+ y, m, d = getYMD(dt)
96
+ st_dt.append(str(y))
97
+ e["start_dt_kwd"] = str(y)
98
+ except Exception:
99
+ pass
100
+
101
+ r = schools.select(n.get("school_name", ""))
102
+ if r:
103
+ if str(r.get("type", "")) == "1":
104
+ fea.append("211")
105
+ if str(r.get("type", "")) == "2":
106
+ fea.append("211")
107
+ if str(r.get("is_abroad", "")) == "1":
108
+ fea.append("留学")
109
+ if str(r.get("is_double_first", "")) == "1":
110
+ fea.append("双一流")
111
+ if str(r.get("is_985", "")) == "1":
112
+ fea.append("985")
113
+ if str(r.get("is_world_known", "")) == "1":
114
+ fea.append("海外知名")
115
+ if r.get("rank") and cv["school_rank_int"] > r["rank"]:
116
+ cv["school_rank_int"] = r["rank"]
117
+
118
+ if n.get("school_name") and isinstance(n["school_name"], str):
119
+ sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
120
+ e["sch_nm_kwd"] = sch[-1]
121
+ fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
122
+
123
+ if n.get("discipline_name") and isinstance(n["discipline_name"], str):
124
+ maj.append(n["discipline_name"])
125
+ e["major_kwd"] = n["discipline_name"]
126
+
127
+ if not n.get("degree") and "985" in fea and not first_fea:
128
+ n["degree"] = "1"
129
+
130
+ if n.get("degree"):
131
+ d = degrees.get_name(n["degree"])
132
+ if d:
133
+ e["degree_kwd"] = d
134
+ if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
135
+ d = "专升本"
136
+ if d:
137
+ deg.append(d)
138
+
139
+ # for first degree
140
+ if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
141
+ fdeg = [d]
142
+ if n.get("school_name"):
143
+ fsch = [n["school_name"]]
144
+ if n.get("discipline_name"):
145
+ fmaj = [n["discipline_name"]]
146
+ first_fea = copy.deepcopy(fea)
147
+
148
+ edu_nst.append(e)
149
+
150
+ cv["sch_rank_kwd"] = []
151
+ if cv["school_rank_int"] <= 20 or ("海外名校" in fea and cv["school_rank_int"] <= 200):
152
+ cv["sch_rank_kwd"].append("顶尖学校")
153
+ elif 50 >= cv["school_rank_int"] > 20 or ("海外名校" in fea and 500 >= cv["school_rank_int"] > 200):
154
+ cv["sch_rank_kwd"].append("精英学校")
155
+ elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) or ("海外名校" in fea and cv["school_rank_int"] > 500):
156
+ cv["sch_rank_kwd"].append("优质学校")
157
+ else:
158
+ cv["sch_rank_kwd"].append("一般学校")
159
+
160
+ if edu_nst:
161
+ cv["edu_nst"] = edu_nst
162
+ if fea:
163
+ cv["edu_fea_kwd"] = list(set(fea))
164
+ if first_fea:
165
+ cv["edu_first_fea_kwd"] = list(set(first_fea))
166
+ if maj:
167
+ cv["major_kwd"] = maj
168
+ if fsch:
169
+ cv["first_school_name_kwd"] = fsch
170
+ if fdeg:
171
+ cv["first_degree_kwd"] = fdeg
172
+ if fmaj:
173
+ cv["first_major_kwd"] = fmaj
174
+ if st_dt:
175
+ cv["edu_start_kwd"] = st_dt
176
+ if ed_dt:
177
+ cv["edu_end_kwd"] = ed_dt
178
+ if ed_dt:
179
+ cv["edu_end_int"] = max([int(t) for t in ed_dt])
180
+ if deg:
181
+ if "本科" in deg and "专科" in deg:
182
+ deg.append("专升本")
183
+ deg = [d for d in deg if d != '本科']
184
+ cv["degree_kwd"] = deg
185
+ cv["highest_degree_kwd"] = highest_degree(deg)
186
+ if edu_end_dt:
187
+ try:
188
+ if re.match(r"[0-9]{9,}", edu_end_dt):
189
+ edu_end_dt = turnTm2Dt(edu_end_dt)
190
+ if edu_end_dt.strip("\n") == "至今":
191
+ edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
192
+ y, m, d = getYMD(edu_end_dt)
193
+ cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
194
+ except Exception as e:
195
+ logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
196
+ if sch:
197
+ cv["school_name_kwd"] = sch
198
+ if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
199
+ or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
200
+ or not cv.get("degree_kwd"):
201
+ for c in sch:
202
+ if schools.is_good(c):
203
+ if "tag_kwd" not in cv:
204
+ cv["tag_kwd"] = []
205
+ cv["tag_kwd"].append("好学校")
206
+ cv["tag_kwd"].append("好学历")
207
+ break
208
+ if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"] and
209
+ any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
210
+ or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
211
+ or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
212
+ if "tag_kwd" not in cv:
213
+ cv["tag_kwd"] = []
214
+ if "好学历" not in cv["tag_kwd"]:
215
+ cv["tag_kwd"].append("好学历")
216
+
217
+ if cv.get("major_kwd"):
218
+ cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
219
+ if cv.get("school_name_kwd"):
220
+ cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
221
+ if cv.get("first_school_name_kwd"):
222
+ cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
223
+ if cv.get("first_major_kwd"):
224
+ cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
225
+
226
+ return cv
227
+
228
+
229
+ def forProj(cv):
230
+ if not cv.get("project_obj"):
231
+ return cv
232
+
233
+ pro_nms, desc = [], []
234
+ for i, n in enumerate(
235
+ sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
236
+ reverse=True)):
237
+ if n.get("name"):
238
+ pro_nms.append(n["name"])
239
+ if n.get("describe"):
240
+ desc.append(str(n["describe"]))
241
+ if n.get("responsibilities"):
242
+ desc.append(str(n["responsibilities"]))
243
+ if n.get("achivement"):
244
+ desc.append(str(n["achivement"]))
245
+
246
+ if pro_nms:
247
+ # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
248
+ cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
249
+ if desc:
250
+ cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
251
+ cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
252
+
253
+ return cv
254
+
255
+
256
+ def json_loads(line):
257
+ return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
258
+
259
+
260
+ def forWork(cv):
261
+ if not cv.get("work_obj"):
262
+ cv["integerity_flt"] *= 0.7
263
+ return cv
264
+
265
+ flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
266
+ "industry_name", "subordinates_count"]
267
+ duas = []
268
+ scales = []
269
+ fea = {c: [] for c in flds}
270
+ latest_job_tm = ""
271
+ goodcorp = False
272
+ goodcorp_ = False
273
+ work_st_tm = ""
274
+ corp_tags = []
275
+ for i, n in enumerate(
276
+ sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
277
+ reverse=True)):
278
+ if isinstance(n, str):
279
+ try:
280
+ n = json_loads(n)
281
+ except Exception:
282
+ continue
283
+
284
+ if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
285
+ work_st_tm = n["start_time"]
286
+ for c in flds:
287
+ if not n.get(c) or str(n[c]) == '0':
288
+ fea[c].append("")
289
+ continue
290
+ if c == "corporation_name":
291
+ n[c] = corporations.corpNorm(n[c], False)
292
+ if corporations.is_good(n[c]):
293
+ if i == 0:
294
+ goodcorp = True
295
+ else:
296
+ goodcorp_ = True
297
+ ct = corporations.corp_tag(n[c])
298
+ if i == 0:
299
+ corp_tags.extend(ct)
300
+ elif ct and ct[0] != "软外":
301
+ corp_tags.extend([f"{t}(曾)" for t in ct])
302
+
303
+ fea[c].append(rmHtmlTag(str(n[c]).lower()))
304
+
305
+ y, m, d = getYMD(n.get("start_time"))
306
+ if not y or not m:
307
+ continue
308
+ st = "%s-%02d-%02d" % (y, int(m), int(d))
309
+ latest_job_tm = st
310
+
311
+ y, m, d = getYMD(n.get("end_time"))
312
+ if (not y or not m) and i > 0:
313
+ continue
314
+ if not y or not m or int(y) > 2022:
315
+ y, m, d = getYMD(str(n.get("updated_at", "")))
316
+ if not y or not m:
317
+ continue
318
+ ed = "%s-%02d-%02d" % (y, int(m), int(d))
319
+
320
+ try:
321
+ duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
322
+ except Exception:
323
+ logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
324
+
325
+ if n.get("scale"):
326
+ r = re.search(r"^([0-9]+)", str(n["scale"]))
327
+ if r:
328
+ scales.append(int(r.group(1)))
329
+
330
+ if goodcorp:
331
+ if "tag_kwd" not in cv:
332
+ cv["tag_kwd"] = []
333
+ cv["tag_kwd"].append("好公司")
334
+ if goodcorp_:
335
+ if "tag_kwd" not in cv:
336
+ cv["tag_kwd"] = []
337
+ cv["tag_kwd"].append("好公司(曾)")
338
+
339
+ if corp_tags:
340
+ if "tag_kwd" not in cv:
341
+ cv["tag_kwd"] = []
342
+ cv["tag_kwd"].extend(corp_tags)
343
+ cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
344
+
345
+ if latest_job_tm:
346
+ cv["latest_job_dt"] = latest_job_tm
347
+ if fea["corporation_id"]:
348
+ cv["corporation_id"] = fea["corporation_id"]
349
+
350
+ if fea["position_name"]:
351
+ cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
352
+ cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
353
+ cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
354
+
355
+ if fea["industry_name"]:
356
+ cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
357
+ cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
358
+ cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
359
+
360
+ if fea["corporation_name"]:
361
+ cv["corporation_name_kwd"] = fea["corporation_name"][0]
362
+ cv["corp_nm_kwd"] = fea["corporation_name"]
363
+ cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
364
+ cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
365
+ cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
366
+
367
+ if fea["responsibilities"]:
368
+ cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
369
+ cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
370
+
371
+ if fea["subordinates_count"]:
372
+ fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
373
+ re.match(r"[^0-9]+$", str(i))]
374
+ if fea["subordinates_count"]:
375
+ cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
376
+
377
+ if isinstance(cv.get("corporation_id"), int):
378
+ cv["corporation_id"] = [str(cv["corporation_id"])]
379
+ if not cv.get("corporation_id"):
380
+ cv["corporation_id"] = []
381
+ for i in cv.get("corporation_id", []):
382
+ cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
383
+
384
+ if work_st_tm:
385
+ try:
386
+ if re.match(r"[0-9]{9,}", work_st_tm):
387
+ work_st_tm = turnTm2Dt(work_st_tm)
388
+ y, m, d = getYMD(work_st_tm)
389
+ cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
390
+ except Exception as e:
391
+ logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
392
+
393
+ cv["job_num_int"] = 0
394
+ if duas:
395
+ cv["dua_flt"] = np.mean(duas)
396
+ cv["cur_dua_int"] = duas[0]
397
+ cv["job_num_int"] = len(duas)
398
+ if scales:
399
+ cv["scale_flt"] = np.max(scales)
400
+ return cv
401
+
402
+
403
+ def turnTm2Dt(b):
404
+ if not b:
405
+ return None
406
+ b = str(b).strip()
407
+ if re.match(r"[0-9]{10,}", b):
408
+ b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
409
+ return b
410
+
411
+
412
+ def getYMD(b):
413
+ y, m, d = "", "", "01"
414
+ if not b:
415
+ return y, m, d
416
+ b = turnTm2Dt(b)
417
+ if re.match(r"[0-9]{4}", b):
418
+ y = int(b[:4])
419
+ r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
420
+ if r:
421
+ m = r.group(1)
422
+ r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
423
+ if r:
424
+ d = r.group(1)
425
+ if not d or int(d) == 0 or int(d) > 31:
426
+ d = "1"
427
+ if not m or int(m) > 12 or int(m) < 1:
428
+ m = "1"
429
+ return y, m, d
430
+
431
+
432
+ def birth(cv):
433
+ if not cv.get("birth"):
434
+ cv["integerity_flt"] *= 0.9
435
+ return cv
436
+ y, m, d = getYMD(cv["birth"])
437
+ if not m or not y:
438
+ return cv
439
+ b = "%s-%02d-%02d" % (y, int(m), int(d))
440
+ cv["birth_dt"] = b
441
+ cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
442
+
443
+ cv["age_int"] = datetime.datetime.now().year - int(y)
444
+ return cv
445
+
446
+
447
+ def parse(cv):
448
+ for k in cv.keys():
449
+ if cv[k] == '\\N':
450
+ cv[k] = ''
451
+ # cv = cv.asDict()
452
+ tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
453
+ "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
454
+ "position_name", "school_name", "self_remark", "title_name"]
455
+ small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
456
+ kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
457
+ "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
458
+ "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
459
+ num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
460
+ "expect_salary_to", "salary_month"]
461
+
462
+ is_fld = [
463
+ ("is_fertility", "已育", "未育"),
464
+ ("is_house", "有房", "没房"),
465
+ ("is_management_experience", "有管理经验", "无管理经验"),
466
+ ("is_marital", "已婚", "未婚"),
467
+ ("is_oversea", "有海外经验", "无海外经验")
468
+ ]
469
+
470
+ rmkeys = []
471
+ for k in cv.keys():
472
+ if cv[k] is None:
473
+ rmkeys.append(k)
474
+ if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
475
+ rmkeys.append(k)
476
+ for k in rmkeys:
477
+ del cv[k]
478
+
479
+ integrity = 0.
480
+ flds_num = 0.
481
+
482
+ def hasValues(flds):
483
+ nonlocal integrity, flds_num
484
+ flds_num += len(flds)
485
+ for f in flds:
486
+ v = str(cv.get(f, ""))
487
+ if len(v) > 0 and v != '0' and v != '[]':
488
+ integrity += 1
489
+
490
+ hasValues(tks_fld)
491
+ hasValues(small_tks_fld)
492
+ hasValues(kwd_fld)
493
+ hasValues(num_fld)
494
+ cv["integerity_flt"] = integrity / flds_num
495
+
496
+ if cv.get("corporation_type"):
497
+ for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
498
+ (r"[//.· <\((]+.*", ""),
499
+ (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
500
+ (r".*(机关|事业).*", "机关"),
501
+ (r".*(非盈利|Non-profit).*", "非盈利"),
502
+ (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
503
+ (r".*国有.*", "国企"),
504
+ (r"[ ()\(\)人/·0-9-]+", ""),
505
+ (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
506
+ cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
507
+ if len(cv["corporation_type"]) < 2:
508
+ del cv["corporation_type"]
509
+
510
+ if cv.get("political_status"):
511
+ for p, r in [
512
+ (r".*党员.*", "党员"),
513
+ (r".*(无党派|公民).*", "群众"),
514
+ (r".*团员.*", "团员")]:
515
+ cv["political_status"] = re.sub(p, r, cv["political_status"])
516
+ if not re.search(r"[党团群]", cv["political_status"]):
517
+ del cv["political_status"]
518
+
519
+ if cv.get("phone"):
520
+ cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
521
+
522
+ keys = list(cv.keys())
523
+ for k in keys:
524
+ # deal with json objects
525
+ if k.find("_obj") > 0:
526
+ try:
527
+ cv[k] = json_loads(cv[k])
528
+ cv[k] = [a for _, a in cv[k].items()]
529
+ nms = []
530
+ for n in cv[k]:
531
+ if not isinstance(n, dict) or "name" not in n or not n.get("name"):
532
+ continue
533
+ n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
534
+ if not n["name"]:
535
+ continue
536
+ nms.append(n["name"])
537
+ if nms:
538
+ t = k[:-4]
539
+ cv[f"{t}_kwd"] = nms
540
+ cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
541
+ except Exception:
542
+ logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
543
+ cv[k] = []
544
+
545
+ # tokenize fields
546
+ if k in tks_fld:
547
+ cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
548
+ if k in small_tks_fld:
549
+ cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
550
+
551
+ # keyword fields
552
+ if k in kwd_fld:
553
+ cv[f"{k}_kwd"] = [n.lower()
554
+ for n in re.split(r"[\t,,;;. ]",
555
+ re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
556
+ ) if n]
557
+
558
+ if k in num_fld and cv.get(k):
559
+ cv[f"{k}_int"] = cv[k]
560
+
561
+ cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
562
+ # for name field
563
+ if cv.get("name"):
564
+ nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
565
+ nm = re.sub(r"[ \t ]+", " ", nm)
566
+ if re.match(r"[a-zA-Z ]+$", nm):
567
+ if len(nm.split()) > 1:
568
+ cv["name"] = nm
569
+ else:
570
+ nm = ""
571
+ elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
572
+ nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
573
+ else:
574
+ nm = ""
575
+ cv["name"] = nm.strip()
576
+ name = cv["name"]
577
+
578
+ # name pingyin and its prefix
579
+ cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
580
+ cv["name_py_pref0_tks"] = ""
581
+ cv["name_py_pref_tks"] = ""
582
+ for py in PY.get_pinyins(nm[:20], ''):
583
+ for i in range(2, len(py) + 1):
584
+ cv["name_py_pref_tks"] += " " + py[:i]
585
+ for py in PY.get_pinyins(nm[:20], ' '):
586
+ py = py.split()
587
+ for i in range(1, len(py) + 1):
588
+ cv["name_py_pref0_tks"] += " " + "".join(py[:i])
589
+
590
+ cv["name_kwd"] = name
591
+ cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
592
+ cv["name_tks"] = (
593
+ rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
594
+ ) if name else ""
595
+ else:
596
+ cv["integerity_flt"] /= 2.
597
+
598
+ if cv.get("phone"):
599
+ r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
600
+ if not r:
601
+ cv["phone"] = ""
602
+ else:
603
+ cv["phone"] = r.group(1)
604
+
605
+ # deal with date fields
606
+ if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
607
+ cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
608
+ else:
609
+ y, m, d = getYMD(str(cv.get("updated_at", "")))
610
+ if not y:
611
+ y = "2012"
612
+ if not m:
613
+ m = "01"
614
+ if not d:
615
+ d = "01"
616
+ cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
617
+ # long text tokenize
618
+
619
+ if cv.get("responsibilities"):
620
+ cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
621
+
622
+ # for yes or no field
623
+ fea = []
624
+ for f, y, n in is_fld:
625
+ if f not in cv:
626
+ continue
627
+ if cv[f] == '是':
628
+ fea.append(y)
629
+ if cv[f] == '否':
630
+ fea.append(n)
631
+
632
+ if fea:
633
+ cv["tag_kwd"] = fea
634
+
635
+ cv = forEdu(cv)
636
+ cv = forProj(cv)
637
+ cv = forWork(cv)
638
+ cv = birth(cv)
639
+
640
+ cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
641
+ for i in range(len(cv["corp_proj_sch_deg_kwd"])):
642
+ for j in cv.get("sch_rank_kwd", []):
643
+ cv["corp_proj_sch_deg_kwd"][i] += "+" + j
644
+ for i in range(len(cv["corp_proj_sch_deg_kwd"])):
645
+ if cv.get("highest_degree_kwd"):
646
+ cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
647
+
648
+ try:
649
+ if not cv.get("work_exp_flt") and cv.get("work_start_time"):
650
+ if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
651
+ cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
652
+ cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
653
+ elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
654
+ y, m, d = getYMD(str(cv["work_start_time"]))
655
+ cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
656
+ cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
657
+ except Exception as e:
658
+ logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
659
+ if "work_exp_flt" not in cv and cv.get("work_experience", 0):
660
+ cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
661
+
662
+ keys = list(cv.keys())
663
+ for k in keys:
664
+ if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
665
+ del cv[k]
666
+ for k in cv.keys():
667
+ if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
668
+ continue
669
+ cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
670
+ keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
671
+ for k in keys:
672
+ if cv[k] <= 0:
673
+ del cv[k]
674
+
675
+ cv["tob_resume_id"] = str(cv["tob_resume_id"])
676
+ cv["id"] = cv["tob_resume_id"]
677
+ logging.debug("CCCCCCCCCCCCCCC")
678
+
679
+ return dealWithInt64(cv)
680
+
681
+
682
+ def dealWithInt64(d):
683
+ if isinstance(d, dict):
684
+ for n, v in d.items():
685
+ d[n] = dealWithInt64(v)
686
+
687
+ if isinstance(d, list):
688
+ d = [dealWithInt64(t) for t in d]
689
+
690
+ if isinstance(d, np.integer):
691
+ d = int(d)
692
+ return d