deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
import copy
|
|
20
|
+
import time
|
|
21
|
+
import datetime
|
|
22
|
+
import demjson3
|
|
23
|
+
import traceback
|
|
24
|
+
import signal
|
|
25
|
+
import numpy as np
|
|
26
|
+
from .entities import degrees, schools, corporations
|
|
27
|
+
from ...depend import rag_tokenizer
|
|
28
|
+
from ...depend.surname import surname
|
|
29
|
+
from xpinyin import Pinyin
|
|
30
|
+
from contextlib import contextmanager
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TimeoutException(Exception):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@contextmanager
|
|
38
|
+
def time_limit(seconds):
|
|
39
|
+
def signal_handler(signum, frame):
|
|
40
|
+
raise TimeoutException("Timed out!")
|
|
41
|
+
|
|
42
|
+
signal.signal(signal.SIGALRM, signal_handler)
|
|
43
|
+
signal.alarm(seconds)
|
|
44
|
+
try:
|
|
45
|
+
yield
|
|
46
|
+
finally:
|
|
47
|
+
signal.alarm(0)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
ENV = None
|
|
51
|
+
PY = Pinyin()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def rmHtmlTag(line):
|
|
55
|
+
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def highest_degree(dg):
|
|
59
|
+
if not dg:
|
|
60
|
+
return ""
|
|
61
|
+
if isinstance(dg, str):
|
|
62
|
+
dg = [dg]
|
|
63
|
+
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
|
|
64
|
+
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def forEdu(cv):
|
|
68
|
+
if not cv.get("education_obj"):
|
|
69
|
+
cv["integerity_flt"] *= 0.8
|
|
70
|
+
return cv
|
|
71
|
+
|
|
72
|
+
first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
|
|
73
|
+
edu_nst = []
|
|
74
|
+
edu_end_dt = ""
|
|
75
|
+
cv["school_rank_int"] = 1000000
|
|
76
|
+
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
|
|
77
|
+
e = {}
|
|
78
|
+
if n.get("end_time"):
|
|
79
|
+
if n["end_time"] > edu_end_dt:
|
|
80
|
+
edu_end_dt = n["end_time"]
|
|
81
|
+
try:
|
|
82
|
+
dt = n["end_time"]
|
|
83
|
+
if re.match(r"[0-9]{9,}", dt):
|
|
84
|
+
dt = turnTm2Dt(dt)
|
|
85
|
+
y, m, d = getYMD(dt)
|
|
86
|
+
ed_dt.append(str(y))
|
|
87
|
+
e["end_dt_kwd"] = str(y)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
pass
|
|
90
|
+
if n.get("start_time"):
|
|
91
|
+
try:
|
|
92
|
+
dt = n["start_time"]
|
|
93
|
+
if re.match(r"[0-9]{9,}", dt):
|
|
94
|
+
dt = turnTm2Dt(dt)
|
|
95
|
+
y, m, d = getYMD(dt)
|
|
96
|
+
st_dt.append(str(y))
|
|
97
|
+
e["start_dt_kwd"] = str(y)
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
r = schools.select(n.get("school_name", ""))
|
|
102
|
+
if r:
|
|
103
|
+
if str(r.get("type", "")) == "1":
|
|
104
|
+
fea.append("211")
|
|
105
|
+
if str(r.get("type", "")) == "2":
|
|
106
|
+
fea.append("211")
|
|
107
|
+
if str(r.get("is_abroad", "")) == "1":
|
|
108
|
+
fea.append("留学")
|
|
109
|
+
if str(r.get("is_double_first", "")) == "1":
|
|
110
|
+
fea.append("双一流")
|
|
111
|
+
if str(r.get("is_985", "")) == "1":
|
|
112
|
+
fea.append("985")
|
|
113
|
+
if str(r.get("is_world_known", "")) == "1":
|
|
114
|
+
fea.append("海外知名")
|
|
115
|
+
if r.get("rank") and cv["school_rank_int"] > r["rank"]:
|
|
116
|
+
cv["school_rank_int"] = r["rank"]
|
|
117
|
+
|
|
118
|
+
if n.get("school_name") and isinstance(n["school_name"], str):
|
|
119
|
+
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
|
120
|
+
e["sch_nm_kwd"] = sch[-1]
|
|
121
|
+
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
|
|
122
|
+
|
|
123
|
+
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
|
124
|
+
maj.append(n["discipline_name"])
|
|
125
|
+
e["major_kwd"] = n["discipline_name"]
|
|
126
|
+
|
|
127
|
+
if not n.get("degree") and "985" in fea and not first_fea:
|
|
128
|
+
n["degree"] = "1"
|
|
129
|
+
|
|
130
|
+
if n.get("degree"):
|
|
131
|
+
d = degrees.get_name(n["degree"])
|
|
132
|
+
if d:
|
|
133
|
+
e["degree_kwd"] = d
|
|
134
|
+
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
|
|
135
|
+
d = "专升本"
|
|
136
|
+
if d:
|
|
137
|
+
deg.append(d)
|
|
138
|
+
|
|
139
|
+
# for first degree
|
|
140
|
+
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
|
|
141
|
+
fdeg = [d]
|
|
142
|
+
if n.get("school_name"):
|
|
143
|
+
fsch = [n["school_name"]]
|
|
144
|
+
if n.get("discipline_name"):
|
|
145
|
+
fmaj = [n["discipline_name"]]
|
|
146
|
+
first_fea = copy.deepcopy(fea)
|
|
147
|
+
|
|
148
|
+
edu_nst.append(e)
|
|
149
|
+
|
|
150
|
+
cv["sch_rank_kwd"] = []
|
|
151
|
+
if cv["school_rank_int"] <= 20 or ("海外名校" in fea and cv["school_rank_int"] <= 200):
|
|
152
|
+
cv["sch_rank_kwd"].append("顶尖学校")
|
|
153
|
+
elif 50 >= cv["school_rank_int"] > 20 or ("海外名校" in fea and 500 >= cv["school_rank_int"] > 200):
|
|
154
|
+
cv["sch_rank_kwd"].append("精英学校")
|
|
155
|
+
elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) or ("海外名校" in fea and cv["school_rank_int"] > 500):
|
|
156
|
+
cv["sch_rank_kwd"].append("优质学校")
|
|
157
|
+
else:
|
|
158
|
+
cv["sch_rank_kwd"].append("一般学校")
|
|
159
|
+
|
|
160
|
+
if edu_nst:
|
|
161
|
+
cv["edu_nst"] = edu_nst
|
|
162
|
+
if fea:
|
|
163
|
+
cv["edu_fea_kwd"] = list(set(fea))
|
|
164
|
+
if first_fea:
|
|
165
|
+
cv["edu_first_fea_kwd"] = list(set(first_fea))
|
|
166
|
+
if maj:
|
|
167
|
+
cv["major_kwd"] = maj
|
|
168
|
+
if fsch:
|
|
169
|
+
cv["first_school_name_kwd"] = fsch
|
|
170
|
+
if fdeg:
|
|
171
|
+
cv["first_degree_kwd"] = fdeg
|
|
172
|
+
if fmaj:
|
|
173
|
+
cv["first_major_kwd"] = fmaj
|
|
174
|
+
if st_dt:
|
|
175
|
+
cv["edu_start_kwd"] = st_dt
|
|
176
|
+
if ed_dt:
|
|
177
|
+
cv["edu_end_kwd"] = ed_dt
|
|
178
|
+
if ed_dt:
|
|
179
|
+
cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
|
180
|
+
if deg:
|
|
181
|
+
if "本科" in deg and "专科" in deg:
|
|
182
|
+
deg.append("专升本")
|
|
183
|
+
deg = [d for d in deg if d != '本科']
|
|
184
|
+
cv["degree_kwd"] = deg
|
|
185
|
+
cv["highest_degree_kwd"] = highest_degree(deg)
|
|
186
|
+
if edu_end_dt:
|
|
187
|
+
try:
|
|
188
|
+
if re.match(r"[0-9]{9,}", edu_end_dt):
|
|
189
|
+
edu_end_dt = turnTm2Dt(edu_end_dt)
|
|
190
|
+
if edu_end_dt.strip("\n") == "至今":
|
|
191
|
+
edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
|
192
|
+
y, m, d = getYMD(edu_end_dt)
|
|
193
|
+
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
|
|
196
|
+
if sch:
|
|
197
|
+
cv["school_name_kwd"] = sch
|
|
198
|
+
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
|
199
|
+
or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
|
|
200
|
+
or not cv.get("degree_kwd"):
|
|
201
|
+
for c in sch:
|
|
202
|
+
if schools.is_good(c):
|
|
203
|
+
if "tag_kwd" not in cv:
|
|
204
|
+
cv["tag_kwd"] = []
|
|
205
|
+
cv["tag_kwd"].append("好学校")
|
|
206
|
+
cv["tag_kwd"].append("好学历")
|
|
207
|
+
break
|
|
208
|
+
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"] and
|
|
209
|
+
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
|
|
210
|
+
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
|
|
211
|
+
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
|
|
212
|
+
if "tag_kwd" not in cv:
|
|
213
|
+
cv["tag_kwd"] = []
|
|
214
|
+
if "好学历" not in cv["tag_kwd"]:
|
|
215
|
+
cv["tag_kwd"].append("好学历")
|
|
216
|
+
|
|
217
|
+
if cv.get("major_kwd"):
|
|
218
|
+
cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
|
219
|
+
if cv.get("school_name_kwd"):
|
|
220
|
+
cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
|
221
|
+
if cv.get("first_school_name_kwd"):
|
|
222
|
+
cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
|
223
|
+
if cv.get("first_major_kwd"):
|
|
224
|
+
cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
|
225
|
+
|
|
226
|
+
return cv
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def forProj(cv):
|
|
230
|
+
if not cv.get("project_obj"):
|
|
231
|
+
return cv
|
|
232
|
+
|
|
233
|
+
pro_nms, desc = [], []
|
|
234
|
+
for i, n in enumerate(
|
|
235
|
+
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
|
|
236
|
+
reverse=True)):
|
|
237
|
+
if n.get("name"):
|
|
238
|
+
pro_nms.append(n["name"])
|
|
239
|
+
if n.get("describe"):
|
|
240
|
+
desc.append(str(n["describe"]))
|
|
241
|
+
if n.get("responsibilities"):
|
|
242
|
+
desc.append(str(n["responsibilities"]))
|
|
243
|
+
if n.get("achivement"):
|
|
244
|
+
desc.append(str(n["achivement"]))
|
|
245
|
+
|
|
246
|
+
if pro_nms:
|
|
247
|
+
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
|
248
|
+
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
|
249
|
+
if desc:
|
|
250
|
+
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
|
251
|
+
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
|
252
|
+
|
|
253
|
+
return cv
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def json_loads(line):
|
|
257
|
+
return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def forWork(cv):
|
|
261
|
+
if not cv.get("work_obj"):
|
|
262
|
+
cv["integerity_flt"] *= 0.7
|
|
263
|
+
return cv
|
|
264
|
+
|
|
265
|
+
flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
|
|
266
|
+
"industry_name", "subordinates_count"]
|
|
267
|
+
duas = []
|
|
268
|
+
scales = []
|
|
269
|
+
fea = {c: [] for c in flds}
|
|
270
|
+
latest_job_tm = ""
|
|
271
|
+
goodcorp = False
|
|
272
|
+
goodcorp_ = False
|
|
273
|
+
work_st_tm = ""
|
|
274
|
+
corp_tags = []
|
|
275
|
+
for i, n in enumerate(
|
|
276
|
+
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
|
|
277
|
+
reverse=True)):
|
|
278
|
+
if isinstance(n, str):
|
|
279
|
+
try:
|
|
280
|
+
n = json_loads(n)
|
|
281
|
+
except Exception:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
|
|
285
|
+
work_st_tm = n["start_time"]
|
|
286
|
+
for c in flds:
|
|
287
|
+
if not n.get(c) or str(n[c]) == '0':
|
|
288
|
+
fea[c].append("")
|
|
289
|
+
continue
|
|
290
|
+
if c == "corporation_name":
|
|
291
|
+
n[c] = corporations.corpNorm(n[c], False)
|
|
292
|
+
if corporations.is_good(n[c]):
|
|
293
|
+
if i == 0:
|
|
294
|
+
goodcorp = True
|
|
295
|
+
else:
|
|
296
|
+
goodcorp_ = True
|
|
297
|
+
ct = corporations.corp_tag(n[c])
|
|
298
|
+
if i == 0:
|
|
299
|
+
corp_tags.extend(ct)
|
|
300
|
+
elif ct and ct[0] != "软外":
|
|
301
|
+
corp_tags.extend([f"{t}(曾)" for t in ct])
|
|
302
|
+
|
|
303
|
+
fea[c].append(rmHtmlTag(str(n[c]).lower()))
|
|
304
|
+
|
|
305
|
+
y, m, d = getYMD(n.get("start_time"))
|
|
306
|
+
if not y or not m:
|
|
307
|
+
continue
|
|
308
|
+
st = "%s-%02d-%02d" % (y, int(m), int(d))
|
|
309
|
+
latest_job_tm = st
|
|
310
|
+
|
|
311
|
+
y, m, d = getYMD(n.get("end_time"))
|
|
312
|
+
if (not y or not m) and i > 0:
|
|
313
|
+
continue
|
|
314
|
+
if not y or not m or int(y) > 2022:
|
|
315
|
+
y, m, d = getYMD(str(n.get("updated_at", "")))
|
|
316
|
+
if not y or not m:
|
|
317
|
+
continue
|
|
318
|
+
ed = "%s-%02d-%02d" % (y, int(m), int(d))
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
|
322
|
+
except Exception:
|
|
323
|
+
logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
|
|
324
|
+
|
|
325
|
+
if n.get("scale"):
|
|
326
|
+
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
|
327
|
+
if r:
|
|
328
|
+
scales.append(int(r.group(1)))
|
|
329
|
+
|
|
330
|
+
if goodcorp:
|
|
331
|
+
if "tag_kwd" not in cv:
|
|
332
|
+
cv["tag_kwd"] = []
|
|
333
|
+
cv["tag_kwd"].append("好公司")
|
|
334
|
+
if goodcorp_:
|
|
335
|
+
if "tag_kwd" not in cv:
|
|
336
|
+
cv["tag_kwd"] = []
|
|
337
|
+
cv["tag_kwd"].append("好公司(曾)")
|
|
338
|
+
|
|
339
|
+
if corp_tags:
|
|
340
|
+
if "tag_kwd" not in cv:
|
|
341
|
+
cv["tag_kwd"] = []
|
|
342
|
+
cv["tag_kwd"].extend(corp_tags)
|
|
343
|
+
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
|
|
344
|
+
|
|
345
|
+
if latest_job_tm:
|
|
346
|
+
cv["latest_job_dt"] = latest_job_tm
|
|
347
|
+
if fea["corporation_id"]:
|
|
348
|
+
cv["corporation_id"] = fea["corporation_id"]
|
|
349
|
+
|
|
350
|
+
if fea["position_name"]:
|
|
351
|
+
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
|
352
|
+
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
|
353
|
+
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
|
354
|
+
|
|
355
|
+
if fea["industry_name"]:
|
|
356
|
+
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
|
357
|
+
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
|
358
|
+
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
|
359
|
+
|
|
360
|
+
if fea["corporation_name"]:
|
|
361
|
+
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
|
362
|
+
cv["corp_nm_kwd"] = fea["corporation_name"]
|
|
363
|
+
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
|
364
|
+
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
|
365
|
+
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
|
366
|
+
|
|
367
|
+
if fea["responsibilities"]:
|
|
368
|
+
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
|
369
|
+
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
|
370
|
+
|
|
371
|
+
if fea["subordinates_count"]:
|
|
372
|
+
fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
|
373
|
+
re.match(r"[^0-9]+$", str(i))]
|
|
374
|
+
if fea["subordinates_count"]:
|
|
375
|
+
cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
|
376
|
+
|
|
377
|
+
if isinstance(cv.get("corporation_id"), int):
|
|
378
|
+
cv["corporation_id"] = [str(cv["corporation_id"])]
|
|
379
|
+
if not cv.get("corporation_id"):
|
|
380
|
+
cv["corporation_id"] = []
|
|
381
|
+
for i in cv.get("corporation_id", []):
|
|
382
|
+
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
|
|
383
|
+
|
|
384
|
+
if work_st_tm:
|
|
385
|
+
try:
|
|
386
|
+
if re.match(r"[0-9]{9,}", work_st_tm):
|
|
387
|
+
work_st_tm = turnTm2Dt(work_st_tm)
|
|
388
|
+
y, m, d = getYMD(work_st_tm)
|
|
389
|
+
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
|
|
392
|
+
|
|
393
|
+
cv["job_num_int"] = 0
|
|
394
|
+
if duas:
|
|
395
|
+
cv["dua_flt"] = np.mean(duas)
|
|
396
|
+
cv["cur_dua_int"] = duas[0]
|
|
397
|
+
cv["job_num_int"] = len(duas)
|
|
398
|
+
if scales:
|
|
399
|
+
cv["scale_flt"] = np.max(scales)
|
|
400
|
+
return cv
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def turnTm2Dt(b):
|
|
404
|
+
if not b:
|
|
405
|
+
return None
|
|
406
|
+
b = str(b).strip()
|
|
407
|
+
if re.match(r"[0-9]{10,}", b):
|
|
408
|
+
b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
|
409
|
+
return b
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def getYMD(b):
|
|
413
|
+
y, m, d = "", "", "01"
|
|
414
|
+
if not b:
|
|
415
|
+
return y, m, d
|
|
416
|
+
b = turnTm2Dt(b)
|
|
417
|
+
if re.match(r"[0-9]{4}", b):
|
|
418
|
+
y = int(b[:4])
|
|
419
|
+
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
|
|
420
|
+
if r:
|
|
421
|
+
m = r.group(1)
|
|
422
|
+
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
|
|
423
|
+
if r:
|
|
424
|
+
d = r.group(1)
|
|
425
|
+
if not d or int(d) == 0 or int(d) > 31:
|
|
426
|
+
d = "1"
|
|
427
|
+
if not m or int(m) > 12 or int(m) < 1:
|
|
428
|
+
m = "1"
|
|
429
|
+
return y, m, d
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def birth(cv):
|
|
433
|
+
if not cv.get("birth"):
|
|
434
|
+
cv["integerity_flt"] *= 0.9
|
|
435
|
+
return cv
|
|
436
|
+
y, m, d = getYMD(cv["birth"])
|
|
437
|
+
if not m or not y:
|
|
438
|
+
return cv
|
|
439
|
+
b = "%s-%02d-%02d" % (y, int(m), int(d))
|
|
440
|
+
cv["birth_dt"] = b
|
|
441
|
+
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
|
|
442
|
+
|
|
443
|
+
cv["age_int"] = datetime.datetime.now().year - int(y)
|
|
444
|
+
return cv
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def parse(cv):
|
|
448
|
+
for k in cv.keys():
|
|
449
|
+
if cv[k] == '\\N':
|
|
450
|
+
cv[k] = ''
|
|
451
|
+
# cv = cv.asDict()
|
|
452
|
+
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
|
|
453
|
+
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
|
|
454
|
+
"position_name", "school_name", "self_remark", "title_name"]
|
|
455
|
+
small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
|
|
456
|
+
kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
|
|
457
|
+
"expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
|
|
458
|
+
"industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
|
|
459
|
+
num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
|
|
460
|
+
"expect_salary_to", "salary_month"]
|
|
461
|
+
|
|
462
|
+
is_fld = [
|
|
463
|
+
("is_fertility", "已育", "未育"),
|
|
464
|
+
("is_house", "有房", "没房"),
|
|
465
|
+
("is_management_experience", "有管理经验", "无管理经验"),
|
|
466
|
+
("is_marital", "已婚", "未婚"),
|
|
467
|
+
("is_oversea", "有海外经验", "无海外经验")
|
|
468
|
+
]
|
|
469
|
+
|
|
470
|
+
rmkeys = []
|
|
471
|
+
for k in cv.keys():
|
|
472
|
+
if cv[k] is None:
|
|
473
|
+
rmkeys.append(k)
|
|
474
|
+
if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
|
|
475
|
+
rmkeys.append(k)
|
|
476
|
+
for k in rmkeys:
|
|
477
|
+
del cv[k]
|
|
478
|
+
|
|
479
|
+
integrity = 0.
|
|
480
|
+
flds_num = 0.
|
|
481
|
+
|
|
482
|
+
def hasValues(flds):
|
|
483
|
+
nonlocal integrity, flds_num
|
|
484
|
+
flds_num += len(flds)
|
|
485
|
+
for f in flds:
|
|
486
|
+
v = str(cv.get(f, ""))
|
|
487
|
+
if len(v) > 0 and v != '0' and v != '[]':
|
|
488
|
+
integrity += 1
|
|
489
|
+
|
|
490
|
+
hasValues(tks_fld)
|
|
491
|
+
hasValues(small_tks_fld)
|
|
492
|
+
hasValues(kwd_fld)
|
|
493
|
+
hasValues(num_fld)
|
|
494
|
+
cv["integerity_flt"] = integrity / flds_num
|
|
495
|
+
|
|
496
|
+
if cv.get("corporation_type"):
|
|
497
|
+
for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
|
|
498
|
+
(r"[//.· <\((]+.*", ""),
|
|
499
|
+
(r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
|
|
500
|
+
(r".*(机关|事业).*", "机关"),
|
|
501
|
+
(r".*(非盈利|Non-profit).*", "非盈利"),
|
|
502
|
+
(r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
|
|
503
|
+
(r".*国有.*", "国企"),
|
|
504
|
+
(r"[ ()\(\)人/·0-9-]+", ""),
|
|
505
|
+
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
|
506
|
+
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
|
|
507
|
+
if len(cv["corporation_type"]) < 2:
|
|
508
|
+
del cv["corporation_type"]
|
|
509
|
+
|
|
510
|
+
if cv.get("political_status"):
|
|
511
|
+
for p, r in [
|
|
512
|
+
(r".*党员.*", "党员"),
|
|
513
|
+
(r".*(无党派|公民).*", "群众"),
|
|
514
|
+
(r".*团员.*", "团员")]:
|
|
515
|
+
cv["political_status"] = re.sub(p, r, cv["political_status"])
|
|
516
|
+
if not re.search(r"[党团群]", cv["political_status"]):
|
|
517
|
+
del cv["political_status"]
|
|
518
|
+
|
|
519
|
+
if cv.get("phone"):
|
|
520
|
+
cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
|
521
|
+
|
|
522
|
+
keys = list(cv.keys())
|
|
523
|
+
for k in keys:
|
|
524
|
+
# deal with json objects
|
|
525
|
+
if k.find("_obj") > 0:
|
|
526
|
+
try:
|
|
527
|
+
cv[k] = json_loads(cv[k])
|
|
528
|
+
cv[k] = [a for _, a in cv[k].items()]
|
|
529
|
+
nms = []
|
|
530
|
+
for n in cv[k]:
|
|
531
|
+
if not isinstance(n, dict) or "name" not in n or not n.get("name"):
|
|
532
|
+
continue
|
|
533
|
+
n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
|
|
534
|
+
if not n["name"]:
|
|
535
|
+
continue
|
|
536
|
+
nms.append(n["name"])
|
|
537
|
+
if nms:
|
|
538
|
+
t = k[:-4]
|
|
539
|
+
cv[f"{t}_kwd"] = nms
|
|
540
|
+
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
|
541
|
+
except Exception:
|
|
542
|
+
logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
|
|
543
|
+
cv[k] = []
|
|
544
|
+
|
|
545
|
+
# tokenize fields
|
|
546
|
+
if k in tks_fld:
|
|
547
|
+
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
|
548
|
+
if k in small_tks_fld:
|
|
549
|
+
cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
|
550
|
+
|
|
551
|
+
# keyword fields
|
|
552
|
+
if k in kwd_fld:
|
|
553
|
+
cv[f"{k}_kwd"] = [n.lower()
|
|
554
|
+
for n in re.split(r"[\t,,;;. ]",
|
|
555
|
+
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
|
|
556
|
+
) if n]
|
|
557
|
+
|
|
558
|
+
if k in num_fld and cv.get(k):
|
|
559
|
+
cv[f"{k}_int"] = cv[k]
|
|
560
|
+
|
|
561
|
+
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
|
|
562
|
+
# for name field
|
|
563
|
+
if cv.get("name"):
|
|
564
|
+
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
|
565
|
+
nm = re.sub(r"[ \t ]+", " ", nm)
|
|
566
|
+
if re.match(r"[a-zA-Z ]+$", nm):
|
|
567
|
+
if len(nm.split()) > 1:
|
|
568
|
+
cv["name"] = nm
|
|
569
|
+
else:
|
|
570
|
+
nm = ""
|
|
571
|
+
elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
|
|
572
|
+
nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
|
|
573
|
+
else:
|
|
574
|
+
nm = ""
|
|
575
|
+
cv["name"] = nm.strip()
|
|
576
|
+
name = cv["name"]
|
|
577
|
+
|
|
578
|
+
# name pingyin and its prefix
|
|
579
|
+
cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
|
|
580
|
+
cv["name_py_pref0_tks"] = ""
|
|
581
|
+
cv["name_py_pref_tks"] = ""
|
|
582
|
+
for py in PY.get_pinyins(nm[:20], ''):
|
|
583
|
+
for i in range(2, len(py) + 1):
|
|
584
|
+
cv["name_py_pref_tks"] += " " + py[:i]
|
|
585
|
+
for py in PY.get_pinyins(nm[:20], ' '):
|
|
586
|
+
py = py.split()
|
|
587
|
+
for i in range(1, len(py) + 1):
|
|
588
|
+
cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
|
589
|
+
|
|
590
|
+
cv["name_kwd"] = name
|
|
591
|
+
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
|
592
|
+
cv["name_tks"] = (
|
|
593
|
+
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
|
594
|
+
) if name else ""
|
|
595
|
+
else:
|
|
596
|
+
cv["integerity_flt"] /= 2.
|
|
597
|
+
|
|
598
|
+
if cv.get("phone"):
|
|
599
|
+
r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
|
|
600
|
+
if not r:
|
|
601
|
+
cv["phone"] = ""
|
|
602
|
+
else:
|
|
603
|
+
cv["phone"] = r.group(1)
|
|
604
|
+
|
|
605
|
+
# deal with date fields
|
|
606
|
+
if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
|
|
607
|
+
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
|
|
608
|
+
else:
|
|
609
|
+
y, m, d = getYMD(str(cv.get("updated_at", "")))
|
|
610
|
+
if not y:
|
|
611
|
+
y = "2012"
|
|
612
|
+
if not m:
|
|
613
|
+
m = "01"
|
|
614
|
+
if not d:
|
|
615
|
+
d = "01"
|
|
616
|
+
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
|
617
|
+
# long text tokenize
|
|
618
|
+
|
|
619
|
+
if cv.get("responsibilities"):
|
|
620
|
+
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
|
621
|
+
|
|
622
|
+
# for yes or no field
|
|
623
|
+
fea = []
|
|
624
|
+
for f, y, n in is_fld:
|
|
625
|
+
if f not in cv:
|
|
626
|
+
continue
|
|
627
|
+
if cv[f] == '是':
|
|
628
|
+
fea.append(y)
|
|
629
|
+
if cv[f] == '否':
|
|
630
|
+
fea.append(n)
|
|
631
|
+
|
|
632
|
+
if fea:
|
|
633
|
+
cv["tag_kwd"] = fea
|
|
634
|
+
|
|
635
|
+
cv = forEdu(cv)
|
|
636
|
+
cv = forProj(cv)
|
|
637
|
+
cv = forWork(cv)
|
|
638
|
+
cv = birth(cv)
|
|
639
|
+
|
|
640
|
+
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
|
|
641
|
+
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
|
642
|
+
for j in cv.get("sch_rank_kwd", []):
|
|
643
|
+
cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
|
644
|
+
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
|
645
|
+
if cv.get("highest_degree_kwd"):
|
|
646
|
+
cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
|
|
650
|
+
if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
|
|
651
|
+
cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
|
|
652
|
+
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
|
653
|
+
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
|
654
|
+
y, m, d = getYMD(str(cv["work_start_time"]))
|
|
655
|
+
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
|
656
|
+
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
|
659
|
+
if "work_exp_flt" not in cv and cv.get("work_experience", 0):
|
|
660
|
+
cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
|
661
|
+
|
|
662
|
+
keys = list(cv.keys())
|
|
663
|
+
for k in keys:
|
|
664
|
+
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
|
|
665
|
+
del cv[k]
|
|
666
|
+
for k in cv.keys():
|
|
667
|
+
if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
|
|
668
|
+
continue
|
|
669
|
+
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
|
|
670
|
+
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
|
|
671
|
+
for k in keys:
|
|
672
|
+
if cv[k] <= 0:
|
|
673
|
+
del cv[k]
|
|
674
|
+
|
|
675
|
+
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
|
676
|
+
cv["id"] = cv["tob_resume_id"]
|
|
677
|
+
logging.debug("CCCCCCCCCCCCCCC")
|
|
678
|
+
|
|
679
|
+
return dealWithInt64(cv)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def dealWithInt64(d):
|
|
683
|
+
if isinstance(d, dict):
|
|
684
|
+
for n, v in d.items():
|
|
685
|
+
d[n] = dealWithInt64(v)
|
|
686
|
+
|
|
687
|
+
if isinstance(d, list):
|
|
688
|
+
d = [dealWithInt64(t) for t in d]
|
|
689
|
+
|
|
690
|
+
if isinstance(d, np.integer):
|
|
691
|
+
d = int(d)
|
|
692
|
+
return d
|