deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
from docx import Document
|
|
18
|
+
import re
|
|
19
|
+
import pandas as pd
|
|
20
|
+
from collections import Counter
|
|
21
|
+
from io import BytesIO
|
|
22
|
+
|
|
23
|
+
from ..config import TokenizerConfig
|
|
24
|
+
from ..depend.rag_tokenizer import RagTokenizer
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RAGFlowDocxParser:
|
|
28
|
+
def __init__(self, tokenizer_cfg: TokenizerConfig | None = None):
|
|
29
|
+
if tokenizer_cfg is None:
|
|
30
|
+
tokenizer_cfg = TokenizerConfig.from_env()
|
|
31
|
+
self.tokenizer_cfg = tokenizer_cfg
|
|
32
|
+
self.tokenizer = RagTokenizer(
|
|
33
|
+
dict_prefix=tokenizer_cfg.resolve_dict_prefix(),
|
|
34
|
+
offline=tokenizer_cfg.offline,
|
|
35
|
+
nltk_data_dir=tokenizer_cfg.nltk_data_dir,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def __extract_table_content(self, tb):
|
|
39
|
+
df = []
|
|
40
|
+
for row in tb.rows:
|
|
41
|
+
df.append([c.text for c in row.cells])
|
|
42
|
+
return self.__compose_table_content(pd.DataFrame(df))
|
|
43
|
+
|
|
44
|
+
def __compose_table_content(self, df):
|
|
45
|
+
|
|
46
|
+
def blockType(b):
|
|
47
|
+
pattern = [
|
|
48
|
+
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
49
|
+
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
|
50
|
+
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
|
51
|
+
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
52
|
+
(r"^第*[一二三四1-4]季度$", "Dt"),
|
|
53
|
+
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
|
54
|
+
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
|
|
55
|
+
("^[0-9.,+%/ -]+$", "Nu"),
|
|
56
|
+
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
|
57
|
+
(r"^[A-Z]*[a-z' -]+$", "En"),
|
|
58
|
+
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
|
59
|
+
(r"^.{1}$", "Sg")
|
|
60
|
+
]
|
|
61
|
+
for p, n in pattern:
|
|
62
|
+
if re.search(p, b):
|
|
63
|
+
return n
|
|
64
|
+
tks = [t for t in self.tokenizer.tokenize(b).split() if len(t) > 1]
|
|
65
|
+
if len(tks) > 3:
|
|
66
|
+
if len(tks) < 12:
|
|
67
|
+
return "Tx"
|
|
68
|
+
else:
|
|
69
|
+
return "Lx"
|
|
70
|
+
|
|
71
|
+
if len(tks) == 1 and self.tokenizer.tag(tks[0]) == "nr":
|
|
72
|
+
return "Nr"
|
|
73
|
+
|
|
74
|
+
return "Ot"
|
|
75
|
+
|
|
76
|
+
if len(df) < 2:
|
|
77
|
+
return []
|
|
78
|
+
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
|
|
79
|
+
1, len(df)) for j in range(len(df.iloc[i, :]))])
|
|
80
|
+
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
|
81
|
+
|
|
82
|
+
colnm = len(df.iloc[0, :])
|
|
83
|
+
hdrows = [0] # header is not necessarily appear in the first line
|
|
84
|
+
if max_type == "Nu":
|
|
85
|
+
for r in range(1, len(df)):
|
|
86
|
+
tys = Counter([blockType(str(df.iloc[r, j]))
|
|
87
|
+
for j in range(len(df.iloc[r, :]))])
|
|
88
|
+
tys = max(tys.items(), key=lambda x: x[1])[0]
|
|
89
|
+
if tys != max_type:
|
|
90
|
+
hdrows.append(r)
|
|
91
|
+
|
|
92
|
+
lines = []
|
|
93
|
+
for i in range(1, len(df)):
|
|
94
|
+
if i in hdrows:
|
|
95
|
+
continue
|
|
96
|
+
hr = [r - i for r in hdrows]
|
|
97
|
+
hr = [r for r in hr if r < 0]
|
|
98
|
+
t = len(hr) - 1
|
|
99
|
+
while t > 0:
|
|
100
|
+
if hr[t] - hr[t - 1] > 1:
|
|
101
|
+
hr = hr[t:]
|
|
102
|
+
break
|
|
103
|
+
t -= 1
|
|
104
|
+
headers = []
|
|
105
|
+
for j in range(len(df.iloc[i, :])):
|
|
106
|
+
t = []
|
|
107
|
+
for h in hr:
|
|
108
|
+
x = str(df.iloc[i + h, j]).strip()
|
|
109
|
+
if x in t:
|
|
110
|
+
continue
|
|
111
|
+
t.append(x)
|
|
112
|
+
t = ",".join(t)
|
|
113
|
+
if t:
|
|
114
|
+
t += ": "
|
|
115
|
+
headers.append(t)
|
|
116
|
+
cells = []
|
|
117
|
+
for j in range(len(df.iloc[i, :])):
|
|
118
|
+
if not str(df.iloc[i, j]):
|
|
119
|
+
continue
|
|
120
|
+
cells.append(headers[j] + str(df.iloc[i, j]))
|
|
121
|
+
lines.append(";".join(cells))
|
|
122
|
+
|
|
123
|
+
if colnm > 3:
|
|
124
|
+
return lines
|
|
125
|
+
return ["\n".join(lines)]
|
|
126
|
+
|
|
127
|
+
def __call__(self, fnm, from_page=0, to_page=100000000):
|
|
128
|
+
self.doc = Document(fnm) if isinstance(
|
|
129
|
+
fnm, str) else Document(BytesIO(fnm))
|
|
130
|
+
pn = 0 # parsed page
|
|
131
|
+
secs = [] # parsed contents
|
|
132
|
+
for p in self.doc.paragraphs:
|
|
133
|
+
if pn > to_page:
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
runs_within_single_paragraph = [] # save runs within the range of pages
|
|
137
|
+
for run in p.runs:
|
|
138
|
+
if pn > to_page:
|
|
139
|
+
break
|
|
140
|
+
if from_page <= pn < to_page and p.text.strip():
|
|
141
|
+
runs_within_single_paragraph.append(run.text) # append run.text first
|
|
142
|
+
|
|
143
|
+
# wrap page break checker into a static method
|
|
144
|
+
if 'lastRenderedPageBreak' in run._element.xml:
|
|
145
|
+
pn += 1
|
|
146
|
+
|
|
147
|
+
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
|
148
|
+
|
|
149
|
+
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
|
150
|
+
return secs, tbls
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from io import BytesIO
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
from openpyxl import Workbook, load_workbook
|
|
21
|
+
from PIL import Image
|
|
22
|
+
|
|
23
|
+
from ..depend.find_codec import find_codec
|
|
24
|
+
|
|
25
|
+
# copied from `/openpyxl/cell/cell.py`
|
|
26
|
+
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RAGFlowExcelParser:
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _load_excel_to_workbook(file_like_object):
|
|
32
|
+
if isinstance(file_like_object, bytes):
|
|
33
|
+
file_like_object = BytesIO(file_like_object)
|
|
34
|
+
|
|
35
|
+
# Read first 4 bytes to determine file type
|
|
36
|
+
file_like_object.seek(0)
|
|
37
|
+
file_head = file_like_object.read(4)
|
|
38
|
+
file_like_object.seek(0)
|
|
39
|
+
|
|
40
|
+
if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
|
|
41
|
+
logging.info("Not an Excel file, converting CSV to Excel Workbook")
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
file_like_object.seek(0)
|
|
45
|
+
df = pd.read_csv(file_like_object, on_bad_lines='skip')
|
|
46
|
+
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
|
47
|
+
|
|
48
|
+
except Exception as e_csv:
|
|
49
|
+
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
return load_workbook(file_like_object, data_only=True)
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logging.info(f"openpyxl load error: {e}, try pandas instead")
|
|
55
|
+
try:
|
|
56
|
+
file_like_object.seek(0)
|
|
57
|
+
try:
|
|
58
|
+
dfs = pd.read_excel(file_like_object, sheet_name=None)
|
|
59
|
+
return RAGFlowExcelParser._dataframe_to_workbook(dfs)
|
|
60
|
+
except Exception as ex:
|
|
61
|
+
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
|
|
62
|
+
file_like_object.seek(0)
|
|
63
|
+
df = pd.read_excel(file_like_object, engine="calamine")
|
|
64
|
+
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
|
65
|
+
except Exception as e_pandas:
|
|
66
|
+
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _clean_dataframe(df: pd.DataFrame):
|
|
70
|
+
def clean_string(s):
|
|
71
|
+
if isinstance(s, str):
|
|
72
|
+
return ILLEGAL_CHARACTERS_RE.sub(" ", s)
|
|
73
|
+
return s
|
|
74
|
+
|
|
75
|
+
return df.apply(lambda col: col.map(clean_string))
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def _dataframe_to_workbook(df):
|
|
79
|
+
# if contains multiple sheets use _dataframes_to_workbook
|
|
80
|
+
if isinstance(df, dict) and len(df) > 1:
|
|
81
|
+
return RAGFlowExcelParser._dataframes_to_workbook(df)
|
|
82
|
+
|
|
83
|
+
df = RAGFlowExcelParser._clean_dataframe(df)
|
|
84
|
+
wb = Workbook()
|
|
85
|
+
ws = wb.active
|
|
86
|
+
ws.title = "Data"
|
|
87
|
+
|
|
88
|
+
for col_num, column_name in enumerate(df.columns, 1):
|
|
89
|
+
ws.cell(row=1, column=col_num, value=column_name)
|
|
90
|
+
|
|
91
|
+
for row_num, row in enumerate(df.values, 2):
|
|
92
|
+
for col_num, value in enumerate(row, 1):
|
|
93
|
+
ws.cell(row=row_num, column=col_num, value=value)
|
|
94
|
+
|
|
95
|
+
return wb
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _dataframes_to_workbook(dfs: dict):
|
|
99
|
+
wb = Workbook()
|
|
100
|
+
default_sheet = wb.active
|
|
101
|
+
wb.remove(default_sheet)
|
|
102
|
+
|
|
103
|
+
for sheet_name, df in dfs.items():
|
|
104
|
+
df = RAGFlowExcelParser._clean_dataframe(df)
|
|
105
|
+
ws = wb.create_sheet(title=sheet_name)
|
|
106
|
+
for col_num, column_name in enumerate(df.columns, 1):
|
|
107
|
+
ws.cell(row=1, column=col_num, value=column_name)
|
|
108
|
+
for row_num, row in enumerate(df.values, 2):
|
|
109
|
+
for col_num, value in enumerate(row, 1):
|
|
110
|
+
ws.cell(row=row_num, column=col_num, value=value)
|
|
111
|
+
return wb
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def _extract_images_from_worksheet(ws, sheetname=None):
|
|
115
|
+
"""
|
|
116
|
+
Extract images from a worksheet and enrich them with vision-based descriptions.
|
|
117
|
+
|
|
118
|
+
Returns: List[dict]
|
|
119
|
+
"""
|
|
120
|
+
images = getattr(ws, "_images", [])
|
|
121
|
+
if not images:
|
|
122
|
+
return []
|
|
123
|
+
|
|
124
|
+
raw_items = []
|
|
125
|
+
|
|
126
|
+
for img in images:
|
|
127
|
+
try:
|
|
128
|
+
img_bytes = img._data()
|
|
129
|
+
pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
|
|
130
|
+
|
|
131
|
+
anchor = img.anchor
|
|
132
|
+
if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
|
|
133
|
+
r1, c1 = anchor._from.row + 1, anchor._from.col + 1
|
|
134
|
+
r2, c2 = anchor._to.row + 1, anchor._to.col + 1
|
|
135
|
+
if r1 == r2 and c1 == c2:
|
|
136
|
+
span = "single_cell"
|
|
137
|
+
else:
|
|
138
|
+
span = "multi_cell"
|
|
139
|
+
else:
|
|
140
|
+
r1, c1 = anchor._from.row + 1, anchor._from.col + 1
|
|
141
|
+
r2, c2 = r1, c1
|
|
142
|
+
span = "single_cell"
|
|
143
|
+
|
|
144
|
+
item = {
|
|
145
|
+
"sheet": sheetname or ws.title,
|
|
146
|
+
"image": pil_img,
|
|
147
|
+
"image_description": "",
|
|
148
|
+
"row_from": r1,
|
|
149
|
+
"col_from": c1,
|
|
150
|
+
"row_to": r2,
|
|
151
|
+
"col_to": c2,
|
|
152
|
+
"span_type": span,
|
|
153
|
+
}
|
|
154
|
+
raw_items.append(item)
|
|
155
|
+
except Exception:
|
|
156
|
+
continue
|
|
157
|
+
return raw_items
|
|
158
|
+
|
|
159
|
+
def html(self, fnm, chunk_rows=256):
|
|
160
|
+
from html import escape
|
|
161
|
+
|
|
162
|
+
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
|
163
|
+
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
|
164
|
+
tb_chunks = []
|
|
165
|
+
|
|
166
|
+
def _fmt(v):
|
|
167
|
+
if v is None:
|
|
168
|
+
return ""
|
|
169
|
+
return str(v).strip()
|
|
170
|
+
|
|
171
|
+
for sheetname in wb.sheetnames:
|
|
172
|
+
ws = wb[sheetname]
|
|
173
|
+
try:
|
|
174
|
+
rows = list(ws.rows)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
if not rows:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
tb_rows_0 = "<tr>"
|
|
183
|
+
for t in list(rows[0]):
|
|
184
|
+
tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
|
|
185
|
+
tb_rows_0 += "</tr>"
|
|
186
|
+
|
|
187
|
+
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
|
188
|
+
tb = ""
|
|
189
|
+
tb += f"<table><caption>{sheetname}</caption>"
|
|
190
|
+
tb += tb_rows_0
|
|
191
|
+
for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
|
|
192
|
+
tb += "<tr>"
|
|
193
|
+
for i, c in enumerate(r):
|
|
194
|
+
if c.value is None:
|
|
195
|
+
tb += "<td></td>"
|
|
196
|
+
else:
|
|
197
|
+
tb += f"<td>{escape(_fmt(c.value))}</td>"
|
|
198
|
+
tb += "</tr>"
|
|
199
|
+
tb += "</table>\n"
|
|
200
|
+
tb_chunks.append(tb)
|
|
201
|
+
|
|
202
|
+
return tb_chunks
|
|
203
|
+
|
|
204
|
+
def markdown(self, fnm):
|
|
205
|
+
import pandas as pd
|
|
206
|
+
|
|
207
|
+
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
|
208
|
+
try:
|
|
209
|
+
file_like_object.seek(0)
|
|
210
|
+
df = pd.read_excel(file_like_object)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
|
|
213
|
+
file_like_object.seek(0)
|
|
214
|
+
df = pd.read_csv(file_like_object, on_bad_lines='skip')
|
|
215
|
+
df = df.replace(r"^\s*$", "", regex=True)
|
|
216
|
+
return df.to_markdown(index=False)
|
|
217
|
+
|
|
218
|
+
def __call__(self, fnm):
|
|
219
|
+
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
|
220
|
+
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
|
221
|
+
|
|
222
|
+
res = []
|
|
223
|
+
for sheetname in wb.sheetnames:
|
|
224
|
+
ws = wb[sheetname]
|
|
225
|
+
try:
|
|
226
|
+
rows = list(ws.rows)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
|
229
|
+
continue
|
|
230
|
+
if not rows:
|
|
231
|
+
continue
|
|
232
|
+
ti = list(rows[0])
|
|
233
|
+
for r in list(rows[1:]):
|
|
234
|
+
fields = []
|
|
235
|
+
for i, c in enumerate(r):
|
|
236
|
+
if not c.value:
|
|
237
|
+
continue
|
|
238
|
+
t = str(ti[i].value) if i < len(ti) else ""
|
|
239
|
+
t += (":" if t else "") + str(c.value)
|
|
240
|
+
fields.append(t)
|
|
241
|
+
line = "; ".join(fields)
|
|
242
|
+
if sheetname.lower().find("sheet") < 0:
|
|
243
|
+
line += " ——" + sheetname
|
|
244
|
+
res.append(line)
|
|
245
|
+
return res
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def row_number(fnm, binary):
|
|
249
|
+
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
|
250
|
+
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
|
|
251
|
+
total = 0
|
|
252
|
+
|
|
253
|
+
for sheetname in wb.sheetnames:
|
|
254
|
+
try:
|
|
255
|
+
ws = wb[sheetname]
|
|
256
|
+
total += len(list(ws.rows))
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
|
259
|
+
continue
|
|
260
|
+
return total
|
|
261
|
+
|
|
262
|
+
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
|
263
|
+
encoding = find_codec(binary)
|
|
264
|
+
txt = binary.decode(encoding, errors="ignore")
|
|
265
|
+
return len(txt.split("\n"))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
if __name__ == "__main__":
|
|
269
|
+
psr = RAGFlowExcelParser()
|
|
270
|
+
psr(sys.argv[1])
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
17
|
+
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
from ..llm_adapter import LLMType, LLMAdapter
|
|
21
|
+
from ..llm_adapter.vision import vision_llm_chunk as picture_vision_llm_chunk
|
|
22
|
+
from ..depend.prompts import vision_llm_figure_describe_prompt
|
|
23
|
+
|
|
24
|
+
# Try to import timeout from common, fallback to local
|
|
25
|
+
try:
|
|
26
|
+
from ..common.connection_utils import timeout
|
|
27
|
+
except ImportError:
|
|
28
|
+
from ..depend.timeout import timeout
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
|
|
32
|
+
if not figures_data_without_positions:
|
|
33
|
+
return []
|
|
34
|
+
return [
|
|
35
|
+
(
|
|
36
|
+
(figure_data[1], [figure_data[0]]),
|
|
37
|
+
[(0, 0, 0, 0, 0)],
|
|
38
|
+
)
|
|
39
|
+
for figure_data in figures_data_without_positions
|
|
40
|
+
if isinstance(figure_data[1], Image.Image)
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs):
|
|
45
|
+
if not sections:
|
|
46
|
+
return tbls
|
|
47
|
+
try:
|
|
48
|
+
vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
|
|
49
|
+
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
|
|
50
|
+
except Exception:
|
|
51
|
+
vision_model = None
|
|
52
|
+
if vision_model:
|
|
53
|
+
figures_data = vision_figure_parser_figure_data_wrapper(sections)
|
|
54
|
+
try:
|
|
55
|
+
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
|
|
56
|
+
boosted_figures = docx_vision_parser(callback=callback)
|
|
57
|
+
tbls.extend(boosted_figures)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
|
|
60
|
+
return tbls
|
|
61
|
+
|
|
62
|
+
def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
|
|
63
|
+
tbls = []
|
|
64
|
+
if not images:
|
|
65
|
+
return []
|
|
66
|
+
try:
|
|
67
|
+
vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
|
|
68
|
+
callback(0.2, "Visual model detected. Attempting to enhance Excel image extraction...")
|
|
69
|
+
except Exception:
|
|
70
|
+
vision_model = None
|
|
71
|
+
if vision_model:
|
|
72
|
+
figures_data = [((
|
|
73
|
+
img["image"], # Image.Image
|
|
74
|
+
[img["image_description"]] # description list (must be list)
|
|
75
|
+
),
|
|
76
|
+
[
|
|
77
|
+
(0, 0, 0, 0, 0) # dummy position
|
|
78
|
+
]) for img in images]
|
|
79
|
+
try:
|
|
80
|
+
parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
|
|
81
|
+
callback(0.22, "Parsing images...")
|
|
82
|
+
boosted_figures = parser(callback=callback)
|
|
83
|
+
tbls.extend(boosted_figures)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
callback(0.25, f"Excel visual model error: {e}. Skipping vision enhancement.")
|
|
86
|
+
return tbls
|
|
87
|
+
|
|
88
|
+
def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
|
|
89
|
+
if not tbls:
|
|
90
|
+
return []
|
|
91
|
+
try:
|
|
92
|
+
vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
|
|
93
|
+
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
|
|
94
|
+
except Exception:
|
|
95
|
+
vision_model = None
|
|
96
|
+
if vision_model:
|
|
97
|
+
def is_figure_item(item):
|
|
98
|
+
return (
|
|
99
|
+
isinstance(item[0][0], Image.Image) and
|
|
100
|
+
isinstance(item[0][1], list)
|
|
101
|
+
)
|
|
102
|
+
figures_data = [item for item in tbls if is_figure_item(item)]
|
|
103
|
+
try:
|
|
104
|
+
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
|
|
105
|
+
boosted_figures = docx_vision_parser(callback=callback)
|
|
106
|
+
tbls = [item for item in tbls if not is_figure_item(item)]
|
|
107
|
+
tbls.extend(boosted_figures)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
|
|
110
|
+
return tbls
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
shared_executor = ThreadPoolExecutor(max_workers=10)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class VisionFigureParser:
|
|
117
|
+
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
|
118
|
+
self.vision_model = vision_model
|
|
119
|
+
self._extract_figures_info(figures_data)
|
|
120
|
+
assert len(self.figures) == len(self.descriptions)
|
|
121
|
+
assert not self.positions or (len(self.figures) == len(self.positions))
|
|
122
|
+
|
|
123
|
+
def _extract_figures_info(self, figures_data):
|
|
124
|
+
self.figures = []
|
|
125
|
+
self.descriptions = []
|
|
126
|
+
self.positions = []
|
|
127
|
+
|
|
128
|
+
for item in figures_data:
|
|
129
|
+
# position
|
|
130
|
+
if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
|
|
131
|
+
img_desc = item[0]
|
|
132
|
+
assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
|
|
133
|
+
self.figures.append(img_desc[0])
|
|
134
|
+
self.descriptions.append(img_desc[1])
|
|
135
|
+
self.positions.append(item[1])
|
|
136
|
+
else:
|
|
137
|
+
assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
|
|
138
|
+
self.figures.append(item[0])
|
|
139
|
+
self.descriptions.append(item[1])
|
|
140
|
+
|
|
141
|
+
def _assemble(self):
|
|
142
|
+
self.assembled = []
|
|
143
|
+
self.has_positions = len(self.positions) != 0
|
|
144
|
+
for i in range(len(self.figures)):
|
|
145
|
+
figure = self.figures[i]
|
|
146
|
+
desc = self.descriptions[i]
|
|
147
|
+
pos = self.positions[i] if self.has_positions else None
|
|
148
|
+
|
|
149
|
+
figure_desc = (figure, desc)
|
|
150
|
+
|
|
151
|
+
if pos is not None:
|
|
152
|
+
self.assembled.append((figure_desc, pos))
|
|
153
|
+
else:
|
|
154
|
+
self.assembled.append((figure_desc,))
|
|
155
|
+
|
|
156
|
+
return self.assembled
|
|
157
|
+
|
|
158
|
+
def __call__(self, **kwargs):
|
|
159
|
+
callback = kwargs.get("callback", lambda prog, msg: None)
|
|
160
|
+
|
|
161
|
+
@timeout(30, 3)
|
|
162
|
+
def process(figure_idx, figure_binary):
|
|
163
|
+
description_text = picture_vision_llm_chunk(
|
|
164
|
+
binary=figure_binary,
|
|
165
|
+
vision_model=self.vision_model,
|
|
166
|
+
prompt=vision_llm_figure_describe_prompt(),
|
|
167
|
+
callback=callback,
|
|
168
|
+
)
|
|
169
|
+
return figure_idx, description_text
|
|
170
|
+
|
|
171
|
+
futures = []
|
|
172
|
+
for idx, img_binary in enumerate(self.figures or []):
|
|
173
|
+
futures.append(shared_executor.submit(process, idx, img_binary))
|
|
174
|
+
|
|
175
|
+
for future in as_completed(futures):
|
|
176
|
+
figure_num, txt = future.result()
|
|
177
|
+
if txt:
|
|
178
|
+
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
|
179
|
+
|
|
180
|
+
self._assemble()
|
|
181
|
+
|
|
182
|
+
return self.assembled
|