pembot 0.0.3__py2.py3-none-any.whl → 0.0.4__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/AnyToText/convertor.py +250 -146
- pembot/__init__.py +1 -1
- pembot/config/config.yaml +1 -1
- pembot/main.py +26 -8
- pembot/pdf2markdown/extract.py +255 -307
- {pembot-0.0.3.dist-info → pembot-0.0.4.dist-info}/METADATA +1 -1
- {pembot-0.0.3.dist-info → pembot-0.0.4.dist-info}/RECORD +9 -9
- {pembot-0.0.3.dist-info → pembot-0.0.4.dist-info}/WHEEL +0 -0
- {pembot-0.0.3.dist-info → pembot-0.0.4.dist-info}/licenses/LICENSE +0 -0
pembot/AnyToText/convertor.py
CHANGED
|
@@ -7,6 +7,7 @@ import json
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from typing import Literal, Union, Dict, Any, List
|
|
9
9
|
import tempfile
|
|
10
|
+
from datetime import datetime, date
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
|
|
@@ -30,19 +31,19 @@ EXCEL_FILE_TYPES= [
|
|
|
30
31
|
class Convertor():
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
def __init__(self, myfile: Path | None, output_dir: Path | None, file_bytes: bytes | None, suffix: str | None, file_type: str | None):
|
|
34
|
+
def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
|
|
34
35
|
|
|
35
36
|
self.output= ""
|
|
36
37
|
|
|
37
38
|
# file_type can be pdf, excel, etc.
|
|
38
|
-
if output_dir is None and
|
|
39
|
+
if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
|
|
39
40
|
with tempfile.TemporaryDirectory() as dp:
|
|
40
41
|
with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
|
|
41
42
|
fp.write(file_bytes)
|
|
42
43
|
myfile= Path(fp.name)
|
|
43
44
|
output_dir= Path(dp)
|
|
44
45
|
if file_type == 'pdf':
|
|
45
|
-
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(
|
|
46
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
46
47
|
extractor.extract()
|
|
47
48
|
with open(output_dir / (myfile.stem + '.md')) as output_file:
|
|
48
49
|
self.output= output_file.read()
|
|
@@ -75,162 +76,235 @@ class Convertor():
|
|
|
75
76
|
else:
|
|
76
77
|
print(mt)
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
|
|
80
79
|
def convert_file_to_json(
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
80
|
+
self,
|
|
81
|
+
sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
|
|
82
|
+
orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
|
|
83
|
+
date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
|
|
84
|
+
csv_encoding: str = 'utf-8', # For reading CSV files
|
|
85
|
+
excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
|
|
87
86
|
) -> bool:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
87
|
+
"""
|
|
88
|
+
Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
|
|
89
|
+
into an equivalent JSON format.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
sheet_to_convert (str | int | None, optional):
|
|
93
|
+
- For Excel/ODS:
|
|
94
|
+
- If None (default): Converts all sheets. The JSON output will be a
|
|
95
|
+
dictionary where keys are sheet names and values are the JSON
|
|
96
|
+
representation of each sheet.
|
|
97
|
+
- If str: Name of the specific sheet to convert.
|
|
98
|
+
- If int: Index of the specific sheet to convert (0-based).
|
|
99
|
+
If a specific sheet is requested, the JSON output will directly be
|
|
100
|
+
the representation of that sheet.
|
|
101
|
+
- For CSV: This parameter is ignored. The entire CSV is processed.
|
|
102
|
+
orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
|
|
103
|
+
Default: 'records'. See pandas.DataFrame.to_dict() documentation.
|
|
104
|
+
date_format (str | None, optional): Format for datetime objects.
|
|
105
|
+
- 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
|
|
106
|
+
- 'epoch': Milliseconds since epoch.
|
|
107
|
+
- None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
|
|
108
|
+
csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
|
|
109
|
+
excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
|
|
110
|
+
- For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
|
|
111
|
+
- For ODS: 'odf' (requires 'odfpy' library).
|
|
112
|
+
If None, pandas auto-detects based on file extension and installed libraries.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
bool: True if conversion was successful, False otherwise.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
input_filepath = self.input_filepath
|
|
119
|
+
json_filepath = self.json_filepath
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
|
|
123
|
+
if not input_filepath.exists():
|
|
124
|
+
print(f"Error: Input file not found at {input_filepath}")
|
|
125
|
+
return False
|
|
126
126
|
|
|
127
|
-
|
|
128
|
-
|
|
127
|
+
# Ensure output directory exists
|
|
128
|
+
json_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
file_suffix = input_filepath.suffix.lower()
|
|
131
|
+
output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
|
|
132
132
|
|
|
133
|
-
|
|
133
|
+
dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
current_engine: PandasReadEngineType = excel_ods_engine
|
|
136
136
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
return False
|
|
146
|
-
|
|
147
|
-
elif file_suffix in ['.xls', '.xlsx', '.ods']:
|
|
148
|
-
try:
|
|
149
|
-
if file_suffix == '.ods':
|
|
150
|
-
if current_engine is None:
|
|
151
|
-
current_engine = 'odf'
|
|
152
|
-
elif current_engine != 'odf':
|
|
153
|
-
print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
|
|
154
|
-
current_engine = 'odf'
|
|
155
|
-
|
|
156
|
-
if sheet_to_convert is not None:
|
|
157
|
-
df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
|
|
158
|
-
dataframes_to_process.append((df, None))
|
|
159
|
-
|
|
160
|
-
else:
|
|
161
|
-
excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
|
|
162
|
-
if not excel_file.sheet_names:
|
|
163
|
-
print(f"Warning: File '{input_filepath.name}' contains no sheets.")
|
|
164
|
-
for sheet_name in excel_file.sheet_names:
|
|
165
|
-
df = excel_file.parse(sheet_name) # engine is inherited
|
|
166
|
-
dataframes_to_process.append((df, sheet_name))
|
|
167
|
-
except ImportError as ie:
|
|
168
|
-
if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
|
|
169
|
-
print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
|
|
170
|
-
elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
|
|
171
|
-
print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
|
|
172
|
-
elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
|
|
173
|
-
print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
|
|
174
|
-
else:
|
|
175
|
-
print(f"ImportError reading file '{input_filepath.name}': {ie}")
|
|
176
|
-
return False
|
|
177
|
-
except Exception as e:
|
|
178
|
-
print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
|
|
179
|
-
return False
|
|
180
|
-
else:
|
|
181
|
-
print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
|
|
137
|
+
if file_suffix == '.csv':
|
|
138
|
+
if sheet_to_convert is not None:
|
|
139
|
+
print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
|
|
140
|
+
try:
|
|
141
|
+
df = pd.read_csv(input_filepath, encoding=csv_encoding)
|
|
142
|
+
dataframes_to_process.append((df, None))
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error reading CSV file '{input_filepath.name}': {e}")
|
|
182
145
|
return False
|
|
183
146
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
147
|
+
elif file_suffix in ['.xls', '.xlsx', '.ods']:
|
|
148
|
+
try:
|
|
149
|
+
if file_suffix == '.ods':
|
|
150
|
+
if current_engine is None:
|
|
151
|
+
current_engine = 'odf'
|
|
152
|
+
elif current_engine != 'odf':
|
|
153
|
+
print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
|
|
154
|
+
current_engine = 'odf'
|
|
188
155
|
|
|
156
|
+
if sheet_to_convert is not None:
|
|
157
|
+
df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
|
|
158
|
+
dataframes_to_process.append((df, None))
|
|
189
159
|
|
|
190
|
-
is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
|
|
191
|
-
temp_processed_data: Dict[str, Any] = {}
|
|
192
|
-
|
|
193
|
-
for df_original, name_key in dataframes_to_process:
|
|
194
|
-
df = df_original.copy()
|
|
195
|
-
|
|
196
|
-
if date_format:
|
|
197
|
-
for col_name in df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns:
|
|
198
|
-
try:
|
|
199
|
-
if date_format == 'iso':
|
|
200
|
-
df[col_name] = df[col_name].apply(lambda x: x.isoformat() if pd.notnull(x) and hasattr(x, 'isoformat') else None)
|
|
201
|
-
elif date_format == 'epoch':
|
|
202
|
-
df[col_name] = df[col_name].apply(lambda x: int(x.timestamp() * 1000) if pd.notnull(x) and hasattr(x, 'timestamp') else None)
|
|
203
|
-
except Exception as e_date:
|
|
204
|
-
print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}. Problematic values might be None.")
|
|
205
|
-
|
|
206
|
-
df = df.astype(object).where(pd.notnull(df), None)
|
|
207
|
-
current_json_segment = df.to_dict(orient=orient)
|
|
208
|
-
|
|
209
|
-
if is_direct_output:
|
|
210
|
-
output_data_final = current_json_segment
|
|
211
|
-
break
|
|
212
160
|
else:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
161
|
+
excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
|
|
162
|
+
if not excel_file.sheet_names:
|
|
163
|
+
print(f"Warning: File '{input_filepath.name}' contains no sheets.")
|
|
164
|
+
for sheet_name in excel_file.sheet_names:
|
|
165
|
+
df = excel_file.parse(sheet_name) # engine is inherited
|
|
166
|
+
dataframes_to_process.append((df, sheet_name))
|
|
167
|
+
except ImportError as ie:
|
|
168
|
+
if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
|
|
169
|
+
print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
|
|
170
|
+
elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
|
|
171
|
+
print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
|
|
172
|
+
elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
|
|
173
|
+
print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
|
|
174
|
+
else:
|
|
175
|
+
print(f"ImportError reading file '{input_filepath.name}': {ie}")
|
|
176
|
+
return False
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
|
|
179
|
+
return False
|
|
180
|
+
else:
|
|
181
|
+
print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
|
|
182
|
+
return False
|
|
221
183
|
|
|
222
|
-
|
|
223
|
-
|
|
184
|
+
if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
|
|
185
|
+
print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
|
|
186
|
+
elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
|
|
190
|
+
temp_processed_data: Dict[str, Any] = {}
|
|
191
|
+
|
|
192
|
+
for df_original, name_key in dataframes_to_process:
|
|
193
|
+
df = df_original.copy()
|
|
194
|
+
|
|
195
|
+
# Handle datetime columns with improved detection and conversion
|
|
196
|
+
if date_format:
|
|
197
|
+
# Check for datetime columns using multiple approaches
|
|
198
|
+
datetime_columns = []
|
|
199
|
+
|
|
200
|
+
# Method 1: Use pandas dtype detection
|
|
201
|
+
datetime_columns.extend(df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns.tolist())
|
|
202
|
+
|
|
203
|
+
# Method 2: Check for datetime objects in each column
|
|
204
|
+
for col in df.columns:
|
|
205
|
+
if col not in datetime_columns:
|
|
206
|
+
# Sample a few non-null values to check type
|
|
207
|
+
sample_values = df[col].dropna().head(10)
|
|
208
|
+
if len(sample_values) > 0:
|
|
209
|
+
for val in sample_values:
|
|
210
|
+
if isinstance(val, (datetime, date, pd.Timestamp)):
|
|
211
|
+
datetime_columns.append(col)
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
# Convert datetime columns
|
|
215
|
+
for col_name in datetime_columns:
|
|
216
|
+
try:
|
|
217
|
+
if date_format == 'iso':
|
|
218
|
+
df[col_name] = df[col_name].apply(lambda x: self._convert_to_iso(x))
|
|
219
|
+
elif date_format == 'epoch':
|
|
220
|
+
df[col_name] = df[col_name].apply(lambda x: self._convert_to_epoch(x))
|
|
221
|
+
except Exception as e_date:
|
|
222
|
+
print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}")
|
|
223
|
+
|
|
224
|
+
# Replace NaN values with None for JSON compatibility
|
|
225
|
+
df = df.astype(object).where(pd.notnull(df), None)
|
|
226
|
+
|
|
227
|
+
# Final safety check: convert any remaining datetime objects
|
|
228
|
+
for col in df.columns:
|
|
229
|
+
df[col] = df[col].apply(lambda x: self._safe_datetime_convert(x, date_format))
|
|
230
|
+
|
|
231
|
+
current_json_segment = df.to_dict(orient=orient)
|
|
232
|
+
|
|
233
|
+
if is_direct_output:
|
|
234
|
+
output_data_final = current_json_segment
|
|
235
|
+
break
|
|
236
|
+
else:
|
|
237
|
+
if name_key is not None:
|
|
238
|
+
temp_processed_data[name_key] = current_json_segment
|
|
239
|
+
|
|
240
|
+
if not is_direct_output:
|
|
241
|
+
output_data_final = temp_processed_data
|
|
242
|
+
|
|
243
|
+
with open(json_filepath, 'w', encoding='utf-8') as f:
|
|
244
|
+
json.dump(output_data_final, f, indent=4, ensure_ascii=False)
|
|
245
|
+
|
|
246
|
+
print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
except FileNotFoundError:
|
|
250
|
+
print(f"Error: Input file not found at {input_filepath.name}")
|
|
251
|
+
return False
|
|
252
|
+
except ValueError as ve:
|
|
253
|
+
print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
|
|
254
|
+
return False
|
|
255
|
+
except Exception as e:
|
|
256
|
+
print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
|
|
257
|
+
return False
|
|
258
|
+
|
|
259
|
+
def _convert_to_iso(self, value):
|
|
260
|
+
"""Convert datetime-like objects to ISO format string."""
|
|
261
|
+
if pd.isnull(value) or value is None:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
if isinstance(value, str):
|
|
266
|
+
return value # Already a string
|
|
267
|
+
elif hasattr(value, 'isoformat'):
|
|
268
|
+
return value.isoformat()
|
|
269
|
+
elif isinstance(value, pd.Timestamp):
|
|
270
|
+
return value.isoformat()
|
|
271
|
+
else:
|
|
272
|
+
return str(value)
|
|
273
|
+
except:
|
|
274
|
+
return str(value) if value is not None else None
|
|
275
|
+
|
|
276
|
+
def _convert_to_epoch(self, value):
|
|
277
|
+
"""Convert datetime-like objects to epoch milliseconds."""
|
|
278
|
+
if pd.isnull(value) or value is None:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
if isinstance(value, (int, float)):
|
|
283
|
+
return int(value) # Assume already epoch
|
|
284
|
+
elif hasattr(value, 'timestamp'):
|
|
285
|
+
return int(value.timestamp() * 1000)
|
|
286
|
+
elif isinstance(value, pd.Timestamp):
|
|
287
|
+
return int(value.timestamp() * 1000)
|
|
288
|
+
else:
|
|
289
|
+
return str(value)
|
|
290
|
+
except:
|
|
291
|
+
return str(value) if value is not None else None
|
|
292
|
+
|
|
293
|
+
def _safe_datetime_convert(self, value, date_format):
|
|
294
|
+
"""Final safety conversion for any remaining datetime objects."""
|
|
295
|
+
if pd.isnull(value) or value is None:
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
# If it's a datetime-like object, convert it
|
|
299
|
+
if isinstance(value, (datetime, date, pd.Timestamp)):
|
|
300
|
+
if date_format == 'iso':
|
|
301
|
+
return self._convert_to_iso(value)
|
|
302
|
+
elif date_format == 'epoch':
|
|
303
|
+
return self._convert_to_epoch(value)
|
|
304
|
+
else:
|
|
305
|
+
return str(value)
|
|
224
306
|
|
|
225
|
-
|
|
226
|
-
print(f"Error: Input file not found at {input_filepath.name}")
|
|
227
|
-
return False
|
|
228
|
-
except ValueError as ve:
|
|
229
|
-
print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
|
|
230
|
-
return False
|
|
231
|
-
except Exception as e:
|
|
232
|
-
print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
|
|
233
|
-
return False
|
|
307
|
+
return value
|
|
234
308
|
|
|
235
309
|
|
|
236
310
|
def chunk_text(text, chunk_size=500, overlap_size=50):
|
|
@@ -257,4 +331,34 @@ def chunk_text(text, chunk_size=500, overlap_size=50):
|
|
|
257
331
|
return chunks
|
|
258
332
|
|
|
259
333
|
if __name__ == '__main__':
|
|
260
|
-
print("
|
|
334
|
+
print("Test Run Start:")
|
|
335
|
+
try:
|
|
336
|
+
# print("Test 1: scaned pdf page, bytes")
|
|
337
|
+
# with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
|
|
338
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
|
|
339
|
+
# print(conv.output)
|
|
340
|
+
|
|
341
|
+
# print("Test 2: JD pdf, bytes")
|
|
342
|
+
# with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
|
|
343
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
|
|
344
|
+
# print(conv.output)
|
|
345
|
+
|
|
346
|
+
# print("Test 3: excel schedule, bytes")
|
|
347
|
+
# with open("/home/cyto/Downloads/Assignment schedule.xlsx", "rb") as imgpdf:
|
|
348
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".xlsx", file_type= "excel")
|
|
349
|
+
# print(conv.output)
|
|
350
|
+
|
|
351
|
+
# without bytes example:
|
|
352
|
+
print("Test 4: scanned pdf, path")
|
|
353
|
+
conv= Convertor(myfile= Path('/home/cyto/Documents/scanned.pdf'), output_dir= Path('/home/cyto/Documents'))
|
|
354
|
+
print(conv.output)
|
|
355
|
+
|
|
356
|
+
# print("Test 5: schedule excel, path")
|
|
357
|
+
# conv= Convertor(myfile= Path('/home/cyto/Downloads/Assignment schedule.xlsx'), output_dir= Path('/home/cyto/Downloads'))
|
|
358
|
+
# print(conv.output)
|
|
359
|
+
except FileNotFoundError as fe:
|
|
360
|
+
print("file not found, modify the driver code to get sample files to test:\n\n", fe)
|
|
361
|
+
except Exception as e:
|
|
362
|
+
print("unhandled: ", e)
|
|
363
|
+
|
|
364
|
+
print("Test Run End.")
|
pembot/__init__.py
CHANGED
pembot/config/config.yaml
CHANGED
pembot/main.py
CHANGED
|
@@ -10,7 +10,11 @@ from pembot.query import rag_query_llm, remove_bs
|
|
|
10
10
|
import os
|
|
11
11
|
import json
|
|
12
12
|
from pembot.utils.string_tools import make_it_an_id
|
|
13
|
-
|
|
13
|
+
import pickle
|
|
14
|
+
from sys import argv
|
|
15
|
+
|
|
16
|
+
required_fields_path= ""
|
|
17
|
+
required_fields= None
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
def make_query(required_fields: list[tuple[str, str, str, str]]):
|
|
@@ -67,8 +71,8 @@ def save_to_json_file(llm_output: str, filepath: Path):
|
|
|
67
71
|
except Exception as e:
|
|
68
72
|
print(f"An unexpected error occurred in save_to_json_file: {e}")
|
|
69
73
|
|
|
70
|
-
def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
71
|
-
# give required output fields
|
|
74
|
+
def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
75
|
+
# give required output fields
|
|
72
76
|
# take the documents
|
|
73
77
|
# convert to text
|
|
74
78
|
# upload to chromadb
|
|
@@ -80,7 +84,7 @@ def make_document_summarization_and_embeddings(db_client, llm_client, inference_
|
|
|
80
84
|
expected_json= text_out_dir / 'json' / (file_root + '.json')
|
|
81
85
|
document_id= make_it_an_id(file_root)
|
|
82
86
|
|
|
83
|
-
if docfile.is_file and not (expected_json).exists():
|
|
87
|
+
if docfile.is_file and not (expected_json).exists():
|
|
84
88
|
|
|
85
89
|
expected_markdown= text_out_dir / (file_root + '.md')
|
|
86
90
|
if not (expected_markdown).exists():
|
|
@@ -161,6 +165,23 @@ if __name__ == "__main__":
|
|
|
161
165
|
# provider="Jina AI",
|
|
162
166
|
# api_key= JINA_API_KEY,
|
|
163
167
|
# )
|
|
168
|
+
#
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
if len(argv) > 1:
|
|
172
|
+
print(f"First argument: {argv[1]}")
|
|
173
|
+
required_fields_path= argv[1]
|
|
174
|
+
with open(required_fields_path, "rb") as rf:
|
|
175
|
+
required_fields= pickle.load(rf)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
print("error while getting required_fields pickle. Please pickle it and put it in project directory to continue\n", e)
|
|
178
|
+
|
|
179
|
+
if required_fields is None:
|
|
180
|
+
print("couldnt load required fields. please provide path to pickle in command line argument")
|
|
181
|
+
exit()
|
|
182
|
+
else:
|
|
183
|
+
print(required_fields)
|
|
184
|
+
|
|
164
185
|
|
|
165
186
|
inference_client= InferenceClient(
|
|
166
187
|
provider="hf-inference",
|
|
@@ -178,7 +199,7 @@ if __name__ == "__main__":
|
|
|
178
199
|
llm_provider_name: PROVIDER_T="nebius"
|
|
179
200
|
|
|
180
201
|
# nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
|
|
181
|
-
# Worth mentioning:
|
|
202
|
+
# Worth mentioning:
|
|
182
203
|
# jinaai/jina-embeddings-v3
|
|
183
204
|
# BAAI/bge-base-en-v1.5
|
|
184
205
|
# nomic-ai/nomic-embed-text-v1.5
|
|
@@ -203,6 +224,3 @@ if __name__ == "__main__":
|
|
|
203
224
|
|
|
204
225
|
docs_collection= database["summary_docs"]
|
|
205
226
|
upload_summaries(process_output_dir / 'json', docs_collection)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
pembot/pdf2markdown/extract.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import fitz
|
|
1
|
+
import fitz
|
|
2
2
|
import pdfplumber
|
|
3
3
|
import re
|
|
4
4
|
import yaml
|
|
5
5
|
# import pytesseract
|
|
6
6
|
import numpy as np
|
|
7
|
-
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
|
|
8
|
-
# VisionEncoderDecoderModel, ViTImageProcessor,
|
|
7
|
+
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
|
|
9
8
|
from typing import Literal, final
|
|
10
9
|
import torch
|
|
11
10
|
from PIL import Image
|
|
@@ -16,22 +15,11 @@ import warnings
|
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from abc import ABC, abstractmethod
|
|
18
17
|
import argparse
|
|
19
|
-
from PIL import Image
|
|
20
18
|
import io
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
model = AutoModelForImageTextToText.from_pretrained(
|
|
26
|
-
model_path,
|
|
27
|
-
torch_dtype="auto",
|
|
28
|
-
device_map="auto",
|
|
29
|
-
attn_implementation="flash_attention_2"
|
|
30
|
-
)
|
|
31
|
-
model.eval()
|
|
19
|
+
from google import genai
|
|
20
|
+
from google.genai import types
|
|
21
|
+
import mimetypes
|
|
32
22
|
|
|
33
|
-
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
34
|
-
processor = AutoProcessor.from_pretrained(model_path)
|
|
35
23
|
|
|
36
24
|
|
|
37
25
|
warnings.filterwarnings("ignore")
|
|
@@ -74,9 +62,31 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
74
62
|
|
|
75
63
|
BULLET_POINTS = "•◦▪▫●○"
|
|
76
64
|
|
|
77
|
-
def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"]):
|
|
65
|
+
def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"], model_name: str | None= None):
|
|
78
66
|
super().__init__(pdf_path)
|
|
79
67
|
|
|
68
|
+
if model_name is None:
|
|
69
|
+
self.MODEL_NAME= "gemini-2.5-flash"
|
|
70
|
+
else:
|
|
71
|
+
self.MODEL_NAME= model_name
|
|
72
|
+
|
|
73
|
+
if "gemini" in self.MODEL_NAME:
|
|
74
|
+
self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
|
|
75
|
+
else:
|
|
76
|
+
model_path = "nanonets/Nanonets-OCR-s"
|
|
77
|
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
78
|
+
model_path,
|
|
79
|
+
torch_dtype="auto",
|
|
80
|
+
device_map="auto",
|
|
81
|
+
attn_implementation="flash_attention_2"
|
|
82
|
+
)
|
|
83
|
+
self.model.eval()
|
|
84
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
85
|
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
|
86
|
+
self.setup_image_captioning()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
80
90
|
self.markdown_content= ""
|
|
81
91
|
self.pdf_filename = Path(pdf_path).stem
|
|
82
92
|
self.output_path= output_path
|
|
@@ -87,26 +97,26 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
87
97
|
self.page_delimiter= page_delimiter
|
|
88
98
|
Path(output_path).mkdir(parents=True, exist_ok=True)
|
|
89
99
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def setup_image_captioning(self):
|
|
103
|
+
"""Set up the image captioning model."""
|
|
104
|
+
try:
|
|
105
|
+
self.model = VisionEncoderDecoderModel.from_pretrained(
|
|
106
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
107
|
+
)
|
|
108
|
+
self.feature_extractor = ViTImageProcessor.from_pretrained(
|
|
109
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
110
|
+
)
|
|
111
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
112
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
113
|
+
)
|
|
114
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
115
|
+
self.model.to(self.device)
|
|
116
|
+
self.logger.info("Image captioning model set up successfully.")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
self.logger.error(f"Error setting up image captioning model: {e}")
|
|
119
|
+
self.logger.exception(traceback.format_exc())
|
|
110
120
|
|
|
111
121
|
def extract(self):
|
|
112
122
|
try:
|
|
@@ -123,282 +133,197 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
123
133
|
self.logger.exception(traceback.format_exc())
|
|
124
134
|
return "", []
|
|
125
135
|
|
|
126
|
-
def extract_markdown_by_blocks(self):
|
|
127
|
-
"""Main method to extract markdown from PDF."""
|
|
128
|
-
try:
|
|
129
|
-
doc = fitz.open(self.pdf_path)
|
|
130
|
-
markdown_content = ""
|
|
131
|
-
markdown_pages = []
|
|
132
|
-
tables = self.extract_tables()
|
|
133
|
-
table_index = 0
|
|
134
|
-
list_counter = 0
|
|
135
|
-
in_code_block = False
|
|
136
|
-
code_block_content = ""
|
|
137
|
-
code_block_lang = None
|
|
138
|
-
prev_line = ""
|
|
139
|
-
|
|
140
|
-
for page_num, page in enumerate(doc):
|
|
141
|
-
self.logger.info(f"Processing page {page_num + 1}")
|
|
142
|
-
page_content = ""
|
|
143
|
-
blocks = page.get_text("dict")["blocks"]
|
|
144
|
-
page_height = page.rect.height
|
|
145
|
-
links = self.extract_links(page)
|
|
146
|
-
|
|
147
|
-
if len(page.get_images()) > 0 and len(page.get_images()) <= 128:
|
|
148
|
-
for block in blocks:
|
|
149
|
-
if block["type"] == 0: # Text
|
|
150
|
-
page_content += self.process_text_block(
|
|
151
|
-
block,
|
|
152
|
-
page_height,
|
|
153
|
-
links,
|
|
154
|
-
list_counter,
|
|
155
|
-
in_code_block,
|
|
156
|
-
code_block_content,
|
|
157
|
-
code_block_lang,
|
|
158
|
-
prev_line,
|
|
159
|
-
)
|
|
160
|
-
elif block["type"] == 1: # Image
|
|
161
|
-
page_content += self.process_image_block(page, block)
|
|
162
|
-
|
|
163
|
-
else:
|
|
164
|
-
for block in blocks:
|
|
165
|
-
if block["type"] == 0: # Text
|
|
166
|
-
page_content += self.process_text_block(
|
|
167
|
-
block,
|
|
168
|
-
page_height,
|
|
169
|
-
links,
|
|
170
|
-
list_counter,
|
|
171
|
-
in_code_block,
|
|
172
|
-
code_block_content,
|
|
173
|
-
code_block_lang,
|
|
174
|
-
prev_line,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
# Insert tables at their approximate positions
|
|
178
|
-
while (
|
|
179
|
-
table_index < len(tables)
|
|
180
|
-
and tables[table_index]["page"] == page.number
|
|
181
|
-
):
|
|
182
|
-
page_content += (
|
|
183
|
-
"\n\n"
|
|
184
|
-
+ self.table_to_markdown(tables[table_index]["content"])
|
|
185
|
-
+ "\n\n"
|
|
186
|
-
)
|
|
187
|
-
table_index += 1
|
|
188
|
-
|
|
189
|
-
markdown_pages.append(self.post_process_markdown(page_content))
|
|
190
|
-
markdown_content += page_content + config["PAGE_DELIMITER"]
|
|
191
136
|
|
|
192
|
-
|
|
193
|
-
return markdown_content, markdown_pages
|
|
194
|
-
except Exception as e:
|
|
195
|
-
self.logger.error(f"Error extracting markdown: {e}")
|
|
196
|
-
self.logger.exception(traceback.format_exc())
|
|
197
|
-
return "", []
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def ocr_page_with_nanonets_s(self, pil_image, model, processor, max_new_tokens: int | None = None):
|
|
137
|
+
def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
|
|
201
138
|
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
|
|
202
139
|
if max_new_tokens is None:
|
|
203
140
|
max_new_tokens= 4096
|
|
204
141
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
142
|
+
if 'gemini' in self.MODEL_NAME:
|
|
143
|
+
|
|
144
|
+
image_format = pil_image.format
|
|
145
|
+
dummy_filename = f"dummy.{image_format.lower()}"
|
|
146
|
+
mime_type, _ = mimetypes.guess_type(dummy_filename)
|
|
147
|
+
response= self.gclient.models.generate_content(
|
|
148
|
+
model= self.MODEL_NAME,
|
|
149
|
+
contents=[
|
|
150
|
+
types.Part.from_bytes(
|
|
151
|
+
data=img_bytes.getvalue(),
|
|
152
|
+
mime_type= mime_type
|
|
153
|
+
),
|
|
154
|
+
prompt
|
|
155
|
+
]
|
|
156
|
+
)
|
|
157
|
+
# print("response :", response)
|
|
158
|
+
return response.text
|
|
159
|
+
else:
|
|
160
|
+
image = pil_image
|
|
161
|
+
messages = [
|
|
162
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
163
|
+
{"role": "user", "content": [
|
|
164
|
+
{"type": "image", "image": image},
|
|
165
|
+
{"type": "text", "text": prompt},
|
|
166
|
+
]},
|
|
167
|
+
]
|
|
168
|
+
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
169
|
+
inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
|
|
170
|
+
inputs = inputs.to(self.model.device)
|
|
217
171
|
|
|
218
|
-
|
|
219
|
-
|
|
172
|
+
output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
|
173
|
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
|
220
174
|
|
|
221
|
-
|
|
222
|
-
|
|
175
|
+
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
176
|
+
return output_text[0]
|
|
223
177
|
|
|
224
178
|
|
|
225
179
|
|
|
226
180
|
def extract_markdown(self):
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
181
|
+
"""
|
|
182
|
+
Extracts all possible content from a PDF, prioritizing searchable text,
|
|
183
|
+
then OCR for embedded images, and finally full-page OCR for scanned pages.
|
|
184
|
+
Avoids redundant OCR where possible.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
tuple: A tuple containing:
|
|
188
|
+
- str: The concatenated markdown content of all pages.
|
|
189
|
+
- list: A list of strings, where each string is the comprehensive markdown
|
|
190
|
+
for a corresponding page.
|
|
191
|
+
"""
|
|
192
|
+
all_pages_markdown = []
|
|
193
|
+
full_document_markdown = [] # Changed to list of lines/blocks to handle insertions better
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
doc = fitz.open(self.pdf_path)
|
|
197
|
+
self.logger.info(f"Opened PDF: {self.pdf_path}")
|
|
198
|
+
|
|
199
|
+
tables = self.extract_tables()
|
|
200
|
+
table_index = 0
|
|
201
|
+
|
|
202
|
+
# State variables for process_text_block that might need to persist across blocks
|
|
203
|
+
# Re-initialize for each new document, but allow state management within process_text_block for lines
|
|
204
|
+
list_counter = 0
|
|
205
|
+
in_code_block = False
|
|
206
|
+
code_block_content = ""
|
|
207
|
+
code_block_lang = None
|
|
208
|
+
prev_line = ""
|
|
209
|
+
|
|
210
|
+
for page_num, page in enumerate(doc):
|
|
211
|
+
current_page_markdown_blocks = [] # Collect markdown blocks for the current page
|
|
212
|
+
page_has_searchable_text = False
|
|
213
|
+
page_has_embedded_images = False
|
|
214
|
+
|
|
215
|
+
self.logger.info(f"\nProcessing page {page_num + 1}...")
|
|
216
|
+
|
|
217
|
+
blocks = page.get_text('dict')['blocks']
|
|
218
|
+
page_height = page.rect.height
|
|
219
|
+
links = self.extract_links(page)
|
|
220
|
+
|
|
221
|
+
# Phase 1: Process text blocks and embedded image blocks
|
|
222
|
+
for block_num, block in enumerate(blocks):
|
|
223
|
+
if block['type'] == 0: # Text block
|
|
224
|
+
page_has_searchable_text = True
|
|
225
|
+
processed_text = self.process_text_block(
|
|
226
|
+
block,
|
|
227
|
+
page_height,
|
|
228
|
+
links,
|
|
229
|
+
list_counter,
|
|
230
|
+
in_code_block,
|
|
231
|
+
code_block_content,
|
|
232
|
+
code_block_lang,
|
|
233
|
+
prev_line,
|
|
234
|
+
)
|
|
235
|
+
if processed_text.strip():
|
|
236
|
+
current_page_markdown_blocks.append(processed_text)
|
|
237
|
+
|
|
238
|
+
elif block['type'] == 1: # Image block
|
|
239
|
+
page_has_embedded_images = True
|
|
240
|
+
self.logger.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
|
|
241
|
+
img_data = block['image']
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
image_bytes= io.BytesIO(img_data)
|
|
245
|
+
pil_image = Image.open(image_bytes)
|
|
246
|
+
ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
|
|
247
|
+
pil_image, image_bytes, max_new_tokens=15000
|
|
248
|
+
)
|
|
231
249
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
250
|
+
if ocr_text_from_block_image.strip():
|
|
251
|
+
self.logger.info(" OCR found text in embedded image block.")
|
|
252
|
+
current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_block_image.strip()}\n\n")
|
|
253
|
+
else:
|
|
254
|
+
self.logger.info(f" No OCR text from embedded image block. Adding generic placeholder.")
|
|
255
|
+
current_page_markdown_blocks.append("\n\n\n\n") # Consider saving images
|
|
256
|
+
except Exception as e:
|
|
257
|
+
self.logger.error(f" Error processing embedded image block for OCR: {e}")
|
|
258
|
+
current_page_markdown_blocks.append("\n\n\n\n")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Insert tables at their approximate positions (after blocks are processed for the page)
|
|
262
|
+
# You might need more sophisticated logic here if table positions are granular
|
|
263
|
+
while (
|
|
264
|
+
table_index < len(tables)
|
|
265
|
+
and tables[table_index]["page"] == page.number
|
|
266
|
+
):
|
|
267
|
+
current_page_markdown_blocks.append(
|
|
268
|
+
self.table_to_markdown(tables[table_index]["content"])
|
|
269
|
+
)
|
|
270
|
+
table_index += 1
|
|
236
271
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
272
|
+
# Phase 2: Full-page OCR if the page seems to be a scanned image or lacks sufficient searchable text
|
|
273
|
+
# We prioritize actual searchable text and embedded image OCR.
|
|
274
|
+
# Only if very little or no text was found, we resort to full-page OCR.
|
|
275
|
+
combined_current_page_text_length = len("".join(current_page_markdown_blocks).strip())
|
|
241
276
|
|
|
242
|
-
|
|
243
|
-
|
|
277
|
+
# A heuristic: if almost no searchable text and no significant OCR from embedded images
|
|
278
|
+
if not page_has_searchable_text and combined_current_page_text_length < 100: # Threshold for considering "minimal text"
|
|
279
|
+
self.logger.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
|
|
280
|
+
try:
|
|
281
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
|
282
|
+
img_bytes = pix.tobytes("png")
|
|
283
|
+
image_bytestream= io.BytesIO(img_bytes)
|
|
284
|
+
pil_image = Image.open(image_bytestream)
|
|
244
285
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
tables = self.extract_tables()
|
|
250
|
-
table_index = 0
|
|
251
|
-
list_counter = 0
|
|
252
|
-
in_code_block = False
|
|
253
|
-
code_block_content = ""
|
|
254
|
-
code_block_lang = None
|
|
255
|
-
prev_line = ""
|
|
256
|
-
|
|
257
|
-
for page_num, page in enumerate(doc):
|
|
258
|
-
page_text_content = []
|
|
259
|
-
page_has_searchable_text = False
|
|
260
|
-
|
|
261
|
-
logging.info(f"\nProcessing page {page_num + 1}...")
|
|
262
|
-
|
|
263
|
-
# --- Phase 1: Extract text from direct text blocks and process embedded images ---
|
|
264
|
-
blocks = page.get_text('dict')['blocks']
|
|
265
|
-
text_blocks_content = []
|
|
266
|
-
image_block_text_content = []
|
|
267
|
-
|
|
268
|
-
page_height = page.rect.height
|
|
269
|
-
links = self.extract_links(page)
|
|
270
|
-
|
|
271
|
-
for block_num, block in enumerate(blocks):
|
|
272
|
-
if block['type'] == 0: # Text block
|
|
273
|
-
page_has_searchable_text = True
|
|
274
|
-
text_blocks_content.append(self.process_text_block(
|
|
275
|
-
block,
|
|
276
|
-
page_height,
|
|
277
|
-
links,
|
|
278
|
-
list_counter,
|
|
279
|
-
in_code_block,
|
|
280
|
-
code_block_content,
|
|
281
|
-
code_block_lang,
|
|
282
|
-
prev_line,
|
|
283
|
-
))
|
|
284
|
-
|
|
285
|
-
# for line in block['lines']:
|
|
286
|
-
# for span in line['spans']:
|
|
287
|
-
# text_blocks_content.append(span['text'])
|
|
288
|
-
elif block['type'] == 1: # Image block
|
|
289
|
-
logging.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
|
|
290
|
-
img_data = block['image']
|
|
291
|
-
img_ext = block['ext']
|
|
286
|
+
ocr_text_from_page = self.ocr_page_with_nanonets_s(
|
|
287
|
+
pil_image, image_bytestream, max_new_tokens=15000
|
|
288
|
+
)
|
|
292
289
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
290
|
+
if ocr_text_from_page.strip():
|
|
291
|
+
self.logger.info(f" Successfully extracted text via full-page OCR for page {page_num + 1}.")
|
|
292
|
+
# If full-page OCR yields significant content and other methods didn't,
|
|
293
|
+
# replace or augment. Here, we'll replace to avoid double-counting if it's primarily scanned.
|
|
294
|
+
# You might choose to append if you want to combine (e.g., if there's header text + scanned body)
|
|
295
|
+
if combined_current_page_text_length < 50: # If almost nothing was found before, replace
|
|
296
|
+
current_page_markdown_blocks = [ocr_text_from_page.strip()]
|
|
297
|
+
else: # Otherwise, augment (append)
|
|
298
|
+
current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_page.strip()}\n\n")
|
|
302
299
|
else:
|
|
303
|
-
|
|
304
|
-
# caption = self.caption_image(pil_image)
|
|
305
|
-
# if caption:
|
|
306
|
-
# logging.info(f" No OCR text, using caption for embedded image block.")
|
|
307
|
-
# image_block_text_content.append(caption)
|
|
308
|
-
# else:
|
|
309
|
-
# logging.info(f" No OCR text and no caption for embedded image block.")
|
|
310
|
-
|
|
311
|
-
# a) captioning sucks, b) no need
|
|
312
|
-
image_block_text_content.append("An Image")
|
|
313
|
-
|
|
314
|
-
# except pytesseract.TesseractNotFoundError:
|
|
315
|
-
# logging.warning(" Tesseract-OCR not found. Skipping OCR for embedded image block.")
|
|
316
|
-
# caption = self.process_image_block(page, block)
|
|
317
|
-
# if caption: image_block_text_content.append(caption)
|
|
318
|
-
|
|
319
|
-
# image_block_text_content.append("An Image")
|
|
300
|
+
self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
|
|
320
301
|
except Exception as e:
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
image_block_text_content.append("An Image")
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# Insert tables at their approximate positions
|
|
328
|
-
while (
|
|
329
|
-
table_index < len(tables)
|
|
330
|
-
and tables[table_index]["page"] == page.number
|
|
331
|
-
):
|
|
332
|
-
page_text_content += (
|
|
333
|
-
"\n\n"
|
|
334
|
-
+ self.table_to_markdown(tables[table_index]["content"])
|
|
335
|
-
+ "\n\n"
|
|
336
|
-
)
|
|
337
|
-
table_index += 1
|
|
338
|
-
|
|
339
|
-
# Add content from text blocks
|
|
340
|
-
if text_blocks_content:
|
|
341
|
-
page_text_content.append(" ".join(text_blocks_content))
|
|
342
|
-
|
|
343
|
-
# Add content from image blocks
|
|
344
|
-
if image_block_text_content:
|
|
345
|
-
page_text_content.append("\n".join(image_block_text_content))
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
# --- Phase 2: OCR the entire page IF it seems to be a scanned image ---
|
|
349
|
-
# We check if page_has_searchable_text is False or if the amount of text
|
|
350
|
-
# is very small, suggesting it might be mostly a scanned page.
|
|
351
|
-
# A threshold of 50 characters is arbitrary; adjust as needed.
|
|
352
|
-
current_text_len = len(" ".join(page_text_content).strip())
|
|
353
|
-
|
|
354
|
-
if not page_has_searchable_text or current_text_len < 50:
|
|
355
|
-
logging.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
|
|
356
|
-
try:
|
|
357
|
-
# Render the page as a high-resolution image (e.g., 300 DPI)
|
|
358
|
-
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
|
359
|
-
img_bytes = pix.tobytes("png")
|
|
360
|
-
|
|
361
|
-
pil_image = Image.open(io.BytesIO(img_bytes))
|
|
362
|
-
|
|
363
|
-
# Perform OCR on the entire page image
|
|
364
|
-
# ocr_text_from_page = pytesseract.image_to_string(pil_image)
|
|
365
|
-
ocr_text_from_page= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
|
|
366
|
-
|
|
367
|
-
if ocr_text_from_page.strip():
|
|
368
|
-
logging.info(f" Successfully extracted text via full-page OCR.")
|
|
369
|
-
page_text_content.append(ocr_text_from_page.strip())
|
|
370
|
-
else:
|
|
371
|
-
logging.info(f" Full-page OCR yielded no text for page {page_num+1}.")
|
|
372
|
-
|
|
373
|
-
# except pytesseract.TesseractNotFoundError:
|
|
374
|
-
# logging.warning(" Tesseract-OCR not found. Skipping full-page OCR for this page.")
|
|
375
|
-
except Exception as e:
|
|
376
|
-
logging.error(f" Error during full-page OCR on page {page_num+1}: {e}")
|
|
377
|
-
else:
|
|
378
|
-
logging.info(f" Page {page_num + 1} has sufficient searchable text; skipping full-page OCR.")
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
# Concatenate all collected text for the current page
|
|
382
|
-
final_page_text = "\n".join(filter(None, page_text_content)).strip() # Use filter(None, ...) to remove empty strings
|
|
383
|
-
all_pages_text.append(self.post_process_markdown(final_page_text))
|
|
384
|
-
the_text += final_page_text + self.page_delimiter
|
|
385
|
-
|
|
386
|
-
logging.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_text[:200]}...")
|
|
302
|
+
self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
|
|
303
|
+
else:
|
|
304
|
+
self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
|
|
387
305
|
|
|
388
|
-
|
|
389
|
-
|
|
306
|
+
# Join collected markdown blocks for the current page
|
|
307
|
+
final_page_markdown = "\n".join(filter(None, current_page_markdown_blocks)).strip()
|
|
308
|
+
all_pages_markdown.append(self.post_process_markdown(final_page_markdown))
|
|
309
|
+
full_document_markdown.append(self.post_process_markdown(final_page_markdown))
|
|
310
|
+
full_document_markdown.append(self.page_delimiter)
|
|
390
311
|
|
|
391
312
|
|
|
392
|
-
|
|
393
|
-
|
|
313
|
+
self.logger.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_markdown[:200]}...")
|
|
314
|
+
print(f"\n--- Page {page_num+1} Done ---\n")
|
|
315
|
+
print(final_page_markdown[:500]) # Print first 500 chars of page markdown
|
|
394
316
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
return []
|
|
398
|
-
except Exception as e:
|
|
399
|
-
logging.critical(f"An unexpected error occurred: {e}")
|
|
400
|
-
return []
|
|
317
|
+
doc.close()
|
|
318
|
+
return "".join(full_document_markdown), all_pages_markdown
|
|
401
319
|
|
|
320
|
+
except fitz.FileNotFoundError:
|
|
321
|
+
self.logger.error(f"PDF file not found: {self.pdf_path}")
|
|
322
|
+
return "", []
|
|
323
|
+
except Exception as e:
|
|
324
|
+
self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
|
|
325
|
+
self.logger.exception(traceback.format_exc())
|
|
326
|
+
return "", []
|
|
402
327
|
|
|
403
328
|
def extract_tables(self):
|
|
404
329
|
"""Extract tables from PDF using pdfplumber."""
|
|
@@ -449,13 +374,13 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
449
374
|
self.logger.exception(traceback.format_exc())
|
|
450
375
|
return ""
|
|
451
376
|
|
|
452
|
-
def perform_ocr(self, image):
|
|
377
|
+
def perform_ocr(self, image, image_bytes):
|
|
453
378
|
"""Perform OCR on the given image."""
|
|
454
379
|
try:
|
|
455
380
|
# ocr_result = pytesseract.image_to_string(
|
|
456
381
|
# image
|
|
457
382
|
# )
|
|
458
|
-
ocr_result= self.ocr_page_with_nanonets_s(image,
|
|
383
|
+
ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
|
|
459
384
|
|
|
460
385
|
|
|
461
386
|
return ocr_result.strip()
|
|
@@ -464,10 +389,10 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
464
389
|
self.logger.exception(traceback.format_exc())
|
|
465
390
|
return ""
|
|
466
391
|
|
|
467
|
-
def caption_image(self, image):
|
|
392
|
+
def caption_image(self, image, image_bytes):
|
|
468
393
|
"""Generate a caption for the given image."""
|
|
469
394
|
try:
|
|
470
|
-
ocr_text = self.perform_ocr(image)
|
|
395
|
+
ocr_text = self.perform_ocr(image, image_bytes)
|
|
471
396
|
if ocr_text:
|
|
472
397
|
return ocr_text
|
|
473
398
|
|
|
@@ -475,19 +400,38 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
475
400
|
if image.mode != "RGB":
|
|
476
401
|
image = image.convert("RGB")
|
|
477
402
|
|
|
478
|
-
|
|
479
|
-
|
|
403
|
+
image_format = image.format
|
|
404
|
+
dummy_filename = f"dummy.{image_format.lower()}"
|
|
405
|
+
mime_type, _ = mimetypes.guess_type(dummy_filename)
|
|
406
|
+
|
|
407
|
+
if "gemini" in self.MODEL_NAME:
|
|
408
|
+
response= self.gclient.models.generate_content(
|
|
409
|
+
model= self.MODEL_NAME,
|
|
410
|
+
contents=[
|
|
411
|
+
types.Part.from_bytes(
|
|
412
|
+
data=image_bytes.getvalue(),
|
|
413
|
+
mime_type= mime_type
|
|
414
|
+
),
|
|
415
|
+
"Write a caption for this image"
|
|
416
|
+
]
|
|
417
|
+
)
|
|
418
|
+
return response.text
|
|
419
|
+
else:
|
|
420
|
+
# Ensure the image is in the correct shape
|
|
421
|
+
image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
|
|
480
422
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
423
|
+
inputs = self.feature_extractor(images=image, return_tensors="pt").to(
|
|
424
|
+
self.device
|
|
425
|
+
)
|
|
426
|
+
pixel_values = inputs.pixel_values
|
|
485
427
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
generated_ids,
|
|
489
|
-
|
|
490
|
-
|
|
428
|
+
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
429
|
+
|
|
430
|
+
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
431
|
+
generated_caption = self.tokenizer.batch_decode(
|
|
432
|
+
generated_ids, skip_special_tokens=True
|
|
433
|
+
)[0]
|
|
434
|
+
return generated_caption.strip()
|
|
491
435
|
except Exception as e:
|
|
492
436
|
self.logger.error(f"Error captioning image: {e}")
|
|
493
437
|
self.logger.exception(traceback.format_exc())
|
|
@@ -789,7 +733,11 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
789
733
|
Path(self.output_path) / image_filename
|
|
790
734
|
) # Convert to Path object
|
|
791
735
|
image.save(image_path, "PNG", optimize=True, quality=95)
|
|
792
|
-
|
|
736
|
+
|
|
737
|
+
img_byte_arr = io.BytesIO()
|
|
738
|
+
image.save(img_byte_arr)
|
|
739
|
+
caption = self.caption_image(image, img_byte_arr)
|
|
740
|
+
|
|
793
741
|
if not caption:
|
|
794
742
|
caption = (
|
|
795
743
|
f"{self.pdf_filename}_image_{int(page.number)+1}_{block['number']}"
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
pembot/.gitignore,sha256=_7FTsZokJ_pzEyyPjOsGw5x5Xx3gUBFaafs7UlPsv9E,98
|
|
2
2
|
pembot/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
3
|
-
pembot/__init__.py,sha256=
|
|
3
|
+
pembot/__init__.py,sha256=gDyRo0zcHlKqOSZfoLopwIZ7zXGBXlogi4tihmIeC5o,211
|
|
4
4
|
pembot/gartner.py,sha256=3ALknQ5mSXIimmwCa3JFDzB_EW2hHEcQO1T2odyBquk,5408
|
|
5
|
-
pembot/main.py,sha256=
|
|
5
|
+
pembot/main.py,sha256=lZLIV8XPonvNoY4LVS-5fct1y9URMXWoSGJUKMw3Yg8,9667
|
|
6
6
|
pembot/output_structure_local.py,sha256=YfpHzfTNeLMSsB_CjAamha9D6Iz7E1IC-tW9xPCMWFc,3000
|
|
7
7
|
pembot/pem.py,sha256=mv6iGcN1peSY7z2dtCQ_BKj31EFBNfczBhps_d-0XDo,6377
|
|
8
8
|
pembot/query.py,sha256=EDbvQ4It1S8Zg4y-sUCLZjiyBmvQGUAMOVq9yPs_hdE,8242
|
|
@@ -79,17 +79,17 @@ pembot/.git/refs/heads/main,sha256=mQvXNyC6knHS2Siuig65zcGaLxzAR8zX_Zoqw2Oj6S0,4
|
|
|
79
79
|
pembot/.git/refs/remotes/origin/HEAD,sha256=K7aiSqD8bEhBAPXVGim7rYQc0sdV9dk_qiBOXbtOsrQ,30
|
|
80
80
|
pembot/.git/refs/remotes/origin/main,sha256=mQvXNyC6knHS2Siuig65zcGaLxzAR8zX_Zoqw2Oj6S0,41
|
|
81
81
|
pembot/AnyToText/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
-
pembot/AnyToText/convertor.py,sha256=
|
|
82
|
+
pembot/AnyToText/convertor.py,sha256=26Pq4OLhVNHgIhJdLLcxGPFTtdnG2lsQkR_53_zkZZM,16997
|
|
83
83
|
pembot/TextEmbedder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
84
84
|
pembot/TextEmbedder/gemini_embedder.py,sha256=P679-2mmQESlYKML1vcrwx_-CSgWJgIQk7NL4F7BLQE,677
|
|
85
85
|
pembot/TextEmbedder/mongodb_embedder.py,sha256=pD8mP-uC_o0COPdOrCTMpoC5PdF8hXlqARHvTr2T-VI,9642
|
|
86
86
|
pembot/TextEmbedder/mongodb_index_creator.py,sha256=ejpsF_y1zY6Z0nux02vjODiDPnxx-YA_xy2PmT94zZ4,5306
|
|
87
87
|
pembot/TextEmbedder/vector_query.py,sha256=Kh1uhx9CatB-oQlQtnW-1I2Qz7MGHI20n2h_8peAChM,1986
|
|
88
|
-
pembot/config/config.yaml,sha256=
|
|
88
|
+
pembot/config/config.yaml,sha256=IJG9CGEPxH83Cr4X5OEZj9SLFT4ZfwINrA63GpjQssA,156
|
|
89
89
|
pembot/pdf2markdown/LICENSE,sha256=1JTJhQjUYDqJzFJhNtitm7mHyE71PRHgetIqRRWg6Pk,1068
|
|
90
90
|
pembot/pdf2markdown/README.md,sha256=jitM1pwI69oa0N4mXv5-SY1ka9Sz3jsRNCDdpW-50kY,4545
|
|
91
91
|
pembot/pdf2markdown/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
|
-
pembot/pdf2markdown/extract.py,sha256=
|
|
92
|
+
pembot/pdf2markdown/extract.py,sha256=LV9cqkWWmQk-32hsfPdjzbzVVF0kYxzyH7-ssUALxpU,34956
|
|
93
93
|
pembot/pdf2markdown/requirements.txt,sha256=0vZQzkSZKLNVUttd4euoDyYEy0nc2W3CIVxhepHW5Ho,76
|
|
94
94
|
pembot/pdf2markdown/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
|
|
95
95
|
pembot/pdf2markdown/.git/config,sha256=ltEWI476vFz2goGWD7QmCDvC6UCQ9ELviXuURlvte_w,269
|
|
@@ -123,7 +123,7 @@ pembot/pdf2markdown/config/config.yaml,sha256=w75W2Eg4-tu8rRk_23PqxWDh0010kRKLmP
|
|
|
123
123
|
pembot/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
124
|
pembot/utils/inference_client.py,sha256=jeURmY2P5heVlH1dCV0XSgiX3U2qYGEmrnUv0KFpdww,5380
|
|
125
125
|
pembot/utils/string_tools.py,sha256=gtRa5rBR0Q7GspTu2WtCnvhJQLFjPfWLvhmyiPkyStU,1883
|
|
126
|
-
pembot-0.0.
|
|
127
|
-
pembot-0.0.
|
|
128
|
-
pembot-0.0.
|
|
129
|
-
pembot-0.0.
|
|
126
|
+
pembot-0.0.4.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
127
|
+
pembot-0.0.4.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
|
|
128
|
+
pembot-0.0.4.dist-info/METADATA,sha256=v9TTMdInby4h5K0aKMrT11NgpyBw0ZMpzrC9dgI0O14,313
|
|
129
|
+
pembot-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|