pembot 0.0.3__py2.py3-none-any.whl → 0.0.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

@@ -7,6 +7,7 @@ import json
7
7
  import pandas as pd
8
8
  from typing import Literal, Union, Dict, Any, List
9
9
  import tempfile
10
+ from datetime import datetime, date
10
11
 
11
12
 
12
13
  PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
@@ -30,19 +31,19 @@ EXCEL_FILE_TYPES= [
30
31
  class Convertor():
31
32
 
32
33
 
33
- def __init__(self, myfile: Path | None, output_dir: Path | None, file_bytes: bytes | None, suffix: str | None, file_type: str | None):
34
+ def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
34
35
 
35
36
  self.output= ""
36
37
 
37
38
  # file_type can be pdf, excel, etc.
38
- if output_dir is None and file_bytes is not None and suffix is not None and myfile is None:
39
+ if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
39
40
  with tempfile.TemporaryDirectory() as dp:
40
41
  with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
41
42
  fp.write(file_bytes)
42
43
  myfile= Path(fp.name)
43
44
  output_dir= Path(dp)
44
45
  if file_type == 'pdf':
45
- extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
46
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
46
47
  extractor.extract()
47
48
  with open(output_dir / (myfile.stem + '.md')) as output_file:
48
49
  self.output= output_file.read()
@@ -75,162 +76,235 @@ class Convertor():
75
76
  else:
76
77
  print(mt)
77
78
 
78
-
79
-
80
79
  def convert_file_to_json(
81
- self,
82
- sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
83
- orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
84
- date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
85
- csv_encoding: str = 'utf-8', # For reading CSV files
86
- excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
80
+ self,
81
+ sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
82
+ orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
83
+ date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
84
+ csv_encoding: str = 'utf-8', # For reading CSV files
85
+ excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
87
86
  ) -> bool:
88
- """
89
- Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
90
- into an equivalent JSON format.
91
-
92
- Args:
93
- sheet_to_convert (str | int | None, optional):
94
- - For Excel/ODS:
95
- - If None (default): Converts all sheets. The JSON output will be a
96
- dictionary where keys are sheet names and values are the JSON
97
- representation of each sheet.
98
- - If str: Name of the specific sheet to convert.
99
- - If int: Index of the specific sheet to convert (0-based).
100
- If a specific sheet is requested, the JSON output will directly be
101
- the representation of that sheet.
102
- - For CSV: This parameter is ignored. The entire CSV is processed.
103
- orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
104
- Default: 'records'. See pandas.DataFrame.to_dict() documentation.
105
- date_format (str | None, optional): Format for datetime objects.
106
- - 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
107
- - 'epoch': Milliseconds since epoch.
108
- - None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
109
- csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
110
- excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
111
- - For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
112
- - For ODS: 'odf' (requires 'odfpy' library).
113
- If None, pandas auto-detects based on file extension and installed libraries.
114
-
115
- Returns:
116
- bool: True if conversion was successful, False otherwise.
117
- """
118
- input_filepath = self.input_filepath
119
- json_filepath = self.json_filepath
120
-
121
- try:
122
-
123
- if not input_filepath.exists():
124
- print(f"Error: Input file not found at {input_filepath}")
125
- return False
87
+ """
88
+ Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
89
+ into an equivalent JSON format.
90
+
91
+ Args:
92
+ sheet_to_convert (str | int | None, optional):
93
+ - For Excel/ODS:
94
+ - If None (default): Converts all sheets. The JSON output will be a
95
+ dictionary where keys are sheet names and values are the JSON
96
+ representation of each sheet.
97
+ - If str: Name of the specific sheet to convert.
98
+ - If int: Index of the specific sheet to convert (0-based).
99
+ If a specific sheet is requested, the JSON output will directly be
100
+ the representation of that sheet.
101
+ - For CSV: This parameter is ignored. The entire CSV is processed.
102
+ orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
103
+ Default: 'records'. See pandas.DataFrame.to_dict() documentation.
104
+ date_format (str | None, optional): Format for datetime objects.
105
+ - 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
106
+ - 'epoch': Milliseconds since epoch.
107
+ - None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
108
+ csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
109
+ excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
110
+ - For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
111
+ - For ODS: 'odf' (requires 'odfpy' library).
112
+ If None, pandas auto-detects based on file extension and installed libraries.
113
+
114
+ Returns:
115
+ bool: True if conversion was successful, False otherwise.
116
+ """
117
+
118
+ input_filepath = self.input_filepath
119
+ json_filepath = self.json_filepath
120
+
121
+ try:
122
+
123
+ if not input_filepath.exists():
124
+ print(f"Error: Input file not found at {input_filepath}")
125
+ return False
126
126
 
127
- # Ensure output directory exists
128
- json_filepath.parent.mkdir(parents=True, exist_ok=True)
127
+ # Ensure output directory exists
128
+ json_filepath.parent.mkdir(parents=True, exist_ok=True)
129
129
 
130
- file_suffix = input_filepath.suffix.lower()
131
- output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
130
+ file_suffix = input_filepath.suffix.lower()
131
+ output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
132
132
 
133
- dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
133
+ dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
134
134
 
135
- current_engine: PandasReadEngineType = excel_ods_engine
135
+ current_engine: PandasReadEngineType = excel_ods_engine
136
136
 
137
- if file_suffix == '.csv':
138
- if sheet_to_convert is not None:
139
- print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
140
- try:
141
- df = pd.read_csv(input_filepath, encoding=csv_encoding)
142
- dataframes_to_process.append((df, None))
143
- except Exception as e:
144
- print(f"Error reading CSV file '{input_filepath.name}': {e}")
145
- return False
146
-
147
- elif file_suffix in ['.xls', '.xlsx', '.ods']:
148
- try:
149
- if file_suffix == '.ods':
150
- if current_engine is None:
151
- current_engine = 'odf'
152
- elif current_engine != 'odf':
153
- print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
154
- current_engine = 'odf'
155
-
156
- if sheet_to_convert is not None:
157
- df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
158
- dataframes_to_process.append((df, None))
159
-
160
- else:
161
- excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
162
- if not excel_file.sheet_names:
163
- print(f"Warning: File '{input_filepath.name}' contains no sheets.")
164
- for sheet_name in excel_file.sheet_names:
165
- df = excel_file.parse(sheet_name) # engine is inherited
166
- dataframes_to_process.append((df, sheet_name))
167
- except ImportError as ie:
168
- if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
169
- print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
170
- elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
171
- print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
172
- elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
173
- print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
174
- else:
175
- print(f"ImportError reading file '{input_filepath.name}': {ie}")
176
- return False
177
- except Exception as e:
178
- print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
179
- return False
180
- else:
181
- print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
137
+ if file_suffix == '.csv':
138
+ if sheet_to_convert is not None:
139
+ print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
140
+ try:
141
+ df = pd.read_csv(input_filepath, encoding=csv_encoding)
142
+ dataframes_to_process.append((df, None))
143
+ except Exception as e:
144
+ print(f"Error reading CSV file '{input_filepath.name}': {e}")
182
145
  return False
183
146
 
184
- if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
185
- print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
186
- elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
187
- pass
147
+ elif file_suffix in ['.xls', '.xlsx', '.ods']:
148
+ try:
149
+ if file_suffix == '.ods':
150
+ if current_engine is None:
151
+ current_engine = 'odf'
152
+ elif current_engine != 'odf':
153
+ print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
154
+ current_engine = 'odf'
188
155
 
156
+ if sheet_to_convert is not None:
157
+ df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
158
+ dataframes_to_process.append((df, None))
189
159
 
190
- is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
191
- temp_processed_data: Dict[str, Any] = {}
192
-
193
- for df_original, name_key in dataframes_to_process:
194
- df = df_original.copy()
195
-
196
- if date_format:
197
- for col_name in df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns:
198
- try:
199
- if date_format == 'iso':
200
- df[col_name] = df[col_name].apply(lambda x: x.isoformat() if pd.notnull(x) and hasattr(x, 'isoformat') else None)
201
- elif date_format == 'epoch':
202
- df[col_name] = df[col_name].apply(lambda x: int(x.timestamp() * 1000) if pd.notnull(x) and hasattr(x, 'timestamp') else None)
203
- except Exception as e_date:
204
- print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}. Problematic values might be None.")
205
-
206
- df = df.astype(object).where(pd.notnull(df), None)
207
- current_json_segment = df.to_dict(orient=orient)
208
-
209
- if is_direct_output:
210
- output_data_final = current_json_segment
211
- break
212
160
  else:
213
- if name_key is not None:
214
- temp_processed_data[name_key] = current_json_segment
215
-
216
- if not is_direct_output:
217
- output_data_final = temp_processed_data
218
-
219
- with open(json_filepath, 'w', encoding='utf-8') as f:
220
- json.dump(output_data_final, f, indent=4, ensure_ascii=False)
161
+ excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
162
+ if not excel_file.sheet_names:
163
+ print(f"Warning: File '{input_filepath.name}' contains no sheets.")
164
+ for sheet_name in excel_file.sheet_names:
165
+ df = excel_file.parse(sheet_name) # engine is inherited
166
+ dataframes_to_process.append((df, sheet_name))
167
+ except ImportError as ie:
168
+ if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
169
+ print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
170
+ elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
171
+ print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
172
+ elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
173
+ print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
174
+ else:
175
+ print(f"ImportError reading file '{input_filepath.name}': {ie}")
176
+ return False
177
+ except Exception as e:
178
+ print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
179
+ return False
180
+ else:
181
+ print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
182
+ return False
221
183
 
222
- print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
223
- return True
184
+ if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
185
+ print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
186
+ elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
187
+ pass
188
+
189
+ is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
190
+ temp_processed_data: Dict[str, Any] = {}
191
+
192
+ for df_original, name_key in dataframes_to_process:
193
+ df = df_original.copy()
194
+
195
+ # Handle datetime columns with improved detection and conversion
196
+ if date_format:
197
+ # Check for datetime columns using multiple approaches
198
+ datetime_columns = []
199
+
200
+ # Method 1: Use pandas dtype detection
201
+ datetime_columns.extend(df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns.tolist())
202
+
203
+ # Method 2: Check for datetime objects in each column
204
+ for col in df.columns:
205
+ if col not in datetime_columns:
206
+ # Sample a few non-null values to check type
207
+ sample_values = df[col].dropna().head(10)
208
+ if len(sample_values) > 0:
209
+ for val in sample_values:
210
+ if isinstance(val, (datetime, date, pd.Timestamp)):
211
+ datetime_columns.append(col)
212
+ break
213
+
214
+ # Convert datetime columns
215
+ for col_name in datetime_columns:
216
+ try:
217
+ if date_format == 'iso':
218
+ df[col_name] = df[col_name].apply(lambda x: self._convert_to_iso(x))
219
+ elif date_format == 'epoch':
220
+ df[col_name] = df[col_name].apply(lambda x: self._convert_to_epoch(x))
221
+ except Exception as e_date:
222
+ print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}")
223
+
224
+ # Replace NaN values with None for JSON compatibility
225
+ df = df.astype(object).where(pd.notnull(df), None)
226
+
227
+ # Final safety check: convert any remaining datetime objects
228
+ for col in df.columns:
229
+ df[col] = df[col].apply(lambda x: self._safe_datetime_convert(x, date_format))
230
+
231
+ current_json_segment = df.to_dict(orient=orient)
232
+
233
+ if is_direct_output:
234
+ output_data_final = current_json_segment
235
+ break
236
+ else:
237
+ if name_key is not None:
238
+ temp_processed_data[name_key] = current_json_segment
239
+
240
+ if not is_direct_output:
241
+ output_data_final = temp_processed_data
242
+
243
+ with open(json_filepath, 'w', encoding='utf-8') as f:
244
+ json.dump(output_data_final, f, indent=4, ensure_ascii=False)
245
+
246
+ print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
247
+ return True
248
+
249
+ except FileNotFoundError:
250
+ print(f"Error: Input file not found at {input_filepath.name}")
251
+ return False
252
+ except ValueError as ve:
253
+ print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
254
+ return False
255
+ except Exception as e:
256
+ print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
257
+ return False
258
+
259
+ def _convert_to_iso(self, value):
260
+ """Convert datetime-like objects to ISO format string."""
261
+ if pd.isnull(value) or value is None:
262
+ return None
263
+
264
+ try:
265
+ if isinstance(value, str):
266
+ return value # Already a string
267
+ elif hasattr(value, 'isoformat'):
268
+ return value.isoformat()
269
+ elif isinstance(value, pd.Timestamp):
270
+ return value.isoformat()
271
+ else:
272
+ return str(value)
273
+ except:
274
+ return str(value) if value is not None else None
275
+
276
+ def _convert_to_epoch(self, value):
277
+ """Convert datetime-like objects to epoch milliseconds."""
278
+ if pd.isnull(value) or value is None:
279
+ return None
280
+
281
+ try:
282
+ if isinstance(value, (int, float)):
283
+ return int(value) # Assume already epoch
284
+ elif hasattr(value, 'timestamp'):
285
+ return int(value.timestamp() * 1000)
286
+ elif isinstance(value, pd.Timestamp):
287
+ return int(value.timestamp() * 1000)
288
+ else:
289
+ return str(value)
290
+ except:
291
+ return str(value) if value is not None else None
292
+
293
+ def _safe_datetime_convert(self, value, date_format):
294
+ """Final safety conversion for any remaining datetime objects."""
295
+ if pd.isnull(value) or value is None:
296
+ return None
297
+
298
+ # If it's a datetime-like object, convert it
299
+ if isinstance(value, (datetime, date, pd.Timestamp)):
300
+ if date_format == 'iso':
301
+ return self._convert_to_iso(value)
302
+ elif date_format == 'epoch':
303
+ return self._convert_to_epoch(value)
304
+ else:
305
+ return str(value)
224
306
 
225
- except FileNotFoundError:
226
- print(f"Error: Input file not found at {input_filepath.name}")
227
- return False
228
- except ValueError as ve:
229
- print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
230
- return False
231
- except Exception as e:
232
- print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
233
- return False
307
+ return value
234
308
 
235
309
 
236
310
  def chunk_text(text, chunk_size=500, overlap_size=50):
@@ -257,4 +331,34 @@ def chunk_text(text, chunk_size=500, overlap_size=50):
257
331
  return chunks
258
332
 
259
333
  if __name__ == '__main__':
260
- print("do you want a rice bag?")
334
+ print("Test Run Start:")
335
+ try:
336
+ # print("Test 1: scaned pdf page, bytes")
337
+ # with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
338
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
339
+ # print(conv.output)
340
+
341
+ # print("Test 2: JD pdf, bytes")
342
+ # with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
343
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
344
+ # print(conv.output)
345
+
346
+ # print("Test 3: excel schedule, bytes")
347
+ # with open("/home/cyto/Downloads/Assignment schedule.xlsx", "rb") as imgpdf:
348
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".xlsx", file_type= "excel")
349
+ # print(conv.output)
350
+
351
+ # without bytes example:
352
+ print("Test 4: scanned pdf, path")
353
+ conv= Convertor(myfile= Path('/home/cyto/Documents/scanned.pdf'), output_dir= Path('/home/cyto/Documents'))
354
+ print(conv.output)
355
+
356
+ # print("Test 5: schedule excel, path")
357
+ # conv= Convertor(myfile= Path('/home/cyto/Downloads/Assignment schedule.xlsx'), output_dir= Path('/home/cyto/Downloads'))
358
+ # print(conv.output)
359
+ except FileNotFoundError as fe:
360
+ print("file not found, modify the driver code to get sample files to test:\n\n", fe)
361
+ except Exception as e:
362
+ print("unhandled: ", e)
363
+
364
+ print("Test Run End.")
pembot/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """
2
2
  A Python Package to convert PEM blog content to usseful information by leveraging LLMs
3
3
  """
4
- __version__ = '0.0.3'
4
+ __version__ = '0.0.4'
5
5
  from .main import save_to_json_file, make_query
6
6
  __all__ = ["save_to_json_file", "make_query"]
pembot/config/config.yaml CHANGED
@@ -2,4 +2,4 @@ OUTPUT_DIR: /home/cyto/dev/pembotdir
2
2
  PAGE_DELIMITER: ___________________________ NEXT PAGE ___________________________
3
3
  app:
4
4
  name: pembot
5
- version: 0.0.3
5
+ version: 0.0.4
pembot/main.py CHANGED
@@ -10,7 +10,11 @@ from pembot.query import rag_query_llm, remove_bs
10
10
  import os
11
11
  import json
12
12
  from pembot.utils.string_tools import make_it_an_id
13
- from schema.structure import required_fields
13
+ import pickle
14
+ from sys import argv
15
+
16
+ required_fields_path= ""
17
+ required_fields= None
14
18
 
15
19
 
16
20
  def make_query(required_fields: list[tuple[str, str, str, str]]):
@@ -67,8 +71,8 @@ def save_to_json_file(llm_output: str, filepath: Path):
67
71
  except Exception as e:
68
72
  print(f"An unexpected error occurred in save_to_json_file: {e}")
69
73
 
70
- def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
71
- # give required output fields
74
+ def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
75
+ # give required output fields
72
76
  # take the documents
73
77
  # convert to text
74
78
  # upload to chromadb
@@ -80,7 +84,7 @@ def make_document_summarization_and_embeddings(db_client, llm_client, inference_
80
84
  expected_json= text_out_dir / 'json' / (file_root + '.json')
81
85
  document_id= make_it_an_id(file_root)
82
86
 
83
- if docfile.is_file and not (expected_json).exists():
87
+ if docfile.is_file and not (expected_json).exists():
84
88
 
85
89
  expected_markdown= text_out_dir / (file_root + '.md')
86
90
  if not (expected_markdown).exists():
@@ -161,6 +165,23 @@ if __name__ == "__main__":
161
165
  # provider="Jina AI",
162
166
  # api_key= JINA_API_KEY,
163
167
  # )
168
+ #
169
+
170
+ try:
171
+ if len(argv) > 1:
172
+ print(f"First argument: {argv[1]}")
173
+ required_fields_path= argv[1]
174
+ with open(required_fields_path, "rb") as rf:
175
+ required_fields= pickle.load(rf)
176
+ except Exception as e:
177
+ print("error while getting required_fields pickle. Please pickle it and put it in project directory to continue\n", e)
178
+
179
+ if required_fields is None:
180
+ print("couldnt load required fields. please provide path to pickle in command line argument")
181
+ exit()
182
+ else:
183
+ print(required_fields)
184
+
164
185
 
165
186
  inference_client= InferenceClient(
166
187
  provider="hf-inference",
@@ -178,7 +199,7 @@ if __name__ == "__main__":
178
199
  llm_provider_name: PROVIDER_T="nebius"
179
200
 
180
201
  # nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
181
- # Worth mentioning:
202
+ # Worth mentioning:
182
203
  # jinaai/jina-embeddings-v3
183
204
  # BAAI/bge-base-en-v1.5
184
205
  # nomic-ai/nomic-embed-text-v1.5
@@ -203,6 +224,3 @@ if __name__ == "__main__":
203
224
 
204
225
  docs_collection= database["summary_docs"]
205
226
  upload_summaries(process_output_dir / 'json', docs_collection)
206
-
207
-
208
-
@@ -1,11 +1,10 @@
1
- import fitz # PyMuPDF
1
+ import fitz
2
2
  import pdfplumber
3
3
  import re
4
4
  import yaml
5
5
  # import pytesseract
6
6
  import numpy as np
7
- from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
8
- # VisionEncoderDecoderModel, ViTImageProcessor,
7
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
9
8
  from typing import Literal, final
10
9
  import torch
11
10
  from PIL import Image
@@ -16,22 +15,11 @@ import warnings
16
15
  from pathlib import Path
17
16
  from abc import ABC, abstractmethod
18
17
  import argparse
19
- from PIL import Image
20
18
  import io
21
- from PIL import Image
22
-
23
- model_path = "nanonets/Nanonets-OCR-s"
24
-
25
- model = AutoModelForImageTextToText.from_pretrained(
26
- model_path,
27
- torch_dtype="auto",
28
- device_map="auto",
29
- attn_implementation="flash_attention_2"
30
- )
31
- model.eval()
19
+ from google import genai
20
+ from google.genai import types
21
+ import mimetypes
32
22
 
33
- tokenizer = AutoTokenizer.from_pretrained(model_path)
34
- processor = AutoProcessor.from_pretrained(model_path)
35
23
 
36
24
 
37
25
  warnings.filterwarnings("ignore")
@@ -74,9 +62,31 @@ class MarkdownPDFExtractor(PDFExtractor):
74
62
 
75
63
  BULLET_POINTS = "•◦▪▫●○"
76
64
 
77
- def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"]):
65
+ def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"], model_name: str | None= None):
78
66
  super().__init__(pdf_path)
79
67
 
68
+ if model_name is None:
69
+ self.MODEL_NAME= "gemini-2.5-flash"
70
+ else:
71
+ self.MODEL_NAME= model_name
72
+
73
+ if "gemini" in self.MODEL_NAME:
74
+ self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
75
+ else:
76
+ model_path = "nanonets/Nanonets-OCR-s"
77
+ self.model = AutoModelForImageTextToText.from_pretrained(
78
+ model_path,
79
+ torch_dtype="auto",
80
+ device_map="auto",
81
+ attn_implementation="flash_attention_2"
82
+ )
83
+ self.model.eval()
84
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
85
+ self.processor = AutoProcessor.from_pretrained(model_path)
86
+ self.setup_image_captioning()
87
+
88
+
89
+
80
90
  self.markdown_content= ""
81
91
  self.pdf_filename = Path(pdf_path).stem
82
92
  self.output_path= output_path
@@ -87,26 +97,26 @@ class MarkdownPDFExtractor(PDFExtractor):
87
97
  self.page_delimiter= page_delimiter
88
98
  Path(output_path).mkdir(parents=True, exist_ok=True)
89
99
 
90
- # self.setup_image_captioning()
91
-
92
- # def setup_image_captioning(self):
93
- # """Set up the image captioning model."""
94
- # try:
95
- # self.model = VisionEncoderDecoderModel.from_pretrained(
96
- # "nlpconnect/vit-gpt2-image-captioning"
97
- # )
98
- # self.feature_extractor = ViTImageProcessor.from_pretrained(
99
- # "nlpconnect/vit-gpt2-image-captioning"
100
- # )
101
- # self.tokenizer = AutoTokenizer.from_pretrained(
102
- # "nlpconnect/vit-gpt2-image-captioning"
103
- # )
104
- # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
105
- # self.model.to(self.device)
106
- # self.logger.info("Image captioning model set up successfully.")
107
- # except Exception as e:
108
- # self.logger.error(f"Error setting up image captioning model: {e}")
109
- # self.logger.exception(traceback.format_exc())
100
+
101
+
102
+ def setup_image_captioning(self):
103
+ """Set up the image captioning model."""
104
+ try:
105
+ self.model = VisionEncoderDecoderModel.from_pretrained(
106
+ "nlpconnect/vit-gpt2-image-captioning"
107
+ )
108
+ self.feature_extractor = ViTImageProcessor.from_pretrained(
109
+ "nlpconnect/vit-gpt2-image-captioning"
110
+ )
111
+ self.tokenizer = AutoTokenizer.from_pretrained(
112
+ "nlpconnect/vit-gpt2-image-captioning"
113
+ )
114
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
115
+ self.model.to(self.device)
116
+ self.logger.info("Image captioning model set up successfully.")
117
+ except Exception as e:
118
+ self.logger.error(f"Error setting up image captioning model: {e}")
119
+ self.logger.exception(traceback.format_exc())
110
120
 
111
121
  def extract(self):
112
122
  try:
@@ -123,282 +133,197 @@ class MarkdownPDFExtractor(PDFExtractor):
123
133
  self.logger.exception(traceback.format_exc())
124
134
  return "", []
125
135
 
126
- def extract_markdown_by_blocks(self):
127
- """Main method to extract markdown from PDF."""
128
- try:
129
- doc = fitz.open(self.pdf_path)
130
- markdown_content = ""
131
- markdown_pages = []
132
- tables = self.extract_tables()
133
- table_index = 0
134
- list_counter = 0
135
- in_code_block = False
136
- code_block_content = ""
137
- code_block_lang = None
138
- prev_line = ""
139
-
140
- for page_num, page in enumerate(doc):
141
- self.logger.info(f"Processing page {page_num + 1}")
142
- page_content = ""
143
- blocks = page.get_text("dict")["blocks"]
144
- page_height = page.rect.height
145
- links = self.extract_links(page)
146
-
147
- if len(page.get_images()) > 0 and len(page.get_images()) <= 128:
148
- for block in blocks:
149
- if block["type"] == 0: # Text
150
- page_content += self.process_text_block(
151
- block,
152
- page_height,
153
- links,
154
- list_counter,
155
- in_code_block,
156
- code_block_content,
157
- code_block_lang,
158
- prev_line,
159
- )
160
- elif block["type"] == 1: # Image
161
- page_content += self.process_image_block(page, block)
162
-
163
- else:
164
- for block in blocks:
165
- if block["type"] == 0: # Text
166
- page_content += self.process_text_block(
167
- block,
168
- page_height,
169
- links,
170
- list_counter,
171
- in_code_block,
172
- code_block_content,
173
- code_block_lang,
174
- prev_line,
175
- )
176
-
177
- # Insert tables at their approximate positions
178
- while (
179
- table_index < len(tables)
180
- and tables[table_index]["page"] == page.number
181
- ):
182
- page_content += (
183
- "\n\n"
184
- + self.table_to_markdown(tables[table_index]["content"])
185
- + "\n\n"
186
- )
187
- table_index += 1
188
-
189
- markdown_pages.append(self.post_process_markdown(page_content))
190
- markdown_content += page_content + config["PAGE_DELIMITER"]
191
136
 
192
- markdown_content = self.post_process_markdown(markdown_content)
193
- return markdown_content, markdown_pages
194
- except Exception as e:
195
- self.logger.error(f"Error extracting markdown: {e}")
196
- self.logger.exception(traceback.format_exc())
197
- return "", []
198
-
199
-
200
- def ocr_page_with_nanonets_s(self, pil_image, model, processor, max_new_tokens: int | None = None):
137
+ def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
201
138
  prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
202
139
  if max_new_tokens is None:
203
140
  max_new_tokens= 4096
204
141
 
205
- # image = Image.open(image_path)
206
- image = pil_image
207
- messages = [
208
- {"role": "system", "content": "You are a helpful assistant."},
209
- {"role": "user", "content": [
210
- {"type": "image", "image": image},
211
- {"type": "text", "text": prompt},
212
- ]},
213
- ]
214
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
215
- inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
216
- inputs = inputs.to(model.device)
142
+ if 'gemini' in self.MODEL_NAME:
143
+
144
+ image_format = pil_image.format
145
+ dummy_filename = f"dummy.{image_format.lower()}"
146
+ mime_type, _ = mimetypes.guess_type(dummy_filename)
147
+ response= self.gclient.models.generate_content(
148
+ model= self.MODEL_NAME,
149
+ contents=[
150
+ types.Part.from_bytes(
151
+ data=img_bytes.getvalue(),
152
+ mime_type= mime_type
153
+ ),
154
+ prompt
155
+ ]
156
+ )
157
+ # print("response :", response)
158
+ return response.text
159
+ else:
160
+ image = pil_image
161
+ messages = [
162
+ {"role": "system", "content": "You are a helpful assistant."},
163
+ {"role": "user", "content": [
164
+ {"type": "image", "image": image},
165
+ {"type": "text", "text": prompt},
166
+ ]},
167
+ ]
168
+ text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
169
+ inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
170
+ inputs = inputs.to(self.model.device)
217
171
 
218
- output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
219
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
172
+ output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
173
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
220
174
 
221
- output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
222
- return output_text[0]
175
+ output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
176
+ return output_text[0]
223
177
 
224
178
 
225
179
 
226
180
  def extract_markdown(self):
227
- """
228
- Extracts all possible text content from a PDF page, concatenating it
229
- from direct text blocks, OCR from embedded image blocks, and OCR from
230
- full-page raster images (scanned pages).
181
+ """
182
+ Extracts all possible content from a PDF, prioritizing searchable text,
183
+ then OCR for embedded images, and finally full-page OCR for scanned pages.
184
+ Avoids redundant OCR where possible.
185
+
186
+ Returns:
187
+ tuple: A tuple containing:
188
+ - str: The concatenated markdown content of all pages.
189
+ - list: A list of strings, where each string is the comprehensive markdown
190
+ for a corresponding page.
191
+ """
192
+ all_pages_markdown = []
193
+ full_document_markdown = [] # Changed to list of lines/blocks to handle insertions better
194
+
195
+ try:
196
+ doc = fitz.open(self.pdf_path)
197
+ self.logger.info(f"Opened PDF: {self.pdf_path}")
198
+
199
+ tables = self.extract_tables()
200
+ table_index = 0
201
+
202
+ # State variables for process_text_block that might need to persist across blocks
203
+ # Re-initialize for each new document, but allow state management within process_text_block for lines
204
+ list_counter = 0
205
+ in_code_block = False
206
+ code_block_content = ""
207
+ code_block_lang = None
208
+ prev_line = ""
209
+
210
+ for page_num, page in enumerate(doc):
211
+ current_page_markdown_blocks = [] # Collect markdown blocks for the current page
212
+ page_has_searchable_text = False
213
+ page_has_embedded_images = False
214
+
215
+ self.logger.info(f"\nProcessing page {page_num + 1}...")
216
+
217
+ blocks = page.get_text('dict')['blocks']
218
+ page_height = page.rect.height
219
+ links = self.extract_links(page)
220
+
221
+ # Phase 1: Process text blocks and embedded image blocks
222
+ for block_num, block in enumerate(blocks):
223
+ if block['type'] == 0: # Text block
224
+ page_has_searchable_text = True
225
+ processed_text = self.process_text_block(
226
+ block,
227
+ page_height,
228
+ links,
229
+ list_counter,
230
+ in_code_block,
231
+ code_block_content,
232
+ code_block_lang,
233
+ prev_line,
234
+ )
235
+ if processed_text.strip():
236
+ current_page_markdown_blocks.append(processed_text)
237
+
238
+ elif block['type'] == 1: # Image block
239
+ page_has_embedded_images = True
240
+ self.logger.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
241
+ img_data = block['image']
242
+
243
+ try:
244
+ image_bytes= io.BytesIO(img_data)
245
+ pil_image = Image.open(image_bytes)
246
+ ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
247
+ pil_image, image_bytes, max_new_tokens=15000
248
+ )
231
249
 
232
- Returns:
233
- list: A list of strings, where each string is the comprehensive text
234
- for a corresponding page. Returns an empty list if an error occurs.
235
- """
250
+ if ocr_text_from_block_image.strip():
251
+ self.logger.info(" OCR found text in embedded image block.")
252
+ current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_block_image.strip()}\n\n")
253
+ else:
254
+ self.logger.info(f" No OCR text from embedded image block. Adding generic placeholder.")
255
+ current_page_markdown_blocks.append("\n\n![Image Placeholder](image_on_page_{page_num+1}_block_{block_num+1}.png)\n\n") # Consider saving images
256
+ except Exception as e:
257
+ self.logger.error(f" Error processing embedded image block for OCR: {e}")
258
+ current_page_markdown_blocks.append("\n\n![Image Processing Error](error_on_page_{page_num+1}_block_{block_num+1}.png)\n\n")
259
+
260
+
261
+ # Insert tables at their approximate positions (after blocks are processed for the page)
262
+ # You might need more sophisticated logic here if table positions are granular
263
+ while (
264
+ table_index < len(tables)
265
+ and tables[table_index]["page"] == page.number
266
+ ):
267
+ current_page_markdown_blocks.append(
268
+ self.table_to_markdown(tables[table_index]["content"])
269
+ )
270
+ table_index += 1
236
271
 
237
- """taken from self:
238
- pdf_path (str): The path to the input PDF file.
239
- output_path (str): Directory to save debug output (like rendered images).
240
- """
272
+ # Phase 2: Full-page OCR if the page seems to be a scanned image or lacks sufficient searchable text
273
+ # We prioritize actual searchable text and embedded image OCR.
274
+ # Only if very little or no text was found, we resort to full-page OCR.
275
+ combined_current_page_text_length = len("".join(current_page_markdown_blocks).strip())
241
276
 
242
- all_pages_text = []
243
- the_text= ""
277
+ # A heuristic: if almost no searchable text and no significant OCR from embedded images
278
+ if not page_has_searchable_text and combined_current_page_text_length < 100: # Threshold for considering "minimal text"
279
+ self.logger.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
280
+ try:
281
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
282
+ img_bytes = pix.tobytes("png")
283
+ image_bytestream= io.BytesIO(img_bytes)
284
+ pil_image = Image.open(image_bytestream)
244
285
 
245
- try:
246
- doc = fitz.open(self.pdf_path)
247
- logging.info(f"Opened PDF: {self.pdf_path}")
248
-
249
- tables = self.extract_tables()
250
- table_index = 0
251
- list_counter = 0
252
- in_code_block = False
253
- code_block_content = ""
254
- code_block_lang = None
255
- prev_line = ""
256
-
257
- for page_num, page in enumerate(doc):
258
- page_text_content = []
259
- page_has_searchable_text = False
260
-
261
- logging.info(f"\nProcessing page {page_num + 1}...")
262
-
263
- # --- Phase 1: Extract text from direct text blocks and process embedded images ---
264
- blocks = page.get_text('dict')['blocks']
265
- text_blocks_content = []
266
- image_block_text_content = []
267
-
268
- page_height = page.rect.height
269
- links = self.extract_links(page)
270
-
271
- for block_num, block in enumerate(blocks):
272
- if block['type'] == 0: # Text block
273
- page_has_searchable_text = True
274
- text_blocks_content.append(self.process_text_block(
275
- block,
276
- page_height,
277
- links,
278
- list_counter,
279
- in_code_block,
280
- code_block_content,
281
- code_block_lang,
282
- prev_line,
283
- ))
284
-
285
- # for line in block['lines']:
286
- # for span in line['spans']:
287
- # text_blocks_content.append(span['text'])
288
- elif block['type'] == 1: # Image block
289
- logging.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
290
- img_data = block['image']
291
- img_ext = block['ext']
286
+ ocr_text_from_page = self.ocr_page_with_nanonets_s(
287
+ pil_image, image_bytestream, max_new_tokens=15000
288
+ )
292
289
 
293
- try:
294
- # Attempt OCR on the embedded image block
295
- pil_image = Image.open(io.BytesIO(img_data))
296
- # ocr_text_from_block_image = pytesseract.image_to_string(pil_image)
297
- ocr_text_from_block_image= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
298
-
299
- if ocr_text_from_block_image.strip():
300
- logging.info(f" OCR found text in embedded image block.")
301
- image_block_text_content.append(ocr_text_from_block_image.strip())
290
+ if ocr_text_from_page.strip():
291
+ self.logger.info(f" Successfully extracted text via full-page OCR for page {page_num + 1}.")
292
+ # If full-page OCR yields significant content and other methods didn't,
293
+ # replace or augment. Here, we'll replace to avoid double-counting if it's primarily scanned.
294
+ # You might choose to append if you want to combine (e.g., if there's header text + scanned body)
295
+ if combined_current_page_text_length < 50: # If almost nothing was found before, replace
296
+ current_page_markdown_blocks = [ocr_text_from_page.strip()]
297
+ else: # Otherwise, augment (append)
298
+ current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_page.strip()}\n\n")
302
299
  else:
303
- # If no OCR text, use the caption
304
- # caption = self.caption_image(pil_image)
305
- # if caption:
306
- # logging.info(f" No OCR text, using caption for embedded image block.")
307
- # image_block_text_content.append(caption)
308
- # else:
309
- # logging.info(f" No OCR text and no caption for embedded image block.")
310
-
311
- # a) captioning sucks, b) no need
312
- image_block_text_content.append("An Image")
313
-
314
- # except pytesseract.TesseractNotFoundError:
315
- # logging.warning(" Tesseract-OCR not found. Skipping OCR for embedded image block.")
316
- # caption = self.process_image_block(page, block)
317
- # if caption: image_block_text_content.append(caption)
318
-
319
- # image_block_text_content.append("An Image")
300
+ self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
320
301
  except Exception as e:
321
- logging.error(f" Error processing embedded image block for OCR/caption: {e}")
322
- # caption = self.process_image_block(page, block)
323
- # if caption: image_block_text_content.append(caption)
324
- image_block_text_content.append("An Image")
325
-
326
-
327
- # Insert tables at their approximate positions
328
- while (
329
- table_index < len(tables)
330
- and tables[table_index]["page"] == page.number
331
- ):
332
- page_text_content += (
333
- "\n\n"
334
- + self.table_to_markdown(tables[table_index]["content"])
335
- + "\n\n"
336
- )
337
- table_index += 1
338
-
339
- # Add content from text blocks
340
- if text_blocks_content:
341
- page_text_content.append(" ".join(text_blocks_content))
342
-
343
- # Add content from image blocks
344
- if image_block_text_content:
345
- page_text_content.append("\n".join(image_block_text_content))
346
-
347
-
348
- # --- Phase 2: OCR the entire page IF it seems to be a scanned image ---
349
- # We check if page_has_searchable_text is False or if the amount of text
350
- # is very small, suggesting it might be mostly a scanned page.
351
- # A threshold of 50 characters is arbitrary; adjust as needed.
352
- current_text_len = len(" ".join(page_text_content).strip())
353
-
354
- if not page_has_searchable_text or current_text_len < 50:
355
- logging.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
356
- try:
357
- # Render the page as a high-resolution image (e.g., 300 DPI)
358
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
359
- img_bytes = pix.tobytes("png")
360
-
361
- pil_image = Image.open(io.BytesIO(img_bytes))
362
-
363
- # Perform OCR on the entire page image
364
- # ocr_text_from_page = pytesseract.image_to_string(pil_image)
365
- ocr_text_from_page= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
366
-
367
- if ocr_text_from_page.strip():
368
- logging.info(f" Successfully extracted text via full-page OCR.")
369
- page_text_content.append(ocr_text_from_page.strip())
370
- else:
371
- logging.info(f" Full-page OCR yielded no text for page {page_num+1}.")
372
-
373
- # except pytesseract.TesseractNotFoundError:
374
- # logging.warning(" Tesseract-OCR not found. Skipping full-page OCR for this page.")
375
- except Exception as e:
376
- logging.error(f" Error during full-page OCR on page {page_num+1}: {e}")
377
- else:
378
- logging.info(f" Page {page_num + 1} has sufficient searchable text; skipping full-page OCR.")
379
-
380
-
381
- # Concatenate all collected text for the current page
382
- final_page_text = "\n".join(filter(None, page_text_content)).strip() # Use filter(None, ...) to remove empty strings
383
- all_pages_text.append(self.post_process_markdown(final_page_text))
384
- the_text += final_page_text + self.page_delimiter
385
-
386
- logging.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_text[:200]}...")
302
+ self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
303
+ else:
304
+ self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
387
305
 
388
- print("\npage done\n")
389
- print(final_page_text)
306
+ # Join collected markdown blocks for the current page
307
+ final_page_markdown = "\n".join(filter(None, current_page_markdown_blocks)).strip()
308
+ all_pages_markdown.append(self.post_process_markdown(final_page_markdown))
309
+ full_document_markdown.append(self.post_process_markdown(final_page_markdown))
310
+ full_document_markdown.append(self.page_delimiter)
390
311
 
391
312
 
392
- doc.close()
393
- return the_text, all_pages_text
313
+ self.logger.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_markdown[:200]}...")
314
+ print(f"\n--- Page {page_num+1} Done ---\n")
315
+ print(final_page_markdown[:500]) # Print first 500 chars of page markdown
394
316
 
395
- except fitz.FileNotFoundError:
396
- logging.error(f"PDF file not found: {self.pdf_path}")
397
- return []
398
- except Exception as e:
399
- logging.critical(f"An unexpected error occurred: {e}")
400
- return []
317
+ doc.close()
318
+ return "".join(full_document_markdown), all_pages_markdown
401
319
 
320
+ except fitz.FileNotFoundError:
321
+ self.logger.error(f"PDF file not found: {self.pdf_path}")
322
+ return "", []
323
+ except Exception as e:
324
+ self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
325
+ self.logger.exception(traceback.format_exc())
326
+ return "", []
402
327
 
403
328
  def extract_tables(self):
404
329
  """Extract tables from PDF using pdfplumber."""
@@ -449,13 +374,13 @@ class MarkdownPDFExtractor(PDFExtractor):
449
374
  self.logger.exception(traceback.format_exc())
450
375
  return ""
451
376
 
452
- def perform_ocr(self, image):
377
+ def perform_ocr(self, image, image_bytes):
453
378
  """Perform OCR on the given image."""
454
379
  try:
455
380
  # ocr_result = pytesseract.image_to_string(
456
381
  # image
457
382
  # )
458
- ocr_result= self.ocr_page_with_nanonets_s(image, model, processor, max_new_tokens=15000)
383
+ ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
459
384
 
460
385
 
461
386
  return ocr_result.strip()
@@ -464,10 +389,10 @@ class MarkdownPDFExtractor(PDFExtractor):
464
389
  self.logger.exception(traceback.format_exc())
465
390
  return ""
466
391
 
467
- def caption_image(self, image):
392
+ def caption_image(self, image, image_bytes):
468
393
  """Generate a caption for the given image."""
469
394
  try:
470
- ocr_text = self.perform_ocr(image)
395
+ ocr_text = self.perform_ocr(image, image_bytes)
471
396
  if ocr_text:
472
397
  return ocr_text
473
398
 
@@ -475,19 +400,38 @@ class MarkdownPDFExtractor(PDFExtractor):
475
400
  if image.mode != "RGB":
476
401
  image = image.convert("RGB")
477
402
 
478
- # Ensure the image is in the correct shape
479
- image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
403
+ image_format = image.format
404
+ dummy_filename = f"dummy.{image_format.lower()}"
405
+ mime_type, _ = mimetypes.guess_type(dummy_filename)
406
+
407
+ if "gemini" in self.MODEL_NAME:
408
+ response= self.gclient.models.generate_content(
409
+ model= self.MODEL_NAME,
410
+ contents=[
411
+ types.Part.from_bytes(
412
+ data=image_bytes.getvalue(),
413
+ mime_type= mime_type
414
+ ),
415
+ "Write a caption for this image"
416
+ ]
417
+ )
418
+ return response.text
419
+ else:
420
+ # Ensure the image is in the correct shape
421
+ image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
480
422
 
481
- inputs = self.feature_extractor(images=image, return_tensors="pt").to(
482
- self.device
483
- )
484
- pixel_values = inputs.pixel_values
423
+ inputs = self.feature_extractor(images=image, return_tensors="pt").to(
424
+ self.device
425
+ )
426
+ pixel_values = inputs.pixel_values
485
427
 
486
- generated_ids = self.model.generate(pixel_values, max_length=30)
487
- generated_caption = self.tokenizer.batch_decode(
488
- generated_ids, skip_special_tokens=True
489
- )[0]
490
- return generated_caption.strip()
428
+ generated_ids = self.model.generate(pixel_values, max_length=30)
429
+
430
+ generated_ids = self.model.generate(pixel_values, max_length=30)
431
+ generated_caption = self.tokenizer.batch_decode(
432
+ generated_ids, skip_special_tokens=True
433
+ )[0]
434
+ return generated_caption.strip()
491
435
  except Exception as e:
492
436
  self.logger.error(f"Error captioning image: {e}")
493
437
  self.logger.exception(traceback.format_exc())
@@ -789,7 +733,11 @@ class MarkdownPDFExtractor(PDFExtractor):
789
733
  Path(self.output_path) / image_filename
790
734
  ) # Convert to Path object
791
735
  image.save(image_path, "PNG", optimize=True, quality=95)
792
- caption = self.caption_image(image)
736
+
737
+ img_byte_arr = io.BytesIO()
738
+ image.save(img_byte_arr)
739
+ caption = self.caption_image(image, img_byte_arr)
740
+
793
741
  if not caption:
794
742
  caption = (
795
743
  f"{self.pdf_filename}_image_{int(page.number)+1}_{block['number']}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pembot
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: A Python Package to convert PEM blog content to usseful information by leveraging LLMs
5
5
  Author-email: cyto <aryan_sidhwani@protonmail.com>
6
6
  License-Expression: MIT
@@ -1,8 +1,8 @@
1
1
  pembot/.gitignore,sha256=_7FTsZokJ_pzEyyPjOsGw5x5Xx3gUBFaafs7UlPsv9E,98
2
2
  pembot/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
3
- pembot/__init__.py,sha256=2Q6n9cB2MiCO_Fysy8FImzSyQPY4ZWcHfoUGrWP8y8M,211
3
+ pembot/__init__.py,sha256=gDyRo0zcHlKqOSZfoLopwIZ7zXGBXlogi4tihmIeC5o,211
4
4
  pembot/gartner.py,sha256=3ALknQ5mSXIimmwCa3JFDzB_EW2hHEcQO1T2odyBquk,5408
5
- pembot/main.py,sha256=qjfqM_vHRZsnvU7DtvR2dJNUy1B6oOyGJ0LAe1sIOu4,9052
5
+ pembot/main.py,sha256=lZLIV8XPonvNoY4LVS-5fct1y9URMXWoSGJUKMw3Yg8,9667
6
6
  pembot/output_structure_local.py,sha256=YfpHzfTNeLMSsB_CjAamha9D6Iz7E1IC-tW9xPCMWFc,3000
7
7
  pembot/pem.py,sha256=mv6iGcN1peSY7z2dtCQ_BKj31EFBNfczBhps_d-0XDo,6377
8
8
  pembot/query.py,sha256=EDbvQ4It1S8Zg4y-sUCLZjiyBmvQGUAMOVq9yPs_hdE,8242
@@ -79,17 +79,17 @@ pembot/.git/refs/heads/main,sha256=mQvXNyC6knHS2Siuig65zcGaLxzAR8zX_Zoqw2Oj6S0,4
79
79
  pembot/.git/refs/remotes/origin/HEAD,sha256=K7aiSqD8bEhBAPXVGim7rYQc0sdV9dk_qiBOXbtOsrQ,30
80
80
  pembot/.git/refs/remotes/origin/main,sha256=mQvXNyC6knHS2Siuig65zcGaLxzAR8zX_Zoqw2Oj6S0,41
81
81
  pembot/AnyToText/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- pembot/AnyToText/convertor.py,sha256=0_ogFWvZEl2x3t3W8-sHjmUUCJe2-Nuxjp_SjTb-mqc,13098
82
+ pembot/AnyToText/convertor.py,sha256=26Pq4OLhVNHgIhJdLLcxGPFTtdnG2lsQkR_53_zkZZM,16997
83
83
  pembot/TextEmbedder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
84
  pembot/TextEmbedder/gemini_embedder.py,sha256=P679-2mmQESlYKML1vcrwx_-CSgWJgIQk7NL4F7BLQE,677
85
85
  pembot/TextEmbedder/mongodb_embedder.py,sha256=pD8mP-uC_o0COPdOrCTMpoC5PdF8hXlqARHvTr2T-VI,9642
86
86
  pembot/TextEmbedder/mongodb_index_creator.py,sha256=ejpsF_y1zY6Z0nux02vjODiDPnxx-YA_xy2PmT94zZ4,5306
87
87
  pembot/TextEmbedder/vector_query.py,sha256=Kh1uhx9CatB-oQlQtnW-1I2Qz7MGHI20n2h_8peAChM,1986
88
- pembot/config/config.yaml,sha256=tM1q7yXsVKJTUDya1JMaclNkAbpTgmcn_7uWiLU-8V8,156
88
+ pembot/config/config.yaml,sha256=IJG9CGEPxH83Cr4X5OEZj9SLFT4ZfwINrA63GpjQssA,156
89
89
  pembot/pdf2markdown/LICENSE,sha256=1JTJhQjUYDqJzFJhNtitm7mHyE71PRHgetIqRRWg6Pk,1068
90
90
  pembot/pdf2markdown/README.md,sha256=jitM1pwI69oa0N4mXv5-SY1ka9Sz3jsRNCDdpW-50kY,4545
91
91
  pembot/pdf2markdown/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
- pembot/pdf2markdown/extract.py,sha256=4AtYbpsWfQw7w-oU-ue4IL_jdNC4zpA-OjPHHXC2Jdo,35648
92
+ pembot/pdf2markdown/extract.py,sha256=LV9cqkWWmQk-32hsfPdjzbzVVF0kYxzyH7-ssUALxpU,34956
93
93
  pembot/pdf2markdown/requirements.txt,sha256=0vZQzkSZKLNVUttd4euoDyYEy0nc2W3CIVxhepHW5Ho,76
94
94
  pembot/pdf2markdown/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
95
95
  pembot/pdf2markdown/.git/config,sha256=ltEWI476vFz2goGWD7QmCDvC6UCQ9ELviXuURlvte_w,269
@@ -123,7 +123,7 @@ pembot/pdf2markdown/config/config.yaml,sha256=w75W2Eg4-tu8rRk_23PqxWDh0010kRKLmP
123
123
  pembot/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
124
  pembot/utils/inference_client.py,sha256=jeURmY2P5heVlH1dCV0XSgiX3U2qYGEmrnUv0KFpdww,5380
125
125
  pembot/utils/string_tools.py,sha256=gtRa5rBR0Q7GspTu2WtCnvhJQLFjPfWLvhmyiPkyStU,1883
126
- pembot-0.0.3.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
127
- pembot-0.0.3.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
128
- pembot-0.0.3.dist-info/METADATA,sha256=lhYkhIOKuUxiOYnD7Ms28SWjHMJQE6rfLAshsyQ_JBk,313
129
- pembot-0.0.3.dist-info/RECORD,,
126
+ pembot-0.0.4.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
127
+ pembot-0.0.4.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
128
+ pembot-0.0.4.dist-info/METADATA,sha256=v9TTMdInby4h5K0aKMrT11NgpyBw0ZMpzrC9dgI0O14,313
129
+ pembot-0.0.4.dist-info/RECORD,,
File without changes