lexoid 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py CHANGED
@@ -4,13 +4,19 @@ import re
4
4
  import tempfile
5
5
  from concurrent.futures import ProcessPoolExecutor
6
6
  from enum import Enum
7
+ from functools import wraps
7
8
  from glob import glob
8
9
  from time import time
9
- from typing import Union, Dict, List
10
+ from typing import Optional, Union, Dict, List
10
11
 
11
12
  from loguru import logger
12
13
 
13
- from lexoid.core.parse_type.llm_parser import parse_llm_doc
14
+ from lexoid.core.parse_type.llm_parser import (
15
+ parse_llm_doc,
16
+ create_response,
17
+ convert_doc_to_base64_images,
18
+ get_api_provider_for_model,
19
+ )
14
20
  from lexoid.core.parse_type.static_parser import parse_static_doc
15
21
  from lexoid.core.utils import (
16
22
  convert_to_pdf,
@@ -31,6 +37,51 @@ class ParserType(Enum):
31
37
  AUTO = "AUTO"
32
38
 
33
39
 
40
+ def retry_with_different_parser_type(func):
41
+ @wraps(func)
42
+ def wrapper(*args, **kwargs):
43
+ try:
44
+ if len(args) > 0:
45
+ kwargs["path"] = args[0]
46
+ if len(args) > 1:
47
+ router_priority = kwargs.get("router_priority", "speed")
48
+ if args[1] == ParserType.AUTO:
49
+ parser_type = ParserType[router(kwargs["path"], router_priority)]
50
+ logger.debug(f"Auto-detected parser type: {parser_type}")
51
+ kwargs["routed"] = True
52
+ else:
53
+ parser_type = args[1]
54
+ kwargs["parser_type"] = parser_type
55
+ return func(**kwargs)
56
+ except Exception as e:
57
+ if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
58
+ "routed", False
59
+ ):
60
+ logger.warning(
61
+ f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
62
+ )
63
+ kwargs["parser_type"] = ParserType.STATIC_PARSE
64
+ kwargs["routed"] = False
65
+ return func(**kwargs)
66
+ elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
67
+ "routed", False
68
+ ):
69
+ logger.warning(
70
+ f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
71
+ )
72
+ kwargs["parser_type"] = ParserType.LLM_PARSE
73
+ kwargs["routed"] = False
74
+ return func(**kwargs)
75
+ else:
76
+ logger.error(
77
+ f"Parsing failed with error: {e}. No fallback parser available."
78
+ )
79
+ raise e
80
+
81
+ return wrapper
82
+
83
+
84
+ @retry_with_different_parser_type
34
85
  def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
35
86
  """
36
87
  Parses a file using the specified parser type.
@@ -49,21 +100,20 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
49
100
  - parent_title: Title of parent doc if recursively parsed
50
101
  - recursive_docs: List of dictionaries for recursively parsed documents
51
102
  - token_usage: Dictionary containing token usage statistics
103
+ - parser_used: Which parser was actually used
52
104
  """
53
- if parser_type == ParserType.AUTO:
54
- router_priority = kwargs.get("router_priority", "speed")
55
- parser_type = ParserType[router(path, router_priority)]
56
- logger.debug(f"Auto-detected parser type: {parser_type}")
57
-
58
105
  kwargs["start"] = (
59
106
  int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
60
107
  )
61
108
  if parser_type == ParserType.STATIC_PARSE:
62
109
  logger.debug("Using static parser")
63
- return parse_static_doc(path, **kwargs)
110
+ result = parse_static_doc(path, **kwargs)
64
111
  else:
65
112
  logger.debug("Using LLM parser")
66
- return parse_llm_doc(path, **kwargs)
113
+ result = parse_llm_doc(path, **kwargs)
114
+
115
+ result["parser_used"] = parser_type
116
+ return result
67
117
 
68
118
 
69
119
  def parse_chunk_list(
@@ -82,15 +132,18 @@ def parse_chunk_list(
82
132
  """
83
133
  combined_segments = []
84
134
  raw_texts = []
85
- token_usage = {"input": 0, "output": 0, "image_count": 0}
135
+ token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
86
136
  for file_path in file_paths:
87
137
  result = parse_chunk(file_path, parser_type, **kwargs)
88
138
  combined_segments.extend(result["segments"])
89
139
  raw_texts.append(result["raw"])
90
- if "token_usage" in result:
140
+ if (
141
+ result.get("parser_used") == ParserType.LLM_PARSE
142
+ and "token_usage" in result
143
+ ):
91
144
  token_usage["input"] += result["token_usage"]["input"]
92
145
  token_usage["output"] += result["token_usage"]["output"]
93
- token_usage["image_count"] += len(result["segments"])
146
+ token_usage["llm_page_count"] += len(result["segments"])
94
147
  token_usage["total"] = token_usage["input"] + token_usage["output"]
95
148
 
96
149
  return {
@@ -136,7 +189,7 @@ def parse(
136
189
  as_pdf = kwargs.get("as_pdf", False)
137
190
  depth = kwargs.get("depth", 1)
138
191
 
139
- if type(parser_type) == str:
192
+ if type(parser_type) is str:
140
193
  parser_type = ParserType[parser_type]
141
194
  if (
142
195
  path.lower().endswith((".doc", ".docx"))
@@ -182,9 +235,9 @@ def parse(
182
235
  sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
183
236
  path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
184
237
 
185
- if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
238
+ if not path.lower().endswith(".pdf"):
186
239
  kwargs["split"] = False
187
- result = parse_chunk(path, parser_type, **kwargs)
240
+ result = parse_chunk_list([path], parser_type, kwargs)
188
241
  else:
189
242
  kwargs["split"] = True
190
243
  split_dir = os.path.join(temp_dir, "splits/")
@@ -219,42 +272,43 @@ def parse(
219
272
  "token_usage": {
220
273
  "input": sum(r["token_usage"]["input"] for r in chunk_results),
221
274
  "output": sum(r["token_usage"]["output"] for r in chunk_results),
222
- "image_count": sum(
223
- r["token_usage"]["image_count"] for r in chunk_results
275
+ "llm_page_count": sum(
276
+ r["token_usage"]["llm_page_count"] for r in chunk_results
224
277
  ),
225
278
  "total": sum(r["token_usage"]["total"] for r in chunk_results),
226
279
  },
227
280
  }
228
281
 
229
- if "api_cost_mapping" in kwargs:
230
- api_cost_mapping = kwargs["api_cost_mapping"]
231
- if isinstance(api_cost_mapping, dict):
232
- api_cost_mapping = api_cost_mapping
233
- elif isinstance(api_cost_mapping, str) and os.path.exists(
234
- api_cost_mapping
235
- ):
236
- with open(api_cost_mapping, "r") as f:
237
- api_cost_mapping = json.load(f)
238
- else:
239
- raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
282
+ if "api_cost_mapping" in kwargs and "token_usage" in result:
283
+ api_cost_mapping = kwargs["api_cost_mapping"]
284
+ if isinstance(api_cost_mapping, dict):
285
+ api_cost_mapping = api_cost_mapping
286
+ elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
287
+ with open(api_cost_mapping, "r") as f:
288
+ api_cost_mapping = json.load(f)
289
+ else:
290
+ raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
240
291
 
241
- api_cost = api_cost_mapping.get(
242
- kwargs.get("model", "gemini-2.0-flash"), None
292
+ api_cost = api_cost_mapping.get(
293
+ kwargs.get("model", "gemini-2.0-flash"), None
294
+ )
295
+ if api_cost:
296
+ token_usage = result["token_usage"]
297
+ token_cost = {
298
+ "input": token_usage["input"] * api_cost["input"] / 1_000_000,
299
+ "input-image": api_cost.get("input-image", 0)
300
+ * token_usage.get("llm_page_count", 0),
301
+ "output": token_usage["output"] * api_cost["output"] / 1_000_000,
302
+ }
303
+ token_cost["total"] = (
304
+ token_cost["input"]
305
+ + token_cost["input-image"]
306
+ + token_cost["output"]
243
307
  )
244
- if api_cost:
245
- token_usage = result["token_usage"]
246
- token_cost = {
247
- "input": token_usage["input"] * api_cost["input"] / 1_000_000
248
- + api_cost.get("input-image", 0) * token_usage["image_count"],
249
- "output": token_usage["output"]
250
- * api_cost["output"]
251
- / 1_000_000,
252
- }
253
- token_cost["total"] = token_cost["input"] + token_cost["output"]
254
- result["token_cost"] = token_cost
255
-
256
- if as_pdf:
257
- result["pdf_path"] = path
308
+ result["token_cost"] = token_cost
309
+
310
+ if as_pdf:
311
+ result["pdf_path"] = path
258
312
 
259
313
  if depth > 1:
260
314
  recursive_docs = []
@@ -285,3 +339,71 @@ def parse(
285
339
  result["recursive_docs"] = recursive_docs
286
340
 
287
341
  return result
342
+
343
+
344
+ def parse_with_schema(
345
+ path: str,
346
+ schema: Dict,
347
+ api: Optional[str] = None,
348
+ model: str = "gpt-4o-mini",
349
+ **kwargs,
350
+ ) -> List[List[Dict]]:
351
+ """
352
+ Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
353
+
354
+ Args:
355
+ path (str): Path to the PDF file.
356
+ schema (Dict): JSON schema to which the parsed output should conform.
357
+ api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
358
+ model (str, optional): LLM model name.
359
+ **kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
360
+
361
+ Returns:
362
+ List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
363
+ """
364
+ if not api:
365
+ api = get_api_provider_for_model(model)
366
+ logger.debug(f"Using API provider: {api}")
367
+
368
+ system_prompt = f"""
369
+ The output should be formatted as a JSON instance that conforms to the JSON schema below.
370
+
371
+ As an example, for the schema {{
372
+ "properties": {{
373
+ "foo": {{
374
+ "title": "Foo",
375
+ "description": "a list of strings",
376
+ "type": "array",
377
+ "items": {{"type": "string"}}
378
+ }}
379
+ }},
380
+ "required": ["foo"]
381
+ }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
382
+
383
+ Here is the output schema:
384
+ {json.dumps(schema, indent=2)}
385
+
386
+ """
387
+
388
+ user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
389
+
390
+ responses = []
391
+ images = convert_doc_to_base64_images(path)
392
+ for i, (page_num, image) in enumerate(images):
393
+ resp_dict = create_response(
394
+ api=api,
395
+ model=model,
396
+ user_prompt=user_prompt,
397
+ system_prompt=system_prompt,
398
+ image_url=image,
399
+ temperature=kwargs.get("temperature", 0.0),
400
+ max_tokens=kwargs.get("max_tokens", 1024),
401
+ )
402
+
403
+ response = resp_dict.get("response", "")
404
+ response = response.split("```json")[-1].split("```")[0].strip()
405
+ logger.debug(f"Processing page {page_num + 1} with response: {response}")
406
+ new_dict = json.loads(response)
407
+ responses.append(new_dict)
408
+
409
+ return responses
@@ -3,23 +3,25 @@ import io
3
3
  import mimetypes
4
4
  import os
5
5
  import time
6
+ from functools import wraps
7
+ from typing import Dict, List, Optional, Tuple
8
+
6
9
  import pypdfium2 as pdfium
7
10
  import requests
8
- from functools import wraps
11
+ from anthropic import Anthropic
12
+ from huggingface_hub import InferenceClient
13
+ from loguru import logger
14
+ from openai import OpenAI
9
15
  from requests.exceptions import HTTPError
10
- from typing import Dict, List
16
+ from together import Together
11
17
 
12
18
  from lexoid.core.prompt_templates import (
13
19
  INSTRUCTIONS_ADD_PG_BREAK,
20
+ LLAMA_PARSER_PROMPT,
14
21
  OPENAI_USER_PROMPT,
15
22
  PARSER_PROMPT,
16
- LLAMA_PARSER_PROMPT,
17
23
  )
18
24
  from lexoid.core.utils import convert_image_to_pdf
19
- from loguru import logger
20
- from openai import OpenAI
21
- from together import Together
22
- from huggingface_hub import InferenceClient
23
25
 
24
26
 
25
27
  def retry_on_http_error(func):
@@ -48,33 +50,41 @@ def retry_on_http_error(func):
48
50
  return wrapper
49
51
 
50
52
 
51
- @retry_on_http_error
52
- def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
53
- if "api_provider" in kwargs and kwargs["api_provider"]:
54
- return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
55
- if "model" not in kwargs:
56
- kwargs["model"] = "gemini-2.0-flash"
57
- model = kwargs.get("model")
53
+ def get_api_provider_for_model(model: str) -> str:
58
54
  if model.startswith("gemini"):
59
- return parse_with_gemini(path, **kwargs)
55
+ return "gemini"
60
56
  if model.startswith("gpt"):
61
- return parse_with_api(path, api="openai", **kwargs)
57
+ return "openai"
62
58
  if model.startswith("meta-llama"):
63
59
  if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
64
- return parse_with_api(path, api="together", **kwargs)
65
- return parse_with_api(path, api="huggingface", **kwargs)
60
+ return "together"
61
+ return "huggingface"
66
62
  if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
67
- return parse_with_api(path, api="openrouter", **kwargs)
63
+ return "openrouter"
64
+ if model.startswith("accounts/fireworks"):
65
+ return "fireworks"
66
+ if model.startswith("claude"):
67
+ return "anthropic"
68
68
  raise ValueError(f"Unsupported model: {model}")
69
69
 
70
70
 
71
- def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
72
- api_key = os.environ.get("GOOGLE_API_KEY")
73
- if not api_key:
74
- raise ValueError("GOOGLE_API_KEY environment variable is not set")
71
+ @retry_on_http_error
72
+ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
73
+ if "api_provider" in kwargs and kwargs["api_provider"]:
74
+ return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
75
75
 
76
- url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
76
+ model = kwargs.get("model", "gemini-2.0-flash")
77
+ kwargs["model"] = model
78
+
79
+ api_provider = get_api_provider_for_model(model)
77
80
 
81
+ if api_provider == "gemini":
82
+ return parse_with_gemini(path, **kwargs)
83
+ else:
84
+ return parse_with_api(path, api=api_provider, **kwargs)
85
+
86
+
87
+ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
78
88
  # Check if the file is an image and convert to PDF if necessary
79
89
  mime_type, _ = mimetypes.guess_type(path)
80
90
  if mime_type and mime_type.startswith("image"):
@@ -86,6 +96,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
86
96
  file_content = file.read()
87
97
  base64_file = base64.b64encode(file_content).decode("utf-8")
88
98
 
99
+ return parse_image_with_gemini(
100
+ base64_file=base64_file, mime_type=mime_type, **kwargs
101
+ )
102
+
103
+
104
+ def parse_image_with_gemini(
105
+ base64_file: str, mime_type: str = "image/png", **kwargs
106
+ ) -> List[Dict] | str:
107
+ api_key = os.environ.get("GOOGLE_API_KEY")
108
+ if not api_key:
109
+ raise ValueError("GOOGLE_API_KEY environment variable is not set")
110
+
111
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
112
+
89
113
  if "system_prompt" in kwargs:
90
114
  prompt = kwargs["system_prompt"]
91
115
  else:
@@ -105,7 +129,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
105
129
  }
106
130
  ],
107
131
  "generationConfig": {
108
- "temperature": kwargs.get("temperature", 0.7),
132
+ "temperature": kwargs.get("temperature", 0),
109
133
  },
110
134
  }
111
135
 
@@ -125,24 +149,23 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
125
149
  if "text" in part
126
150
  )
127
151
 
128
- combined_text = ""
152
+ combined_text = raw_text
129
153
  if "<output>" in raw_text:
130
- combined_text = raw_text.split("<output>")[1].strip()
131
- if "</output>" in result:
132
- combined_text = result.split("</output>")[0].strip()
154
+ combined_text = raw_text.split("<output>")[-1].strip()
155
+ if "</output>" in combined_text:
156
+ combined_text = combined_text.split("</output>")[0].strip()
133
157
 
134
158
  token_usage = result["usageMetadata"]
135
159
  input_tokens = token_usage.get("promptTokenCount", 0)
136
160
  output_tokens = token_usage.get("candidatesTokenCount", 0)
137
161
  total_tokens = input_tokens + output_tokens
138
-
139
162
  return {
140
163
  "raw": combined_text.replace("<page-break>", "\n\n"),
141
164
  "segments": [
142
165
  {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
143
166
  for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
144
167
  ],
145
- "title": kwargs["title"],
168
+ "title": kwargs.get("title", ""),
146
169
  "url": kwargs.get("url", ""),
147
170
  "parent_title": kwargs.get("parent_title", ""),
148
171
  "recursive_docs": [],
@@ -169,18 +192,54 @@ def convert_pdf_page_to_base64(
169
192
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
170
193
 
171
194
 
172
- def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
173
- """
174
- Parse documents (PDFs or images) using various vision model APIs.
195
+ def get_messages(
196
+ system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
197
+ ) -> List[Dict]:
198
+ messages = []
199
+ if system_prompt:
200
+ messages.append(
201
+ {
202
+ "role": "system",
203
+ "content": system_prompt,
204
+ }
205
+ )
206
+ base_message = (
207
+ [
208
+ {"type": "text", "text": user_prompt},
209
+ ]
210
+ if user_prompt
211
+ else []
212
+ )
213
+ image_message = (
214
+ [
215
+ {
216
+ "type": "image_url",
217
+ "image_url": {"url": image_url},
218
+ }
219
+ ]
220
+ if image_url
221
+ else []
222
+ )
175
223
 
176
- Args:
177
- path (str): Path to the document to parse
178
- api (str): Which API to use ("openai", "huggingface", or "together")
179
- **kwargs: Additional arguments including model, temperature, title, etc.
224
+ messages.append(
225
+ {
226
+ "role": "user",
227
+ "content": base_message + image_message,
228
+ }
229
+ )
180
230
 
181
- Returns:
182
- Dict: Dictionary containing parsed document data
183
- """
231
+ return messages
232
+
233
+
234
+ def create_response(
235
+ api: str,
236
+ model: str,
237
+ system_prompt: Optional[str] = None,
238
+ user_prompt: Optional[str] = None,
239
+ image_url: Optional[str] = None,
240
+ temperature: float = 0.0,
241
+ max_tokens: int = 1024,
242
+ ) -> Dict:
184
243
  # Initialize appropriate client
185
244
  clients = {
186
245
  "openai": lambda: OpenAI(),
@@ -192,11 +251,110 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
192
251
  base_url="https://openrouter.ai/api/v1",
193
252
  api_key=os.environ["OPENROUTER_API_KEY"],
194
253
  ),
254
+ "fireworks": lambda: OpenAI(
255
+ base_url="https://api.fireworks.ai/inference/v1",
256
+ api_key=os.environ["FIREWORKS_API_KEY"],
257
+ ),
258
+ "anthropic": lambda: Anthropic(
259
+ api_key=os.environ["ANTHROPIC_API_KEY"],
260
+ ),
261
+ "gemini": lambda: None, # Gemini is handled separately
195
262
  }
196
263
  assert api in clients, f"Unsupported API: {api}"
197
- logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
264
+
265
+ if api == "gemini":
266
+ image_url = image_url.split("data:image/png;base64,")[1]
267
+ response = parse_image_with_gemini(
268
+ base64_file=image_url,
269
+ model=model,
270
+ temperature=temperature,
271
+ max_tokens=max_tokens,
272
+ system_prompt=system_prompt,
273
+ )
274
+ return {
275
+ "response": response["raw"],
276
+ "usage": response["token_usage"],
277
+ }
278
+
198
279
  client = clients[api]()
199
280
 
281
+ if api == "anthropic":
282
+ image_media_type = image_url.split(";")[0].split(":")[1]
283
+ image_data = image_url.split(",")[1]
284
+ response = client.messages.create(
285
+ model=model,
286
+ messages=[
287
+ {
288
+ "role": "user",
289
+ "content": [
290
+ {
291
+ "type": "image",
292
+ "source": {
293
+ "type": "base64",
294
+ "media_type": image_media_type,
295
+ "data": image_data,
296
+ },
297
+ },
298
+ {"type": "text", "text": user_prompt},
299
+ ],
300
+ }
301
+ ],
302
+ max_tokens=max_tokens,
303
+ temperature=temperature,
304
+ )
305
+
306
+ return {
307
+ "response": response.content[0].text,
308
+ "usage": {
309
+ "input_tokens": response.usage.input_tokens,
310
+ "output_tokens": response.usage.output_tokens,
311
+ "total_tokens": response.usage.input_tokens
312
+ + response.usage.output_tokens,
313
+ },
314
+ }
315
+
316
+ # Prepare messages for the API call
317
+ messages = get_messages(system_prompt, user_prompt, image_url)
318
+
319
+ # Common completion parameters
320
+ completion_params = {
321
+ "model": model,
322
+ "messages": messages,
323
+ "max_tokens": max_tokens,
324
+ "temperature": temperature,
325
+ }
326
+
327
+ # Get completion from selected API
328
+ response = client.chat.completions.create(**completion_params)
329
+ token_usage = response.usage
330
+
331
+ # Extract the response text
332
+ page_text = response.choices[0].message.content
333
+
334
+ return {
335
+ "response": page_text,
336
+ "usage": {
337
+ "input_tokens": token_usage.prompt_tokens,
338
+ "output_tokens": token_usage.completion_tokens,
339
+ "total_tokens": token_usage.total_tokens,
340
+ },
341
+ }
342
+
343
+
344
+ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
345
+ """
346
+ Parse documents (PDFs or images) using various vision model APIs.
347
+
348
+ Args:
349
+ path (str): Path to the document to parse
350
+ api (str): Which API to use ("openai", "huggingface", or "together")
351
+ **kwargs: Additional arguments including model, temperature, title, etc.
352
+
353
+ Returns:
354
+ Dict: Dictionary containing parsed document data
355
+ """
356
+ logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
357
+
200
358
  # Handle different input types
201
359
  mime_type, _ = mimetypes.guess_type(path)
202
360
  if mime_type and mime_type.startswith("image"):
@@ -215,76 +373,48 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
215
373
  for page_num in range(len(pdf_document))
216
374
  ]
217
375
 
218
- # API-specific message formatting
219
- def get_messages(page_num: int, image_url: str) -> List[Dict]:
220
- image_message = {
221
- "type": "image_url",
222
- "image_url": {"url": image_url},
223
- }
224
-
376
+ # Process each page/image
377
+ all_results = []
378
+ for page_num, image_url in images:
225
379
  if api == "openai":
226
380
  system_prompt = kwargs.get(
227
381
  "system_prompt", PARSER_PROMPT.format(custom_instructions="")
228
382
  )
229
383
  user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
230
- return [
231
- {
232
- "role": "system",
233
- "content": system_prompt,
234
- },
235
- {
236
- "role": "user",
237
- "content": [
238
- {"type": "text", "text": user_prompt},
239
- image_message,
240
- ],
241
- },
242
- ]
243
384
  else:
244
- prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
245
- base_message = {"type": "text", "text": prompt}
246
- return [
247
- {
248
- "role": "user",
249
- "content": [base_message, image_message],
250
- }
251
- ]
252
-
253
- # Process each page/image
254
- all_results = []
255
- for page_num, image_url in images:
256
- messages = get_messages(page_num, image_url)
257
-
258
- # Common completion parameters
259
- completion_params = {
260
- "model": kwargs["model"],
261
- "messages": messages,
262
- "max_tokens": kwargs.get("max_tokens", 1024),
263
- "temperature": kwargs.get("temperature", 0.7),
264
- }
385
+ system_prompt = kwargs.get("system_prompt", None)
386
+ user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
387
+
388
+ response = create_response(
389
+ api=api,
390
+ model=kwargs["model"],
391
+ system_prompt=system_prompt,
392
+ user_prompt=user_prompt,
393
+ image_url=image_url,
394
+ temperature=kwargs.get("temperature", 0.0),
395
+ max_tokens=kwargs.get("max_tokens", 1024),
396
+ )
265
397
 
266
398
  # Get completion from selected API
267
- response = client.chat.completions.create(**completion_params)
268
- token_usage = response.usage
399
+ page_text = response["response"]
400
+ token_usage = response["usage"]
269
401
 
270
- # Extract the response text
271
- page_text = response.choices[0].message.content
272
402
  if kwargs.get("verbose", None):
273
403
  logger.debug(f"Page {page_num + 1} response: {page_text}")
274
404
 
275
405
  # Extract content between output tags if present
276
406
  result = page_text
277
407
  if "<output>" in page_text:
278
- result = page_text.split("<output>")[1].strip()
408
+ result = page_text.split("<output>")[-1].strip()
279
409
  if "</output>" in result:
280
410
  result = result.split("</output>")[0].strip()
281
411
  all_results.append(
282
412
  (
283
413
  page_num,
284
414
  result,
285
- token_usage.prompt_tokens,
286
- token_usage.completion_tokens,
287
- token_usage.total_tokens,
415
+ token_usage["input_tokens"],
416
+ token_usage["output_tokens"],
417
+ token_usage["total_tokens"],
288
418
  )
289
419
  )
290
420
 
@@ -319,3 +449,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
319
449
  "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
320
450
  },
321
451
  }
452
+
453
+
454
+ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
455
+ """
456
+ Converts a document (PDF or image) to a base64 encoded string.
457
+
458
+ Args:
459
+ path (str): Path to the PDF file.
460
+
461
+ Returns:
462
+ str: Base64 encoded string of the PDF content.
463
+ """
464
+ if path.endswith(".pdf"):
465
+ pdf_document = pdfium.PdfDocument(path)
466
+ return [
467
+ (
468
+ page_num,
469
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
470
+ )
471
+ for page_num in range(len(pdf_document))
472
+ ]
473
+ elif mimetypes.guess_type(path)[0].startswith("image"):
474
+ with open(path, "rb") as img_file:
475
+ image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
476
+ return [(0, f"data:image/png;base64,{image_base64}")]
@@ -1,12 +1,14 @@
1
1
  import os
2
2
  import re
3
3
  import tempfile
4
+ from functools import wraps
4
5
  from time import time
5
6
  from typing import Dict, List
6
7
 
7
8
  import pandas as pd
8
9
  import pdfplumber
9
10
  from docx import Document
11
+ from loguru import logger
10
12
  from pdfminer.high_level import extract_pages
11
13
  from pdfminer.layout import LTTextContainer
12
14
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
@@ -22,6 +24,38 @@ from lexoid.core.utils import (
22
24
  )
23
25
 
24
26
 
27
+ def retry_with_different_parser(func):
28
+ @wraps(func)
29
+ def wrapper(*args, **kwargs):
30
+ try:
31
+ return func(*args, **kwargs)
32
+ except Exception as e:
33
+ if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
34
+ "routed", False
35
+ ):
36
+ kwargs["framework"] = "pdfminer"
37
+ logger.warning(
38
+ f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
39
+ )
40
+ return func(*args, **kwargs)
41
+ elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
42
+ "routed", False
43
+ ):
44
+ kwargs["framework"] = "pdfplumber"
45
+ logger.warning(
46
+ f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
47
+ )
48
+ return func(*args, **kwargs)
49
+ else:
50
+ logger.error(
51
+ f"Failed to parse document with both pdfplumber and pdfminer: {e}"
52
+ )
53
+ raise e
54
+
55
+ return wrapper
56
+
57
+
58
+ @retry_with_different_parser
25
59
  def parse_static_doc(path: str, **kwargs) -> Dict:
26
60
  """
27
61
  Parses a document using static parsing methods.
@@ -41,7 +41,8 @@ Think step-by-step.
41
41
  '0' is typically more oval than 'O'
42
42
  '8' has a more angular top than 'B'
43
43
  {custom_instructions}
44
- - Return only the correct markdown without additional text or explanations. Do not any additional text (such as "```html" or "```markdown") in the output.
44
+ - Return only the correct markdown without additional text or explanations.
45
+ - DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
45
46
  - Think before generating the output in <thinking></thinking> tags.
46
47
 
47
48
  Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
lexoid/core/utils.py CHANGED
@@ -69,15 +69,45 @@ def convert_image_to_pdf(image_path: str) -> bytes:
69
69
 
70
70
  def remove_html_tags(text: str):
71
71
  html = markdown(text, extensions=["tables"])
72
- return re.sub(HTML_TAG_PATTERN, "", html)
72
+ return re.sub(HTML_TAG_PATTERN, " ", html)
73
73
 
74
74
 
75
- def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float:
75
+ def clean_text(txt):
76
+ # Remove LaTeX commands (e.g. \command, \command[args]{args})
77
+ txt = re.sub(r"\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?", " ", txt)
78
+
79
+ # Replace all blocks of whitespace (including tabs and newlines) with a single space
80
+ txt = re.sub(r"\s+", " ", txt)
81
+
82
+ # Remove all non-alphanumeric characters except spaces
83
+ txt = re.sub(r"[^a-zA-Z0-9 ]", " ", txt)
84
+
85
+ return txt.strip()
86
+
87
+
88
+ def calculate_similarity(
89
+ text1: str, text2: str, ignore_html: bool = True, diff_save_path: str = ""
90
+ ) -> float:
76
91
  """Calculate similarity ratio between two texts using SequenceMatcher."""
77
92
  if ignore_html:
78
93
  text1 = remove_html_tags(text1)
79
94
  text2 = remove_html_tags(text2)
80
- return SequenceMatcher(None, text1, text2).ratio()
95
+
96
+ text1 = clean_text(clean_text(text1))
97
+ text2 = clean_text(clean_text(text2))
98
+
99
+ sm = SequenceMatcher(None, text1, text2)
100
+ # Save the diff and the texts for debugging
101
+ if diff_save_path:
102
+ with open(diff_save_path, "w") as f:
103
+ f.write(f"Text 1:\n{text1}\n\n")
104
+ f.write(f"Text 2:\n{text2}\n\n")
105
+ f.write("Differences:\n")
106
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
107
+ if tag == "equal":
108
+ continue
109
+ f.write(f"{tag} {text1[i1:i2]} -> {text2[j1:j2]}\n")
110
+ return sm.ratio()
81
111
 
82
112
 
83
113
  def convert_pdf_page_to_image(
@@ -345,7 +375,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
345
375
  # Additional wait for any dynamic content
346
376
  try:
347
377
  await page.wait_for_selector("body", timeout=30000)
348
- except:
378
+ except Exception:
349
379
  pass
350
380
 
351
381
  html = await page.content()
@@ -561,24 +591,32 @@ def router(path: str, priority: str = "speed") -> str:
561
591
  priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
562
592
  """
563
593
  file_type = get_file_type(path)
564
- if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
594
+ if (
595
+ file_type.startswith("text/")
596
+ or "spreadsheet" in file_type
597
+ or "presentation" in file_type
598
+ ):
565
599
  return "STATIC_PARSE"
566
600
 
567
601
  if priority == "accuracy":
568
602
  # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
569
603
  # Otherwise, use LLM_PARSE
570
- if (
571
- file_type == "application/pdf"
572
- and not has_image_in_pdf(path)
573
- and has_hyperlink_in_pdf(path)
574
- ):
604
+ has_image = has_image_in_pdf(path)
605
+ has_hyperlink = has_hyperlink_in_pdf(path)
606
+ if file_type == "application/pdf" and not has_image and has_hyperlink:
607
+ logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
575
608
  return "STATIC_PARSE"
609
+ logger.debug(
610
+ f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
611
+ )
576
612
  return "LLM_PARSE"
577
613
  else:
578
614
  # If the file is a PDF without images, use STATIC_PARSE
579
615
  # Otherwise, use LLM_PARSE
580
616
  if file_type == "application/pdf" and not has_image_in_pdf(path):
617
+ logger.debug("Using STATIC_PARSE for PDF without images.")
581
618
  return "STATIC_PARSE"
619
+ logger.debug("Using LLM_PARSE because PDF has images")
582
620
  return "LLM_PARSE"
583
621
 
584
622
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3.10
8
8
  Classifier: Programming Language :: Python :: 3.11
9
9
  Classifier: Programming Language :: Python :: 3.12
10
10
  Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: anthropic (>=0.55.0,<0.56.0)
11
12
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
12
13
  Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
13
14
  Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
@@ -49,7 +50,8 @@ Description-Content-Type: text/markdown
49
50
  </div>
50
51
 
51
52
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
52
- [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
53
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
54
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
53
55
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
54
56
  [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
55
57
 
@@ -144,6 +146,7 @@ print(parsed_md)
144
146
  * Hugging Face
145
147
  * Together AI
146
148
  * OpenRouter
149
+ * Fireworks
147
150
 
148
151
  ## Benchmark
149
152
 
@@ -151,22 +154,25 @@ Results aggregated across 5 iterations each for 5 documents.
151
154
 
152
155
  _Note:_ Benchmarks are currently done in the zero-shot setting.
153
156
 
154
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
155
- | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
156
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
157
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
158
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
159
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
160
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
161
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
162
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
163
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
164
- | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
165
- | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
166
- | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
167
- | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
168
- | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
169
- | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
170
- | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
171
- | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
157
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
158
+ | --- | --- | --- | --- | --- | --- |
159
+ | 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
160
+ | 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
161
+ | 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
162
+ | 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
163
+ | 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
164
+ | 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
165
+ | 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
166
+ | 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
167
+ | 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
168
+ | 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
169
+ | 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
170
+ | 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
171
+ | 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
172
+ | 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
173
+ | 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
174
+ | 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
175
+ | 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
176
+ | 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
177
+ | 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
172
178
 
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=RxVJZ9JrQyJMKi-tx4GqdQoq_Bh956s90Fl8BVX15NY,15619
2
+ lexoid/core/parse_type/llm_parser.py,sha256=KPB4JHXrvVASBIp9dGApM4j70P75tXffUsTP-SQTKZA,15506
3
+ lexoid/core/parse_type/static_parser.py,sha256=5DMGDDYTTn5gXrqyz3JOxyLbVlhOj9Njk8n83pA0bFY,22534
4
+ lexoid/core/prompt_templates.py,sha256=zftophGU0DNt0AKwYqN8jnAtH90-KlFTyJhs-I2yDiE,6351
5
+ lexoid/core/utils.py,sha256=3goAgS8_8ybLxdhhqhDQDFyGeG_YytF45p81bVI42uI,21122
6
+ lexoid-0.1.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.15.dist-info/METADATA,sha256=cjNZsYCiCGVQ2GfuutCPsFNeO_iGONo5XtCMO4VU5DI,6235
8
+ lexoid-0.1.15.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.15.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
2
- lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
3
- lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
6
- lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
8
- lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.13.dist-info/RECORD,,