lexoid 0.1.14__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.14
3
+ Version: 0.1.15
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3.10
8
8
  Classifier: Programming Language :: Python :: 3.11
9
9
  Classifier: Programming Language :: Python :: 3.12
10
10
  Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: anthropic (>=0.55.0,<0.56.0)
11
12
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
12
13
  Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
13
14
  Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
@@ -155,23 +156,23 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
155
156
 
156
157
  | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
157
158
  | --- | --- | --- | --- | --- | --- |
158
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
159
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
160
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
161
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
162
- | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
163
- | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
164
- | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
165
- | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
166
- | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
167
- | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
168
- | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
169
- | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
170
- | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
171
- | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
172
- | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
173
- | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
174
- | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
175
- | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
176
- | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
159
+ | 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
160
+ | 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
161
+ | 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
162
+ | 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
163
+ | 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
164
+ | 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
165
+ | 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
166
+ | 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
167
+ | 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
168
+ | 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
169
+ | 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
170
+ | 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
171
+ | 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
172
+ | 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
173
+ | 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
174
+ | 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
175
+ | 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
176
+ | 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
177
+ | 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
177
178
 
@@ -120,22 +120,22 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
120
120
 
121
121
  | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
122
122
  | --- | --- | --- | --- | --- | --- |
123
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
124
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
125
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
126
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
127
- | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
128
- | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
129
- | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
130
- | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
131
- | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
132
- | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
133
- | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
134
- | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
135
- | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
136
- | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
137
- | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
138
- | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
139
- | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
140
- | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
141
- | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
123
+ | 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
124
+ | 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
125
+ | 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
126
+ | 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
127
+ | 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
128
+ | 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
129
+ | 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
130
+ | 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
131
+ | 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
132
+ | 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
133
+ | 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
134
+ | 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
135
+ | 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
136
+ | 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
137
+ | 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
138
+ | 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
139
+ | 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
140
+ | 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
141
+ | 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
@@ -4,9 +4,10 @@ import re
4
4
  import tempfile
5
5
  from concurrent.futures import ProcessPoolExecutor
6
6
  from enum import Enum
7
+ from functools import wraps
7
8
  from glob import glob
8
9
  from time import time
9
- from typing import Union, Dict, List
10
+ from typing import Optional, Union, Dict, List
10
11
 
11
12
  from loguru import logger
12
13
 
@@ -14,6 +15,7 @@ from lexoid.core.parse_type.llm_parser import (
14
15
  parse_llm_doc,
15
16
  create_response,
16
17
  convert_doc_to_base64_images,
18
+ get_api_provider_for_model,
17
19
  )
18
20
  from lexoid.core.parse_type.static_parser import parse_static_doc
19
21
  from lexoid.core.utils import (
@@ -35,6 +37,51 @@ class ParserType(Enum):
35
37
  AUTO = "AUTO"
36
38
 
37
39
 
40
+ def retry_with_different_parser_type(func):
41
+ @wraps(func)
42
+ def wrapper(*args, **kwargs):
43
+ try:
44
+ if len(args) > 0:
45
+ kwargs["path"] = args[0]
46
+ if len(args) > 1:
47
+ router_priority = kwargs.get("router_priority", "speed")
48
+ if args[1] == ParserType.AUTO:
49
+ parser_type = ParserType[router(kwargs["path"], router_priority)]
50
+ logger.debug(f"Auto-detected parser type: {parser_type}")
51
+ kwargs["routed"] = True
52
+ else:
53
+ parser_type = args[1]
54
+ kwargs["parser_type"] = parser_type
55
+ return func(**kwargs)
56
+ except Exception as e:
57
+ if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
58
+ "routed", False
59
+ ):
60
+ logger.warning(
61
+ f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
62
+ )
63
+ kwargs["parser_type"] = ParserType.STATIC_PARSE
64
+ kwargs["routed"] = False
65
+ return func(**kwargs)
66
+ elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
67
+ "routed", False
68
+ ):
69
+ logger.warning(
70
+ f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
71
+ )
72
+ kwargs["parser_type"] = ParserType.LLM_PARSE
73
+ kwargs["routed"] = False
74
+ return func(**kwargs)
75
+ else:
76
+ logger.error(
77
+ f"Parsing failed with error: {e}. No fallback parser available."
78
+ )
79
+ raise e
80
+
81
+ return wrapper
82
+
83
+
84
+ @retry_with_different_parser_type
38
85
  def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
39
86
  """
40
87
  Parses a file using the specified parser type.
@@ -55,11 +102,6 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
55
102
  - token_usage: Dictionary containing token usage statistics
56
103
  - parser_used: Which parser was actually used
57
104
  """
58
- if parser_type == ParserType.AUTO:
59
- router_priority = kwargs.get("router_priority", "speed")
60
- parser_type = ParserType[router(path, router_priority)]
61
- logger.debug(f"Auto-detected parser type: {parser_type}")
62
-
63
105
  kwargs["start"] = (
64
106
  int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
65
107
  )
@@ -193,7 +235,7 @@ def parse(
193
235
  sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
194
236
  path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
195
237
 
196
- if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
238
+ if not path.lower().endswith(".pdf"):
197
239
  kwargs["split"] = False
198
240
  result = parse_chunk_list([path], parser_type, kwargs)
199
241
  else:
@@ -300,7 +342,11 @@ def parse(
300
342
 
301
343
 
302
344
  def parse_with_schema(
303
- path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
345
+ path: str,
346
+ schema: Dict,
347
+ api: Optional[str] = None,
348
+ model: str = "gpt-4o-mini",
349
+ **kwargs,
304
350
  ) -> List[List[Dict]]:
305
351
  """
306
352
  Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
@@ -315,6 +361,10 @@ def parse_with_schema(
315
361
  Returns:
316
362
  List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
317
363
  """
364
+ if not api:
365
+ api = get_api_provider_for_model(model)
366
+ logger.debug(f"Using API provider: {api}")
367
+
318
368
  system_prompt = f"""
319
369
  The output should be formatted as a JSON instance that conforms to the JSON schema below.
320
370
 
@@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Tuple
8
8
 
9
9
  import pypdfium2 as pdfium
10
10
  import requests
11
+ from anthropic import Anthropic
11
12
  from huggingface_hub import InferenceClient
12
13
  from loguru import logger
13
14
  from openai import OpenAI
@@ -49,36 +50,41 @@ def retry_on_http_error(func):
49
50
  return wrapper
50
51
 
51
52
 
52
- @retry_on_http_error
53
- def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
54
- if "api_provider" in kwargs and kwargs["api_provider"]:
55
- return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
56
- if "model" not in kwargs:
57
- kwargs["model"] = "gemini-2.0-flash"
58
- model = kwargs.get("model")
53
+ def get_api_provider_for_model(model: str) -> str:
59
54
  if model.startswith("gemini"):
60
- return parse_with_gemini(path, **kwargs)
55
+ return "gemini"
61
56
  if model.startswith("gpt"):
62
- return parse_with_api(path, api="openai", **kwargs)
57
+ return "openai"
63
58
  if model.startswith("meta-llama"):
64
59
  if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
65
- return parse_with_api(path, api="together", **kwargs)
66
- return parse_with_api(path, api="huggingface", **kwargs)
60
+ return "together"
61
+ return "huggingface"
67
62
  if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
68
- return parse_with_api(path, api="openrouter", **kwargs)
63
+ return "openrouter"
69
64
  if model.startswith("accounts/fireworks"):
70
- return parse_with_api(path, api="fireworks", **kwargs)
65
+ return "fireworks"
66
+ if model.startswith("claude"):
67
+ return "anthropic"
71
68
  raise ValueError(f"Unsupported model: {model}")
72
69
 
73
70
 
74
- def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
75
- logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
76
- api_key = os.environ.get("GOOGLE_API_KEY")
77
- if not api_key:
78
- raise ValueError("GOOGLE_API_KEY environment variable is not set")
71
+ @retry_on_http_error
72
+ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
73
+ if "api_provider" in kwargs and kwargs["api_provider"]:
74
+ return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
79
75
 
80
- url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
76
+ model = kwargs.get("model", "gemini-2.0-flash")
77
+ kwargs["model"] = model
81
78
 
79
+ api_provider = get_api_provider_for_model(model)
80
+
81
+ if api_provider == "gemini":
82
+ return parse_with_gemini(path, **kwargs)
83
+ else:
84
+ return parse_with_api(path, api=api_provider, **kwargs)
85
+
86
+
87
+ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
82
88
  # Check if the file is an image and convert to PDF if necessary
83
89
  mime_type, _ = mimetypes.guess_type(path)
84
90
  if mime_type and mime_type.startswith("image"):
@@ -90,6 +96,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
90
96
  file_content = file.read()
91
97
  base64_file = base64.b64encode(file_content).decode("utf-8")
92
98
 
99
+ return parse_image_with_gemini(
100
+ base64_file=base64_file, mime_type=mime_type, **kwargs
101
+ )
102
+
103
+
104
+ def parse_image_with_gemini(
105
+ base64_file: str, mime_type: str = "image/png", **kwargs
106
+ ) -> List[Dict] | str:
107
+ api_key = os.environ.get("GOOGLE_API_KEY")
108
+ if not api_key:
109
+ raise ValueError("GOOGLE_API_KEY environment variable is not set")
110
+
111
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
112
+
93
113
  if "system_prompt" in kwargs:
94
114
  prompt = kwargs["system_prompt"]
95
115
  else:
@@ -109,7 +129,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
109
129
  }
110
130
  ],
111
131
  "generationConfig": {
112
- "temperature": kwargs.get("temperature", 0.2),
132
+ "temperature": kwargs.get("temperature", 0),
113
133
  },
114
134
  }
115
135
 
@@ -129,24 +149,23 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
129
149
  if "text" in part
130
150
  )
131
151
 
132
- combined_text = ""
152
+ combined_text = raw_text
133
153
  if "<output>" in raw_text:
134
154
  combined_text = raw_text.split("<output>")[-1].strip()
135
- if "</output>" in result:
136
- combined_text = result.split("</output>")[0].strip()
155
+ if "</output>" in combined_text:
156
+ combined_text = combined_text.split("</output>")[0].strip()
137
157
 
138
158
  token_usage = result["usageMetadata"]
139
159
  input_tokens = token_usage.get("promptTokenCount", 0)
140
160
  output_tokens = token_usage.get("candidatesTokenCount", 0)
141
161
  total_tokens = input_tokens + output_tokens
142
-
143
162
  return {
144
163
  "raw": combined_text.replace("<page-break>", "\n\n"),
145
164
  "segments": [
146
165
  {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
147
166
  for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
148
167
  ],
149
- "title": kwargs["title"],
168
+ "title": kwargs.get("title", ""),
150
169
  "url": kwargs.get("url", ""),
151
170
  "parent_title": kwargs.get("parent_title", ""),
152
171
  "recursive_docs": [],
@@ -218,7 +237,7 @@ def create_response(
218
237
  system_prompt: Optional[str] = None,
219
238
  user_prompt: Optional[str] = None,
220
239
  image_url: Optional[str] = None,
221
- temperature: float = 0.2,
240
+ temperature: float = 0.0,
222
241
  max_tokens: int = 1024,
223
242
  ) -> Dict:
224
243
  # Initialize appropriate client
@@ -236,10 +255,64 @@ def create_response(
236
255
  base_url="https://api.fireworks.ai/inference/v1",
237
256
  api_key=os.environ["FIREWORKS_API_KEY"],
238
257
  ),
258
+ "anthropic": lambda: Anthropic(
259
+ api_key=os.environ["ANTHROPIC_API_KEY"],
260
+ ),
261
+ "gemini": lambda: None, # Gemini is handled separately
239
262
  }
240
263
  assert api in clients, f"Unsupported API: {api}"
264
+
265
+ if api == "gemini":
266
+ image_url = image_url.split("data:image/png;base64,")[1]
267
+ response = parse_image_with_gemini(
268
+ base64_file=image_url,
269
+ model=model,
270
+ temperature=temperature,
271
+ max_tokens=max_tokens,
272
+ system_prompt=system_prompt,
273
+ )
274
+ return {
275
+ "response": response["raw"],
276
+ "usage": response["token_usage"],
277
+ }
278
+
241
279
  client = clients[api]()
242
280
 
281
+ if api == "anthropic":
282
+ image_media_type = image_url.split(";")[0].split(":")[1]
283
+ image_data = image_url.split(",")[1]
284
+ response = client.messages.create(
285
+ model=model,
286
+ messages=[
287
+ {
288
+ "role": "user",
289
+ "content": [
290
+ {
291
+ "type": "image",
292
+ "source": {
293
+ "type": "base64",
294
+ "media_type": image_media_type,
295
+ "data": image_data,
296
+ },
297
+ },
298
+ {"type": "text", "text": user_prompt},
299
+ ],
300
+ }
301
+ ],
302
+ max_tokens=max_tokens,
303
+ temperature=temperature,
304
+ )
305
+
306
+ return {
307
+ "response": response.content[0].text,
308
+ "usage": {
309
+ "input_tokens": response.usage.input_tokens,
310
+ "output_tokens": response.usage.output_tokens,
311
+ "total_tokens": response.usage.input_tokens
312
+ + response.usage.output_tokens,
313
+ },
314
+ }
315
+
243
316
  # Prepare messages for the API call
244
317
  messages = get_messages(system_prompt, user_prompt, image_url)
245
318
 
@@ -260,7 +333,11 @@ def create_response(
260
333
 
261
334
  return {
262
335
  "response": page_text,
263
- "usage": token_usage,
336
+ "usage": {
337
+ "input_tokens": token_usage.prompt_tokens,
338
+ "output_tokens": token_usage.completion_tokens,
339
+ "total_tokens": token_usage.total_tokens,
340
+ },
264
341
  }
265
342
 
266
343
 
@@ -314,7 +391,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
314
391
  system_prompt=system_prompt,
315
392
  user_prompt=user_prompt,
316
393
  image_url=image_url,
317
- temperature=kwargs.get("temperature", 0.2),
394
+ temperature=kwargs.get("temperature", 0.0),
318
395
  max_tokens=kwargs.get("max_tokens", 1024),
319
396
  )
320
397
 
@@ -335,9 +412,9 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
335
412
  (
336
413
  page_num,
337
414
  result,
338
- token_usage.prompt_tokens,
339
- token_usage.completion_tokens,
340
- token_usage.total_tokens,
415
+ token_usage["input_tokens"],
416
+ token_usage["output_tokens"],
417
+ token_usage["total_tokens"],
341
418
  )
342
419
  )
343
420
 
@@ -1,12 +1,14 @@
1
1
  import os
2
2
  import re
3
3
  import tempfile
4
+ from functools import wraps
4
5
  from time import time
5
6
  from typing import Dict, List
6
7
 
7
8
  import pandas as pd
8
9
  import pdfplumber
9
10
  from docx import Document
11
+ from loguru import logger
10
12
  from pdfminer.high_level import extract_pages
11
13
  from pdfminer.layout import LTTextContainer
12
14
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
@@ -22,6 +24,38 @@ from lexoid.core.utils import (
22
24
  )
23
25
 
24
26
 
27
+ def retry_with_different_parser(func):
28
+ @wraps(func)
29
+ def wrapper(*args, **kwargs):
30
+ try:
31
+ return func(*args, **kwargs)
32
+ except Exception as e:
33
+ if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
34
+ "routed", False
35
+ ):
36
+ kwargs["framework"] = "pdfminer"
37
+ logger.warning(
38
+ f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
39
+ )
40
+ return func(*args, **kwargs)
41
+ elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
42
+ "routed", False
43
+ ):
44
+ kwargs["framework"] = "pdfplumber"
45
+ logger.warning(
46
+ f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
47
+ )
48
+ return func(*args, **kwargs)
49
+ else:
50
+ logger.error(
51
+ f"Failed to parse document with both pdfplumber and pdfminer: {e}"
52
+ )
53
+ raise e
54
+
55
+ return wrapper
56
+
57
+
58
+ @retry_with_different_parser
25
59
  def parse_static_doc(path: str, **kwargs) -> Dict:
26
60
  """
27
61
  Parses a document using static parsing methods.
@@ -69,15 +69,45 @@ def convert_image_to_pdf(image_path: str) -> bytes:
69
69
 
70
70
  def remove_html_tags(text: str):
71
71
  html = markdown(text, extensions=["tables"])
72
- return re.sub(HTML_TAG_PATTERN, "", html)
72
+ return re.sub(HTML_TAG_PATTERN, " ", html)
73
73
 
74
74
 
75
- def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float:
75
+ def clean_text(txt):
76
+ # Remove LaTeX commands (e.g. \command, \command[args]{args})
77
+ txt = re.sub(r"\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?", " ", txt)
78
+
79
+ # Replace all blocks of whitespace (including tabs and newlines) with a single space
80
+ txt = re.sub(r"\s+", " ", txt)
81
+
82
+ # Remove all non-alphanumeric characters except spaces
83
+ txt = re.sub(r"[^a-zA-Z0-9 ]", " ", txt)
84
+
85
+ return txt.strip()
86
+
87
+
88
+ def calculate_similarity(
89
+ text1: str, text2: str, ignore_html: bool = True, diff_save_path: str = ""
90
+ ) -> float:
76
91
  """Calculate similarity ratio between two texts using SequenceMatcher."""
77
92
  if ignore_html:
78
93
  text1 = remove_html_tags(text1)
79
94
  text2 = remove_html_tags(text2)
80
- return SequenceMatcher(None, text1, text2).ratio()
95
+
96
+ text1 = clean_text(clean_text(text1))
97
+ text2 = clean_text(clean_text(text2))
98
+
99
+ sm = SequenceMatcher(None, text1, text2)
100
+ # Save the diff and the texts for debugging
101
+ if diff_save_path:
102
+ with open(diff_save_path, "w") as f:
103
+ f.write(f"Text 1:\n{text1}\n\n")
104
+ f.write(f"Text 2:\n{text2}\n\n")
105
+ f.write("Differences:\n")
106
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
107
+ if tag == "equal":
108
+ continue
109
+ f.write(f"{tag} {text1[i1:i2]} -> {text2[j1:j2]}\n")
110
+ return sm.ratio()
81
111
 
82
112
 
83
113
  def convert_pdf_page_to_image(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.14"
3
+ version = "0.1.15"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
@@ -30,6 +30,7 @@ huggingface-hub = "^0.27.0"
30
30
  together = "^1.4.0"
31
31
  openpyxl = "^3.1.5"
32
32
  pptx2md = "^2.0.6"
33
+ anthropic = "^0.55.0"
33
34
 
34
35
  [tool.poetry.group.dev.dependencies]
35
36
  ipykernel = "^6.29.5"
@@ -40,6 +41,7 @@ pytest = "^8.3.2"
40
41
  [tool.poetry.group.docs.dependencies]
41
42
  sphinx = "^8.1.3"
42
43
  pydata-sphinx-theme = "^0.16.1"
44
+ docutils = "^0.21.2"
43
45
 
44
46
  [build-system]
45
47
  requires = ["poetry-core", "wheel"]
File without changes