lexoid 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -49,7 +49,8 @@ Description-Content-Type: text/markdown
49
49
  </div>
50
50
 
51
51
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
52
- [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
52
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
53
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
53
54
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
54
55
  [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
55
56
 
@@ -144,6 +145,7 @@ print(parsed_md)
144
145
  * Hugging Face
145
146
  * Together AI
146
147
  * OpenRouter
148
+ * Fireworks
147
149
 
148
150
  ## Benchmark
149
151
 
@@ -151,22 +153,25 @@ Results aggregated across 5 iterations each for 5 documents.
151
153
 
152
154
  _Note:_ Benchmarks are currently done in the zero-shot setting.
153
155
 
154
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
155
- | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
156
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
157
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
158
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
159
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
160
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
161
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
162
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
163
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
164
- | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
165
- | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
166
- | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
167
- | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
168
- | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
169
- | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
170
- | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
171
- | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
156
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
157
+ | --- | --- | --- | --- | --- | --- |
158
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
159
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
160
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
161
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
162
+ | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
163
+ | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
164
+ | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
165
+ | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
166
+ | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
167
+ | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
168
+ | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
169
+ | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
170
+ | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
171
+ | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
172
+ | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
173
+ | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
174
+ | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
175
+ | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
176
+ | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
172
177
 
@@ -14,7 +14,8 @@
14
14
  </div>
15
15
 
16
16
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
17
- [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
17
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
18
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
18
19
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
19
20
  [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
20
21
 
@@ -109,6 +110,7 @@ print(parsed_md)
109
110
  * Hugging Face
110
111
  * Together AI
111
112
  * OpenRouter
113
+ * Fireworks
112
114
 
113
115
  ## Benchmark
114
116
 
@@ -116,21 +118,24 @@ Results aggregated across 5 iterations each for 5 documents.
116
118
 
117
119
  _Note:_ Benchmarks are currently done in the zero-shot setting.
118
120
 
119
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
120
- | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
121
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
122
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
123
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
124
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
125
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
126
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
127
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
128
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
129
- | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
130
- | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
131
- | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
132
- | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
133
- | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
134
- | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
135
- | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
136
- | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
121
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
122
+ | --- | --- | --- | --- | --- | --- |
123
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
124
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
125
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
126
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
127
+ | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
128
+ | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
129
+ | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
130
+ | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
131
+ | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
132
+ | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
133
+ | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
134
+ | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
135
+ | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
136
+ | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
137
+ | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
138
+ | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
139
+ | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
140
+ | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
141
+ | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
@@ -10,7 +10,11 @@ from typing import Union, Dict, List
10
10
 
11
11
  from loguru import logger
12
12
 
13
- from lexoid.core.parse_type.llm_parser import parse_llm_doc
13
+ from lexoid.core.parse_type.llm_parser import (
14
+ parse_llm_doc,
15
+ create_response,
16
+ convert_doc_to_base64_images,
17
+ )
14
18
  from lexoid.core.parse_type.static_parser import parse_static_doc
15
19
  from lexoid.core.utils import (
16
20
  convert_to_pdf,
@@ -49,6 +53,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
49
53
  - parent_title: Title of parent doc if recursively parsed
50
54
  - recursive_docs: List of dictionaries for recursively parsed documents
51
55
  - token_usage: Dictionary containing token usage statistics
56
+ - parser_used: Which parser was actually used
52
57
  """
53
58
  if parser_type == ParserType.AUTO:
54
59
  router_priority = kwargs.get("router_priority", "speed")
@@ -60,10 +65,13 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
60
65
  )
61
66
  if parser_type == ParserType.STATIC_PARSE:
62
67
  logger.debug("Using static parser")
63
- return parse_static_doc(path, **kwargs)
68
+ result = parse_static_doc(path, **kwargs)
64
69
  else:
65
70
  logger.debug("Using LLM parser")
66
- return parse_llm_doc(path, **kwargs)
71
+ result = parse_llm_doc(path, **kwargs)
72
+
73
+ result["parser_used"] = parser_type
74
+ return result
67
75
 
68
76
 
69
77
  def parse_chunk_list(
@@ -82,15 +90,18 @@ def parse_chunk_list(
82
90
  """
83
91
  combined_segments = []
84
92
  raw_texts = []
85
- token_usage = {"input": 0, "output": 0, "image_count": 0}
93
+ token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
86
94
  for file_path in file_paths:
87
95
  result = parse_chunk(file_path, parser_type, **kwargs)
88
96
  combined_segments.extend(result["segments"])
89
97
  raw_texts.append(result["raw"])
90
- if "token_usage" in result:
98
+ if (
99
+ result.get("parser_used") == ParserType.LLM_PARSE
100
+ and "token_usage" in result
101
+ ):
91
102
  token_usage["input"] += result["token_usage"]["input"]
92
103
  token_usage["output"] += result["token_usage"]["output"]
93
- token_usage["image_count"] += len(result["segments"])
104
+ token_usage["llm_page_count"] += len(result["segments"])
94
105
  token_usage["total"] = token_usage["input"] + token_usage["output"]
95
106
 
96
107
  return {
@@ -136,7 +147,7 @@ def parse(
136
147
  as_pdf = kwargs.get("as_pdf", False)
137
148
  depth = kwargs.get("depth", 1)
138
149
 
139
- if type(parser_type) == str:
150
+ if type(parser_type) is str:
140
151
  parser_type = ParserType[parser_type]
141
152
  if (
142
153
  path.lower().endswith((".doc", ".docx"))
@@ -184,7 +195,7 @@ def parse(
184
195
 
185
196
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
186
197
  kwargs["split"] = False
187
- result = parse_chunk(path, parser_type, **kwargs)
198
+ result = parse_chunk_list([path], parser_type, kwargs)
188
199
  else:
189
200
  kwargs["split"] = True
190
201
  split_dir = os.path.join(temp_dir, "splits/")
@@ -219,42 +230,43 @@ def parse(
219
230
  "token_usage": {
220
231
  "input": sum(r["token_usage"]["input"] for r in chunk_results),
221
232
  "output": sum(r["token_usage"]["output"] for r in chunk_results),
222
- "image_count": sum(
223
- r["token_usage"]["image_count"] for r in chunk_results
233
+ "llm_page_count": sum(
234
+ r["token_usage"]["llm_page_count"] for r in chunk_results
224
235
  ),
225
236
  "total": sum(r["token_usage"]["total"] for r in chunk_results),
226
237
  },
227
238
  }
228
239
 
229
- if "api_cost_mapping" in kwargs:
230
- api_cost_mapping = kwargs["api_cost_mapping"]
231
- if isinstance(api_cost_mapping, dict):
232
- api_cost_mapping = api_cost_mapping
233
- elif isinstance(api_cost_mapping, str) and os.path.exists(
234
- api_cost_mapping
235
- ):
236
- with open(api_cost_mapping, "r") as f:
237
- api_cost_mapping = json.load(f)
238
- else:
239
- raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
240
-
241
- api_cost = api_cost_mapping.get(
242
- kwargs.get("model", "gemini-2.0-flash"), None
240
+ if "api_cost_mapping" in kwargs and "token_usage" in result:
241
+ api_cost_mapping = kwargs["api_cost_mapping"]
242
+ if isinstance(api_cost_mapping, dict):
243
+ api_cost_mapping = api_cost_mapping
244
+ elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
245
+ with open(api_cost_mapping, "r") as f:
246
+ api_cost_mapping = json.load(f)
247
+ else:
248
+ raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
249
+
250
+ api_cost = api_cost_mapping.get(
251
+ kwargs.get("model", "gemini-2.0-flash"), None
252
+ )
253
+ if api_cost:
254
+ token_usage = result["token_usage"]
255
+ token_cost = {
256
+ "input": token_usage["input"] * api_cost["input"] / 1_000_000,
257
+ "input-image": api_cost.get("input-image", 0)
258
+ * token_usage.get("llm_page_count", 0),
259
+ "output": token_usage["output"] * api_cost["output"] / 1_000_000,
260
+ }
261
+ token_cost["total"] = (
262
+ token_cost["input"]
263
+ + token_cost["input-image"]
264
+ + token_cost["output"]
243
265
  )
244
- if api_cost:
245
- token_usage = result["token_usage"]
246
- token_cost = {
247
- "input": token_usage["input"] * api_cost["input"] / 1_000_000
248
- + api_cost.get("input-image", 0) * token_usage["image_count"],
249
- "output": token_usage["output"]
250
- * api_cost["output"]
251
- / 1_000_000,
252
- }
253
- token_cost["total"] = token_cost["input"] + token_cost["output"]
254
- result["token_cost"] = token_cost
255
-
256
- if as_pdf:
257
- result["pdf_path"] = path
266
+ result["token_cost"] = token_cost
267
+
268
+ if as_pdf:
269
+ result["pdf_path"] = path
258
270
 
259
271
  if depth > 1:
260
272
  recursive_docs = []
@@ -285,3 +297,63 @@ def parse(
285
297
  result["recursive_docs"] = recursive_docs
286
298
 
287
299
  return result
300
+
301
+
302
+ def parse_with_schema(
303
+ path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
304
+ ) -> List[List[Dict]]:
305
+ """
306
+ Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
307
+
308
+ Args:
309
+ path (str): Path to the PDF file.
310
+ schema (Dict): JSON schema to which the parsed output should conform.
311
+ api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
312
+ model (str, optional): LLM model name.
313
+ **kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
314
+
315
+ Returns:
316
+ List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
317
+ """
318
+ system_prompt = f"""
319
+ The output should be formatted as a JSON instance that conforms to the JSON schema below.
320
+
321
+ As an example, for the schema {{
322
+ "properties": {{
323
+ "foo": {{
324
+ "title": "Foo",
325
+ "description": "a list of strings",
326
+ "type": "array",
327
+ "items": {{"type": "string"}}
328
+ }}
329
+ }},
330
+ "required": ["foo"]
331
+ }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
332
+
333
+ Here is the output schema:
334
+ {json.dumps(schema, indent=2)}
335
+
336
+ """
337
+
338
+ user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
339
+
340
+ responses = []
341
+ images = convert_doc_to_base64_images(path)
342
+ for i, (page_num, image) in enumerate(images):
343
+ resp_dict = create_response(
344
+ api=api,
345
+ model=model,
346
+ user_prompt=user_prompt,
347
+ system_prompt=system_prompt,
348
+ image_url=image,
349
+ temperature=kwargs.get("temperature", 0.0),
350
+ max_tokens=kwargs.get("max_tokens", 1024),
351
+ )
352
+
353
+ response = resp_dict.get("response", "")
354
+ response = response.split("```json")[-1].split("```")[0].strip()
355
+ logger.debug(f"Processing page {page_num + 1} with response: {response}")
356
+ new_dict = json.loads(response)
357
+ responses.append(new_dict)
358
+
359
+ return responses
@@ -3,23 +3,24 @@ import io
3
3
  import mimetypes
4
4
  import os
5
5
  import time
6
+ from functools import wraps
7
+ from typing import Dict, List, Optional, Tuple
8
+
6
9
  import pypdfium2 as pdfium
7
10
  import requests
8
- from functools import wraps
11
+ from huggingface_hub import InferenceClient
12
+ from loguru import logger
13
+ from openai import OpenAI
9
14
  from requests.exceptions import HTTPError
10
- from typing import Dict, List
15
+ from together import Together
11
16
 
12
17
  from lexoid.core.prompt_templates import (
13
18
  INSTRUCTIONS_ADD_PG_BREAK,
19
+ LLAMA_PARSER_PROMPT,
14
20
  OPENAI_USER_PROMPT,
15
21
  PARSER_PROMPT,
16
- LLAMA_PARSER_PROMPT,
17
22
  )
18
23
  from lexoid.core.utils import convert_image_to_pdf
19
- from loguru import logger
20
- from openai import OpenAI
21
- from together import Together
22
- from huggingface_hub import InferenceClient
23
24
 
24
25
 
25
26
  def retry_on_http_error(func):
@@ -65,10 +66,13 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
65
66
  return parse_with_api(path, api="huggingface", **kwargs)
66
67
  if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
67
68
  return parse_with_api(path, api="openrouter", **kwargs)
69
+ if model.startswith("accounts/fireworks"):
70
+ return parse_with_api(path, api="fireworks", **kwargs)
68
71
  raise ValueError(f"Unsupported model: {model}")
69
72
 
70
73
 
71
74
  def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
75
+ logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
72
76
  api_key = os.environ.get("GOOGLE_API_KEY")
73
77
  if not api_key:
74
78
  raise ValueError("GOOGLE_API_KEY environment variable is not set")
@@ -105,7 +109,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
105
109
  }
106
110
  ],
107
111
  "generationConfig": {
108
- "temperature": kwargs.get("temperature", 0.7),
112
+ "temperature": kwargs.get("temperature", 0.2),
109
113
  },
110
114
  }
111
115
 
@@ -127,7 +131,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
127
131
 
128
132
  combined_text = ""
129
133
  if "<output>" in raw_text:
130
- combined_text = raw_text.split("<output>")[1].strip()
134
+ combined_text = raw_text.split("<output>")[-1].strip()
131
135
  if "</output>" in result:
132
136
  combined_text = result.split("</output>")[0].strip()
133
137
 
@@ -169,18 +173,54 @@ def convert_pdf_page_to_base64(
169
173
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
170
174
 
171
175
 
172
- def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
173
- """
174
- Parse documents (PDFs or images) using various vision model APIs.
176
+ def get_messages(
177
+ system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
178
+ ) -> List[Dict]:
179
+ messages = []
180
+ if system_prompt:
181
+ messages.append(
182
+ {
183
+ "role": "system",
184
+ "content": system_prompt,
185
+ }
186
+ )
187
+ base_message = (
188
+ [
189
+ {"type": "text", "text": user_prompt},
190
+ ]
191
+ if user_prompt
192
+ else []
193
+ )
194
+ image_message = (
195
+ [
196
+ {
197
+ "type": "image_url",
198
+ "image_url": {"url": image_url},
199
+ }
200
+ ]
201
+ if image_url
202
+ else []
203
+ )
175
204
 
176
- Args:
177
- path (str): Path to the document to parse
178
- api (str): Which API to use ("openai", "huggingface", or "together")
179
- **kwargs: Additional arguments including model, temperature, title, etc.
205
+ messages.append(
206
+ {
207
+ "role": "user",
208
+ "content": base_message + image_message,
209
+ }
210
+ )
180
211
 
181
- Returns:
182
- Dict: Dictionary containing parsed document data
183
- """
212
+ return messages
213
+
214
+
215
+ def create_response(
216
+ api: str,
217
+ model: str,
218
+ system_prompt: Optional[str] = None,
219
+ user_prompt: Optional[str] = None,
220
+ image_url: Optional[str] = None,
221
+ temperature: float = 0.2,
222
+ max_tokens: int = 1024,
223
+ ) -> Dict:
184
224
  # Initialize appropriate client
185
225
  clients = {
186
226
  "openai": lambda: OpenAI(),
@@ -192,11 +232,52 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
192
232
  base_url="https://openrouter.ai/api/v1",
193
233
  api_key=os.environ["OPENROUTER_API_KEY"],
194
234
  ),
235
+ "fireworks": lambda: OpenAI(
236
+ base_url="https://api.fireworks.ai/inference/v1",
237
+ api_key=os.environ["FIREWORKS_API_KEY"],
238
+ ),
195
239
  }
196
240
  assert api in clients, f"Unsupported API: {api}"
197
- logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
198
241
  client = clients[api]()
199
242
 
243
+ # Prepare messages for the API call
244
+ messages = get_messages(system_prompt, user_prompt, image_url)
245
+
246
+ # Common completion parameters
247
+ completion_params = {
248
+ "model": model,
249
+ "messages": messages,
250
+ "max_tokens": max_tokens,
251
+ "temperature": temperature,
252
+ }
253
+
254
+ # Get completion from selected API
255
+ response = client.chat.completions.create(**completion_params)
256
+ token_usage = response.usage
257
+
258
+ # Extract the response text
259
+ page_text = response.choices[0].message.content
260
+
261
+ return {
262
+ "response": page_text,
263
+ "usage": token_usage,
264
+ }
265
+
266
+
267
+ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
268
+ """
269
+ Parse documents (PDFs or images) using various vision model APIs.
270
+
271
+ Args:
272
+ path (str): Path to the document to parse
273
+ api (str): Which API to use ("openai", "huggingface", or "together")
274
+ **kwargs: Additional arguments including model, temperature, title, etc.
275
+
276
+ Returns:
277
+ Dict: Dictionary containing parsed document data
278
+ """
279
+ logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
280
+
200
281
  # Handle different input types
201
282
  mime_type, _ = mimetypes.guess_type(path)
202
283
  if mime_type and mime_type.startswith("image"):
@@ -215,67 +296,39 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
215
296
  for page_num in range(len(pdf_document))
216
297
  ]
217
298
 
218
- # API-specific message formatting
219
- def get_messages(page_num: int, image_url: str) -> List[Dict]:
220
- image_message = {
221
- "type": "image_url",
222
- "image_url": {"url": image_url},
223
- }
224
-
299
+ # Process each page/image
300
+ all_results = []
301
+ for page_num, image_url in images:
225
302
  if api == "openai":
226
303
  system_prompt = kwargs.get(
227
304
  "system_prompt", PARSER_PROMPT.format(custom_instructions="")
228
305
  )
229
306
  user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
230
- return [
231
- {
232
- "role": "system",
233
- "content": system_prompt,
234
- },
235
- {
236
- "role": "user",
237
- "content": [
238
- {"type": "text", "text": user_prompt},
239
- image_message,
240
- ],
241
- },
242
- ]
243
307
  else:
244
- prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
245
- base_message = {"type": "text", "text": prompt}
246
- return [
247
- {
248
- "role": "user",
249
- "content": [base_message, image_message],
250
- }
251
- ]
252
-
253
- # Process each page/image
254
- all_results = []
255
- for page_num, image_url in images:
256
- messages = get_messages(page_num, image_url)
257
-
258
- # Common completion parameters
259
- completion_params = {
260
- "model": kwargs["model"],
261
- "messages": messages,
262
- "max_tokens": kwargs.get("max_tokens", 1024),
263
- "temperature": kwargs.get("temperature", 0.7),
264
- }
308
+ system_prompt = kwargs.get("system_prompt", None)
309
+ user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
310
+
311
+ response = create_response(
312
+ api=api,
313
+ model=kwargs["model"],
314
+ system_prompt=system_prompt,
315
+ user_prompt=user_prompt,
316
+ image_url=image_url,
317
+ temperature=kwargs.get("temperature", 0.2),
318
+ max_tokens=kwargs.get("max_tokens", 1024),
319
+ )
265
320
 
266
321
  # Get completion from selected API
267
- response = client.chat.completions.create(**completion_params)
268
- token_usage = response.usage
322
+ page_text = response["response"]
323
+ token_usage = response["usage"]
269
324
 
270
- # Extract the response text
271
- page_text = response.choices[0].message.content
272
325
  if kwargs.get("verbose", None):
273
326
  logger.debug(f"Page {page_num + 1} response: {page_text}")
274
327
 
275
328
  # Extract content between output tags if present
276
329
  result = page_text
277
330
  if "<output>" in page_text:
278
- result = page_text.split("<output>")[1].strip()
331
+ result = page_text.split("<output>")[-1].strip()
279
332
  if "</output>" in result:
280
333
  result = result.split("</output>")[0].strip()
281
334
  all_results.append(
@@ -319,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
319
372
  "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
320
373
  },
321
374
  }
375
+
376
+
377
+ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
378
+ """
379
+ Converts a document (PDF or image) to a base64 encoded string.
380
+
381
+ Args:
382
+ path (str): Path to the PDF file.
383
+
384
+ Returns:
385
+ str: Base64 encoded string of the PDF content.
386
+ """
387
+ if path.endswith(".pdf"):
388
+ pdf_document = pdfium.PdfDocument(path)
389
+ return [
390
+ (
391
+ page_num,
392
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
393
+ )
394
+ for page_num in range(len(pdf_document))
395
+ ]
396
+ elif mimetypes.guess_type(path)[0].startswith("image"):
397
+ with open(path, "rb") as img_file:
398
+ image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
399
+ return [(0, f"data:image/png;base64,{image_base64}")]
@@ -1,7 +1,8 @@
1
1
  import os
2
+ import re
2
3
  import tempfile
3
4
  from time import time
4
- from typing import List, Dict
5
+ from typing import Dict, List
5
6
 
6
7
  import pandas as pd
7
8
  import pdfplumber
@@ -9,14 +10,15 @@ from docx import Document
9
10
  from pdfminer.high_level import extract_pages
10
11
  from pdfminer.layout import LTTextContainer
11
12
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
12
- from pptx2md import convert, ConversionConfig
13
+ from pptx2md import ConversionConfig, convert
14
+
13
15
 
14
16
  from lexoid.core.utils import (
15
17
  get_file_type,
16
18
  get_uri_rect,
17
19
  html_to_markdown,
18
- split_pdf,
19
20
  split_md_by_headings,
21
+ split_pdf,
20
22
  )
21
23
 
22
24
 
@@ -203,6 +205,25 @@ def embed_links_in_text(page, text, links):
203
205
  return text
204
206
 
205
207
 
208
+ def detect_indentation_level(word, base_left_position):
209
+ """Determine indentation level based on left position difference."""
210
+ left_diff = word["x0"] - base_left_position
211
+ if left_diff < 5:
212
+ return 0
213
+ return int(left_diff // 25) + 1
214
+
215
+
216
+ def embed_email_links(text: str) -> str:
217
+ """
218
+ Detect email addresses in text and wrap them in angle brackets.
219
+ For example, 'mail@example.com' becomes '<mail@example.com>'.
220
+ """
221
+ email_pattern = re.compile(
222
+ r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
223
+ )
224
+ return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
225
+
226
+
206
227
  def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
207
228
  """
208
229
  Process a single page's content and return formatted markdown text.
@@ -213,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
213
234
  last_y = None
214
235
  x_tolerance = kwargs.get("x_tolerance", 1)
215
236
  y_tolerance = kwargs.get("y_tolerance", 5)
216
-
237
+ next_h_line_idx = 0
238
+
239
+ # First detect horizontal lines that could be markdown rules
240
+ horizontal_lines = []
241
+ if hasattr(page, "lines"):
242
+ for line in page.lines:
243
+ # Check if line is approximately horizontal (within 5 degrees)
244
+ if (
245
+ abs(line["height"]) < 0.1
246
+ or abs(line["width"]) > abs(line["height"]) * 20
247
+ ):
248
+ # Consider it a horizontal rule candidate
249
+ horizontal_lines.append(
250
+ {
251
+ "top": line["top"],
252
+ "bottom": line["bottom"],
253
+ "x0": line["x0"],
254
+ "x1": line["x1"],
255
+ }
256
+ )
217
257
  # Table settings
218
258
  vertical_strategy = kwargs.get("vertical_strategy", "lines")
219
259
  horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
@@ -243,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
243
283
  extra_attrs=["size", "top", "bottom", "fontname"],
244
284
  )
245
285
 
246
- def format_paragraph(text_elements):
247
- """Format a paragraph with styling applied to individual words"""
248
- formatted_words = []
249
- for element in text_elements:
250
- text = element["text"]
251
- formatting = get_text_formatting(element)
252
- formatted_words.append(apply_markdown_formatting(text, formatting))
253
- return f"{' '.join(formatted_words)}\n\n"
286
+ if words:
287
+ font_sizes = [w.get("size", 12) for w in words]
288
+ body_font_size = max(set(font_sizes), key=font_sizes.count)
289
+ else:
290
+ body_font_size = 12
291
+
292
+ left_positions = []
293
+ prev_bottom = None
294
+
295
+ for word in words:
296
+ # Check if this is likely a new line (first word in line)
297
+ if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
298
+ left_positions.append(word["x0"])
299
+ prev_bottom = word["top"]
300
+
301
+ # Find the most common minimum left position (mode)
302
+ if left_positions:
303
+ base_left = max(set(left_positions), key=left_positions.count)
304
+ else:
305
+ base_left = 0
306
+
307
+ for line in horizontal_lines:
308
+ # Check each word to see if it overlaps with this line
309
+ for word in words:
310
+ # Get word bounding box coordinates
311
+ word_left = word["x0"]
312
+ word_right = word["x1"]
313
+ word_top = word["top"]
314
+ word_bottom = word["bottom"]
315
+
316
+ # Check if word overlaps with line in both x and y dimensions
317
+ x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
318
+ y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
319
+
320
+ if x_overlap and y_overlap:
321
+ word["text"] = f"~~{word['text']}~~"
322
+ break
254
323
 
255
324
  def get_text_formatting(word):
256
325
  """
@@ -260,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
260
329
  formatting = {
261
330
  "bold": False,
262
331
  "italic": False,
332
+ "monospace": False,
263
333
  }
264
-
265
334
  # Check font name for common bold/italic indicators
266
335
  font_name = word.get("fontname", "").lower()
267
336
  if any(style in font_name for style in ["bold", "heavy", "black"]):
268
337
  formatting["bold"] = True
269
338
  if any(style in font_name for style in ["italic", "oblique"]):
270
339
  formatting["italic"] = True
271
-
340
+ if "mono" in font_name: # Detect monospace fonts
341
+ formatting["monospace"] = True
272
342
  return formatting
273
343
 
274
344
  def apply_markdown_formatting(text, formatting):
275
345
  """Apply markdown formatting to text based on detected styles"""
346
+ if formatting["monospace"]:
347
+ text = f"`{text}`"
276
348
  if formatting["bold"] and formatting["italic"]:
277
349
  text = f"***{text}***"
278
350
  elif formatting["bold"]:
@@ -281,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
281
353
  text = f"*{text}*"
282
354
  return text
283
355
 
284
- def detect_heading_level(font_size):
285
- if font_size >= 24:
356
+ def format_paragraph(text_elements):
357
+ """
358
+ Format a paragraph with styling applied to individual words.
359
+ If all words are monospace, treat the paragraph as a code block.
360
+ Otherwise, wrap monospace words with backticks (`).
361
+ """
362
+
363
+ all_monospace = True
364
+ formatted_words = []
365
+
366
+ for element in text_elements:
367
+ if isinstance(element, tuple) and element[0] == "indent":
368
+ indent = "&nbsp;" * element[1] * 3
369
+ formatted_words.append(indent)
370
+ continue
371
+
372
+ text = element["text"]
373
+ formatting = get_text_formatting(element)
374
+
375
+ if formatting.get("monospace", False):
376
+ # Wrap monospace words with backticks
377
+ formatted_words.append(f"`{text}`")
378
+ else:
379
+ all_monospace = False
380
+ # Apply other markdown formatting
381
+ formatted_words.append(apply_markdown_formatting(text, formatting))
382
+
383
+ # If all words are monospace, format as a code block
384
+ if all_monospace:
385
+ if isinstance(text_elements[0], tuple):
386
+ indent_str = " " * text_elements[0][1]
387
+ if len(text_elements) > 1:
388
+ text_elements = text_elements[1:]
389
+ text_elements[0]["text"] = indent_str + text_elements[0]["text"]
390
+ else:
391
+ return indent_str
392
+ code_content = " ".join([element["text"] for element in text_elements])
393
+ return f"```\n{code_content}\n```\n\n"
394
+
395
+ # Otherwise, return the formatted paragraph
396
+ return f"{' '.join(formatted_words)}\n\n"
397
+
398
+ def detect_heading_level(font_size, body_font_size):
399
+ """Determine heading level based on font size ratio.
400
+
401
+ Args:
402
+ font_size: The font size to evaluate
403
+ body_font_size: The base body font size for comparison
404
+
405
+ Returns:
406
+ int: The heading level (1-3) or None if not a heading
407
+ """
408
+ size_ratio = font_size / body_font_size
409
+ if size_ratio >= 2:
286
410
  return 1
287
- elif font_size >= 20:
411
+ elif size_ratio >= 1.4:
288
412
  return 2
289
- elif font_size >= 16:
413
+ elif size_ratio >= 1.2:
290
414
  return 3
291
415
  return None
292
416
 
@@ -303,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
303
427
  )
304
428
  )
305
429
  tables.sort(key=lambda x: x[1]["bottom"])
430
+
306
431
  content_elements = []
307
- for word in words:
432
+ for line in horizontal_lines:
433
+ content_elements.append(
434
+ (
435
+ "horizontal_line",
436
+ {
437
+ "top": line["top"],
438
+ "bottom": line["bottom"],
439
+ "x0": line["x0"],
440
+ "x1": line["x1"],
441
+ },
442
+ )
443
+ )
444
+
445
+ for i, word in enumerate(words):
308
446
  while tables and word["bottom"] > tables[0][1]["bottom"]:
309
447
  content_elements.append(tables.pop(0))
448
+
449
+ # Equate position of words on the same line
450
+ if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
451
+ word["top"] = words[i - 1]["top"]
452
+
310
453
  content_elements.append(("word", word))
311
454
  content_elements.extend(tables)
312
455
 
456
+ content_elements.sort(
457
+ key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
458
+ )
459
+
313
460
  for element_type, element in content_elements:
461
+ # If there are any pending paragraphs or headings, add them first
314
462
  if element_type == "table":
315
- # If there are any pending paragraphs or headings, add them first
316
463
  if current_heading:
317
- level = detect_heading_level(current_heading[0]["size"])
464
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
318
465
  heading_text = format_paragraph(current_heading)
319
466
  markdown_content.append(f"{'#' * level} {heading_text}")
320
467
  current_heading = []
@@ -324,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
324
471
  # Add the table
325
472
  markdown_content.append(element["content"])
326
473
  last_y = element["bottom"]
474
+ elif element_type == "horizontal_line":
475
+ while (next_h_line_idx < len(horizontal_lines)) and (
476
+ last_y is not None
477
+ and horizontal_lines[next_h_line_idx]["top"] <= last_y
478
+ ):
479
+ # Insert the horizontal rule *after* the preceding text
480
+ if current_paragraph: # Flush any pending paragraph
481
+ markdown_content.append(format_paragraph(current_paragraph))
482
+ current_paragraph = []
483
+ markdown_content.append("\n---\n\n") # Add the rule
484
+ next_h_line_idx += 1
327
485
  else:
328
486
  # Process word
329
487
  word = element
330
488
  # Check if this might be a heading
331
- heading_level = detect_heading_level(word["size"])
489
+ heading_level = detect_heading_level(word["size"], body_font_size)
332
490
 
333
491
  # Detect new line based on vertical position
334
492
  is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
@@ -336,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
336
494
  if is_new_line:
337
495
  # If we were collecting a heading
338
496
  if current_heading:
339
- level = detect_heading_level(current_heading[0]["size"])
497
+ level = detect_heading_level(
498
+ current_heading[0]["size"], body_font_size
499
+ )
340
500
  heading_text = format_paragraph(current_heading)
341
501
  markdown_content.append(f"{'#' * level} {heading_text}")
342
502
  current_heading = []
@@ -346,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
346
506
  markdown_content.append(format_paragraph(current_paragraph))
347
507
  current_paragraph = []
348
508
 
509
+ indent_level = detect_indentation_level(word, base_left)
510
+ current_paragraph.append(("indent", indent_level))
511
+
349
512
  # Add word to appropriate collection
350
513
  if heading_level:
351
514
  if current_paragraph: # Flush any pending paragraph
@@ -354,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
354
517
  current_heading.append(word)
355
518
  else:
356
519
  if current_heading: # Flush any pending heading
357
- level = detect_heading_level(current_heading[0]["size"])
520
+ level = detect_heading_level(
521
+ current_heading[0]["size"], body_font_size
522
+ )
358
523
  heading_text = format_paragraph(current_heading)
359
524
  markdown_content.append(f"{'#' * level} {heading_text}")
360
525
  current_heading = []
@@ -364,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
364
529
 
365
530
  # Handle remaining content
366
531
  if current_heading:
367
- level = detect_heading_level(current_heading[0]["size"])
532
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
368
533
  heading_text = format_paragraph(current_heading)
369
534
  markdown_content.append(f"{'#' * level} {heading_text}")
370
535
 
@@ -383,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
383
548
  if links:
384
549
  content = embed_links_in_text(page, content, links)
385
550
 
551
+ content = embed_email_links(content)
552
+
386
553
  # Remove redundant formatting
387
- content = content.replace("** **", " ").replace("* *", " ")
554
+ content = (
555
+ content.replace("** **", " ")
556
+ .replace("* *", " ")
557
+ .replace("` `", " ")
558
+ .replace("\n```\n\n```", "")
559
+ )
388
560
 
389
561
  return content
390
562
 
@@ -41,7 +41,8 @@ Think step-by-step.
41
41
  '0' is typically more oval than 'O'
42
42
  '8' has a more angular top than 'B'
43
43
  {custom_instructions}
44
- - Return only the correct markdown without additional text or explanations. Do not any additional text (such as "```html" or "```markdown") in the output.
44
+ - Return only the correct markdown without additional text or explanations.
45
+ - DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
45
46
  - Think before generating the output in <thinking></thinking> tags.
46
47
 
47
48
  Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
@@ -345,7 +345,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
345
345
  # Additional wait for any dynamic content
346
346
  try:
347
347
  await page.wait_for_selector("body", timeout=30000)
348
- except:
348
+ except Exception:
349
349
  pass
350
350
 
351
351
  html = await page.content()
@@ -561,24 +561,32 @@ def router(path: str, priority: str = "speed") -> str:
561
561
  priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
562
562
  """
563
563
  file_type = get_file_type(path)
564
- if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
564
+ if (
565
+ file_type.startswith("text/")
566
+ or "spreadsheet" in file_type
567
+ or "presentation" in file_type
568
+ ):
565
569
  return "STATIC_PARSE"
566
570
 
567
571
  if priority == "accuracy":
568
572
  # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
569
573
  # Otherwise, use LLM_PARSE
570
- if (
571
- file_type == "application/pdf"
572
- and not has_image_in_pdf(path)
573
- and has_hyperlink_in_pdf(path)
574
- ):
574
+ has_image = has_image_in_pdf(path)
575
+ has_hyperlink = has_hyperlink_in_pdf(path)
576
+ if file_type == "application/pdf" and not has_image and has_hyperlink:
577
+ logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
575
578
  return "STATIC_PARSE"
579
+ logger.debug(
580
+ f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
581
+ )
576
582
  return "LLM_PARSE"
577
583
  else:
578
584
  # If the file is a PDF without images, use STATIC_PARSE
579
585
  # Otherwise, use LLM_PARSE
580
586
  if file_type == "application/pdf" and not has_image_in_pdf(path):
587
+ logger.debug("Using STATIC_PARSE for PDF without images.")
581
588
  return "STATIC_PARSE"
589
+ logger.debug("Using LLM_PARSE because PDF has images")
582
590
  return "LLM_PARSE"
583
591
 
584
592
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.12"
3
+ version = "0.1.14"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
File without changes