lexoid 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -49,7 +49,8 @@ Description-Content-Type: text/markdown
49
49
  </div>
50
50
 
51
51
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
52
- [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
52
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
53
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
53
54
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
54
55
  [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
55
56
 
@@ -144,6 +145,7 @@ print(parsed_md)
144
145
  * Hugging Face
145
146
  * Together AI
146
147
  * OpenRouter
148
+ * Fireworks
147
149
 
148
150
  ## Benchmark
149
151
 
@@ -151,22 +153,25 @@ Results aggregated across 5 iterations each for 5 documents.
151
153
 
152
154
  _Note:_ Benchmarks are currently done in the zero-shot setting.
153
155
 
154
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
155
- | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
156
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
157
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
158
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
159
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
160
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
161
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
162
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
163
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
164
- | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
165
- | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
166
- | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
167
- | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
168
- | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
169
- | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
170
- | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
171
- | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
156
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
157
+ | --- | --- | --- | --- | --- | --- |
158
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
159
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
160
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
161
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
162
+ | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
163
+ | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
164
+ | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
165
+ | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
166
+ | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
167
+ | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
168
+ | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
169
+ | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
170
+ | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
171
+ | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
172
+ | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
173
+ | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
174
+ | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
175
+ | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
176
+ | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
172
177
 
@@ -14,7 +14,8 @@
14
14
  </div>
15
15
 
16
16
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
17
- [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
17
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
18
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
18
19
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
19
20
  [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
20
21
 
@@ -109,6 +110,7 @@ print(parsed_md)
109
110
  * Hugging Face
110
111
  * Together AI
111
112
  * OpenRouter
113
+ * Fireworks
112
114
 
113
115
  ## Benchmark
114
116
 
@@ -116,21 +118,24 @@ Results aggregated across 5 iterations each for 5 documents.
116
118
 
117
119
  _Note:_ Benchmarks are currently done in the zero-shot setting.
118
120
 
119
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
120
- | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
121
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
122
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
123
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
124
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
125
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
126
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
127
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
128
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
129
- | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
130
- | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
131
- | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
132
- | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
133
- | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
134
- | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
135
- | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
136
- | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
121
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
122
+ | --- | --- | --- | --- | --- | --- |
123
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
124
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
125
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
126
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
127
+ | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
128
+ | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
129
+ | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
130
+ | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
131
+ | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
132
+ | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
133
+ | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
134
+ | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
135
+ | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
136
+ | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
137
+ | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
138
+ | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
139
+ | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
140
+ | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
141
+ | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
@@ -10,7 +10,11 @@ from typing import Union, Dict, List
10
10
 
11
11
  from loguru import logger
12
12
 
13
- from lexoid.core.parse_type.llm_parser import parse_llm_doc
13
+ from lexoid.core.parse_type.llm_parser import (
14
+ parse_llm_doc,
15
+ create_response,
16
+ convert_doc_to_base64_images,
17
+ )
14
18
  from lexoid.core.parse_type.static_parser import parse_static_doc
15
19
  from lexoid.core.utils import (
16
20
  convert_to_pdf,
@@ -49,6 +53,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
49
53
  - parent_title: Title of parent doc if recursively parsed
50
54
  - recursive_docs: List of dictionaries for recursively parsed documents
51
55
  - token_usage: Dictionary containing token usage statistics
56
+ - parser_used: Which parser was actually used
52
57
  """
53
58
  if parser_type == ParserType.AUTO:
54
59
  router_priority = kwargs.get("router_priority", "speed")
@@ -60,10 +65,13 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
60
65
  )
61
66
  if parser_type == ParserType.STATIC_PARSE:
62
67
  logger.debug("Using static parser")
63
- return parse_static_doc(path, **kwargs)
68
+ result = parse_static_doc(path, **kwargs)
64
69
  else:
65
70
  logger.debug("Using LLM parser")
66
- return parse_llm_doc(path, **kwargs)
71
+ result = parse_llm_doc(path, **kwargs)
72
+
73
+ result["parser_used"] = parser_type
74
+ return result
67
75
 
68
76
 
69
77
  def parse_chunk_list(
@@ -82,15 +90,18 @@ def parse_chunk_list(
82
90
  """
83
91
  combined_segments = []
84
92
  raw_texts = []
85
- token_usage = {"input": 0, "output": 0, "image_count": 0}
93
+ token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
86
94
  for file_path in file_paths:
87
95
  result = parse_chunk(file_path, parser_type, **kwargs)
88
96
  combined_segments.extend(result["segments"])
89
97
  raw_texts.append(result["raw"])
90
- if "token_usage" in result:
98
+ if (
99
+ result.get("parser_used") == ParserType.LLM_PARSE
100
+ and "token_usage" in result
101
+ ):
91
102
  token_usage["input"] += result["token_usage"]["input"]
92
103
  token_usage["output"] += result["token_usage"]["output"]
93
- token_usage["image_count"] += len(result["segments"])
104
+ token_usage["llm_page_count"] += len(result["segments"])
94
105
  token_usage["total"] = token_usage["input"] + token_usage["output"]
95
106
 
96
107
  return {
@@ -136,7 +147,7 @@ def parse(
136
147
  as_pdf = kwargs.get("as_pdf", False)
137
148
  depth = kwargs.get("depth", 1)
138
149
 
139
- if type(parser_type) == str:
150
+ if type(parser_type) is str:
140
151
  parser_type = ParserType[parser_type]
141
152
  if (
142
153
  path.lower().endswith((".doc", ".docx"))
@@ -184,7 +195,7 @@ def parse(
184
195
 
185
196
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
186
197
  kwargs["split"] = False
187
- result = parse_chunk(path, parser_type, **kwargs)
198
+ result = parse_chunk_list([path], parser_type, kwargs)
188
199
  else:
189
200
  kwargs["split"] = True
190
201
  split_dir = os.path.join(temp_dir, "splits/")
@@ -219,42 +230,43 @@ def parse(
219
230
  "token_usage": {
220
231
  "input": sum(r["token_usage"]["input"] for r in chunk_results),
221
232
  "output": sum(r["token_usage"]["output"] for r in chunk_results),
222
- "image_count": sum(
223
- r["token_usage"]["image_count"] for r in chunk_results
233
+ "llm_page_count": sum(
234
+ r["token_usage"]["llm_page_count"] for r in chunk_results
224
235
  ),
225
236
  "total": sum(r["token_usage"]["total"] for r in chunk_results),
226
237
  },
227
238
  }
228
239
 
229
- if "api_cost_mapping" in kwargs:
230
- api_cost_mapping = kwargs["api_cost_mapping"]
231
- if isinstance(api_cost_mapping, dict):
232
- api_cost_mapping = api_cost_mapping
233
- elif isinstance(api_cost_mapping, str) and os.path.exists(
234
- api_cost_mapping
235
- ):
236
- with open(api_cost_mapping, "r") as f:
237
- api_cost_mapping = json.load(f)
238
- else:
239
- raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
240
-
241
- api_cost = api_cost_mapping.get(
242
- kwargs.get("model", "gemini-2.0-flash"), None
240
+ if "api_cost_mapping" in kwargs and "token_usage" in result:
241
+ api_cost_mapping = kwargs["api_cost_mapping"]
242
+ if isinstance(api_cost_mapping, dict):
243
+ api_cost_mapping = api_cost_mapping
244
+ elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
245
+ with open(api_cost_mapping, "r") as f:
246
+ api_cost_mapping = json.load(f)
247
+ else:
248
+ raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
249
+
250
+ api_cost = api_cost_mapping.get(
251
+ kwargs.get("model", "gemini-2.0-flash"), None
252
+ )
253
+ if api_cost:
254
+ token_usage = result["token_usage"]
255
+ token_cost = {
256
+ "input": token_usage["input"] * api_cost["input"] / 1_000_000,
257
+ "input-image": api_cost.get("input-image", 0)
258
+ * token_usage.get("llm_page_count", 0),
259
+ "output": token_usage["output"] * api_cost["output"] / 1_000_000,
260
+ }
261
+ token_cost["total"] = (
262
+ token_cost["input"]
263
+ + token_cost["input-image"]
264
+ + token_cost["output"]
243
265
  )
244
- if api_cost:
245
- token_usage = result["token_usage"]
246
- token_cost = {
247
- "input": token_usage["input"] * api_cost["input"] / 1_000_000
248
- + api_cost.get("input-image", 0) * token_usage["image_count"],
249
- "output": token_usage["output"]
250
- * api_cost["output"]
251
- / 1_000_000,
252
- }
253
- token_cost["total"] = token_cost["input"] + token_cost["output"]
254
- result["token_cost"] = token_cost
255
-
256
- if as_pdf:
257
- result["pdf_path"] = path
266
+ result["token_cost"] = token_cost
267
+
268
+ if as_pdf:
269
+ result["pdf_path"] = path
258
270
 
259
271
  if depth > 1:
260
272
  recursive_docs = []
@@ -285,3 +297,63 @@ def parse(
285
297
  result["recursive_docs"] = recursive_docs
286
298
 
287
299
  return result
300
+
301
+
302
+ def parse_with_schema(
303
+ path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
304
+ ) -> List[List[Dict]]:
305
+ """
306
+ Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
307
+
308
+ Args:
309
+ path (str): Path to the PDF file.
310
+ schema (Dict): JSON schema to which the parsed output should conform.
311
+ api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
312
+ model (str, optional): LLM model name.
313
+ **kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
314
+
315
+ Returns:
316
+ List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
317
+ """
318
+ system_prompt = f"""
319
+ The output should be formatted as a JSON instance that conforms to the JSON schema below.
320
+
321
+ As an example, for the schema {{
322
+ "properties": {{
323
+ "foo": {{
324
+ "title": "Foo",
325
+ "description": "a list of strings",
326
+ "type": "array",
327
+ "items": {{"type": "string"}}
328
+ }}
329
+ }},
330
+ "required": ["foo"]
331
+ }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
332
+
333
+ Here is the output schema:
334
+ {json.dumps(schema, indent=2)}
335
+
336
+ """
337
+
338
+ user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
339
+
340
+ responses = []
341
+ images = convert_doc_to_base64_images(path)
342
+ for i, (page_num, image) in enumerate(images):
343
+ resp_dict = create_response(
344
+ api=api,
345
+ model=model,
346
+ user_prompt=user_prompt,
347
+ system_prompt=system_prompt,
348
+ image_url=image,
349
+ temperature=kwargs.get("temperature", 0.0),
350
+ max_tokens=kwargs.get("max_tokens", 1024),
351
+ )
352
+
353
+ response = resp_dict.get("response", "")
354
+ response = response.split("```json")[-1].split("```")[0].strip()
355
+ logger.debug(f"Processing page {page_num + 1} with response: {response}")
356
+ new_dict = json.loads(response)
357
+ responses.append(new_dict)
358
+
359
+ return responses
@@ -3,23 +3,24 @@ import io
3
3
  import mimetypes
4
4
  import os
5
5
  import time
6
+ from functools import wraps
7
+ from typing import Dict, List, Optional, Tuple
8
+
6
9
  import pypdfium2 as pdfium
7
10
  import requests
8
- from functools import wraps
11
+ from huggingface_hub import InferenceClient
12
+ from loguru import logger
13
+ from openai import OpenAI
9
14
  from requests.exceptions import HTTPError
10
- from typing import Dict, List
15
+ from together import Together
11
16
 
12
17
  from lexoid.core.prompt_templates import (
13
18
  INSTRUCTIONS_ADD_PG_BREAK,
19
+ LLAMA_PARSER_PROMPT,
14
20
  OPENAI_USER_PROMPT,
15
21
  PARSER_PROMPT,
16
- LLAMA_PARSER_PROMPT,
17
22
  )
18
23
  from lexoid.core.utils import convert_image_to_pdf
19
- from loguru import logger
20
- from openai import OpenAI
21
- from together import Together
22
- from huggingface_hub import InferenceClient
23
24
 
24
25
 
25
26
  def retry_on_http_error(func):
@@ -65,10 +66,13 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
65
66
  return parse_with_api(path, api="huggingface", **kwargs)
66
67
  if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
67
68
  return parse_with_api(path, api="openrouter", **kwargs)
69
+ if model.startswith("accounts/fireworks"):
70
+ return parse_with_api(path, api="fireworks", **kwargs)
68
71
  raise ValueError(f"Unsupported model: {model}")
69
72
 
70
73
 
71
74
  def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
75
+ logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
72
76
  api_key = os.environ.get("GOOGLE_API_KEY")
73
77
  if not api_key:
74
78
  raise ValueError("GOOGLE_API_KEY environment variable is not set")
@@ -105,7 +109,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
105
109
  }
106
110
  ],
107
111
  "generationConfig": {
108
- "temperature": kwargs.get("temperature", 0.7),
112
+ "temperature": kwargs.get("temperature", 0.2),
109
113
  },
110
114
  }
111
115
 
@@ -127,7 +131,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
127
131
 
128
132
  combined_text = ""
129
133
  if "<output>" in raw_text:
130
- combined_text = raw_text.split("<output>")[1].strip()
134
+ combined_text = raw_text.split("<output>")[-1].strip()
131
135
  if "</output>" in result:
132
136
  combined_text = result.split("</output>")[0].strip()
133
137
 
@@ -169,18 +173,54 @@ def convert_pdf_page_to_base64(
169
173
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
170
174
 
171
175
 
172
- def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
173
- """
174
- Parse documents (PDFs or images) using various vision model APIs.
176
+ def get_messages(
177
+ system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
178
+ ) -> List[Dict]:
179
+ messages = []
180
+ if system_prompt:
181
+ messages.append(
182
+ {
183
+ "role": "system",
184
+ "content": system_prompt,
185
+ }
186
+ )
187
+ base_message = (
188
+ [
189
+ {"type": "text", "text": user_prompt},
190
+ ]
191
+ if user_prompt
192
+ else []
193
+ )
194
+ image_message = (
195
+ [
196
+ {
197
+ "type": "image_url",
198
+ "image_url": {"url": image_url},
199
+ }
200
+ ]
201
+ if image_url
202
+ else []
203
+ )
175
204
 
176
- Args:
177
- path (str): Path to the document to parse
178
- api (str): Which API to use ("openai", "huggingface", or "together")
179
- **kwargs: Additional arguments including model, temperature, title, etc.
205
+ messages.append(
206
+ {
207
+ "role": "user",
208
+ "content": base_message + image_message,
209
+ }
210
+ )
180
211
 
181
- Returns:
182
- Dict: Dictionary containing parsed document data
183
- """
212
+ return messages
213
+
214
+
215
+ def create_response(
216
+ api: str,
217
+ model: str,
218
+ system_prompt: Optional[str] = None,
219
+ user_prompt: Optional[str] = None,
220
+ image_url: Optional[str] = None,
221
+ temperature: float = 0.2,
222
+ max_tokens: int = 1024,
223
+ ) -> Dict:
184
224
  # Initialize appropriate client
185
225
  clients = {
186
226
  "openai": lambda: OpenAI(),
@@ -192,11 +232,52 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
192
232
  base_url="https://openrouter.ai/api/v1",
193
233
  api_key=os.environ["OPENROUTER_API_KEY"],
194
234
  ),
235
+ "fireworks": lambda: OpenAI(
236
+ base_url="https://api.fireworks.ai/inference/v1",
237
+ api_key=os.environ["FIREWORKS_API_KEY"],
238
+ ),
195
239
  }
196
240
  assert api in clients, f"Unsupported API: {api}"
197
- logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
198
241
  client = clients[api]()
199
242
 
243
+ # Prepare messages for the API call
244
+ messages = get_messages(system_prompt, user_prompt, image_url)
245
+
246
+ # Common completion parameters
247
+ completion_params = {
248
+ "model": model,
249
+ "messages": messages,
250
+ "max_tokens": max_tokens,
251
+ "temperature": temperature,
252
+ }
253
+
254
+ # Get completion from selected API
255
+ response = client.chat.completions.create(**completion_params)
256
+ token_usage = response.usage
257
+
258
+ # Extract the response text
259
+ page_text = response.choices[0].message.content
260
+
261
+ return {
262
+ "response": page_text,
263
+ "usage": token_usage,
264
+ }
265
+
266
+
267
+ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
268
+ """
269
+ Parse documents (PDFs or images) using various vision model APIs.
270
+
271
+ Args:
272
+ path (str): Path to the document to parse
273
+ api (str): Which API to use ("openai", "huggingface", or "together")
274
+ **kwargs: Additional arguments including model, temperature, title, etc.
275
+
276
+ Returns:
277
+ Dict: Dictionary containing parsed document data
278
+ """
279
+ logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
280
+
200
281
  # Handle different input types
201
282
  mime_type, _ = mimetypes.guess_type(path)
202
283
  if mime_type and mime_type.startswith("image"):
@@ -215,67 +296,39 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
215
296
  for page_num in range(len(pdf_document))
216
297
  ]
217
298
 
218
- # API-specific message formatting
219
- def get_messages(page_num: int, image_url: str) -> List[Dict]:
220
- image_message = {
221
- "type": "image_url",
222
- "image_url": {"url": image_url},
223
- }
224
-
299
+ # Process each page/image
300
+ all_results = []
301
+ for page_num, image_url in images:
225
302
  if api == "openai":
226
303
  system_prompt = kwargs.get(
227
304
  "system_prompt", PARSER_PROMPT.format(custom_instructions="")
228
305
  )
229
306
  user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
230
- return [
231
- {
232
- "role": "system",
233
- "content": system_prompt,
234
- },
235
- {
236
- "role": "user",
237
- "content": [
238
- {"type": "text", "text": user_prompt},
239
- image_message,
240
- ],
241
- },
242
- ]
243
307
  else:
244
- prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
245
- base_message = {"type": "text", "text": prompt}
246
- return [
247
- {
248
- "role": "user",
249
- "content": [base_message, image_message],
250
- }
251
- ]
252
-
253
- # Process each page/image
254
- all_results = []
255
- for page_num, image_url in images:
256
- messages = get_messages(page_num, image_url)
257
-
258
- # Common completion parameters
259
- completion_params = {
260
- "model": kwargs["model"],
261
- "messages": messages,
262
- "max_tokens": kwargs.get("max_tokens", 1024),
263
- "temperature": kwargs.get("temperature", 0.7),
264
- }
308
+ system_prompt = kwargs.get("system_prompt", None)
309
+ user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
310
+
311
+ response = create_response(
312
+ api=api,
313
+ model=kwargs["model"],
314
+ system_prompt=system_prompt,
315
+ user_prompt=user_prompt,
316
+ image_url=image_url,
317
+ temperature=kwargs.get("temperature", 0.2),
318
+ max_tokens=kwargs.get("max_tokens", 1024),
319
+ )
265
320
 
266
321
  # Get completion from selected API
267
- response = client.chat.completions.create(**completion_params)
268
- token_usage = response.usage
322
+ page_text = response["response"]
323
+ token_usage = response["usage"]
269
324
 
270
- # Extract the response text
271
- page_text = response.choices[0].message.content
272
325
  if kwargs.get("verbose", None):
273
326
  logger.debug(f"Page {page_num + 1} response: {page_text}")
274
327
 
275
328
  # Extract content between output tags if present
276
329
  result = page_text
277
330
  if "<output>" in page_text:
278
- result = page_text.split("<output>")[1].strip()
331
+ result = page_text.split("<output>")[-1].strip()
279
332
  if "</output>" in result:
280
333
  result = result.split("</output>")[0].strip()
281
334
  all_results.append(
@@ -319,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
319
372
  "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
320
373
  },
321
374
  }
375
+
376
+
377
+ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
378
+ """
379
+ Converts a document (PDF or image) to a base64 encoded string.
380
+
381
+ Args:
382
+ path (str): Path to the PDF file.
383
+
384
+ Returns:
385
+ str: Base64 encoded string of the PDF content.
386
+ """
387
+ if path.endswith(".pdf"):
388
+ pdf_document = pdfium.PdfDocument(path)
389
+ return [
390
+ (
391
+ page_num,
392
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
393
+ )
394
+ for page_num in range(len(pdf_document))
395
+ ]
396
+ elif mimetypes.guess_type(path)[0].startswith("image"):
397
+ with open(path, "rb") as img_file:
398
+ image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
399
+ return [(0, f"data:image/png;base64,{image_base64}")]
@@ -41,7 +41,8 @@ Think step-by-step.
41
41
  '0' is typically more oval than 'O'
42
42
  '8' has a more angular top than 'B'
43
43
  {custom_instructions}
44
- - Return only the correct markdown without additional text or explanations. Do not any additional text (such as "```html" or "```markdown") in the output.
44
+ - Return only the correct markdown without additional text or explanations.
45
+ - DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
45
46
  - Think before generating the output in <thinking></thinking> tags.
46
47
 
47
48
  Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
@@ -345,7 +345,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
345
345
  # Additional wait for any dynamic content
346
346
  try:
347
347
  await page.wait_for_selector("body", timeout=30000)
348
- except:
348
+ except Exception:
349
349
  pass
350
350
 
351
351
  html = await page.content()
@@ -561,24 +561,32 @@ def router(path: str, priority: str = "speed") -> str:
561
561
  priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
562
562
  """
563
563
  file_type = get_file_type(path)
564
- if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
564
+ if (
565
+ file_type.startswith("text/")
566
+ or "spreadsheet" in file_type
567
+ or "presentation" in file_type
568
+ ):
565
569
  return "STATIC_PARSE"
566
570
 
567
571
  if priority == "accuracy":
568
572
  # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
569
573
  # Otherwise, use LLM_PARSE
570
- if (
571
- file_type == "application/pdf"
572
- and not has_image_in_pdf(path)
573
- and has_hyperlink_in_pdf(path)
574
- ):
574
+ has_image = has_image_in_pdf(path)
575
+ has_hyperlink = has_hyperlink_in_pdf(path)
576
+ if file_type == "application/pdf" and not has_image and has_hyperlink:
577
+ logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
575
578
  return "STATIC_PARSE"
579
+ logger.debug(
580
+ f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
581
+ )
576
582
  return "LLM_PARSE"
577
583
  else:
578
584
  # If the file is a PDF without images, use STATIC_PARSE
579
585
  # Otherwise, use LLM_PARSE
580
586
  if file_type == "application/pdf" and not has_image_in_pdf(path):
587
+ logger.debug("Using STATIC_PARSE for PDF without images.")
581
588
  return "STATIC_PARSE"
589
+ logger.debug("Using LLM_PARSE because PDF has images")
582
590
  return "LLM_PARSE"
583
591
 
584
592
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.13"
3
+ version = "0.1.14"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
File without changes