lexoid 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py CHANGED
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import os
2
3
  import re
3
4
  import tempfile
@@ -50,7 +51,8 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
50
51
  - token_usage: Dictionary containing token usage statistics
51
52
  """
52
53
  if parser_type == ParserType.AUTO:
53
- parser_type = ParserType[router(path)]
54
+ router_priority = kwargs.get("router_priority", "speed")
55
+ parser_type = ParserType[router(path, router_priority)]
54
56
  logger.debug(f"Auto-detected parser type: {parser_type}")
55
57
 
56
58
  kwargs["start"] = (
@@ -80,7 +82,7 @@ def parse_chunk_list(
80
82
  """
81
83
  combined_segments = []
82
84
  raw_texts = []
83
- token_usage = {"input": 0, "output": 0}
85
+ token_usage = {"input": 0, "output": 0, "image_count": 0}
84
86
  for file_path in file_paths:
85
87
  result = parse_chunk(file_path, parser_type, **kwargs)
86
88
  combined_segments.extend(result["segments"])
@@ -88,6 +90,7 @@ def parse_chunk_list(
88
90
  if "token_usage" in result:
89
91
  token_usage["input"] += result["token_usage"]["input"]
90
92
  token_usage["output"] += result["token_usage"]["output"]
93
+ token_usage["image_count"] += len(result["segments"])
91
94
  token_usage["total"] = token_usage["input"] + token_usage["output"]
92
95
 
93
96
  return {
@@ -135,14 +138,20 @@ def parse(
135
138
 
136
139
  if type(parser_type) == str:
137
140
  parser_type = ParserType[parser_type]
141
+ if (
142
+ path.lower().endswith((".doc", ".docx"))
143
+ and parser_type != ParserType.STATIC_PARSE
144
+ ):
145
+ as_pdf = True
146
+ if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE:
147
+ logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.")
148
+ parser_type = ParserType.STATIC_PARSE
149
+ if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE:
150
+ logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.")
151
+ parser_type = ParserType.STATIC_PARSE
138
152
 
139
153
  with tempfile.TemporaryDirectory() as temp_dir:
140
- if (
141
- path.lower().endswith((".doc", ".docx"))
142
- and parser_type != ParserType.STATIC_PARSE
143
- ):
144
- as_pdf = True
145
-
154
+ kwargs["temp_dir"] = temp_dir
146
155
  if path.startswith(("http://", "https://")):
147
156
  kwargs["url"] = path
148
157
  download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
@@ -210,9 +219,40 @@ def parse(
210
219
  "token_usage": {
211
220
  "input": sum(r["token_usage"]["input"] for r in chunk_results),
212
221
  "output": sum(r["token_usage"]["output"] for r in chunk_results),
222
+ "image_count": sum(
223
+ r["token_usage"]["image_count"] for r in chunk_results
224
+ ),
213
225
  "total": sum(r["token_usage"]["total"] for r in chunk_results),
214
226
  },
215
227
  }
228
+
229
+ if "api_cost_mapping" in kwargs:
230
+ api_cost_mapping = kwargs["api_cost_mapping"]
231
+ if isinstance(api_cost_mapping, dict):
232
+ api_cost_mapping = api_cost_mapping
233
+ elif isinstance(api_cost_mapping, str) and os.path.exists(
234
+ api_cost_mapping
235
+ ):
236
+ with open(api_cost_mapping, "r") as f:
237
+ api_cost_mapping = json.load(f)
238
+ else:
239
+ raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
240
+
241
+ api_cost = api_cost_mapping.get(
242
+ kwargs.get("model", "gemini-2.0-flash"), None
243
+ )
244
+ if api_cost:
245
+ token_usage = result["token_usage"]
246
+ token_cost = {
247
+ "input": token_usage["input"] * api_cost["input"] / 1_000_000
248
+ + api_cost.get("input-image", 0) * token_usage["image_count"],
249
+ "output": token_usage["output"]
250
+ * api_cost["output"]
251
+ / 1_000_000,
252
+ }
253
+ token_cost["total"] = token_cost["input"] + token_cost["output"]
254
+ result["token_cost"] = token_cost
255
+
216
256
  if as_pdf:
217
257
  result["pdf_path"] = path
218
258
 
@@ -31,6 +31,7 @@ def retry_on_http_error(func):
31
31
  logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
32
32
  time.sleep(10)
33
33
  try:
34
+ logger.debug(f"Retry {func.__name__}")
34
35
  return func(*args, **kwargs)
35
36
  except HTTPError as e:
36
37
  logger.error(f"Retry failed: {e}")
@@ -49,6 +50,8 @@ def retry_on_http_error(func):
49
50
 
50
51
  @retry_on_http_error
51
52
  def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
53
+ if "api_provider" in kwargs and kwargs["api_provider"]:
54
+ return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
52
55
  if "model" not in kwargs:
53
56
  kwargs["model"] = "gemini-2.0-flash"
54
57
  model = kwargs.get("model")
@@ -57,9 +60,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
57
60
  if model.startswith("gpt"):
58
61
  return parse_with_api(path, api="openai", **kwargs)
59
62
  if model.startswith("meta-llama"):
60
- if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
63
+ if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
61
64
  return parse_with_api(path, api="together", **kwargs)
62
65
  return parse_with_api(path, api="huggingface", **kwargs)
66
+ if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
67
+ return parse_with_api(path, api="openrouter", **kwargs)
63
68
  raise ValueError(f"Unsupported model: {model}")
64
69
 
65
70
 
@@ -81,20 +86,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
81
86
  file_content = file.read()
82
87
  base64_file = base64.b64encode(file_content).decode("utf-8")
83
88
 
84
- # Ideally, we do this ourselves. But, for now this might be a good enough.
85
- custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
86
- if kwargs["pages_per_split_"] == 1:
87
- custom_instruction = ""
89
+ if "system_prompt" in kwargs:
90
+ prompt = kwargs["system_prompt"]
91
+ else:
92
+ # Ideally, we do this ourselves. But, for now this might be a good enough.
93
+ custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
94
+ if kwargs["pages_per_split_"] == 1:
95
+ custom_instruction = ""
96
+ prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction)
88
97
 
89
98
  payload = {
90
99
  "contents": [
91
100
  {
92
101
  "parts": [
93
- {
94
- "text": PARSER_PROMPT.format(
95
- custom_instructions=custom_instruction
96
- )
97
- },
102
+ {"text": prompt},
98
103
  {"inline_data": {"mime_type": mime_type, "data": base64_file}},
99
104
  ]
100
105
  }
@@ -105,9 +110,11 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
105
110
  }
106
111
 
107
112
  headers = {"Content-Type": "application/json"}
108
-
109
- response = requests.post(url, json=payload, headers=headers)
110
- response.raise_for_status()
113
+ try:
114
+ response = requests.post(url, json=payload, headers=headers, timeout=120)
115
+ response.raise_for_status()
116
+ except requests.Timeout as e:
117
+ raise HTTPError(f"Timeout error occurred: {e}")
111
118
 
112
119
  result = response.json()
113
120
 
@@ -130,7 +137,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
130
137
  total_tokens = input_tokens + output_tokens
131
138
 
132
139
  return {
133
- "raw": combined_text,
140
+ "raw": combined_text.replace("<page-break>", "\n\n"),
134
141
  "segments": [
135
142
  {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
136
143
  for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
@@ -181,6 +188,10 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
181
188
  token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
182
189
  ),
183
190
  "together": lambda: Together(),
191
+ "openrouter": lambda: OpenAI(
192
+ base_url="https://openrouter.ai/api/v1",
193
+ api_key=os.environ["OPENROUTER_API_KEY"],
194
+ ),
184
195
  }
185
196
  assert api in clients, f"Unsupported API: {api}"
186
197
  logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -206,35 +217,32 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
206
217
 
207
218
  # API-specific message formatting
208
219
  def get_messages(page_num: int, image_url: str) -> List[Dict]:
209
- base_message = {
210
- "type": "text",
211
- "text": LLAMA_PARSER_PROMPT,
212
- }
213
220
  image_message = {
214
221
  "type": "image_url",
215
222
  "image_url": {"url": image_url},
216
223
  }
217
224
 
218
225
  if api == "openai":
226
+ system_prompt = kwargs.get(
227
+ "system_prompt", PARSER_PROMPT.format(custom_instructions="")
228
+ )
229
+ user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
219
230
  return [
220
231
  {
221
232
  "role": "system",
222
- "content": PARSER_PROMPT.format(
223
- custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
224
- ),
233
+ "content": system_prompt,
225
234
  },
226
235
  {
227
236
  "role": "user",
228
237
  "content": [
229
- {
230
- "type": "text",
231
- "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
232
- },
238
+ {"type": "text", "text": user_prompt},
233
239
  image_message,
234
240
  ],
235
241
  },
236
242
  ]
237
243
  else:
244
+ prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
245
+ base_message = {"type": "text", "text": prompt}
238
246
  return [
239
247
  {
240
248
  "role": "user",
@@ -283,7 +291,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
283
291
  # Sort results by page number and combine
284
292
  all_results.sort(key=lambda x: x[0])
285
293
  all_texts = [text for _, text, _, _, _ in all_results]
286
- combined_text = "<page-break>".join(all_texts)
294
+ combined_text = "\n\n".join(all_texts)
287
295
 
288
296
  return {
289
297
  "raw": combined_text,
@@ -1,12 +1,23 @@
1
+ import os
1
2
  import tempfile
3
+ from time import time
4
+ from typing import List, Dict
5
+
2
6
  import pandas as pd
3
7
  import pdfplumber
4
- from typing import List, Dict
5
- from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
8
+ from docx import Document
6
9
  from pdfminer.high_level import extract_pages
7
10
  from pdfminer.layout import LTTextContainer
8
11
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
9
- from docx import Document
12
+ from pptx2md import convert, ConversionConfig
13
+
14
+ from lexoid.core.utils import (
15
+ get_file_type,
16
+ get_uri_rect,
17
+ html_to_markdown,
18
+ split_pdf,
19
+ split_md_by_headings,
20
+ )
10
21
 
11
22
 
12
23
  def parse_static_doc(path: str, **kwargs) -> Dict:
@@ -47,8 +58,11 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
47
58
  "parent_title": kwargs.get("parent_title", ""),
48
59
  "recursive_docs": [],
49
60
  }
50
- elif file_type == "text/csv":
51
- df = pd.read_csv(path)
61
+ elif file_type == "text/csv" or "spreadsheet" in file_type:
62
+ if "spreadsheet" in file_type:
63
+ df = pd.read_excel(path)
64
+ else:
65
+ df = pd.read_csv(path)
52
66
  content = df.to_markdown(index=False)
53
67
  return {
54
68
  "raw": content,
@@ -58,6 +72,27 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
58
72
  "parent_title": kwargs.get("parent_title", ""),
59
73
  "recursive_docs": [],
60
74
  }
75
+ elif "presentation" in file_type:
76
+ md_path = os.path.join(kwargs["temp_dir"], f"{int(time())}.md")
77
+ convert(
78
+ ConversionConfig(
79
+ pptx_path=path,
80
+ output_path=md_path,
81
+ image_dir=None,
82
+ disable_image=True,
83
+ disable_notes=True,
84
+ )
85
+ )
86
+ with open(md_path, "r") as f:
87
+ content = f.read()
88
+ return {
89
+ "raw": content,
90
+ "segments": split_md_by_headings(content, "#"),
91
+ "title": kwargs["title"],
92
+ "url": kwargs.get("url", ""),
93
+ "parent_title": kwargs.get("parent_title", ""),
94
+ "recursive_docs": [],
95
+ }
61
96
  else:
62
97
  raise ValueError(f"Unsupported file type: {file_type}")
63
98
 
@@ -389,7 +424,7 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
389
424
  ]
390
425
 
391
426
  return {
392
- "raw": "<page-break>".join(page_texts),
427
+ "raw": "\n\n".join(page_texts),
393
428
  "segments": segments,
394
429
  "title": kwargs["title"],
395
430
  "url": kwargs.get("url", ""),
lexoid/core/utils.py CHANGED
@@ -46,7 +46,7 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
46
46
 
47
47
 
48
48
  def create_sub_pdf(
49
- input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
49
+ input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None
50
50
  ) -> str:
51
51
  if isinstance(page_nums, int):
52
52
  page_nums = (page_nums,)
@@ -106,6 +106,8 @@ def is_supported_file_type(path: str) -> bool:
106
106
  if (
107
107
  file_type == "application/pdf"
108
108
  or "wordprocessing" in file_type
109
+ or "spreadsheet" in file_type
110
+ or "presentation" in file_type
109
111
  or file_type.startswith("image/")
110
112
  or file_type.startswith("text")
111
113
  ):
@@ -217,7 +219,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
217
219
  pattern = r"^([^\n]+)\n-+$"
218
220
  sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
219
221
  # Remove empty sections and strip whitespace
220
- sections = [section.strip() for section in sections if section.strip()]
222
+ sections = [section.strip() for section in sections]
221
223
 
222
224
  # Handle content before first heading if it exists
223
225
  if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
@@ -244,7 +246,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
244
246
  headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
245
247
 
246
248
  # Remove empty sections and strip whitespace
247
- sections = [section.strip() for section in sections if section.strip()]
249
+ sections = [section.strip() for section in sections]
248
250
 
249
251
  # Handle content before first heading if it exists
250
252
  if len(sections) > len(headings):
@@ -299,6 +301,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
299
301
 
300
302
  return content
301
303
 
304
+
302
305
  def get_webpage_soup(url: str) -> BeautifulSoup:
303
306
  try:
304
307
  from playwright.async_api import async_playwright
@@ -473,7 +476,10 @@ def save_webpage_as_pdf(url: str, output_path: str) -> str:
473
476
  Returns:
474
477
  str: The path to the saved PDF file.
475
478
  """
476
- app = QApplication(sys.argv)
479
+ if not QApplication.instance():
480
+ app = QApplication(sys.argv)
481
+ else:
482
+ app = QApplication.instance()
477
483
  web = QWebEngineView()
478
484
  web.load(QUrl(url))
479
485
 
@@ -546,7 +552,7 @@ def has_hyperlink_in_pdf(path: str):
546
552
  )
547
553
 
548
554
 
549
- def router(path: str, priority: str = "accuracy") -> str:
555
+ def router(path: str, priority: str = "speed") -> str:
550
556
  """
551
557
  Routes the file path to the appropriate parser based on the file type.
552
558
 
@@ -555,9 +561,9 @@ def router(path: str, priority: str = "accuracy") -> str:
555
561
  priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
556
562
  """
557
563
  file_type = get_file_type(path)
558
- if file_type.startswith("text/"):
564
+ if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
559
565
  return "STATIC_PARSE"
560
-
566
+
561
567
  if priority == "accuracy":
562
568
  # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
563
569
  # Otherwise, use LLM_PARSE
@@ -571,13 +577,11 @@ def router(path: str, priority: str = "accuracy") -> str:
571
577
  else:
572
578
  # If the file is a PDF without images, use STATIC_PARSE
573
579
  # Otherwise, use LLM_PARSE
574
- if (
575
- file_type == "application/pdf"
576
- and not has_image_in_pdf(path)
577
- ):
580
+ if file_type == "application/pdf" and not has_image_in_pdf(path):
578
581
  return "STATIC_PARSE"
579
582
  return "LLM_PARSE"
580
583
 
584
+
581
585
  def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
582
586
  temp_path = os.path.join(
583
587
  temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -18,10 +18,12 @@ Requires-Dist: markdownify (>=0.13.1,<0.14.0)
18
18
  Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
19
19
  Requires-Dist: openai (>=1.47.0,<2.0.0)
20
20
  Requires-Dist: opencv-python (>=4.10.0.84,<5.0.0.0)
21
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
21
22
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
22
23
  Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
23
24
  Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
24
25
  Requires-Dist: playwright (>=1.49.0,<2.0.0)
26
+ Requires-Dist: pptx2md (>=2.0.6,<3.0.0)
25
27
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
26
28
  Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
27
29
  Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
@@ -31,7 +33,20 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
33
  Requires-Dist: together (>=1.4.0,<2.0.0)
32
34
  Description-Content-Type: text/markdown
33
35
 
34
- # Lexoid
36
+ <div align="center">
37
+
38
+ ```
39
+ ___ _______ __ __ _______ ___ ______
40
+ | | | || |_| || || | | |
41
+ | | | ___|| || _ || | | _ |
42
+ | | | |___ | || | | || | | | | |
43
+ | |___ | ___| | | | |_| || | | |_| |
44
+ | || |___ | _ || || | | |
45
+ |_______||_______||__| |__||_______||___| |______|
46
+
47
+ ```
48
+
49
+ </div>
35
50
 
36
51
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
37
52
  [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
@@ -43,54 +58,67 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
43
58
  [Documentation](https://oidlabs-com.github.io/Lexoid/)
44
59
 
45
60
  ## Motivation:
61
+
46
62
  - Use the multi-modal advancement of LLMs
47
63
  - Enable convenience for users
48
64
  - Collaborate with a permissive license
49
65
 
50
66
  ## Installation
67
+
51
68
  ### Installing with pip
69
+
52
70
  ```
53
71
  pip install lexoid
54
72
  ```
55
73
 
56
74
  To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
75
+
57
76
  ```
58
77
  OPENAI_API_KEY=""
59
78
  GOOGLE_API_KEY=""
60
79
  ```
61
80
 
62
81
  Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
82
+
63
83
  ```
64
84
  playwright install --with-deps --only-shell chromium
65
85
  ```
66
86
 
67
87
  ### Building `.whl` from source
88
+
68
89
  ```
69
90
  make build
70
91
  ```
71
92
 
72
93
  ### Creating a local installation
94
+
73
95
  To install dependencies:
96
+
74
97
  ```
75
98
  make install
76
99
  ```
100
+
77
101
  or, to install with dev-dependencies:
102
+
78
103
  ```
79
104
  make dev
80
105
  ```
81
106
 
82
107
  To activate virtual environment:
108
+
83
109
  ```
84
110
  source .venv/bin/activate
85
111
  ```
86
112
 
87
113
  ## Usage
114
+
88
115
  [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
89
116
 
90
- [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
117
+ [Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
91
118
 
92
119
  Here's a quick example to parse documents using Lexoid:
93
- ``` python
120
+
121
+ ```python
94
122
  from lexoid.api import parse
95
123
  from lexoid.api import ParserType
96
124
 
@@ -103,30 +131,42 @@ print(parsed_md)
103
131
  ```
104
132
 
105
133
  ### Parameters
134
+
106
135
  - path (str): The file path or URL.
107
136
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
108
137
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
109
138
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
110
- - **kwargs: Additional arguments for the parser.
139
+ - \*\*kwargs: Additional arguments for the parser.
140
+
141
+ ## Supported API Providers
142
+ * Google
143
+ * OpenAI
144
+ * Hugging Face
145
+ * Together AI
146
+ * OpenRouter
111
147
 
112
148
  ## Benchmark
149
+
113
150
  Results aggregated across 5 iterations each for 5 documents.
114
151
 
115
152
  _Note:_ Benchmarks are currently done in the zero-shot setting.
116
153
 
117
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
118
- |---|---|---|---|---|
119
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
120
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
121
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
122
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
123
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
124
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
125
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
126
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
127
- | 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
128
- | 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
129
- | 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
130
- | 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
131
- | 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
154
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
155
+ | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
156
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
157
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
158
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
159
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
160
+ | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
161
+ | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
162
+ | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
163
+ | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
164
+ | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
165
+ | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
166
+ | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
167
+ | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
168
+ | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
169
+ | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
170
+ | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
171
+ | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
132
172
 
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
2
+ lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
3
+ lexoid/core/parse_type/static_parser.py,sha256=v4GWUmZVBBIF9TnbkhPBt2gspk0Oq_ujtNGnXZHLBr8,15055
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
6
+ lexoid-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.12.dist-info/METADATA,sha256=XMHFMqwDj2DgSaZcZjXU881NxdPsRGBAsUyPyRsJvyU,6809
8
+ lexoid-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.12.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
2
- lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
3
- lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=1If_3XoUhPQRY5XMzLJBsHdyjtLgD734eYBYvsg8w5Y,19569
6
- lexoid-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.11.dist-info/METADATA,sha256=kipDZLbUz_wkJUrzPGH2VppBNMHmaJadHR5_BAqHgjU,4838
8
- lexoid-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.11.dist-info/RECORD,,