lexoid 0.1.6.post1__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,30 +2,66 @@ import base64
2
2
  import io
3
3
  import mimetypes
4
4
  import os
5
- from typing import Dict, List
6
-
5
+ import time
7
6
  import pypdfium2 as pdfium
8
7
  import requests
8
+ from functools import wraps
9
+ from requests.exceptions import HTTPError
10
+ from typing import Dict, List
11
+
9
12
  from lexoid.core.prompt_templates import (
10
13
  INSTRUCTIONS_ADD_PG_BREAK,
11
14
  OPENAI_USER_PROMPT,
12
15
  PARSER_PROMPT,
16
+ LLAMA_PARSER_PROMPT,
13
17
  )
14
18
  from lexoid.core.utils import convert_image_to_pdf
15
19
  from loguru import logger
16
20
  from openai import OpenAI
21
+ from huggingface_hub import InferenceClient
17
22
 
18
23
 
24
+ def retry_on_http_error(func):
25
+ @wraps(func)
26
+ def wrapper(*args, **kwargs):
27
+ try:
28
+ return func(*args, **kwargs)
29
+ except HTTPError as e:
30
+ logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
31
+ time.sleep(10)
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except HTTPError as e:
35
+ logger.error(f"Retry failed: {e}")
36
+ if kwargs.get("raw", False):
37
+ return ""
38
+ return [
39
+ {
40
+ "metadata": {
41
+ "title": kwargs["title"],
42
+ "page": kwargs.get("start", 0),
43
+ },
44
+ "content": "",
45
+ }
46
+ ]
47
+
48
+ return wrapper
49
+
50
+
51
+ @retry_on_http_error
19
52
  def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
20
53
  if "model" not in kwargs:
21
54
  kwargs["model"] = "gemini-1.5-flash"
22
55
  model = kwargs.get("model")
23
56
  if model.startswith("gemini"):
24
57
  return parse_with_gemini(path, raw, **kwargs)
25
- elif model.startswith("gpt"):
26
- return parse_with_gpt(path, raw, **kwargs)
27
- else:
28
- raise ValueError(f"Unsupported model: {model}")
58
+ if model.startswith("gpt"):
59
+ return parse_with_api(path, raw, api="openai", **kwargs)
60
+ if model.startswith("meta-llama"):
61
+ if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
62
+ return parse_with_together(path, raw, **kwargs)
63
+ return parse_with_api(path, raw, api="huggingface", **kwargs)
64
+ raise ValueError(f"Unsupported model: {model}")
29
65
 
30
66
 
31
67
  def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
@@ -101,7 +137,6 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
101
137
  "content": page,
102
138
  }
103
139
  for page_no, page in enumerate(result.split("<page-break>"), start=1)
104
- if page.strip()
105
140
  ]
106
141
 
107
142
 
@@ -120,59 +155,188 @@ def convert_pdf_page_to_base64(
120
155
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
121
156
 
122
157
 
123
- def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
124
- client = OpenAI()
158
+ def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
159
+ api_key = os.environ.get("TOGETHER_API_KEY")
160
+ if not api_key:
161
+ raise ValueError("TOGETHER_API_KEY environment variable is not set")
162
+
163
+ url = "https://api.together.xyz/v1/chat/completions"
164
+ headers = {
165
+ "Authorization": f"Bearer {api_key}",
166
+ "Content-Type": "application/json",
167
+ }
125
168
 
126
- # Handle different input types
127
169
  mime_type, _ = mimetypes.guess_type(path)
128
170
  if mime_type and mime_type.startswith("image"):
129
- # Single image processing
130
171
  with open(path, "rb") as img_file:
131
172
  image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
132
- images = [(0, image_base64)]
173
+ images = [(0, f"data:{mime_type};base64,{image_base64}")]
133
174
  else:
134
- # PDF processing
135
175
  pdf_document = pdfium.PdfDocument(path)
136
176
  images = [
137
- (page_num, convert_pdf_page_to_base64(pdf_document, page_num))
177
+ (
178
+ page_num,
179
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
180
+ )
138
181
  for page_num in range(len(pdf_document))
139
182
  ]
140
183
 
141
- # Process each page/image
142
184
  all_results = []
143
- for page_num, image_base64 in images:
185
+ for page_num, image_url in images:
144
186
  messages = [
145
- {
146
- "role": "system",
147
- "content": PARSER_PROMPT,
148
- },
149
187
  {
150
188
  "role": "user",
151
189
  "content": [
152
- {
153
- "type": "text",
154
- "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
155
- },
156
- {
157
- "type": "image_url",
158
- "image_url": {"url": f"data:image/png;base64,{image_base64}"},
159
- },
190
+ {"type": "text", "text": LLAMA_PARSER_PROMPT},
191
+ {"type": "image_url", "image_url": {"url": image_url}},
160
192
  ],
193
+ }
194
+ ]
195
+
196
+ payload = {
197
+ "model": kwargs["model"],
198
+ "messages": messages,
199
+ "max_tokens": kwargs.get("max_tokens", 1024),
200
+ "temperature": kwargs.get("temperature", 0.7),
201
+ }
202
+
203
+ response = requests.post(url, json=payload, headers=headers)
204
+ response.raise_for_status()
205
+ response_data = response.json()
206
+
207
+ page_text = response_data["choices"][0]["message"]["content"]
208
+ if kwargs.get("verbose", None):
209
+ logger.debug(f"Page {page_num + 1} response: {page_text}")
210
+
211
+ result = page_text
212
+ if "<output>" in page_text:
213
+ result = page_text.split("<output>")[1].strip()
214
+ if "</output>" in result:
215
+ result = result.split("</output>")[0].strip()
216
+ all_results.append((page_num, result))
217
+
218
+ all_results.sort(key=lambda x: x[0])
219
+ all_texts = [text for _, text in all_results]
220
+ combined_text = "<page-break>".join(all_texts)
221
+
222
+ if raw:
223
+ return combined_text
224
+
225
+ return [
226
+ {
227
+ "metadata": {
228
+ "title": kwargs["title"],
229
+ "page": kwargs.get("start", 0) + page_no,
161
230
  },
231
+ "content": page,
232
+ }
233
+ for page_no, page in enumerate(all_texts, start=1)
234
+ ]
235
+
236
+
237
+ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
238
+ """
239
+ Parse documents (PDFs or images) using various vision model APIs.
240
+
241
+ Args:
242
+ path (str): Path to the document to parse
243
+ raw (bool): If True, return raw text; if False, return structured data
244
+ api (str): Which API to use ("openai" or "huggingface")
245
+ **kwargs: Additional arguments including model, temperature, title, etc.
246
+
247
+ Returns:
248
+ List[Dict] | str: Parsed content either as raw text or structured data
249
+ """
250
+ # Initialize appropriate client
251
+ clients = {
252
+ "openai": lambda: OpenAI(),
253
+ "huggingface": lambda: InferenceClient(
254
+ token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
255
+ ),
256
+ }
257
+ assert api in clients, f"Unsupported API: {api}"
258
+ logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
259
+ client = clients[api]()
260
+
261
+ # Handle different input types
262
+ mime_type, _ = mimetypes.guess_type(path)
263
+ if mime_type and mime_type.startswith("image"):
264
+ # Single image processing
265
+ with open(path, "rb") as img_file:
266
+ image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
267
+ images = [(0, f"data:{mime_type};base64,{image_base64}")]
268
+ else:
269
+ # PDF processing
270
+ pdf_document = pdfium.PdfDocument(path)
271
+ images = [
272
+ (
273
+ page_num,
274
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
275
+ )
276
+ for page_num in range(len(pdf_document))
162
277
  ]
163
278
 
164
- # Get completion from GPT-4 Vision
165
- response = client.chat.completions.create(
166
- model=kwargs["model"],
167
- temperature=kwargs.get("temperature", 0.7),
168
- messages=messages,
169
- )
279
+ # API-specific message formatting
280
+ def get_messages(page_num: int, image_url: str) -> List[Dict]:
281
+ base_message = {
282
+ "type": "text",
283
+ "text": LLAMA_PARSER_PROMPT,
284
+ }
285
+ image_message = {
286
+ "type": "image_url",
287
+ "image_url": {"url": image_url},
288
+ }
289
+
290
+ if api == "openai":
291
+ return [
292
+ {
293
+ "role": "system",
294
+ "content": PARSER_PROMPT.format(
295
+ custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
296
+ ),
297
+ },
298
+ {
299
+ "role": "user",
300
+ "content": [
301
+ {
302
+ "type": "text",
303
+ "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
304
+ },
305
+ image_message,
306
+ ],
307
+ },
308
+ ]
309
+ else:
310
+ return [
311
+ {
312
+ "role": "user",
313
+ "content": [base_message, image_message],
314
+ }
315
+ ]
316
+
317
+ # Process each page/image
318
+ all_results = []
319
+ for page_num, image_url in images:
320
+ messages = get_messages(page_num, image_url)
321
+
322
+ # Common completion parameters
323
+ completion_params = {
324
+ "model": kwargs["model"],
325
+ "messages": messages,
326
+ "max_tokens": kwargs.get("max_tokens", 1024),
327
+ "temperature": kwargs.get("temperature", 0.7),
328
+ }
329
+
330
+ # Get completion from selected API
331
+ response = client.chat.completions.create(**completion_params)
170
332
 
171
333
  # Extract the response text
172
334
  page_text = response.choices[0].message.content
173
335
  if kwargs.get("verbose", None):
174
336
  logger.debug(f"Page {page_num + 1} response: {page_text}")
175
- result = ""
337
+
338
+ # Extract content between output tags if present
339
+ result = page_text
176
340
  if "<output>" in page_text:
177
341
  result = page_text.split("<output>")[1].strip()
178
342
  if "</output>" in result:
@@ -196,5 +360,4 @@ def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
196
360
  "content": page,
197
361
  }
198
362
  for page_no, page in enumerate(all_texts, start=1)
199
- if page.strip()
200
363
  ]
@@ -89,15 +89,21 @@ def process_table(table) -> str:
89
89
 
90
90
  # Convert to DataFrame and handle empty cells
91
91
  df = pd.DataFrame(table_data)
92
+ df.replace("", pd.NA, inplace=True)
93
+ df = df.dropna(how="all", axis=0)
94
+ df = df.dropna(how="all", axis=1)
92
95
  df = df.fillna("")
96
+ if len(df) == 0:
97
+ return ""
93
98
 
94
99
  # Use first row as header and clean it up
95
100
  df.columns = df.iloc[0]
96
- df = df.drop(0)
101
+ df = df.drop(df.index[0])
102
+ df.replace(r"\n", "<br>", regex=True, inplace=True)
97
103
 
98
104
  # Convert to markdown with some formatting options
99
105
  markdown_table = df.to_markdown(index=False, tablefmt="pipe")
100
- return f"\n{markdown_table}\n\n" # Add newlines for proper markdown rendering
106
+ return f"\n{markdown_table}\n\n"
101
107
 
102
108
 
103
109
  def embed_links_in_text(page, text, links):
@@ -157,8 +163,20 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
157
163
  x_tolerance = kwargs.get("x_tolerance", 1)
158
164
  y_tolerance = kwargs.get("y_tolerance", 5)
159
165
 
160
- # First, identify tables and their positions
161
- tables = page.find_tables()
166
+ # Table settings
167
+ vertical_strategy = kwargs.get("vertical_strategy", "lines")
168
+ horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
169
+ snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
170
+ snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
171
+
172
+ tables = page.find_tables(
173
+ table_settings={
174
+ "vertical_strategy": vertical_strategy,
175
+ "horizontal_strategy": horizontal_strategy,
176
+ "snap_x_tolerance": snap_x_tolerance,
177
+ "snap_y_tolerance": snap_y_tolerance,
178
+ }
179
+ )
162
180
  table_zones = [(table.bbox, process_table(table)) for table in tables]
163
181
 
164
182
  # Create a filtered page excluding table areas
@@ -171,12 +189,46 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
171
189
  words = filtered_page.extract_words(
172
190
  x_tolerance=x_tolerance,
173
191
  y_tolerance=y_tolerance,
174
- extra_attrs=["size", "top", "bottom"],
192
+ extra_attrs=["size", "top", "bottom", "fontname"],
175
193
  )
176
194
 
177
- def format_paragraph(text):
178
- text = " ".join(text.split())
179
- return f"{text}\n\n"
195
+ def format_paragraph(text_elements):
196
+ """Format a paragraph with styling applied to individual words"""
197
+ formatted_words = []
198
+ for element in text_elements:
199
+ text = element["text"]
200
+ formatting = get_text_formatting(element)
201
+ formatted_words.append(apply_markdown_formatting(text, formatting))
202
+ return f"{' '.join(formatted_words)}\n\n"
203
+
204
+ def get_text_formatting(word):
205
+ """
206
+ Detect text formatting based on font properties
207
+ Returns a dict of formatting attributes
208
+ """
209
+ formatting = {
210
+ "bold": False,
211
+ "italic": False,
212
+ }
213
+
214
+ # Check font name for common bold/italic indicators
215
+ font_name = word.get("fontname", "").lower()
216
+ if any(style in font_name for style in ["bold", "heavy", "black"]):
217
+ formatting["bold"] = True
218
+ if any(style in font_name for style in ["italic", "oblique"]):
219
+ formatting["italic"] = True
220
+
221
+ return formatting
222
+
223
+ def apply_markdown_formatting(text, formatting):
224
+ """Apply markdown formatting to text based on detected styles"""
225
+ if formatting["bold"] and formatting["italic"]:
226
+ text = f"***{text}***"
227
+ elif formatting["bold"]:
228
+ text = f"**{text}**"
229
+ elif formatting["italic"]:
230
+ text = f"*{text}*"
231
+ return text
180
232
 
181
233
  def detect_heading_level(font_size):
182
234
  if font_size >= 24:
@@ -205,17 +257,18 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
205
257
  while tables and word["bottom"] > tables[0][1]["bottom"]:
206
258
  content_elements.append(tables.pop(0))
207
259
  content_elements.append(("word", word))
260
+ content_elements.extend(tables)
208
261
 
209
262
  for element_type, element in content_elements:
210
263
  if element_type == "table":
211
264
  # If there are any pending paragraphs or headings, add them first
212
265
  if current_heading:
213
266
  level = detect_heading_level(current_heading[0]["size"])
214
- heading_text = " ".join(word["text"] for word in current_heading)
215
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
267
+ heading_text = format_paragraph(current_heading)
268
+ markdown_content.append(f"{'#' * level} {heading_text}")
216
269
  current_heading = []
217
270
  if current_paragraph:
218
- markdown_content.append(format_paragraph(" ".join(current_paragraph)))
271
+ markdown_content.append(format_paragraph(current_paragraph))
219
272
  current_paragraph = []
220
273
  # Add the table
221
274
  markdown_content.append(element["content"])
@@ -233,46 +286,42 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
233
286
  # If we were collecting a heading
234
287
  if current_heading:
235
288
  level = detect_heading_level(current_heading[0]["size"])
236
- heading_text = " ".join(word["text"] for word in current_heading)
237
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
289
+ heading_text = format_paragraph(current_heading)
290
+ markdown_content.append(f"{'#' * level} {heading_text}")
238
291
  current_heading = []
239
292
 
240
293
  # If we were collecting a paragraph
241
294
  if current_paragraph:
242
- markdown_content.append(
243
- format_paragraph(" ".join(current_paragraph))
244
- )
295
+ markdown_content.append(format_paragraph(current_paragraph))
245
296
  current_paragraph = []
246
297
 
247
298
  # Add word to appropriate collection
248
299
  if heading_level:
249
300
  if current_paragraph: # Flush any pending paragraph
250
- markdown_content.append(
251
- format_paragraph(" ".join(current_paragraph))
252
- )
301
+ markdown_content.append(format_paragraph(current_paragraph))
253
302
  current_paragraph = []
254
- current_heading.append({"text": word["text"], "size": word["size"]})
303
+ current_heading.append(word)
255
304
  else:
256
305
  if current_heading: # Flush any pending heading
257
306
  level = detect_heading_level(current_heading[0]["size"])
258
- heading_text = " ".join(word["text"] for word in current_heading)
259
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
307
+ heading_text = format_paragraph(current_heading)
308
+ markdown_content.append(f"{'#' * level} {heading_text}")
260
309
  current_heading = []
261
- current_paragraph.append(word["text"])
310
+ current_paragraph.append(word)
262
311
 
263
312
  last_y = word["top"]
264
313
 
265
314
  # Handle remaining content
266
315
  if current_heading:
267
316
  level = detect_heading_level(current_heading[0]["size"])
268
- heading_text = " ".join(word["text"] for word in current_heading)
269
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
317
+ heading_text = format_paragraph(current_heading)
318
+ markdown_content.append(f"{'#' * level} {heading_text}")
270
319
 
271
320
  if current_paragraph:
272
- markdown_content.append(format_paragraph(" ".join(current_paragraph)))
321
+ markdown_content.append(format_paragraph(current_paragraph))
273
322
 
274
323
  # Process links for the page
275
- content = "".join(markdown_content) # Process links using the new function
324
+ content = "".join(markdown_content)
276
325
  if page.annots:
277
326
  links = []
278
327
  for annot in page.annots:
@@ -283,6 +332,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
283
332
  if links:
284
333
  content = embed_links_in_text(page, content, links)
285
334
 
335
+ # Remove redundant formatting
336
+ content = content.replace("** **", " ").replace("* *", " ")
337
+
286
338
  return content
287
339
 
288
340
 
@@ -76,3 +76,22 @@ Ensure accurate representation of all content, including tables and visual eleme
76
76
  """
77
77
 
78
78
  INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
79
+
80
+ LLAMA_PARSER_PROMPT = """\
81
+ You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
82
+
83
+ Instructions:
84
+ 1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
85
+ 2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
86
+ 3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
87
+ 4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
88
+ 5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
89
+ 6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
90
+ 7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
91
+ 8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
92
+ 9. Ensure all special characters, symbols, and equations are accurately represented.
93
+ 10. Provide the output only once, without any duplication.
94
+ 11. Enclose the entire output within <output> and </output> tags.
95
+
96
+ Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
97
+ """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.6.post1
3
+ Version: 0.1.8
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.13
11
11
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
12
12
  Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
13
13
  Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
14
+ Requires-Dist: huggingface-hub (>=0.27.0,<0.28.0)
14
15
  Requires-Dist: loguru (>=0.7.2,<0.8.0)
15
16
  Requires-Dist: markdown (>=3.7,<4.0)
16
17
  Requires-Dist: markdownify (>=0.13.1,<0.14.0)
@@ -31,47 +32,59 @@ Description-Content-Type: text/markdown
31
32
 
32
33
  # Lexoid
33
34
 
35
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
36
+ [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
37
+ [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
38
+
34
39
  Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
35
40
 
36
41
  ## Motivation:
37
42
  - Use the multi-modal advancement of LLMs
38
- - Enable convenience for users while driving innovation
43
+ - Enable convenience for users
39
44
  - Collaborate with a permissive license
40
45
 
41
46
  ## Installation
42
- To install dependencies:
47
+ ### Installing with pip
43
48
  ```
44
- make install
49
+ pip install lexoid
45
50
  ```
46
- or, to install with dev-dependencies:
51
+
52
+ To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
47
53
  ```
48
- make dev
54
+ OPENAI_API_KEY=""
55
+ GOOGLE_API_KEY=""
49
56
  ```
50
57
 
51
- To activate virtual environment:
58
+ Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
52
59
  ```
53
- source .venv/bin/activate
60
+ playwright install --with-deps --only-shell chromium
54
61
  ```
55
62
 
56
- To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
63
+ ### Building `.whl` from source
57
64
  ```
58
- OPENAI_API_KEY=""
59
- GOOGLE_API_KEY=""
65
+ make build
60
66
  ```
61
67
 
62
- To build a `.whl` file for testing:
68
+ ### Creating a local installation
69
+ To install dependencies:
70
+ ```
71
+ make install
72
+ ```
73
+ or, to install with dev-dependencies:
63
74
  ```
64
- poetry build
75
+ make dev
65
76
  ```
66
77
 
67
- Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
78
+ To activate virtual environment:
68
79
  ```
69
- playwright install --with-deps --only-shell chromium
80
+ source .venv/bin/activate
70
81
  ```
71
82
 
72
83
  ## Usage
73
84
  [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
74
85
 
86
+ [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
87
+
75
88
  Here's a quick example to parse documents using Lexoid:
76
89
  ``` python
77
90
  from lexoid.api import parse
@@ -88,17 +101,24 @@ print(parsed_md)
88
101
  ### Parameters
89
102
  - path (str): The file path or URL.
90
103
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
91
- - raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
104
+ - raw (bool, optional): Return raw text or structured data. Defaults to False.
92
105
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
93
106
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
94
107
  - **kwargs: Additional arguments for the parser.
95
108
 
96
109
  ## Benchmark
97
110
  Initial results (_more updates soon_)
111
+ _Note:_ Benchmarks done in zero-shot scenario currently
98
112
  | Rank | Model/Framework | Similarity | Time (s) |
99
113
  |------|-----------|------------|----------|
100
114
  | 1 | gpt-4o | 0.799 | 21.77|
101
- | 2 | gemini-1.5-pro | 0.742 | 15.77 |
102
- | 3 | gpt-4o-mini | 0.721 | 14.86 |
103
- | 4 | gemini-1.5-flash | 0.702 | 4.56 |
115
+ | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
116
+ | 3 | gemini-exp-1121 | 0.779 | 30.88 |
117
+ | 4 | gemini-1.5-pro | 0.742 | 15.77 |
118
+ | 5 | gpt-4o-mini | 0.721 | 14.86 |
119
+ | 6 | gemini-1.5-flash | 0.702 | 4.56 |
120
+ | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
121
+ | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
122
+ | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
123
+ | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
104
124
 
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
2
+ lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
3
+ lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
6
+ lexoid-0.1.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.8.dist-info/METADATA,sha256=iuRu83NSZJhzOkKi-1H1uPxC1mkqHHhExt38CZGg3GE,4421
8
+ lexoid-0.1.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.8.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
2
- lexoid/core/parse_type/llm_parser.py,sha256=R-0HoXATCBnMJpyjOmMw_EdvVS_PKhhgC7z3NoKzhrs,6311
3
- lexoid/core/parse_type/static_parser.py,sha256=uFmuz_1JQHUp8FZADPhLBPEv1La2AnZ4j2Vj6SlH0fo,11993
4
- lexoid/core/prompt_templates.py,sha256=0KXHGNunMfrRZh5QfENcxY1s30VioY2fsu3wELc-3z8,4794
5
- lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
6
- lexoid-0.1.6.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.6.post1.dist-info/METADATA,sha256=tPhhqCNwJGR5LNSH-J9hCJf2O4AN6QJhFiXHbUcRizM,3436
8
- lexoid-0.1.6.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.6.post1.dist-info/RECORD,,