lexoid 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -110,20 +110,23 @@ print(parsed_md)
110
110
  - **kwargs: Additional arguments for the parser.
111
111
 
112
112
  ## Benchmark
113
- Initial results (_more updates soon_)
113
+ Results aggregated across 5 iterations each for 5 documents.
114
114
 
115
115
  _Note:_ Benchmarks are currently done in the zero-shot setting.
116
116
 
117
- | Rank | Model/Framework | Similarity | Time (s) |
118
- |------|-----------|------------|----------|
119
- | 1 | gpt-4o | 0.799 | 21.77|
120
- | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
121
- | 3 | gemini-exp-1121 | 0.779 | 30.88 |
122
- | 4 | gemini-1.5-pro | 0.742 | 15.77 |
123
- | 5 | gpt-4o-mini | 0.721 | 14.86 |
124
- | 6 | gemini-1.5-flash | 0.702 | 4.56 |
125
- | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
126
- | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
127
- | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
128
- | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
117
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
118
+ |---|---|---|---|---|
119
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
120
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
121
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
122
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
123
+ | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
124
+ | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
125
+ | 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
126
+ | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
127
+ | 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
128
+ | 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
129
+ | 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
130
+ | 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
131
+ | 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
129
132
 
@@ -77,19 +77,22 @@ print(parsed_md)
77
77
  - **kwargs: Additional arguments for the parser.
78
78
 
79
79
  ## Benchmark
80
- Initial results (_more updates soon_)
80
+ Results aggregated across 5 iterations each for 5 documents.
81
81
 
82
82
  _Note:_ Benchmarks are currently done in the zero-shot setting.
83
83
 
84
- | Rank | Model/Framework | Similarity | Time (s) |
85
- |------|-----------|------------|----------|
86
- | 1 | gpt-4o | 0.799 | 21.77|
87
- | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
88
- | 3 | gemini-exp-1121 | 0.779 | 30.88 |
89
- | 4 | gemini-1.5-pro | 0.742 | 15.77 |
90
- | 5 | gpt-4o-mini | 0.721 | 14.86 |
91
- | 6 | gemini-1.5-flash | 0.702 | 4.56 |
92
- | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
93
- | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
94
- | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
95
- | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
84
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
85
+ |---|---|---|---|---|
86
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
87
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
88
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
89
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
90
+ | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
91
+ | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
92
+ | 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
93
+ | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
94
+ | 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
95
+ | 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
96
+ | 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
97
+ | 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
98
+ | 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
@@ -20,6 +20,7 @@ from lexoid.core.utils import (
20
20
  router,
21
21
  split_pdf,
22
22
  create_sub_pdf,
23
+ get_webpage_soup,
23
24
  )
24
25
 
25
26
 
@@ -102,7 +103,7 @@ def parse_chunk_list(
102
103
 
103
104
  def parse(
104
105
  path: str,
105
- parser_type: Union[str, ParserType] = "LLM_PARSE",
106
+ parser_type: Union[str, ParserType] = "AUTO",
106
107
  pages_per_split: int = 4,
107
108
  max_processes: int = 4,
108
109
  **kwargs,
@@ -149,6 +150,7 @@ def parse(
149
150
  if is_supported_url_file_type(path):
150
151
  path = download_file(path, download_dir)
151
152
  elif as_pdf:
153
+ kwargs["title"] = get_webpage_soup(path).title.string.strip()
152
154
  pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
153
155
  if not pdf_filename.endswith(".pdf"):
154
156
  pdf_filename += ".pdf"
@@ -50,7 +50,7 @@ def retry_on_http_error(func):
50
50
  @retry_on_http_error
51
51
  def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
52
52
  if "model" not in kwargs:
53
- kwargs["model"] = "gemini-1.5-flash"
53
+ kwargs["model"] = "gemini-2.0-flash"
54
54
  model = kwargs.get("model")
55
55
  if model.startswith("gemini"):
56
56
  return parse_with_gemini(path, **kwargs)
@@ -299,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
299
299
 
300
300
  return content
301
301
 
302
-
303
- def read_html_content(url: str) -> Dict:
304
- """
305
- Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
306
-
307
- Args:
308
- url (str): The URL of the HTML page.
309
-
310
- Returns:
311
- Dict: Dictionary containing parsed document data
312
- """
313
-
302
+ def get_webpage_soup(url: str) -> BeautifulSoup:
314
303
  try:
315
304
  from playwright.async_api import async_playwright
316
305
 
@@ -371,6 +360,21 @@ def read_html_content(url: str) -> Dict:
371
360
  soup = BeautifulSoup(
372
361
  response.content, "html.parser", from_encoding="iso-8859-1"
373
362
  )
363
+ return soup
364
+
365
+
366
+ def read_html_content(url: str) -> Dict:
367
+ """
368
+ Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
369
+
370
+ Args:
371
+ url (str): The URL of the HTML page.
372
+
373
+ Returns:
374
+ Dict: Dictionary containing parsed document data
375
+ """
376
+
377
+ soup = get_webpage_soup(url)
374
378
  title = soup.title.string.strip() if soup.title else "No title"
375
379
  url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
376
380
  full_title = f"{title} - {url_hash}"
@@ -542,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
542
546
  )
543
547
 
544
548
 
545
- def router(path: str):
549
+ def router(path: str, priority: str = "accuracy") -> str:
550
+ """
551
+ Routes the file path to the appropriate parser based on the file type.
552
+
553
+ Args:
554
+ path (str): The file path to route.
555
+ priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
556
+ """
546
557
  file_type = get_file_type(path)
547
558
  if file_type.startswith("text/"):
548
559
  return "STATIC_PARSE"
549
- # Naive routing strategy for now.
550
- # Current routing strategy,
551
- # 1. If the PDF has hidden hyperlinks (as alias) and no images: STATIC_PARSE
552
- # 2. Other scenarios: LLM_PARSE
553
- # If you have other needs, do reach out or create an issue.
554
- if (
555
- file_type == "application/pdf"
556
- and not has_image_in_pdf(path)
557
- and has_hyperlink_in_pdf(path)
558
- ):
559
- return "STATIC_PARSE"
560
- return "LLM_PARSE"
561
-
560
+
561
+ if priority == "accuracy":
562
+ # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
563
+ # Otherwise, use LLM_PARSE
564
+ if (
565
+ file_type == "application/pdf"
566
+ and not has_image_in_pdf(path)
567
+ and has_hyperlink_in_pdf(path)
568
+ ):
569
+ return "STATIC_PARSE"
570
+ return "LLM_PARSE"
571
+ else:
572
+ # If the file is a PDF without images, use STATIC_PARSE
573
+ # Otherwise, use LLM_PARSE
574
+ if (
575
+ file_type == "application/pdf"
576
+ and not has_image_in_pdf(path)
577
+ ):
578
+ return "STATIC_PARSE"
579
+ return "LLM_PARSE"
562
580
 
563
581
  def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
564
582
  temp_path = os.path.join(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.10"
3
+ version = "0.1.11"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
File without changes