lexoid 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py CHANGED
@@ -19,6 +19,8 @@ from lexoid.core.utils import (
19
19
  recursive_read_html,
20
20
  router,
21
21
  split_pdf,
22
+ create_sub_pdf,
23
+ get_webpage_soup,
22
24
  )
23
25
 
24
26
 
@@ -83,8 +85,9 @@ def parse_chunk_list(
83
85
  result = parse_chunk(file_path, parser_type, **kwargs)
84
86
  combined_segments.extend(result["segments"])
85
87
  raw_texts.append(result["raw"])
86
- token_usage["input"] += result["token_usage"]["input"]
87
- token_usage["output"] += result["token_usage"]["output"]
88
+ if "token_usage" in result:
89
+ token_usage["input"] += result["token_usage"]["input"]
90
+ token_usage["output"] += result["token_usage"]["output"]
88
91
  token_usage["total"] = token_usage["input"] + token_usage["output"]
89
92
 
90
93
  return {
@@ -100,7 +103,7 @@ def parse_chunk_list(
100
103
 
101
104
  def parse(
102
105
  path: str,
103
- parser_type: Union[str, ParserType] = "LLM_PARSE",
106
+ parser_type: Union[str, ParserType] = "AUTO",
104
107
  pages_per_split: int = 4,
105
108
  max_processes: int = 4,
106
109
  **kwargs,
@@ -147,6 +150,7 @@ def parse(
147
150
  if is_supported_url_file_type(path):
148
151
  path = download_file(path, download_dir)
149
152
  elif as_pdf:
153
+ kwargs["title"] = get_webpage_soup(path).title.string.strip()
150
154
  pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
151
155
  if not pdf_filename.endswith(".pdf"):
152
156
  pdf_filename += ".pdf"
@@ -163,6 +167,12 @@ def parse(
163
167
  pdf_path = os.path.join(temp_dir, "converted.pdf")
164
168
  path = convert_to_pdf(path, pdf_path)
165
169
 
170
+ if "page_nums" in kwargs and path.lower().endswith(".pdf"):
171
+ sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
172
+ os.makedirs(sub_pdf_dir, exist_ok=True)
173
+ sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
174
+ path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
175
+
166
176
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
167
177
  kwargs["split"] = False
168
178
  result = parse_chunk(path, parser_type, **kwargs)
@@ -50,7 +50,7 @@ def retry_on_http_error(func):
50
50
  @retry_on_http_error
51
51
  def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
52
52
  if "model" not in kwargs:
53
- kwargs["model"] = "gemini-1.5-flash"
53
+ kwargs["model"] = "gemini-2.0-flash"
54
54
  model = kwargs.get("model")
55
55
  if model.startswith("gemini"):
56
56
  return parse_with_gemini(path, **kwargs)
@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
125
125
  combined_text = result.split("</output>")[0].strip()
126
126
 
127
127
  token_usage = result["usageMetadata"]
128
+ input_tokens = token_usage.get("promptTokenCount", 0)
129
+ output_tokens = token_usage.get("candidatesTokenCount", 0)
130
+ total_tokens = input_tokens + output_tokens
128
131
 
129
132
  return {
130
133
  "raw": combined_text,
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
137
140
  "parent_title": kwargs.get("parent_title", ""),
138
141
  "recursive_docs": [],
139
142
  "token_usage": {
140
- "input": token_usage["promptTokenCount"],
141
- "output": token_usage["candidatesTokenCount"],
142
- "total": token_usage["totalTokenCount"],
143
+ "input": input_tokens,
144
+ "output": output_tokens,
145
+ "total": total_tokens,
143
146
  },
144
147
  }
145
148
 
lexoid/core/utils.py CHANGED
@@ -6,7 +6,7 @@ import re
6
6
  import sys
7
7
  from difflib import SequenceMatcher
8
8
  from hashlib import md5
9
- from typing import Dict, List
9
+ from typing import Dict, List, Optional
10
10
  from urllib.parse import urlparse
11
11
 
12
12
  import nest_asyncio
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
45
45
  return paths
46
46
 
47
47
 
48
+ def create_sub_pdf(
49
+ input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
50
+ ) -> str:
51
+ if isinstance(page_nums, int):
52
+ page_nums = (page_nums,)
53
+ page_nums = tuple(sorted(set(page_nums)))
54
+ with pikepdf.open(input_path) as pdf:
55
+ indices = page_nums if page_nums else range(len(pdf.pages))
56
+ with pikepdf.new() as new_pdf:
57
+ new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
58
+ new_pdf.save(output_path)
59
+ return output_path
60
+
61
+
48
62
  def convert_image_to_pdf(image_path: str) -> bytes:
49
63
  with Image.open(image_path) as img:
50
64
  img_rgb = img.convert("RGB")
@@ -285,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
285
299
 
286
300
  return content
287
301
 
288
-
289
- def read_html_content(url: str) -> Dict:
290
- """
291
- Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
292
-
293
- Args:
294
- url (str): The URL of the HTML page.
295
-
296
- Returns:
297
- Dict: Dictionary containing parsed document data
298
- """
299
-
302
+ def get_webpage_soup(url: str) -> BeautifulSoup:
300
303
  try:
301
304
  from playwright.async_api import async_playwright
302
305
 
@@ -357,6 +360,21 @@ def read_html_content(url: str) -> Dict:
357
360
  soup = BeautifulSoup(
358
361
  response.content, "html.parser", from_encoding="iso-8859-1"
359
362
  )
363
+ return soup
364
+
365
+
366
+ def read_html_content(url: str) -> Dict:
367
+ """
368
+ Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
369
+
370
+ Args:
371
+ url (str): The URL of the HTML page.
372
+
373
+ Returns:
374
+ Dict: Dictionary containing parsed document data
375
+ """
376
+
377
+ soup = get_webpage_soup(url)
360
378
  title = soup.title.string.strip() if soup.title else "No title"
361
379
  url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
362
380
  full_title = f"{title} - {url_hash}"
@@ -528,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
528
546
  )
529
547
 
530
548
 
531
- def router(path: str):
549
+ def router(path: str, priority: str = "accuracy") -> str:
550
+ """
551
+ Routes the file path to the appropriate parser based on the file type.
552
+
553
+ Args:
554
+ path (str): The file path to route.
555
+ priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
556
+ """
532
557
  file_type = get_file_type(path)
533
558
  if file_type.startswith("text/"):
534
559
  return "STATIC_PARSE"
535
- # Naive routing strategy for now.
536
- # Current routing strategy,
537
- # 1. If the PDF has hidden hyperlinks (as alias) and no images: STATIC_PARSE
538
- # 2. Other scenarios: LLM_PARSE
539
- # If you have other needs, do reach out or create an issue.
540
- if (
541
- file_type == "application/pdf"
542
- and not has_image_in_pdf(path)
543
- and has_hyperlink_in_pdf(path)
544
- ):
545
- return "STATIC_PARSE"
546
- return "LLM_PARSE"
547
-
560
+
561
+ if priority == "accuracy":
562
+ # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
563
+ # Otherwise, use LLM_PARSE
564
+ if (
565
+ file_type == "application/pdf"
566
+ and not has_image_in_pdf(path)
567
+ and has_hyperlink_in_pdf(path)
568
+ ):
569
+ return "STATIC_PARSE"
570
+ return "LLM_PARSE"
571
+ else:
572
+ # If the file is a PDF without images, use STATIC_PARSE
573
+ # Otherwise, use LLM_PARSE
574
+ if (
575
+ file_type == "application/pdf"
576
+ and not has_image_in_pdf(path)
577
+ ):
578
+ return "STATIC_PARSE"
579
+ return "LLM_PARSE"
548
580
 
549
581
  def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
550
582
  temp_path = os.path.join(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -110,20 +110,23 @@ print(parsed_md)
110
110
  - **kwargs: Additional arguments for the parser.
111
111
 
112
112
  ## Benchmark
113
- Initial results (_more updates soon_)
113
+ Results aggregated across 5 iterations each for 5 documents.
114
114
 
115
115
  _Note:_ Benchmarks are currently done in the zero-shot setting.
116
116
 
117
- | Rank | Model/Framework | Similarity | Time (s) |
118
- |------|-----------|------------|----------|
119
- | 1 | gpt-4o | 0.799 | 21.77|
120
- | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
121
- | 3 | gemini-exp-1121 | 0.779 | 30.88 |
122
- | 4 | gemini-1.5-pro | 0.742 | 15.77 |
123
- | 5 | gpt-4o-mini | 0.721 | 14.86 |
124
- | 6 | gemini-1.5-flash | 0.702 | 4.56 |
125
- | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
126
- | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
127
- | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
128
- | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
117
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
118
+ |---|---|---|---|---|
119
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
120
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
121
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
122
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
123
+ | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
124
+ | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
125
+ | 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
126
+ | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
127
+ | 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
128
+ | 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
129
+ | 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
130
+ | 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
131
+ | 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
129
132
 
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
2
+ lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
3
+ lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=1If_3XoUhPQRY5XMzLJBsHdyjtLgD734eYBYvsg8w5Y,19569
6
+ lexoid-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.11.dist-info/METADATA,sha256=kipDZLbUz_wkJUrzPGH2VppBNMHmaJadHR5_BAqHgjU,4838
8
+ lexoid-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.11.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
2
- lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
3
- lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
6
- lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
8
- lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.9.dist-info/RECORD,,