lexoid 0.1.8__py3-none-any.whl → 0.1.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/core/utils.py CHANGED
@@ -298,9 +298,44 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
298
298
 
299
299
  async def fetch_page():
300
300
  async with async_playwright() as p:
301
- browser = await p.chromium.launch(headless=True)
302
- page = await browser.new_page()
301
+ browser = await p.chromium.launch(
302
+ headless=True,
303
+ args=[
304
+ "--disable-blink-features=AutomationControlled",
305
+ "--no-sandbox",
306
+ "--window-size=1920,1080",
307
+ ],
308
+ )
309
+ context = await browser.new_context(
310
+ viewport={"width": 1920, "height": 1080},
311
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
312
+ bypass_csp=True,
313
+ )
314
+ page = await context.new_page()
315
+
316
+ # Add headers to appear more like a real browser
317
+ await page.set_extra_http_headers(
318
+ {
319
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
320
+ "Accept-Language": "en-US,en;q=0.5",
321
+ "Sec-Fetch-Dest": "document",
322
+ "Sec-Fetch-Mode": "navigate",
323
+ "Sec-Fetch-Site": "none",
324
+ "Sec-Fetch-User": "?1",
325
+ }
326
+ )
327
+
303
328
  await page.goto(url)
329
+
330
+ # Wait for Cloudflare check to complete
331
+ await page.wait_for_load_state("networkidle")
332
+
333
+ # Additional wait for any dynamic content
334
+ try:
335
+ await page.wait_for_selector("body", timeout=30000)
336
+ except:
337
+ pass
338
+
304
339
  html = await page.content()
305
340
  await browser.close()
306
341
  return html
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.8
3
+ Version: 0.1.8.post1
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -35,9 +35,12 @@ Description-Content-Type: text/markdown
35
35
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
36
36
  [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
37
37
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
38
+ [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
38
39
 
39
40
  Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
40
41
 
42
+ [Documentation](https://oidlabs-com.github.io/Lexoid/)
43
+
41
44
  ## Motivation:
42
45
  - Use the multi-modal advancement of LLMs
43
46
  - Enable convenience for users
@@ -108,7 +111,9 @@ print(parsed_md)
108
111
 
109
112
  ## Benchmark
110
113
  Initial results (_more updates soon_)
111
- _Note:_ Benchmarks done in zero-shot scenario currently
114
+
115
+ _Note:_ Benchmarks are currently done in the zero-shot setting.
116
+
112
117
  | Rank | Model/Framework | Similarity | Time (s) |
113
118
  |------|-----------|------------|----------|
114
119
  | 1 | gpt-4o | 0.799 | 21.77|
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
2
+ lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
3
+ lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
6
+ lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
8
+ lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.8.post1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
2
- lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
3
- lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
6
- lexoid-0.1.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.8.dist-info/METADATA,sha256=iuRu83NSZJhzOkKi-1H1uPxC1mkqHHhExt38CZGg3GE,4421
8
- lexoid-0.1.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.8.dist-info/RECORD,,