lexoid 0.1.8__py3-none-any.whl → 0.1.8.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/core/utils.py +37 -2
- {lexoid-0.1.8.dist-info → lexoid-0.1.8.post1.dist-info}/METADATA +7 -2
- lexoid-0.1.8.post1.dist-info/RECORD +9 -0
- lexoid-0.1.8.dist-info/RECORD +0 -9
- {lexoid-0.1.8.dist-info → lexoid-0.1.8.post1.dist-info}/LICENSE +0 -0
- {lexoid-0.1.8.dist-info → lexoid-0.1.8.post1.dist-info}/WHEEL +0 -0
lexoid/core/utils.py
CHANGED
@@ -298,9 +298,44 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
|
|
298
298
|
|
299
299
|
async def fetch_page():
|
300
300
|
async with async_playwright() as p:
|
301
|
-
browser = await p.chromium.launch(
|
302
|
-
|
301
|
+
browser = await p.chromium.launch(
|
302
|
+
headless=True,
|
303
|
+
args=[
|
304
|
+
"--disable-blink-features=AutomationControlled",
|
305
|
+
"--no-sandbox",
|
306
|
+
"--window-size=1920,1080",
|
307
|
+
],
|
308
|
+
)
|
309
|
+
context = await browser.new_context(
|
310
|
+
viewport={"width": 1920, "height": 1080},
|
311
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
312
|
+
bypass_csp=True,
|
313
|
+
)
|
314
|
+
page = await context.new_page()
|
315
|
+
|
316
|
+
# Add headers to appear more like a real browser
|
317
|
+
await page.set_extra_http_headers(
|
318
|
+
{
|
319
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
320
|
+
"Accept-Language": "en-US,en;q=0.5",
|
321
|
+
"Sec-Fetch-Dest": "document",
|
322
|
+
"Sec-Fetch-Mode": "navigate",
|
323
|
+
"Sec-Fetch-Site": "none",
|
324
|
+
"Sec-Fetch-User": "?1",
|
325
|
+
}
|
326
|
+
)
|
327
|
+
|
303
328
|
await page.goto(url)
|
329
|
+
|
330
|
+
# Wait for Cloudflare check to complete
|
331
|
+
await page.wait_for_load_state("networkidle")
|
332
|
+
|
333
|
+
# Additional wait for any dynamic content
|
334
|
+
try:
|
335
|
+
await page.wait_for_selector("body", timeout=30000)
|
336
|
+
except:
|
337
|
+
pass
|
338
|
+
|
304
339
|
html = await page.content()
|
305
340
|
await browser.close()
|
306
341
|
return html
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.8
|
3
|
+
Version: 0.1.8.post1
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -35,9 +35,12 @@ Description-Content-Type: text/markdown
|
|
35
35
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
36
36
|
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
37
37
|
[](https://pypi.org/project/lexoid/)
|
38
|
+
[](https://oidlabs-com.github.io/Lexoid/)
|
38
39
|
|
39
40
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
40
41
|
|
42
|
+
[Documentation](https://oidlabs-com.github.io/Lexoid/)
|
43
|
+
|
41
44
|
## Motivation:
|
42
45
|
- Use the multi-modal advancement of LLMs
|
43
46
|
- Enable convenience for users
|
@@ -108,7 +111,9 @@ print(parsed_md)
|
|
108
111
|
|
109
112
|
## Benchmark
|
110
113
|
Initial results (_more updates soon_)
|
111
|
-
|
114
|
+
|
115
|
+
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
|
+
|
112
117
|
| Rank | Model/Framework | Similarity | Time (s) |
|
113
118
|
|------|-----------|------------|----------|
|
114
119
|
| 1 | gpt-4o | 0.799 | 21.77|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
|
6
|
+
lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
|
8
|
+
lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.8.post1.dist-info/RECORD,,
|
lexoid-0.1.8.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
|
6
|
-
lexoid-0.1.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.8.dist-info/METADATA,sha256=iuRu83NSZJhzOkKi-1H1uPxC1mkqHHhExt38CZGg3GE,4421
|
8
|
-
lexoid-0.1.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|