lexoid 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/core/utils.py ADDED
@@ -0,0 +1,534 @@
1
+ import asyncio
2
+ import io
3
+ import mimetypes
4
+ import os
5
+ import re
6
+ import sys
7
+ from difflib import SequenceMatcher
8
+ from typing import Dict, List, Union
9
+ from urllib.parse import urlparse
10
+
11
+ import nest_asyncio
12
+ import pikepdf
13
+ import pypdfium2
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ from docx2pdf import convert
17
+ from loguru import logger
18
+ from markdown import markdown
19
+ from markdownify import markdownify as md
20
+ from PIL import Image
21
+ from PyQt5.QtCore import QMarginsF, QUrl
22
+ from PyQt5.QtGui import QPageLayout, QPageSize
23
+ from PyQt5.QtPrintSupport import QPrinter
24
+ from PyQt5.QtWebEngineWidgets import QWebEngineView
25
+ from PyQt5.QtWidgets import QApplication
26
+
27
+ # Source: https://stackoverflow.com/a/12982689
28
+ HTML_TAG_PATTERN = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
29
+
30
+
31
+ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
32
+ paths = []
33
+ with pikepdf.open(input_path) as pdf:
34
+ total_pages = len(pdf.pages)
35
+ for start in range(0, total_pages, pages_per_split):
36
+ end = min(start + pages_per_split, total_pages)
37
+ output_path = os.path.join(
38
+ output_dir, f"split_{str(start + 1).zfill(4)}_{end}.pdf"
39
+ )
40
+ with pikepdf.new() as new_pdf:
41
+ new_pdf.pages.extend(pdf.pages[start:end])
42
+ new_pdf.save(output_path)
43
+ paths.append(output_path)
44
+ return paths
45
+
46
+
47
+ def convert_image_to_pdf(image_path: str) -> bytes:
48
+ with Image.open(image_path) as img:
49
+ img_rgb = img.convert("RGB")
50
+ pdf_buffer = io.BytesIO()
51
+ img_rgb.save(pdf_buffer, format="PDF")
52
+ return pdf_buffer.getvalue()
53
+
54
+
55
+ def remove_html_tags(text: str):
56
+ html = markdown(text, extensions=["tables"])
57
+ return re.sub(HTML_TAG_PATTERN, "", html)
58
+
59
+
60
+ def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float:
61
+ """Calculate similarity ratio between two texts using SequenceMatcher."""
62
+ if ignore_html:
63
+ text1 = remove_html_tags(text1)
64
+ text2 = remove_html_tags(text2)
65
+ return SequenceMatcher(None, text1, text2).ratio()
66
+
67
+
68
+ def convert_pdf_page_to_image(
69
+ pdf_document: pypdfium2.PdfDocument, page_number: int
70
+ ) -> bytes:
71
+ """Convert a PDF page to an image."""
72
+ page = pdf_document[page_number]
73
+ # Render with 4x scaling for better quality
74
+ pil_image = page.render(scale=4).to_pil()
75
+
76
+ # Convert to bytes
77
+ img_byte_arr = io.BytesIO()
78
+ pil_image.save(img_byte_arr, format="PNG")
79
+ img_byte_arr.seek(0)
80
+ return img_byte_arr.getvalue()
81
+
82
+
83
+ def get_file_type(path: str) -> str:
84
+ """Get the file type of a file based on its extension."""
85
+ return mimetypes.guess_type(path)[0]
86
+
87
+
88
+ def is_supported_file_type(path: str) -> bool:
89
+ """Check if the file type is supported for parsing."""
90
+ file_type = get_file_type(path)
91
+ if (
92
+ file_type == "application/pdf"
93
+ or "wordprocessing" in file_type
94
+ or file_type.startswith("image/")
95
+ or file_type.startswith("text")
96
+ ):
97
+ return True
98
+ return False
99
+
100
+
101
+ def is_supported_url_file_type(url: str) -> bool:
102
+ """
103
+ Check if the file type from the URL is supported.
104
+
105
+ Args:
106
+ url (str): The URL of the file.
107
+
108
+ Returns:
109
+ bool: True if the file type is supported, False otherwise.
110
+ """
111
+ supported_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
112
+ parsed_url = urlparse(url)
113
+ ext = os.path.splitext(parsed_url.path)[1].lower()
114
+
115
+ if ext in supported_extensions:
116
+ return True
117
+
118
+ # If no extension in URL, try to get content type from headers
119
+ try:
120
+ response = requests.head(url)
121
+ except requests.exceptions.ConnectionError:
122
+ return False
123
+ content_type = response.headers.get("Content-Type", "")
124
+ ext = mimetypes.guess_extension(content_type)
125
+
126
+ return ext in supported_extensions
127
+
128
+
129
+ def download_file(url: str, temp_dir: str) -> str:
130
+ """
131
+ Downloads a file from the given URL and saves it to a temporary directory.
132
+
133
+ Args:
134
+ url (str): The URL of the file to download.
135
+ temp_dir (str): The temporary directory to save the file.
136
+
137
+ Returns:
138
+ str: The path to the downloaded file.
139
+ """
140
+ response = requests.get(url)
141
+ file_name = os.path.basename(urlparse(url).path)
142
+ if not file_name:
143
+ content_type = response.headers.get("Content-Type", "")
144
+ ext = mimetypes.guess_extension(content_type)
145
+ file_name = f"downloaded_file{ext}" if ext else "downloaded_file"
146
+
147
+ file_path = os.path.join(temp_dir, file_name)
148
+ with open(file_path, "wb") as f:
149
+ f.write(response.content)
150
+ return file_path
151
+
152
+
153
+ def find_dominant_heading_level(markdown_content: str) -> str:
154
+ """
155
+ Finds the most common heading level that occurs more than once.
156
+ Also checks for underline style headings (---).
157
+
158
+ Args:
159
+ markdown_content (str): The markdown content to analyze
160
+
161
+ Returns:
162
+ str: The dominant heading pattern (e.g., '##' or 'underline')
163
+ """
164
+ # Check for underline style headings first
165
+ underline_pattern = r"^[^\n]+\n-+$"
166
+ underline_matches = re.findall(underline_pattern, markdown_content, re.MULTILINE)
167
+ if len(underline_matches) > 1:
168
+ return "underline"
169
+
170
+ # Find all hash-style headings in the markdown content
171
+ heading_patterns = ["#####", "####", "###", "##", "#"]
172
+ heading_counts = {}
173
+
174
+ for pattern in heading_patterns:
175
+ # Look for headings at the start of a line
176
+ regex = f"^{pattern} .*$"
177
+ matches = re.findall(regex, markdown_content, re.MULTILINE)
178
+ if len(matches) > 1: # Only consider headings that appear more than once
179
+ heading_counts[pattern] = len(matches)
180
+
181
+ if not heading_counts:
182
+ return "#" # Default to h1 if no repeated headings found
183
+
184
+ return min(heading_counts.keys(), key=len)
185
+
186
+
187
+ def split_md_by_headings(
188
+ markdown_content: str, heading_pattern: str, title: str
189
+ ) -> List[Dict]:
190
+ """
191
+ Splits markdown content by the specified heading pattern and structures it.
192
+
193
+ Args:
194
+ url (str): The URL of the HTML page
195
+ markdown_content (str): The markdown content to split
196
+ heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
197
+
198
+ Returns:
199
+ List[Dict]: List of dictionaries containing metadata and content
200
+ """
201
+ structured_content = []
202
+
203
+ if heading_pattern == "underline":
204
+ # Split by underline headings
205
+ pattern = r"^([^\n]+)\n-+$"
206
+ sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
207
+ # Remove empty sections and strip whitespace
208
+ sections = [section.strip() for section in sections if section.strip()]
209
+
210
+ # Handle content before first heading if it exists
211
+ if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
212
+ structured_content.append(
213
+ {
214
+ "metadata": {"title": title, "page": "Introduction"},
215
+ "content": sections.pop(0),
216
+ }
217
+ )
218
+
219
+ # Process sections pairwise (heading, content)
220
+ for i in range(0, len(sections), 2):
221
+ if i + 1 < len(sections):
222
+ structured_content.append(
223
+ {
224
+ "metadata": {"title": title, "page": sections[i]},
225
+ "content": sections[i + 1],
226
+ }
227
+ )
228
+ else:
229
+ # Split by hash headings
230
+ regex = f"^{heading_pattern} .*$"
231
+ sections = re.split(regex, markdown_content, flags=re.MULTILINE)
232
+ headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
233
+
234
+ # Remove empty sections and strip whitespace
235
+ sections = [section.strip() for section in sections if section.strip()]
236
+
237
+ # Handle content before first heading if it exists
238
+ if len(sections) > len(headings):
239
+ structured_content.append(
240
+ {
241
+ "metadata": {"title": title, "page": "Introduction"},
242
+ "content": sections.pop(0),
243
+ }
244
+ )
245
+
246
+ # Process remaining sections
247
+ for heading, content in zip(headings, sections):
248
+ clean_heading = heading.replace(heading_pattern, "").strip()
249
+ structured_content.append(
250
+ {
251
+ "metadata": {"title": title, "page": clean_heading},
252
+ "content": content,
253
+ }
254
+ )
255
+
256
+ return structured_content
257
+
258
+
259
+ def html_to_markdown(html: str, raw: bool, title: str) -> str:
260
+ """
261
+ Converts HTML content to markdown.
262
+
263
+ Args:
264
+ html (str): The HTML content to convert.
265
+ raw (bool): Whether to return raw markdown text or structured data.
266
+
267
+ Returns:
268
+ Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
269
+ """
270
+ markdown_content = md(html)
271
+
272
+ if raw:
273
+ return markdown_content
274
+
275
+ # Find the dominant heading level
276
+ heading_pattern = find_dominant_heading_level(markdown_content)
277
+
278
+ # Split content by headings and structure it
279
+ return split_md_by_headings(markdown_content, heading_pattern, title)
280
+
281
+
282
+ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
283
+ """
284
+ Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
285
+
286
+ Args:
287
+ url (str): The URL of the HTML page.
288
+ raw (bool): Whether to return raw markdown text or structured data.
289
+
290
+ Returns:
291
+ Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
292
+ """
293
+
294
+ try:
295
+ from playwright.async_api import async_playwright
296
+
297
+ nest_asyncio.apply()
298
+
299
+ async def fetch_page():
300
+ async with async_playwright() as p:
301
+ browser = await p.chromium.launch(headless=True)
302
+ page = await browser.new_page()
303
+ await page.goto(url)
304
+ html = await page.content()
305
+ await browser.close()
306
+ return html
307
+
308
+ loop = asyncio.get_event_loop()
309
+ html = loop.run_until_complete(fetch_page())
310
+ soup = BeautifulSoup(html, "html.parser")
311
+ except Exception as e:
312
+ logger.debug(
313
+ f"Error reading HTML content from URL, attempting with default https request: {str(e)}"
314
+ )
315
+ response = requests.get(url)
316
+ soup = BeautifulSoup(
317
+ response.content, "html.parser", from_encoding="iso-8859-1"
318
+ )
319
+ return html_to_markdown(str(soup), raw, title=url)
320
+
321
+
322
+ def extract_urls_from_markdown(content: str) -> List[str]:
323
+ """
324
+ Extracts URLs from markdown content using regex.
325
+ Matches both [text](url) and bare http(s):// URLs.
326
+
327
+ Args:
328
+ content (str): Markdown content to search for URLs
329
+
330
+ Returns:
331
+ List[str]: List of unique URLs found
332
+ """
333
+ # Match markdown links [text](url) and bare URLs
334
+ markdown_pattern = r"\[([^\]]+)\]\((https?://[^\s\)]+)\)"
335
+ bare_url_pattern = r"(?<!\()(https?://[^\s\)]+)"
336
+
337
+ urls = []
338
+ # Extract URLs from markdown links
339
+ urls.extend(match.group(2) for match in re.finditer(markdown_pattern, content))
340
+ # Extract bare URLs
341
+ urls.extend(match.group(0) for match in re.finditer(bare_url_pattern, content))
342
+
343
+ return list(set(urls)) # Remove duplicates
344
+
345
+
346
+ def recursive_read_html(
347
+ url: str, depth: int, raw: bool, visited_urls: set = None
348
+ ) -> Union[str, List[Dict]]:
349
+ """
350
+ Recursively reads HTML content from URLs up to specified depth.
351
+
352
+ Args:
353
+ url (str): The URL to parse
354
+ depth (int): How many levels deep to recursively parse
355
+ raw (bool): Whether to return raw text or structured data
356
+ visited_urls (set): Set of already visited URLs to prevent cycles
357
+
358
+ Returns:
359
+ Union[str, List[Dict]]: Combined content from all parsed URLs
360
+ """
361
+ if visited_urls is None:
362
+ visited_urls = set()
363
+
364
+ if url in visited_urls:
365
+ return "" if raw else []
366
+
367
+ visited_urls.add(url)
368
+
369
+ try:
370
+ content = read_html_content(url, raw)
371
+ except Exception as e:
372
+ print(f"Error processing URL {url}: {str(e)}")
373
+ return "" if raw else []
374
+
375
+ if depth <= 1:
376
+ return content
377
+
378
+ # Extract URLs from the content
379
+ if raw:
380
+ urls = extract_urls_from_markdown(content)
381
+ else:
382
+ # Extract URLs from all content sections
383
+ urls = []
384
+ for doc in content:
385
+ urls.extend(extract_urls_from_markdown(doc["content"]))
386
+
387
+ # Recursively process each URL
388
+ for sub_url in urls:
389
+ if sub_url not in visited_urls:
390
+ sub_content = recursive_read_html(sub_url, depth - 1, raw, visited_urls)
391
+
392
+ if raw:
393
+ if sub_content:
394
+ content += f"\n\n--- Begin content from {sub_url} ---\n\n"
395
+ content += sub_content
396
+ content += f"\n\n--- End content from {sub_url} ---\n\n"
397
+ else:
398
+ if isinstance(sub_content, list):
399
+ content.extend(sub_content)
400
+
401
+ return content
402
+
403
+
404
+ def save_webpage_as_pdf(url: str, output_path: str) -> str:
405
+ """
406
+ Saves a webpage as a PDF file using PyQt5.
407
+
408
+ Args:
409
+ url (str): The URL of the webpage.
410
+ output_path (str): The path to save the PDF file.
411
+
412
+ Returns:
413
+ str: The path to the saved PDF file.
414
+ """
415
+ app = QApplication(sys.argv)
416
+ web = QWebEngineView()
417
+ web.load(QUrl(url))
418
+
419
+ def handle_print_finished(filename, status):
420
+ print(f"PDF saved to: {filename}")
421
+ app.quit()
422
+
423
+ def handle_load_finished(status):
424
+ if status:
425
+ printer = QPrinter(QPrinter.HighResolution)
426
+ printer.setOutputFormat(QPrinter.PdfFormat)
427
+ printer.setOutputFileName(output_path)
428
+
429
+ page_layout = QPageLayout(
430
+ QPageSize(QPageSize.A4), QPageLayout.Portrait, QMarginsF(15, 15, 15, 15)
431
+ )
432
+ printer.setPageLayout(page_layout)
433
+
434
+ web.page().printToPdf(output_path)
435
+ web.page().pdfPrintingFinished.connect(handle_print_finished)
436
+
437
+ web.loadFinished.connect(handle_load_finished)
438
+ app.exec_()
439
+
440
+ return output_path
441
+
442
+
443
+ def convert_to_pdf(input_path: str, output_path: str) -> str:
444
+ """
445
+ Converts a file or webpage to PDF.
446
+
447
+ Args:
448
+ input_path (str): The path to the input file or URL.
449
+ output_path (str): The path to save the output PDF file.
450
+
451
+ Returns:
452
+ str: The path to the saved PDF file.
453
+ """
454
+ if input_path.startswith(("http://", "https://")):
455
+ return save_webpage_as_pdf(input_path, output_path)
456
+ file_type = get_file_type(input_path)
457
+ if file_type.startswith("image/"):
458
+ img_data = convert_image_to_pdf(input_path)
459
+ with open(output_path, "wb") as f:
460
+ f.write(img_data)
461
+ elif "word" in file_type:
462
+ return convert_doc_to_pdf(input_path, os.path.dirname(output_path))
463
+ else:
464
+ # Assume it's already a PDF, just copy it
465
+ with open(input_path, "rb") as src, open(output_path, "wb") as dst:
466
+ dst.write(src.read())
467
+
468
+ return output_path
469
+
470
+
471
+ def has_image_in_pdf(path: str):
472
+ with open(path, "rb") as fp:
473
+ content = fp.read()
474
+ return "Image".lower() in list(
475
+ map(lambda x: x.strip(), (str(content).lower().split("/")))
476
+ )
477
+
478
+
479
+ def has_hyperlink_in_pdf(path: str):
480
+ with open(path, "rb") as fp:
481
+ content = fp.read()
482
+ # URI tag is used if Links are hidden.
483
+ return "URI".lower() in list(
484
+ map(lambda x: x.strip(), (str(content).lower().split("/")))
485
+ )
486
+
487
+
488
+ def router(path: str):
489
+ file_type = get_file_type(path)
490
+ if file_type.startswith("text/"):
491
+ return "STATIC_PARSE"
492
+ # Naive routing strategy for now.
493
+ # Current routing strategy,
494
+ # 1. If the PDF has hidden hyperlinks (as alias) and no images: STATIC_PARSE
495
+ # 2. Other scenarios: LLM_PARSE
496
+ # If you have other needs, do reach out or create an issue.
497
+ if (
498
+ file_type == "application/pdf"
499
+ and not has_image_in_pdf(path)
500
+ and has_hyperlink_in_pdf(path)
501
+ ):
502
+ return "STATIC_PARSE"
503
+ return "LLM_PARSE"
504
+
505
+
506
+ def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
507
+ temp_path = os.path.join(
508
+ temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
509
+ )
510
+
511
+ # Convert the document to PDF
512
+ # docx2pdf is not supported in linux. Use LibreOffice in linux instead.
513
+ # May need to install LibreOffice if not already installed.
514
+ if "linux" in sys.platform.lower():
515
+ os.system(
516
+ f'lowriter --headless --convert-to pdf --outdir {temp_dir} "{input_path}"'
517
+ )
518
+ else:
519
+ convert(input_path, temp_path)
520
+
521
+ # Return the path of the converted PDF
522
+ return temp_path
523
+
524
+
525
+ def get_uri_rect(path):
526
+ with open(path, "rb") as fp:
527
+ byte_str = str(fp.read())
528
+ pattern = r"\((https?://[^\s)]+)\)"
529
+ uris = re.findall(pattern, byte_str)
530
+ rect_splits = byte_str.split("/Rect [")[1:]
531
+ rects = [
532
+ list(map(float, rect_split.split("]")[0].split())) for rect_split in rect_splits
533
+ ]
534
+ return {uri: rect for uri, rect in zip(uris, rects)}