lexoid 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py ADDED
@@ -0,0 +1,200 @@
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from enum import Enum
6
+ from glob import glob
7
+ from time import time
8
+ from typing import Union, Dict, List
9
+
10
+ from loguru import logger
11
+
12
+ from lexoid.core.parse_type.llm_parser import parse_llm_doc
13
+ from lexoid.core.parse_type.static_parser import parse_static_doc
14
+ from lexoid.core.utils import (
15
+ convert_to_pdf,
16
+ download_file,
17
+ is_supported_url_file_type,
18
+ is_supported_file_type,
19
+ recursive_read_html,
20
+ router,
21
+ split_pdf,
22
+ )
23
+
24
+
25
+ class ParserType(Enum):
26
+ LLM_PARSE = "LLM_PARSE"
27
+ STATIC_PARSE = "STATIC_PARSE"
28
+ AUTO = "AUTO"
29
+
30
+
31
+ def parse_chunk(
32
+ path: str, parser_type: ParserType, raw: bool, **kwargs
33
+ ) -> List[Dict] | str:
34
+ """
35
+ Parses a file using the specified parser type.
36
+
37
+ Args:
38
+ path (str): The file path or URL.
39
+ parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
40
+ raw (bool): Whether to return raw text or structured data.
41
+ **kwargs: Additional arguments for the parser.
42
+
43
+ Returns:
44
+ List[Dict] | str: Parsed document data as a list of dictionaries or raw text.
45
+ """
46
+ if parser_type == ParserType.AUTO:
47
+ parser_type = ParserType[router(path)]
48
+ logger.debug(f"Auto-detected parser type: {parser_type}")
49
+
50
+ kwargs["start"] = (
51
+ int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
52
+ )
53
+ if parser_type == ParserType.STATIC_PARSE:
54
+ logger.debug("Using static parser")
55
+ return parse_static_doc(path, raw, **kwargs)
56
+ else:
57
+ logger.debug("Using LLM parser")
58
+ return parse_llm_doc(path, raw, **kwargs)
59
+
60
+
61
+ def parse_chunk_list(
62
+ file_paths: List[str], parser_type: ParserType, raw: bool, kwargs: Dict
63
+ ) -> List[Dict | str]:
64
+ """
65
+ Parses a list of files using the specified parser type.
66
+
67
+ Args:
68
+ file_paths (list): List of file paths.
69
+ parser_type (ParserType): The type of parser to use.
70
+ raw (bool): Whether to return raw text or structured data.
71
+ kwargs (dict): Additional arguments for the parser.
72
+
73
+ Returns:
74
+ List[Dict | str]: List of parsed documents with raw text and/or metadata.
75
+ """
76
+ local_docs = []
77
+ for file_path in file_paths:
78
+ result = parse_chunk(file_path, parser_type, raw, **kwargs)
79
+ if isinstance(result, list):
80
+ local_docs.extend(result)
81
+ else:
82
+ local_docs.append(result.replace("<page break>", "\n\n"))
83
+ return local_docs
84
+
85
+
86
+ def parse(
87
+ path: str,
88
+ parser_type: Union[str, ParserType] = "LLM_PARSE",
89
+ raw: bool = False,
90
+ pages_per_split: int = 4,
91
+ max_processes: int = 4,
92
+ **kwargs,
93
+ ) -> Union[List[Dict], str]:
94
+ """
95
+ Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
96
+
97
+ Args:
98
+ path (str): The file path or URL.
99
+ parser_type (Union[str, ParserType], optional): The type of parser to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). Defaults to "LLM_PARSE".
100
+ raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
101
+ pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
102
+ max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
103
+ **kwargs: Additional arguments for the parser.
104
+
105
+ Returns:
106
+ Union[List[Dict], str]: Parsed document data as a list of dictionaries or raw text.
107
+ """
108
+ kwargs["title"] = os.path.basename(path)
109
+ kwargs["pages_per_split_"] = pages_per_split
110
+ as_pdf = kwargs.get("as_pdf", False)
111
+ depth = kwargs.get("depth", 1)
112
+ if type(parser_type) == str:
113
+ parser_type = ParserType[parser_type]
114
+
115
+ with tempfile.TemporaryDirectory() as temp_dir:
116
+ if (
117
+ path.lower().endswith((".doc", ".docx"))
118
+ and parser_type != ParserType.STATIC_PARSE
119
+ ):
120
+ as_pdf = True
121
+
122
+ if path.startswith(("http://", "https://")):
123
+ download_dir = os.path.join(temp_dir, "downloads/")
124
+ os.makedirs(download_dir, exist_ok=True)
125
+ if is_supported_url_file_type(path):
126
+ path = download_file(path, download_dir)
127
+ elif as_pdf:
128
+ pdf_path = os.path.join(download_dir, f"webpage_{int(time())}.pdf")
129
+ path = convert_to_pdf(path, pdf_path)
130
+ else:
131
+ return recursive_read_html(path, depth, raw)
132
+
133
+ assert is_supported_file_type(
134
+ path
135
+ ), f"Unsupported file type {os.path.splitext(path)[1]}"
136
+
137
+ if as_pdf and not path.lower().endswith(".pdf"):
138
+ pdf_path = os.path.join(temp_dir, "converted.pdf")
139
+ path = convert_to_pdf(path, pdf_path)
140
+
141
+ if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
142
+ kwargs["split"] = False
143
+ all_docs = parse_chunk(path, parser_type, raw, **kwargs)
144
+ if raw:
145
+ all_docs = [all_docs]
146
+ else:
147
+ kwargs["split"] = True
148
+ split_dir = os.path.join(temp_dir, "splits/")
149
+ os.makedirs(split_dir, exist_ok=True)
150
+ split_pdf(path, split_dir, pages_per_split)
151
+ split_files = sorted(glob(os.path.join(split_dir, "*.pdf")))
152
+
153
+ chunk_size = max(1, len(split_files) // max_processes)
154
+ file_chunks = [
155
+ split_files[i : i + chunk_size]
156
+ for i in range(0, len(split_files), chunk_size)
157
+ ]
158
+
159
+ process_args = [(chunk, parser_type, raw, kwargs) for chunk in file_chunks]
160
+
161
+ if max_processes == 1 or len(file_chunks) == 1:
162
+ all_docs = [parse_chunk_list(*args) for args in process_args]
163
+ else:
164
+ with ProcessPoolExecutor(max_workers=max_processes) as executor:
165
+ all_docs = list(executor.map(parse_chunk_list, *zip(*process_args)))
166
+
167
+ all_docs = [item for sublist in all_docs for item in sublist]
168
+
169
+ if depth > 1:
170
+ new_docs = all_docs.copy()
171
+ for doc in all_docs:
172
+ urls = re.findall(
173
+ r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
174
+ doc if raw else doc["content"],
175
+ )
176
+ for url in urls:
177
+ if "](" in url:
178
+ url = url.split("](")[-1]
179
+ logger.debug(f"Reading content from {url}")
180
+ if not url.startswith("http"):
181
+ url = "https://" + url
182
+
183
+ kwargs_cp = kwargs.copy()
184
+ kwargs_cp["depth"] = depth - 1
185
+ res = parse(
186
+ url,
187
+ parser_type=parser_type,
188
+ raw=raw,
189
+ pages_per_split=pages_per_split,
190
+ max_processes=max_processes,
191
+ **kwargs_cp,
192
+ )
193
+
194
+ if raw:
195
+ new_docs.append(res)
196
+ else:
197
+ new_docs.extend(res)
198
+ all_docs = new_docs
199
+
200
+ return "\n".join(all_docs) if raw else all_docs
@@ -0,0 +1,200 @@
1
+ import base64
2
+ import io
3
+ import mimetypes
4
+ import os
5
+ from typing import Dict, List
6
+
7
+ import pypdfium2 as pdfium
8
+ import requests
9
+ from lexoid.core.prompt_templates import (
10
+ INSTRUCTIONS_ADD_PG_BREAK,
11
+ OPENAI_USER_PROMPT,
12
+ PARSER_PROMPT,
13
+ )
14
+ from lexoid.core.utils import convert_image_to_pdf
15
+ from loguru import logger
16
+ from openai import OpenAI
17
+
18
+
19
+ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
20
+ if "model" not in kwargs:
21
+ kwargs["model"] = "gemini-1.5-flash"
22
+ model = kwargs.get("model")
23
+ if model.startswith("gemini"):
24
+ return parse_with_gemini(path, raw, **kwargs)
25
+ elif model.startswith("gpt"):
26
+ return parse_with_gpt(path, raw, **kwargs)
27
+ else:
28
+ raise ValueError(f"Unsupported model: {model}")
29
+
30
+
31
+ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
32
+ api_key = os.environ.get("GOOGLE_API_KEY")
33
+ if not api_key:
34
+ raise ValueError("GOOGLE_API_KEY environment variable is not set")
35
+
36
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
37
+
38
+ # Check if the file is an image and convert to PDF if necessary
39
+ mime_type, _ = mimetypes.guess_type(path)
40
+ if mime_type and mime_type.startswith("image"):
41
+ pdf_content = convert_image_to_pdf(path)
42
+ mime_type = "application/pdf"
43
+ base64_file = base64.b64encode(pdf_content).decode("utf-8")
44
+ else:
45
+ with open(path, "rb") as file:
46
+ file_content = file.read()
47
+ base64_file = base64.b64encode(file_content).decode("utf-8")
48
+
49
+ # Ideally, we do this ourselves. But, for now this might be a good enough.
50
+ custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
51
+ if kwargs["pages_per_split_"] == 1:
52
+ custom_instruction = ""
53
+
54
+ payload = {
55
+ "contents": [
56
+ {
57
+ "parts": [
58
+ {
59
+ "text": PARSER_PROMPT.format(
60
+ custom_instructions=custom_instruction
61
+ )
62
+ },
63
+ {"inline_data": {"mime_type": mime_type, "data": base64_file}},
64
+ ]
65
+ }
66
+ ],
67
+ "generationConfig": {
68
+ "temperature": kwargs.get("temperature", 0.7),
69
+ },
70
+ }
71
+
72
+ headers = {"Content-Type": "application/json"}
73
+
74
+ response = requests.post(url, json=payload, headers=headers)
75
+ response.raise_for_status()
76
+
77
+ result = response.json()
78
+
79
+ raw_text = "".join(
80
+ part["text"]
81
+ for candidate in result.get("candidates", [])
82
+ for part in candidate.get("content", {}).get("parts", [])
83
+ if "text" in part
84
+ )
85
+
86
+ result = ""
87
+ if "<output>" in raw_text:
88
+ result = raw_text.split("<output>")[1].strip()
89
+ if "</output>" in result:
90
+ result = result.split("</output>")[0].strip()
91
+
92
+ if raw:
93
+ return result
94
+
95
+ return [
96
+ {
97
+ "metadata": {
98
+ "title": kwargs["title"],
99
+ "page": kwargs.get("start", 0) + page_no,
100
+ },
101
+ "content": page,
102
+ }
103
+ for page_no, page in enumerate(result.split("<page-break>"), start=1)
104
+ if page.strip()
105
+ ]
106
+
107
+
108
+ def convert_pdf_page_to_base64(
109
+ pdf_document: pdfium.PdfDocument, page_number: int
110
+ ) -> str:
111
+ """Convert a PDF page to a base64-encoded PNG string."""
112
+ page = pdf_document[page_number]
113
+ # Render with 4x scaling for better quality
114
+ pil_image = page.render(scale=4).to_pil()
115
+
116
+ # Convert to base64
117
+ img_byte_arr = io.BytesIO()
118
+ pil_image.save(img_byte_arr, format="PNG")
119
+ img_byte_arr.seek(0)
120
+ return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
121
+
122
+
123
+ def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
124
+ client = OpenAI()
125
+
126
+ # Handle different input types
127
+ mime_type, _ = mimetypes.guess_type(path)
128
+ if mime_type and mime_type.startswith("image"):
129
+ # Single image processing
130
+ with open(path, "rb") as img_file:
131
+ image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
132
+ images = [(0, image_base64)]
133
+ else:
134
+ # PDF processing
135
+ pdf_document = pdfium.PdfDocument(path)
136
+ images = [
137
+ (page_num, convert_pdf_page_to_base64(pdf_document, page_num))
138
+ for page_num in range(len(pdf_document))
139
+ ]
140
+
141
+ # Process each page/image
142
+ all_results = []
143
+ for page_num, image_base64 in images:
144
+ messages = [
145
+ {
146
+ "role": "system",
147
+ "content": PARSER_PROMPT,
148
+ },
149
+ {
150
+ "role": "user",
151
+ "content": [
152
+ {
153
+ "type": "text",
154
+ "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
155
+ },
156
+ {
157
+ "type": "image_url",
158
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
159
+ },
160
+ ],
161
+ },
162
+ ]
163
+
164
+ # Get completion from GPT-4 Vision
165
+ response = client.chat.completions.create(
166
+ model=kwargs["model"],
167
+ temperature=kwargs.get("temperature", 0.7),
168
+ messages=messages,
169
+ )
170
+
171
+ # Extract the response text
172
+ page_text = response.choices[0].message.content
173
+ if kwargs.get("verbose", None):
174
+ logger.debug(f"Page {page_num + 1} response: {page_text}")
175
+ result = ""
176
+ if "<output>" in page_text:
177
+ result = page_text.split("<output>")[1].strip()
178
+ if "</output>" in result:
179
+ result = result.split("</output>")[0].strip()
180
+ all_results.append((page_num, result))
181
+
182
+ # Sort results by page number and combine
183
+ all_results.sort(key=lambda x: x[0])
184
+ all_texts = [text for _, text in all_results]
185
+ combined_text = "<page-break>".join(all_texts)
186
+
187
+ if raw:
188
+ return combined_text
189
+
190
+ return [
191
+ {
192
+ "metadata": {
193
+ "title": kwargs["title"],
194
+ "page": kwargs.get("start", 0) + page_no,
195
+ },
196
+ "content": page,
197
+ }
198
+ for page_no, page in enumerate(all_texts, start=1)
199
+ if page.strip()
200
+ ]