lexoid 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +200 -0
- lexoid/core/parse_type/llm_parser.py +200 -0
- lexoid/core/parse_type/static_parser.py +350 -0
- lexoid/core/prompt_templates.py +78 -0
- lexoid/core/utils.py +534 -0
- lexoid-0.1.6.dist-info/LICENSE +201 -0
- lexoid-0.1.6.dist-info/METADATA +102 -0
- lexoid-0.1.6.dist-info/RECORD +9 -0
- lexoid-0.1.6.dist-info/WHEEL +4 -0
lexoid/api.py
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import tempfile
|
4
|
+
from concurrent.futures import ProcessPoolExecutor
|
5
|
+
from enum import Enum
|
6
|
+
from glob import glob
|
7
|
+
from time import time
|
8
|
+
from typing import Union, Dict, List
|
9
|
+
|
10
|
+
from loguru import logger
|
11
|
+
|
12
|
+
from lexoid.core.parse_type.llm_parser import parse_llm_doc
|
13
|
+
from lexoid.core.parse_type.static_parser import parse_static_doc
|
14
|
+
from lexoid.core.utils import (
|
15
|
+
convert_to_pdf,
|
16
|
+
download_file,
|
17
|
+
is_supported_url_file_type,
|
18
|
+
is_supported_file_type,
|
19
|
+
recursive_read_html,
|
20
|
+
router,
|
21
|
+
split_pdf,
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
class ParserType(Enum):
|
26
|
+
LLM_PARSE = "LLM_PARSE"
|
27
|
+
STATIC_PARSE = "STATIC_PARSE"
|
28
|
+
AUTO = "AUTO"
|
29
|
+
|
30
|
+
|
31
|
+
def parse_chunk(
|
32
|
+
path: str, parser_type: ParserType, raw: bool, **kwargs
|
33
|
+
) -> List[Dict] | str:
|
34
|
+
"""
|
35
|
+
Parses a file using the specified parser type.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
path (str): The file path or URL.
|
39
|
+
parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
|
40
|
+
raw (bool): Whether to return raw text or structured data.
|
41
|
+
**kwargs: Additional arguments for the parser.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
List[Dict] | str: Parsed document data as a list of dictionaries or raw text.
|
45
|
+
"""
|
46
|
+
if parser_type == ParserType.AUTO:
|
47
|
+
parser_type = ParserType[router(path)]
|
48
|
+
logger.debug(f"Auto-detected parser type: {parser_type}")
|
49
|
+
|
50
|
+
kwargs["start"] = (
|
51
|
+
int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
|
52
|
+
)
|
53
|
+
if parser_type == ParserType.STATIC_PARSE:
|
54
|
+
logger.debug("Using static parser")
|
55
|
+
return parse_static_doc(path, raw, **kwargs)
|
56
|
+
else:
|
57
|
+
logger.debug("Using LLM parser")
|
58
|
+
return parse_llm_doc(path, raw, **kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
def parse_chunk_list(
|
62
|
+
file_paths: List[str], parser_type: ParserType, raw: bool, kwargs: Dict
|
63
|
+
) -> List[Dict | str]:
|
64
|
+
"""
|
65
|
+
Parses a list of files using the specified parser type.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
file_paths (list): List of file paths.
|
69
|
+
parser_type (ParserType): The type of parser to use.
|
70
|
+
raw (bool): Whether to return raw text or structured data.
|
71
|
+
kwargs (dict): Additional arguments for the parser.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
List[Dict | str]: List of parsed documents with raw text and/or metadata.
|
75
|
+
"""
|
76
|
+
local_docs = []
|
77
|
+
for file_path in file_paths:
|
78
|
+
result = parse_chunk(file_path, parser_type, raw, **kwargs)
|
79
|
+
if isinstance(result, list):
|
80
|
+
local_docs.extend(result)
|
81
|
+
else:
|
82
|
+
local_docs.append(result.replace("<page break>", "\n\n"))
|
83
|
+
return local_docs
|
84
|
+
|
85
|
+
|
86
|
+
def parse(
|
87
|
+
path: str,
|
88
|
+
parser_type: Union[str, ParserType] = "LLM_PARSE",
|
89
|
+
raw: bool = False,
|
90
|
+
pages_per_split: int = 4,
|
91
|
+
max_processes: int = 4,
|
92
|
+
**kwargs,
|
93
|
+
) -> Union[List[Dict], str]:
|
94
|
+
"""
|
95
|
+
Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
path (str): The file path or URL.
|
99
|
+
parser_type (Union[str, ParserType], optional): The type of parser to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). Defaults to "LLM_PARSE".
|
100
|
+
raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
|
101
|
+
pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
102
|
+
max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
|
103
|
+
**kwargs: Additional arguments for the parser.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
Union[List[Dict], str]: Parsed document data as a list of dictionaries or raw text.
|
107
|
+
"""
|
108
|
+
kwargs["title"] = os.path.basename(path)
|
109
|
+
kwargs["pages_per_split_"] = pages_per_split
|
110
|
+
as_pdf = kwargs.get("as_pdf", False)
|
111
|
+
depth = kwargs.get("depth", 1)
|
112
|
+
if type(parser_type) == str:
|
113
|
+
parser_type = ParserType[parser_type]
|
114
|
+
|
115
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
116
|
+
if (
|
117
|
+
path.lower().endswith((".doc", ".docx"))
|
118
|
+
and parser_type != ParserType.STATIC_PARSE
|
119
|
+
):
|
120
|
+
as_pdf = True
|
121
|
+
|
122
|
+
if path.startswith(("http://", "https://")):
|
123
|
+
download_dir = os.path.join(temp_dir, "downloads/")
|
124
|
+
os.makedirs(download_dir, exist_ok=True)
|
125
|
+
if is_supported_url_file_type(path):
|
126
|
+
path = download_file(path, download_dir)
|
127
|
+
elif as_pdf:
|
128
|
+
pdf_path = os.path.join(download_dir, f"webpage_{int(time())}.pdf")
|
129
|
+
path = convert_to_pdf(path, pdf_path)
|
130
|
+
else:
|
131
|
+
return recursive_read_html(path, depth, raw)
|
132
|
+
|
133
|
+
assert is_supported_file_type(
|
134
|
+
path
|
135
|
+
), f"Unsupported file type {os.path.splitext(path)[1]}"
|
136
|
+
|
137
|
+
if as_pdf and not path.lower().endswith(".pdf"):
|
138
|
+
pdf_path = os.path.join(temp_dir, "converted.pdf")
|
139
|
+
path = convert_to_pdf(path, pdf_path)
|
140
|
+
|
141
|
+
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
142
|
+
kwargs["split"] = False
|
143
|
+
all_docs = parse_chunk(path, parser_type, raw, **kwargs)
|
144
|
+
if raw:
|
145
|
+
all_docs = [all_docs]
|
146
|
+
else:
|
147
|
+
kwargs["split"] = True
|
148
|
+
split_dir = os.path.join(temp_dir, "splits/")
|
149
|
+
os.makedirs(split_dir, exist_ok=True)
|
150
|
+
split_pdf(path, split_dir, pages_per_split)
|
151
|
+
split_files = sorted(glob(os.path.join(split_dir, "*.pdf")))
|
152
|
+
|
153
|
+
chunk_size = max(1, len(split_files) // max_processes)
|
154
|
+
file_chunks = [
|
155
|
+
split_files[i : i + chunk_size]
|
156
|
+
for i in range(0, len(split_files), chunk_size)
|
157
|
+
]
|
158
|
+
|
159
|
+
process_args = [(chunk, parser_type, raw, kwargs) for chunk in file_chunks]
|
160
|
+
|
161
|
+
if max_processes == 1 or len(file_chunks) == 1:
|
162
|
+
all_docs = [parse_chunk_list(*args) for args in process_args]
|
163
|
+
else:
|
164
|
+
with ProcessPoolExecutor(max_workers=max_processes) as executor:
|
165
|
+
all_docs = list(executor.map(parse_chunk_list, *zip(*process_args)))
|
166
|
+
|
167
|
+
all_docs = [item for sublist in all_docs for item in sublist]
|
168
|
+
|
169
|
+
if depth > 1:
|
170
|
+
new_docs = all_docs.copy()
|
171
|
+
for doc in all_docs:
|
172
|
+
urls = re.findall(
|
173
|
+
r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
|
174
|
+
doc if raw else doc["content"],
|
175
|
+
)
|
176
|
+
for url in urls:
|
177
|
+
if "](" in url:
|
178
|
+
url = url.split("](")[-1]
|
179
|
+
logger.debug(f"Reading content from {url}")
|
180
|
+
if not url.startswith("http"):
|
181
|
+
url = "https://" + url
|
182
|
+
|
183
|
+
kwargs_cp = kwargs.copy()
|
184
|
+
kwargs_cp["depth"] = depth - 1
|
185
|
+
res = parse(
|
186
|
+
url,
|
187
|
+
parser_type=parser_type,
|
188
|
+
raw=raw,
|
189
|
+
pages_per_split=pages_per_split,
|
190
|
+
max_processes=max_processes,
|
191
|
+
**kwargs_cp,
|
192
|
+
)
|
193
|
+
|
194
|
+
if raw:
|
195
|
+
new_docs.append(res)
|
196
|
+
else:
|
197
|
+
new_docs.extend(res)
|
198
|
+
all_docs = new_docs
|
199
|
+
|
200
|
+
return "\n".join(all_docs) if raw else all_docs
|
@@ -0,0 +1,200 @@
|
|
1
|
+
import base64
|
2
|
+
import io
|
3
|
+
import mimetypes
|
4
|
+
import os
|
5
|
+
from typing import Dict, List
|
6
|
+
|
7
|
+
import pypdfium2 as pdfium
|
8
|
+
import requests
|
9
|
+
from lexoid.core.prompt_templates import (
|
10
|
+
INSTRUCTIONS_ADD_PG_BREAK,
|
11
|
+
OPENAI_USER_PROMPT,
|
12
|
+
PARSER_PROMPT,
|
13
|
+
)
|
14
|
+
from lexoid.core.utils import convert_image_to_pdf
|
15
|
+
from loguru import logger
|
16
|
+
from openai import OpenAI
|
17
|
+
|
18
|
+
|
19
|
+
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
20
|
+
if "model" not in kwargs:
|
21
|
+
kwargs["model"] = "gemini-1.5-flash"
|
22
|
+
model = kwargs.get("model")
|
23
|
+
if model.startswith("gemini"):
|
24
|
+
return parse_with_gemini(path, raw, **kwargs)
|
25
|
+
elif model.startswith("gpt"):
|
26
|
+
return parse_with_gpt(path, raw, **kwargs)
|
27
|
+
else:
|
28
|
+
raise ValueError(f"Unsupported model: {model}")
|
29
|
+
|
30
|
+
|
31
|
+
def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
32
|
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
33
|
+
if not api_key:
|
34
|
+
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
35
|
+
|
36
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
|
37
|
+
|
38
|
+
# Check if the file is an image and convert to PDF if necessary
|
39
|
+
mime_type, _ = mimetypes.guess_type(path)
|
40
|
+
if mime_type and mime_type.startswith("image"):
|
41
|
+
pdf_content = convert_image_to_pdf(path)
|
42
|
+
mime_type = "application/pdf"
|
43
|
+
base64_file = base64.b64encode(pdf_content).decode("utf-8")
|
44
|
+
else:
|
45
|
+
with open(path, "rb") as file:
|
46
|
+
file_content = file.read()
|
47
|
+
base64_file = base64.b64encode(file_content).decode("utf-8")
|
48
|
+
|
49
|
+
# Ideally, we do this ourselves. But, for now this might be a good enough.
|
50
|
+
custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
|
51
|
+
if kwargs["pages_per_split_"] == 1:
|
52
|
+
custom_instruction = ""
|
53
|
+
|
54
|
+
payload = {
|
55
|
+
"contents": [
|
56
|
+
{
|
57
|
+
"parts": [
|
58
|
+
{
|
59
|
+
"text": PARSER_PROMPT.format(
|
60
|
+
custom_instructions=custom_instruction
|
61
|
+
)
|
62
|
+
},
|
63
|
+
{"inline_data": {"mime_type": mime_type, "data": base64_file}},
|
64
|
+
]
|
65
|
+
}
|
66
|
+
],
|
67
|
+
"generationConfig": {
|
68
|
+
"temperature": kwargs.get("temperature", 0.7),
|
69
|
+
},
|
70
|
+
}
|
71
|
+
|
72
|
+
headers = {"Content-Type": "application/json"}
|
73
|
+
|
74
|
+
response = requests.post(url, json=payload, headers=headers)
|
75
|
+
response.raise_for_status()
|
76
|
+
|
77
|
+
result = response.json()
|
78
|
+
|
79
|
+
raw_text = "".join(
|
80
|
+
part["text"]
|
81
|
+
for candidate in result.get("candidates", [])
|
82
|
+
for part in candidate.get("content", {}).get("parts", [])
|
83
|
+
if "text" in part
|
84
|
+
)
|
85
|
+
|
86
|
+
result = ""
|
87
|
+
if "<output>" in raw_text:
|
88
|
+
result = raw_text.split("<output>")[1].strip()
|
89
|
+
if "</output>" in result:
|
90
|
+
result = result.split("</output>")[0].strip()
|
91
|
+
|
92
|
+
if raw:
|
93
|
+
return result
|
94
|
+
|
95
|
+
return [
|
96
|
+
{
|
97
|
+
"metadata": {
|
98
|
+
"title": kwargs["title"],
|
99
|
+
"page": kwargs.get("start", 0) + page_no,
|
100
|
+
},
|
101
|
+
"content": page,
|
102
|
+
}
|
103
|
+
for page_no, page in enumerate(result.split("<page-break>"), start=1)
|
104
|
+
if page.strip()
|
105
|
+
]
|
106
|
+
|
107
|
+
|
108
|
+
def convert_pdf_page_to_base64(
|
109
|
+
pdf_document: pdfium.PdfDocument, page_number: int
|
110
|
+
) -> str:
|
111
|
+
"""Convert a PDF page to a base64-encoded PNG string."""
|
112
|
+
page = pdf_document[page_number]
|
113
|
+
# Render with 4x scaling for better quality
|
114
|
+
pil_image = page.render(scale=4).to_pil()
|
115
|
+
|
116
|
+
# Convert to base64
|
117
|
+
img_byte_arr = io.BytesIO()
|
118
|
+
pil_image.save(img_byte_arr, format="PNG")
|
119
|
+
img_byte_arr.seek(0)
|
120
|
+
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
121
|
+
|
122
|
+
|
123
|
+
def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
124
|
+
client = OpenAI()
|
125
|
+
|
126
|
+
# Handle different input types
|
127
|
+
mime_type, _ = mimetypes.guess_type(path)
|
128
|
+
if mime_type and mime_type.startswith("image"):
|
129
|
+
# Single image processing
|
130
|
+
with open(path, "rb") as img_file:
|
131
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
132
|
+
images = [(0, image_base64)]
|
133
|
+
else:
|
134
|
+
# PDF processing
|
135
|
+
pdf_document = pdfium.PdfDocument(path)
|
136
|
+
images = [
|
137
|
+
(page_num, convert_pdf_page_to_base64(pdf_document, page_num))
|
138
|
+
for page_num in range(len(pdf_document))
|
139
|
+
]
|
140
|
+
|
141
|
+
# Process each page/image
|
142
|
+
all_results = []
|
143
|
+
for page_num, image_base64 in images:
|
144
|
+
messages = [
|
145
|
+
{
|
146
|
+
"role": "system",
|
147
|
+
"content": PARSER_PROMPT,
|
148
|
+
},
|
149
|
+
{
|
150
|
+
"role": "user",
|
151
|
+
"content": [
|
152
|
+
{
|
153
|
+
"type": "text",
|
154
|
+
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
155
|
+
},
|
156
|
+
{
|
157
|
+
"type": "image_url",
|
158
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
159
|
+
},
|
160
|
+
],
|
161
|
+
},
|
162
|
+
]
|
163
|
+
|
164
|
+
# Get completion from GPT-4 Vision
|
165
|
+
response = client.chat.completions.create(
|
166
|
+
model=kwargs["model"],
|
167
|
+
temperature=kwargs.get("temperature", 0.7),
|
168
|
+
messages=messages,
|
169
|
+
)
|
170
|
+
|
171
|
+
# Extract the response text
|
172
|
+
page_text = response.choices[0].message.content
|
173
|
+
if kwargs.get("verbose", None):
|
174
|
+
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
175
|
+
result = ""
|
176
|
+
if "<output>" in page_text:
|
177
|
+
result = page_text.split("<output>")[1].strip()
|
178
|
+
if "</output>" in result:
|
179
|
+
result = result.split("</output>")[0].strip()
|
180
|
+
all_results.append((page_num, result))
|
181
|
+
|
182
|
+
# Sort results by page number and combine
|
183
|
+
all_results.sort(key=lambda x: x[0])
|
184
|
+
all_texts = [text for _, text in all_results]
|
185
|
+
combined_text = "<page-break>".join(all_texts)
|
186
|
+
|
187
|
+
if raw:
|
188
|
+
return combined_text
|
189
|
+
|
190
|
+
return [
|
191
|
+
{
|
192
|
+
"metadata": {
|
193
|
+
"title": kwargs["title"],
|
194
|
+
"page": kwargs.get("start", 0) + page_no,
|
195
|
+
},
|
196
|
+
"content": page,
|
197
|
+
}
|
198
|
+
for page_no, page in enumerate(all_texts, start=1)
|
199
|
+
if page.strip()
|
200
|
+
]
|