lexoid 0.1.8.post1__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +84 -47
- lexoid/core/parse_type/llm_parser.py +79 -132
- lexoid/core/parse_type/static_parser.py +83 -67
- lexoid/core/utils.py +54 -46
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.9.dist-info}/METADATA +4 -4
- lexoid-0.1.9.dist-info/RECORD +9 -0
- lexoid-0.1.8.post1.dist-info/RECORD +0 -9
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.9.dist-info}/LICENSE +0 -0
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.9.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -28,20 +28,24 @@ class ParserType(Enum):
|
|
28
28
|
AUTO = "AUTO"
|
29
29
|
|
30
30
|
|
31
|
-
def parse_chunk(
|
32
|
-
path: str, parser_type: ParserType, raw: bool, **kwargs
|
33
|
-
) -> List[Dict] | str:
|
31
|
+
def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
34
32
|
"""
|
35
33
|
Parses a file using the specified parser type.
|
36
34
|
|
37
35
|
Args:
|
38
36
|
path (str): The file path or URL.
|
39
37
|
parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
|
40
|
-
raw (bool): Whether to return raw text or structured data.
|
41
38
|
**kwargs: Additional arguments for the parser.
|
42
39
|
|
43
40
|
Returns:
|
44
|
-
|
41
|
+
Dict: Dictionary containing:
|
42
|
+
- raw: Full markdown content as string
|
43
|
+
- segments: List of dictionaries with metadata and content
|
44
|
+
- title: Title of the document
|
45
|
+
- url: URL if applicable
|
46
|
+
- parent_title: Title of parent doc if recursively parsed
|
47
|
+
- recursive_docs: List of dictionaries for recursively parsed documents
|
48
|
+
- token_usage: Dictionary containing token usage statistics
|
45
49
|
"""
|
46
50
|
if parser_type == ParserType.AUTO:
|
47
51
|
parser_type = ParserType[router(path)]
|
@@ -52,63 +56,80 @@ def parse_chunk(
|
|
52
56
|
)
|
53
57
|
if parser_type == ParserType.STATIC_PARSE:
|
54
58
|
logger.debug("Using static parser")
|
55
|
-
return parse_static_doc(path,
|
59
|
+
return parse_static_doc(path, **kwargs)
|
56
60
|
else:
|
57
61
|
logger.debug("Using LLM parser")
|
58
|
-
return parse_llm_doc(path,
|
62
|
+
return parse_llm_doc(path, **kwargs)
|
59
63
|
|
60
64
|
|
61
65
|
def parse_chunk_list(
|
62
|
-
file_paths: List[str], parser_type: ParserType,
|
63
|
-
) ->
|
66
|
+
file_paths: List[str], parser_type: ParserType, kwargs: Dict
|
67
|
+
) -> Dict:
|
64
68
|
"""
|
65
69
|
Parses a list of files using the specified parser type.
|
66
70
|
|
67
71
|
Args:
|
68
72
|
file_paths (list): List of file paths.
|
69
73
|
parser_type (ParserType): The type of parser to use.
|
70
|
-
raw (bool): Whether to return raw text or structured data.
|
71
74
|
kwargs (dict): Additional arguments for the parser.
|
72
75
|
|
73
76
|
Returns:
|
74
|
-
|
77
|
+
Dict: Dictionary containing parsed document data
|
75
78
|
"""
|
76
|
-
|
79
|
+
combined_segments = []
|
80
|
+
raw_texts = []
|
81
|
+
token_usage = {"input": 0, "output": 0}
|
77
82
|
for file_path in file_paths:
|
78
|
-
result = parse_chunk(file_path, parser_type,
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
83
|
+
result = parse_chunk(file_path, parser_type, **kwargs)
|
84
|
+
combined_segments.extend(result["segments"])
|
85
|
+
raw_texts.append(result["raw"])
|
86
|
+
token_usage["input"] += result["token_usage"]["input"]
|
87
|
+
token_usage["output"] += result["token_usage"]["output"]
|
88
|
+
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
89
|
+
|
90
|
+
return {
|
91
|
+
"raw": "\n\n".join(raw_texts),
|
92
|
+
"segments": combined_segments,
|
93
|
+
"title": kwargs.get("title", ""),
|
94
|
+
"url": kwargs.get("url", ""),
|
95
|
+
"parent_title": kwargs.get("parent_title", ""),
|
96
|
+
"recursive_docs": [],
|
97
|
+
"token_usage": token_usage,
|
98
|
+
}
|
84
99
|
|
85
100
|
|
86
101
|
def parse(
|
87
102
|
path: str,
|
88
103
|
parser_type: Union[str, ParserType] = "LLM_PARSE",
|
89
|
-
raw: bool = False,
|
90
104
|
pages_per_split: int = 4,
|
91
105
|
max_processes: int = 4,
|
92
106
|
**kwargs,
|
93
|
-
) ->
|
107
|
+
) -> Dict:
|
94
108
|
"""
|
95
109
|
Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
|
96
110
|
|
97
111
|
Args:
|
98
112
|
path (str): The file path or URL.
|
99
|
-
parser_type (Union[str, ParserType], optional):
|
100
|
-
|
101
|
-
|
102
|
-
max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
|
113
|
+
parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
|
114
|
+
pages_per_split (int, optional): Number of pages per split for chunking.
|
115
|
+
max_processes (int, optional): Maximum number of processes for parallel processing.
|
103
116
|
**kwargs: Additional arguments for the parser.
|
104
117
|
|
105
118
|
Returns:
|
106
|
-
|
119
|
+
Dict: Dictionary containing:
|
120
|
+
- raw: Full markdown content as string
|
121
|
+
- segments: List of dictionaries with metadata and content
|
122
|
+
- title: Title of the document
|
123
|
+
- url: URL if applicable
|
124
|
+
- parent_title: Title of parent doc if recursively parsed
|
125
|
+
- recursive_docs: List of dictionaries for recursively parsed documents
|
126
|
+
- token_usage: Dictionary containing token usage statistics
|
107
127
|
"""
|
108
128
|
kwargs["title"] = os.path.basename(path)
|
109
129
|
kwargs["pages_per_split_"] = pages_per_split
|
110
130
|
as_pdf = kwargs.get("as_pdf", False)
|
111
131
|
depth = kwargs.get("depth", 1)
|
132
|
+
|
112
133
|
if type(parser_type) == str:
|
113
134
|
parser_type = ParserType[parser_type]
|
114
135
|
|
@@ -120,15 +141,19 @@ def parse(
|
|
120
141
|
as_pdf = True
|
121
142
|
|
122
143
|
if path.startswith(("http://", "https://")):
|
123
|
-
|
144
|
+
kwargs["url"] = path
|
145
|
+
download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
|
124
146
|
os.makedirs(download_dir, exist_ok=True)
|
125
147
|
if is_supported_url_file_type(path):
|
126
148
|
path = download_file(path, download_dir)
|
127
149
|
elif as_pdf:
|
128
|
-
|
150
|
+
pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
|
151
|
+
if not pdf_filename.endswith(".pdf"):
|
152
|
+
pdf_filename += ".pdf"
|
153
|
+
pdf_path = os.path.join(download_dir, pdf_filename)
|
129
154
|
path = convert_to_pdf(path, pdf_path)
|
130
155
|
else:
|
131
|
-
return recursive_read_html(path, depth
|
156
|
+
return recursive_read_html(path, depth)
|
132
157
|
|
133
158
|
assert is_supported_file_type(
|
134
159
|
path
|
@@ -140,9 +165,7 @@ def parse(
|
|
140
165
|
|
141
166
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
142
167
|
kwargs["split"] = False
|
143
|
-
|
144
|
-
if raw:
|
145
|
-
all_docs = [all_docs]
|
168
|
+
result = parse_chunk(path, parser_type, **kwargs)
|
146
169
|
else:
|
147
170
|
kwargs["split"] = True
|
148
171
|
split_dir = os.path.join(temp_dir, "splits/")
|
@@ -156,22 +179,39 @@ def parse(
|
|
156
179
|
for i in range(0, len(split_files), chunk_size)
|
157
180
|
]
|
158
181
|
|
159
|
-
process_args = [(chunk, parser_type,
|
182
|
+
process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
|
160
183
|
|
161
184
|
if max_processes == 1 or len(file_chunks) == 1:
|
162
|
-
|
185
|
+
chunk_results = [parse_chunk_list(*args) for args in process_args]
|
163
186
|
else:
|
164
187
|
with ProcessPoolExecutor(max_workers=max_processes) as executor:
|
165
|
-
|
166
|
-
|
167
|
-
|
188
|
+
chunk_results = list(
|
189
|
+
executor.map(parse_chunk_list, *zip(*process_args))
|
190
|
+
)
|
191
|
+
|
192
|
+
# Combine results from all chunks
|
193
|
+
result = {
|
194
|
+
"raw": "\n\n".join(r["raw"] for r in chunk_results),
|
195
|
+
"segments": [seg for r in chunk_results for seg in r["segments"]],
|
196
|
+
"title": kwargs["title"],
|
197
|
+
"url": kwargs.get("url", ""),
|
198
|
+
"parent_title": kwargs.get("parent_title", ""),
|
199
|
+
"recursive_docs": [],
|
200
|
+
"token_usage": {
|
201
|
+
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
202
|
+
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
203
|
+
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
204
|
+
},
|
205
|
+
}
|
206
|
+
if as_pdf:
|
207
|
+
result["pdf_path"] = path
|
168
208
|
|
169
209
|
if depth > 1:
|
170
|
-
|
171
|
-
for
|
210
|
+
recursive_docs = []
|
211
|
+
for segment in result["segments"]:
|
172
212
|
urls = re.findall(
|
173
213
|
r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
|
174
|
-
|
214
|
+
segment["content"],
|
175
215
|
)
|
176
216
|
for url in urls:
|
177
217
|
if "](" in url:
|
@@ -182,19 +222,16 @@ def parse(
|
|
182
222
|
|
183
223
|
kwargs_cp = kwargs.copy()
|
184
224
|
kwargs_cp["depth"] = depth - 1
|
185
|
-
|
225
|
+
kwargs_cp["parent_title"] = result["title"]
|
226
|
+
sub_doc = parse(
|
186
227
|
url,
|
187
228
|
parser_type=parser_type,
|
188
|
-
raw=raw,
|
189
229
|
pages_per_split=pages_per_split,
|
190
230
|
max_processes=max_processes,
|
191
231
|
**kwargs_cp,
|
192
232
|
)
|
233
|
+
recursive_docs.append(sub_doc)
|
193
234
|
|
194
|
-
|
195
|
-
new_docs.append(res)
|
196
|
-
else:
|
197
|
-
new_docs.extend(res)
|
198
|
-
all_docs = new_docs
|
235
|
+
result["recursive_docs"] = recursive_docs
|
199
236
|
|
200
|
-
return
|
237
|
+
return result
|
@@ -18,6 +18,7 @@ from lexoid.core.prompt_templates import (
|
|
18
18
|
from lexoid.core.utils import convert_image_to_pdf
|
19
19
|
from loguru import logger
|
20
20
|
from openai import OpenAI
|
21
|
+
from together import Together
|
21
22
|
from huggingface_hub import InferenceClient
|
22
23
|
|
23
24
|
|
@@ -33,38 +34,36 @@ def retry_on_http_error(func):
|
|
33
34
|
return func(*args, **kwargs)
|
34
35
|
except HTTPError as e:
|
35
36
|
logger.error(f"Retry failed: {e}")
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
46
|
-
]
|
37
|
+
return {
|
38
|
+
"raw": "",
|
39
|
+
"segments": [],
|
40
|
+
"title": kwargs["title"],
|
41
|
+
"url": kwargs.get("url", ""),
|
42
|
+
"parent_title": kwargs.get("parent_title", ""),
|
43
|
+
"recursive_docs": [],
|
44
|
+
"error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
|
45
|
+
}
|
47
46
|
|
48
47
|
return wrapper
|
49
48
|
|
50
49
|
|
51
50
|
@retry_on_http_error
|
52
|
-
def parse_llm_doc(path: str,
|
51
|
+
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
53
52
|
if "model" not in kwargs:
|
54
53
|
kwargs["model"] = "gemini-1.5-flash"
|
55
54
|
model = kwargs.get("model")
|
56
55
|
if model.startswith("gemini"):
|
57
|
-
return parse_with_gemini(path,
|
56
|
+
return parse_with_gemini(path, **kwargs)
|
58
57
|
if model.startswith("gpt"):
|
59
|
-
return parse_with_api(path,
|
58
|
+
return parse_with_api(path, api="openai", **kwargs)
|
60
59
|
if model.startswith("meta-llama"):
|
61
60
|
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
62
|
-
return
|
63
|
-
return parse_with_api(path,
|
61
|
+
return parse_with_api(path, api="together", **kwargs)
|
62
|
+
return parse_with_api(path, api="huggingface", **kwargs)
|
64
63
|
raise ValueError(f"Unsupported model: {model}")
|
65
64
|
|
66
65
|
|
67
|
-
def parse_with_gemini(path: str,
|
66
|
+
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
68
67
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
69
68
|
if not api_key:
|
70
69
|
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
@@ -119,25 +118,30 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
119
118
|
if "text" in part
|
120
119
|
)
|
121
120
|
|
122
|
-
|
121
|
+
combined_text = ""
|
123
122
|
if "<output>" in raw_text:
|
124
|
-
|
123
|
+
combined_text = raw_text.split("<output>")[1].strip()
|
125
124
|
if "</output>" in result:
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
"metadata": {
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
125
|
+
combined_text = result.split("</output>")[0].strip()
|
126
|
+
|
127
|
+
token_usage = result["usageMetadata"]
|
128
|
+
|
129
|
+
return {
|
130
|
+
"raw": combined_text,
|
131
|
+
"segments": [
|
132
|
+
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
133
|
+
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
134
|
+
],
|
135
|
+
"title": kwargs["title"],
|
136
|
+
"url": kwargs.get("url", ""),
|
137
|
+
"parent_title": kwargs.get("parent_title", ""),
|
138
|
+
"recursive_docs": [],
|
139
|
+
"token_usage": {
|
140
|
+
"input": token_usage["promptTokenCount"],
|
141
|
+
"output": token_usage["candidatesTokenCount"],
|
142
|
+
"total": token_usage["totalTokenCount"],
|
143
|
+
},
|
144
|
+
}
|
141
145
|
|
142
146
|
|
143
147
|
def convert_pdf_page_to_base64(
|
@@ -155,97 +159,17 @@ def convert_pdf_page_to_base64(
|
|
155
159
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
156
160
|
|
157
161
|
|
158
|
-
def
|
159
|
-
api_key = os.environ.get("TOGETHER_API_KEY")
|
160
|
-
if not api_key:
|
161
|
-
raise ValueError("TOGETHER_API_KEY environment variable is not set")
|
162
|
-
|
163
|
-
url = "https://api.together.xyz/v1/chat/completions"
|
164
|
-
headers = {
|
165
|
-
"Authorization": f"Bearer {api_key}",
|
166
|
-
"Content-Type": "application/json",
|
167
|
-
}
|
168
|
-
|
169
|
-
mime_type, _ = mimetypes.guess_type(path)
|
170
|
-
if mime_type and mime_type.startswith("image"):
|
171
|
-
with open(path, "rb") as img_file:
|
172
|
-
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
173
|
-
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
174
|
-
else:
|
175
|
-
pdf_document = pdfium.PdfDocument(path)
|
176
|
-
images = [
|
177
|
-
(
|
178
|
-
page_num,
|
179
|
-
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
180
|
-
)
|
181
|
-
for page_num in range(len(pdf_document))
|
182
|
-
]
|
183
|
-
|
184
|
-
all_results = []
|
185
|
-
for page_num, image_url in images:
|
186
|
-
messages = [
|
187
|
-
{
|
188
|
-
"role": "user",
|
189
|
-
"content": [
|
190
|
-
{"type": "text", "text": LLAMA_PARSER_PROMPT},
|
191
|
-
{"type": "image_url", "image_url": {"url": image_url}},
|
192
|
-
],
|
193
|
-
}
|
194
|
-
]
|
195
|
-
|
196
|
-
payload = {
|
197
|
-
"model": kwargs["model"],
|
198
|
-
"messages": messages,
|
199
|
-
"max_tokens": kwargs.get("max_tokens", 1024),
|
200
|
-
"temperature": kwargs.get("temperature", 0.7),
|
201
|
-
}
|
202
|
-
|
203
|
-
response = requests.post(url, json=payload, headers=headers)
|
204
|
-
response.raise_for_status()
|
205
|
-
response_data = response.json()
|
206
|
-
|
207
|
-
page_text = response_data["choices"][0]["message"]["content"]
|
208
|
-
if kwargs.get("verbose", None):
|
209
|
-
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
210
|
-
|
211
|
-
result = page_text
|
212
|
-
if "<output>" in page_text:
|
213
|
-
result = page_text.split("<output>")[1].strip()
|
214
|
-
if "</output>" in result:
|
215
|
-
result = result.split("</output>")[0].strip()
|
216
|
-
all_results.append((page_num, result))
|
217
|
-
|
218
|
-
all_results.sort(key=lambda x: x[0])
|
219
|
-
all_texts = [text for _, text in all_results]
|
220
|
-
combined_text = "<page-break>".join(all_texts)
|
221
|
-
|
222
|
-
if raw:
|
223
|
-
return combined_text
|
224
|
-
|
225
|
-
return [
|
226
|
-
{
|
227
|
-
"metadata": {
|
228
|
-
"title": kwargs["title"],
|
229
|
-
"page": kwargs.get("start", 0) + page_no,
|
230
|
-
},
|
231
|
-
"content": page,
|
232
|
-
}
|
233
|
-
for page_no, page in enumerate(all_texts, start=1)
|
234
|
-
]
|
235
|
-
|
236
|
-
|
237
|
-
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
162
|
+
def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
238
163
|
"""
|
239
164
|
Parse documents (PDFs or images) using various vision model APIs.
|
240
165
|
|
241
166
|
Args:
|
242
167
|
path (str): Path to the document to parse
|
243
|
-
|
244
|
-
api (str): Which API to use ("openai" or "huggingface")
|
168
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
245
169
|
**kwargs: Additional arguments including model, temperature, title, etc.
|
246
170
|
|
247
171
|
Returns:
|
248
|
-
|
172
|
+
Dict: Dictionary containing parsed document data
|
249
173
|
"""
|
250
174
|
# Initialize appropriate client
|
251
175
|
clients = {
|
@@ -253,6 +177,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
253
177
|
"huggingface": lambda: InferenceClient(
|
254
178
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
255
179
|
),
|
180
|
+
"together": lambda: Together(),
|
256
181
|
}
|
257
182
|
assert api in clients, f"Unsupported API: {api}"
|
258
183
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -329,6 +254,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
329
254
|
|
330
255
|
# Get completion from selected API
|
331
256
|
response = client.chat.completions.create(**completion_params)
|
257
|
+
token_usage = response.usage
|
332
258
|
|
333
259
|
# Extract the response text
|
334
260
|
page_text = response.choices[0].message.content
|
@@ -341,23 +267,44 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
341
267
|
result = page_text.split("<output>")[1].strip()
|
342
268
|
if "</output>" in result:
|
343
269
|
result = result.split("</output>")[0].strip()
|
344
|
-
all_results.append(
|
270
|
+
all_results.append(
|
271
|
+
(
|
272
|
+
page_num,
|
273
|
+
result,
|
274
|
+
token_usage.prompt_tokens,
|
275
|
+
token_usage.completion_tokens,
|
276
|
+
token_usage.total_tokens,
|
277
|
+
)
|
278
|
+
)
|
345
279
|
|
346
280
|
# Sort results by page number and combine
|
347
281
|
all_results.sort(key=lambda x: x[0])
|
348
|
-
all_texts = [text for _, text in all_results]
|
282
|
+
all_texts = [text for _, text, _, _, _ in all_results]
|
349
283
|
combined_text = "<page-break>".join(all_texts)
|
350
284
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
285
|
+
return {
|
286
|
+
"raw": combined_text,
|
287
|
+
"segments": [
|
288
|
+
{
|
289
|
+
"metadata": {
|
290
|
+
"page": kwargs.get("start", 0) + page_no + 1,
|
291
|
+
"token_usage": {
|
292
|
+
"input": input_tokens,
|
293
|
+
"output": output_tokens,
|
294
|
+
"total": total_tokens,
|
295
|
+
},
|
296
|
+
},
|
297
|
+
"content": page,
|
298
|
+
}
|
299
|
+
for page_no, page, input_tokens, output_tokens, total_tokens in all_results
|
300
|
+
],
|
301
|
+
"title": kwargs["title"],
|
302
|
+
"url": kwargs.get("url", ""),
|
303
|
+
"parent_title": kwargs.get("parent_title", ""),
|
304
|
+
"recursive_docs": [],
|
305
|
+
"token_usage": {
|
306
|
+
"input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
|
307
|
+
"output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
|
308
|
+
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
|
309
|
+
},
|
310
|
+
}
|
@@ -9,73 +9,89 @@ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
|
9
9
|
from docx import Document
|
10
10
|
|
11
11
|
|
12
|
-
def parse_static_doc(path: str,
|
12
|
+
def parse_static_doc(path: str, **kwargs) -> Dict:
|
13
|
+
"""
|
14
|
+
Parses a document using static parsing methods.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
path (str): The file path.
|
18
|
+
**kwargs: Additional arguments for parsing.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
Dict: Dictionary containing parsed document data
|
22
|
+
"""
|
13
23
|
framework = kwargs.get("framework", "pdfplumber")
|
14
24
|
|
15
25
|
file_type = get_file_type(path)
|
16
26
|
if file_type == "application/pdf":
|
17
27
|
if framework == "pdfplumber":
|
18
|
-
return parse_with_pdfplumber(path,
|
28
|
+
return parse_with_pdfplumber(path, **kwargs)
|
19
29
|
elif framework == "pdfminer":
|
20
|
-
return parse_with_pdfminer(path,
|
30
|
+
return parse_with_pdfminer(path, **kwargs)
|
21
31
|
else:
|
22
32
|
raise ValueError(f"Unsupported framework: {framework}")
|
23
33
|
elif "wordprocessing" in file_type:
|
24
|
-
return parse_with_docx(path,
|
34
|
+
return parse_with_docx(path, **kwargs)
|
25
35
|
elif file_type == "text/html":
|
26
36
|
with open(path, "r") as f:
|
27
37
|
html_content = f.read()
|
28
|
-
return html_to_markdown(html_content,
|
38
|
+
return html_to_markdown(html_content, kwargs["title"])
|
29
39
|
elif file_type == "text/plain":
|
30
40
|
with open(path, "r") as f:
|
31
41
|
content = f.read()
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
]
|
42
|
+
return {
|
43
|
+
"raw": content,
|
44
|
+
"segments": [{"metadata": {"page": 1}, "content": content}],
|
45
|
+
"title": kwargs["title"],
|
46
|
+
"url": kwargs.get("url", ""),
|
47
|
+
"parent_title": kwargs.get("parent_title", ""),
|
48
|
+
"recursive_docs": [],
|
49
|
+
}
|
41
50
|
elif file_type == "text/csv":
|
42
51
|
df = pd.read_csv(path)
|
43
52
|
content = df.to_markdown(index=False)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
]
|
53
|
+
return {
|
54
|
+
"raw": content,
|
55
|
+
"segments": [{"metadata": {"page": 1}, "content": content}],
|
56
|
+
"title": kwargs["title"],
|
57
|
+
"url": kwargs.get("url", ""),
|
58
|
+
"parent_title": kwargs.get("parent_title", ""),
|
59
|
+
"recursive_docs": [],
|
60
|
+
}
|
53
61
|
else:
|
54
62
|
raise ValueError(f"Unsupported file type: {file_type}")
|
55
63
|
|
56
64
|
|
57
|
-
def parse_with_pdfminer(path: str,
|
65
|
+
def parse_with_pdfminer(path: str, **kwargs) -> Dict:
|
66
|
+
"""
|
67
|
+
Parse PDF using pdfminer.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Dict: Dictionary containing parsed document data
|
71
|
+
"""
|
58
72
|
pages = list(extract_pages(path))
|
59
|
-
|
73
|
+
segments = []
|
74
|
+
raw_texts = []
|
75
|
+
|
60
76
|
for page_num, page_layout in enumerate(pages, start=1):
|
61
77
|
page_text = "".join(
|
62
78
|
element.get_text()
|
63
79
|
for element in page_layout
|
64
80
|
if isinstance(element, LTTextContainer)
|
65
81
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
82
|
+
raw_texts.append(page_text)
|
83
|
+
segments.append(
|
84
|
+
{"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
|
85
|
+
)
|
86
|
+
|
87
|
+
return {
|
88
|
+
"raw": "\n".join(raw_texts),
|
89
|
+
"segments": segments,
|
90
|
+
"title": kwargs["title"],
|
91
|
+
"url": kwargs.get("url", ""),
|
92
|
+
"parent_title": kwargs.get("parent_title", ""),
|
93
|
+
"recursive_docs": [],
|
94
|
+
}
|
79
95
|
|
80
96
|
|
81
97
|
def process_table(table) -> str:
|
@@ -359,44 +375,44 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
|
|
359
375
|
return page_texts
|
360
376
|
|
361
377
|
|
362
|
-
def parse_with_pdfplumber(path: str,
|
378
|
+
def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
|
363
379
|
"""
|
364
|
-
Parse PDF
|
365
|
-
|
366
|
-
Args:
|
367
|
-
path (str): Path to the PDF file
|
368
|
-
raw (bool): If True, return raw text with page breaks; if False, return structured data
|
369
|
-
**kwargs: Additional arguments including 'title' and 'start' page number
|
380
|
+
Parse PDF using pdfplumber.
|
370
381
|
|
371
382
|
Returns:
|
372
|
-
|
373
|
-
or a string of raw text with page breaks
|
383
|
+
Dict: Dictionary containing parsed document data
|
374
384
|
"""
|
375
385
|
page_texts = process_pdf_with_pdfplumber(path)
|
376
|
-
|
377
|
-
|
378
|
-
return [
|
379
|
-
{
|
380
|
-
"metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
|
381
|
-
"content": page_text,
|
382
|
-
}
|
386
|
+
segments = [
|
387
|
+
{"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
|
383
388
|
for page_num, page_text in enumerate(page_texts, start=1)
|
384
389
|
]
|
385
390
|
|
391
|
+
return {
|
392
|
+
"raw": "<page-break>".join(page_texts),
|
393
|
+
"segments": segments,
|
394
|
+
"title": kwargs["title"],
|
395
|
+
"url": kwargs.get("url", ""),
|
396
|
+
"parent_title": kwargs.get("parent_title", ""),
|
397
|
+
"recursive_docs": [],
|
398
|
+
}
|
399
|
+
|
386
400
|
|
387
|
-
def parse_with_docx(path: str,
|
401
|
+
def parse_with_docx(path: str, **kwargs) -> Dict:
|
402
|
+
"""
|
403
|
+
Parse DOCX document.
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
Dict: Dictionary containing parsed document data
|
407
|
+
"""
|
388
408
|
doc = Document(path)
|
389
409
|
full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
390
410
|
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
},
|
400
|
-
"content": full_text,
|
401
|
-
}
|
402
|
-
]
|
411
|
+
return {
|
412
|
+
"raw": full_text,
|
413
|
+
"segments": [{"metadata": {"page": kwargs["start"] + 1}, "content": full_text}],
|
414
|
+
"title": kwargs["title"],
|
415
|
+
"url": kwargs.get("url", ""),
|
416
|
+
"parent_title": kwargs.get("parent_title", ""),
|
417
|
+
"recursive_docs": [],
|
418
|
+
}
|
lexoid/core/utils.py
CHANGED
@@ -5,7 +5,8 @@ import os
|
|
5
5
|
import re
|
6
6
|
import sys
|
7
7
|
from difflib import SequenceMatcher
|
8
|
-
from
|
8
|
+
from hashlib import md5
|
9
|
+
from typing import Dict, List
|
9
10
|
from urllib.parse import urlparse
|
10
11
|
|
11
12
|
import nest_asyncio
|
@@ -184,14 +185,11 @@ def find_dominant_heading_level(markdown_content: str) -> str:
|
|
184
185
|
return min(heading_counts.keys(), key=len)
|
185
186
|
|
186
187
|
|
187
|
-
def split_md_by_headings(
|
188
|
-
markdown_content: str, heading_pattern: str, title: str
|
189
|
-
) -> List[Dict]:
|
188
|
+
def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
|
190
189
|
"""
|
191
190
|
Splits markdown content by the specified heading pattern and structures it.
|
192
191
|
|
193
192
|
Args:
|
194
|
-
url (str): The URL of the HTML page
|
195
193
|
markdown_content (str): The markdown content to split
|
196
194
|
heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
|
197
195
|
|
@@ -211,7 +209,7 @@ def split_md_by_headings(
|
|
211
209
|
if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
|
212
210
|
structured_content.append(
|
213
211
|
{
|
214
|
-
"metadata": {"
|
212
|
+
"metadata": {"page": "Introduction"},
|
215
213
|
"content": sections.pop(0),
|
216
214
|
}
|
217
215
|
)
|
@@ -221,7 +219,7 @@ def split_md_by_headings(
|
|
221
219
|
if i + 1 < len(sections):
|
222
220
|
structured_content.append(
|
223
221
|
{
|
224
|
-
"metadata": {"
|
222
|
+
"metadata": {"page": sections[i]},
|
225
223
|
"content": sections[i + 1],
|
226
224
|
}
|
227
225
|
)
|
@@ -238,7 +236,7 @@ def split_md_by_headings(
|
|
238
236
|
if len(sections) > len(headings):
|
239
237
|
structured_content.append(
|
240
238
|
{
|
241
|
-
"metadata": {"
|
239
|
+
"metadata": {"page": "Introduction"},
|
242
240
|
"content": sections.pop(0),
|
243
241
|
}
|
244
242
|
)
|
@@ -248,7 +246,7 @@ def split_md_by_headings(
|
|
248
246
|
clean_heading = heading.replace(heading_pattern, "").strip()
|
249
247
|
structured_content.append(
|
250
248
|
{
|
251
|
-
"metadata": {"
|
249
|
+
"metadata": {"page": clean_heading},
|
252
250
|
"content": content,
|
253
251
|
}
|
254
252
|
)
|
@@ -256,39 +254,47 @@ def split_md_by_headings(
|
|
256
254
|
return structured_content
|
257
255
|
|
258
256
|
|
259
|
-
def html_to_markdown(html: str,
|
257
|
+
def html_to_markdown(html: str, title: str, url: str) -> str:
|
260
258
|
"""
|
261
259
|
Converts HTML content to markdown.
|
262
260
|
|
263
261
|
Args:
|
264
262
|
html (str): The HTML content to convert.
|
265
|
-
|
263
|
+
title (str): The title of the HTML page
|
264
|
+
url (str): The URL of the HTML page
|
266
265
|
|
267
266
|
Returns:
|
268
|
-
|
267
|
+
Dict: Dictionary containing parsed document data
|
269
268
|
"""
|
270
269
|
markdown_content = md(html)
|
271
270
|
|
272
|
-
if raw:
|
273
|
-
return markdown_content
|
274
|
-
|
275
271
|
# Find the dominant heading level
|
276
272
|
heading_pattern = find_dominant_heading_level(markdown_content)
|
277
273
|
|
278
274
|
# Split content by headings and structure it
|
279
|
-
|
275
|
+
split_md = split_md_by_headings(markdown_content, heading_pattern)
|
276
|
+
|
277
|
+
content = {
|
278
|
+
"raw": markdown_content,
|
279
|
+
"segments": split_md,
|
280
|
+
"title": title,
|
281
|
+
"url": url,
|
282
|
+
"parent_title": "",
|
283
|
+
"recursive_docs": [],
|
284
|
+
}
|
285
|
+
|
286
|
+
return content
|
280
287
|
|
281
288
|
|
282
|
-
def read_html_content(url: str
|
289
|
+
def read_html_content(url: str) -> Dict:
|
283
290
|
"""
|
284
291
|
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
285
292
|
|
286
293
|
Args:
|
287
294
|
url (str): The URL of the HTML page.
|
288
|
-
raw (bool): Whether to return raw markdown text or structured data.
|
289
295
|
|
290
296
|
Returns:
|
291
|
-
|
297
|
+
Dict: Dictionary containing parsed document data
|
292
298
|
"""
|
293
299
|
|
294
300
|
try:
|
@@ -351,7 +357,10 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
|
|
351
357
|
soup = BeautifulSoup(
|
352
358
|
response.content, "html.parser", from_encoding="iso-8859-1"
|
353
359
|
)
|
354
|
-
|
360
|
+
title = soup.title.string.strip() if soup.title else "No title"
|
361
|
+
url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
|
362
|
+
full_title = f"{title} - {url_hash}"
|
363
|
+
return html_to_markdown(str(soup), title=full_title, url=url)
|
355
364
|
|
356
365
|
|
357
366
|
def extract_urls_from_markdown(content: str) -> List[str]:
|
@@ -378,61 +387,60 @@ def extract_urls_from_markdown(content: str) -> List[str]:
|
|
378
387
|
return list(set(urls)) # Remove duplicates
|
379
388
|
|
380
389
|
|
381
|
-
def recursive_read_html(
|
382
|
-
url: str, depth: int, raw: bool, visited_urls: set = None
|
383
|
-
) -> Union[str, List[Dict]]:
|
390
|
+
def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
|
384
391
|
"""
|
385
392
|
Recursively reads HTML content from URLs up to specified depth.
|
386
393
|
|
387
394
|
Args:
|
388
395
|
url (str): The URL to parse
|
389
396
|
depth (int): How many levels deep to recursively parse
|
390
|
-
raw (bool): Whether to return raw text or structured data
|
391
397
|
visited_urls (set): Set of already visited URLs to prevent cycles
|
392
398
|
|
393
399
|
Returns:
|
394
|
-
|
400
|
+
Dict: Dictionary containing parsed document data
|
395
401
|
"""
|
396
402
|
if visited_urls is None:
|
397
403
|
visited_urls = set()
|
398
404
|
|
399
405
|
if url in visited_urls:
|
400
|
-
return
|
406
|
+
return {
|
407
|
+
"raw": "",
|
408
|
+
"segments": [],
|
409
|
+
"title": "",
|
410
|
+
"url": url,
|
411
|
+
"parent_title": "",
|
412
|
+
"recursive_docs": [],
|
413
|
+
}
|
401
414
|
|
402
415
|
visited_urls.add(url)
|
403
416
|
|
404
417
|
try:
|
405
|
-
content = read_html_content(url
|
418
|
+
content = read_html_content(url)
|
406
419
|
except Exception as e:
|
407
420
|
print(f"Error processing URL {url}: {str(e)}")
|
408
|
-
return
|
421
|
+
return {
|
422
|
+
"raw": "",
|
423
|
+
"segments": [],
|
424
|
+
"title": "",
|
425
|
+
"url": url,
|
426
|
+
"parent_title": "",
|
427
|
+
"recursive_docs": [],
|
428
|
+
}
|
409
429
|
|
410
430
|
if depth <= 1:
|
411
431
|
return content
|
412
432
|
|
413
|
-
# Extract URLs from
|
414
|
-
|
415
|
-
urls = extract_urls_from_markdown(content)
|
416
|
-
else:
|
417
|
-
# Extract URLs from all content sections
|
418
|
-
urls = []
|
419
|
-
for doc in content:
|
420
|
-
urls.extend(extract_urls_from_markdown(doc["content"]))
|
433
|
+
# Extract URLs from all content sections
|
434
|
+
urls = extract_urls_from_markdown(content["raw"])
|
421
435
|
|
422
436
|
# Recursively process each URL
|
437
|
+
recursive_docs = []
|
423
438
|
for sub_url in urls:
|
424
439
|
if sub_url not in visited_urls:
|
425
|
-
sub_content = recursive_read_html(sub_url, depth - 1,
|
426
|
-
|
427
|
-
if raw:
|
428
|
-
if sub_content:
|
429
|
-
content += f"\n\n--- Begin content from {sub_url} ---\n\n"
|
430
|
-
content += sub_content
|
431
|
-
content += f"\n\n--- End content from {sub_url} ---\n\n"
|
432
|
-
else:
|
433
|
-
if isinstance(sub_content, list):
|
434
|
-
content.extend(sub_content)
|
440
|
+
sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
|
441
|
+
recursive_docs.append(sub_content)
|
435
442
|
|
443
|
+
content["recursive_docs"] = recursive_docs
|
436
444
|
return content
|
437
445
|
|
438
446
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -28,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
|
28
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
29
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
30
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
31
|
+
Requires-Dist: together (>=1.4.0,<2.0.0)
|
31
32
|
Description-Content-Type: text/markdown
|
32
33
|
|
33
34
|
# Lexoid
|
@@ -93,10 +94,10 @@ Here's a quick example to parse documents using Lexoid:
|
|
93
94
|
from lexoid.api import parse
|
94
95
|
from lexoid.api import ParserType
|
95
96
|
|
96
|
-
parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE"
|
97
|
+
parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
|
97
98
|
# or
|
98
99
|
pdf_path = "path/to/immigration-law-advisor.pdf"
|
99
|
-
parsed_md = parse(pdf_path, parser_type="LLM_PARSE"
|
100
|
+
parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
|
100
101
|
|
101
102
|
print(parsed_md)
|
102
103
|
```
|
@@ -104,7 +105,6 @@ print(parsed_md)
|
|
104
105
|
### Parameters
|
105
106
|
- path (str): The file path or URL.
|
106
107
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
107
|
-
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
108
108
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
109
109
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
110
110
|
- **kwargs: Additional arguments for the parser.
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
|
6
|
+
lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
|
8
|
+
lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.9.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
|
6
|
-
lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
|
8
|
-
lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.8.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|