lexoid 0.1.8.post1__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +92 -47
- lexoid/core/parse_type/llm_parser.py +82 -132
- lexoid/core/parse_type/static_parser.py +83 -67
- lexoid/core/utils.py +68 -46
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/METADATA +4 -4
- lexoid-0.1.10.dist-info/RECORD +9 -0
- lexoid-0.1.8.post1.dist-info/RECORD +0 -9
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/LICENSE +0 -0
- {lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -19,6 +19,7 @@ from lexoid.core.utils import (
|
|
19
19
|
recursive_read_html,
|
20
20
|
router,
|
21
21
|
split_pdf,
|
22
|
+
create_sub_pdf,
|
22
23
|
)
|
23
24
|
|
24
25
|
|
@@ -28,20 +29,24 @@ class ParserType(Enum):
|
|
28
29
|
AUTO = "AUTO"
|
29
30
|
|
30
31
|
|
31
|
-
def parse_chunk(
|
32
|
-
path: str, parser_type: ParserType, raw: bool, **kwargs
|
33
|
-
) -> List[Dict] | str:
|
32
|
+
def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
34
33
|
"""
|
35
34
|
Parses a file using the specified parser type.
|
36
35
|
|
37
36
|
Args:
|
38
37
|
path (str): The file path or URL.
|
39
38
|
parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
|
40
|
-
raw (bool): Whether to return raw text or structured data.
|
41
39
|
**kwargs: Additional arguments for the parser.
|
42
40
|
|
43
41
|
Returns:
|
44
|
-
|
42
|
+
Dict: Dictionary containing:
|
43
|
+
- raw: Full markdown content as string
|
44
|
+
- segments: List of dictionaries with metadata and content
|
45
|
+
- title: Title of the document
|
46
|
+
- url: URL if applicable
|
47
|
+
- parent_title: Title of parent doc if recursively parsed
|
48
|
+
- recursive_docs: List of dictionaries for recursively parsed documents
|
49
|
+
- token_usage: Dictionary containing token usage statistics
|
45
50
|
"""
|
46
51
|
if parser_type == ParserType.AUTO:
|
47
52
|
parser_type = ParserType[router(path)]
|
@@ -52,63 +57,81 @@ def parse_chunk(
|
|
52
57
|
)
|
53
58
|
if parser_type == ParserType.STATIC_PARSE:
|
54
59
|
logger.debug("Using static parser")
|
55
|
-
return parse_static_doc(path,
|
60
|
+
return parse_static_doc(path, **kwargs)
|
56
61
|
else:
|
57
62
|
logger.debug("Using LLM parser")
|
58
|
-
return parse_llm_doc(path,
|
63
|
+
return parse_llm_doc(path, **kwargs)
|
59
64
|
|
60
65
|
|
61
66
|
def parse_chunk_list(
|
62
|
-
file_paths: List[str], parser_type: ParserType,
|
63
|
-
) ->
|
67
|
+
file_paths: List[str], parser_type: ParserType, kwargs: Dict
|
68
|
+
) -> Dict:
|
64
69
|
"""
|
65
70
|
Parses a list of files using the specified parser type.
|
66
71
|
|
67
72
|
Args:
|
68
73
|
file_paths (list): List of file paths.
|
69
74
|
parser_type (ParserType): The type of parser to use.
|
70
|
-
raw (bool): Whether to return raw text or structured data.
|
71
75
|
kwargs (dict): Additional arguments for the parser.
|
72
76
|
|
73
77
|
Returns:
|
74
|
-
|
78
|
+
Dict: Dictionary containing parsed document data
|
75
79
|
"""
|
76
|
-
|
80
|
+
combined_segments = []
|
81
|
+
raw_texts = []
|
82
|
+
token_usage = {"input": 0, "output": 0}
|
77
83
|
for file_path in file_paths:
|
78
|
-
result = parse_chunk(file_path, parser_type,
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
+
result = parse_chunk(file_path, parser_type, **kwargs)
|
85
|
+
combined_segments.extend(result["segments"])
|
86
|
+
raw_texts.append(result["raw"])
|
87
|
+
if "token_usage" in result:
|
88
|
+
token_usage["input"] += result["token_usage"]["input"]
|
89
|
+
token_usage["output"] += result["token_usage"]["output"]
|
90
|
+
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
91
|
+
|
92
|
+
return {
|
93
|
+
"raw": "\n\n".join(raw_texts),
|
94
|
+
"segments": combined_segments,
|
95
|
+
"title": kwargs.get("title", ""),
|
96
|
+
"url": kwargs.get("url", ""),
|
97
|
+
"parent_title": kwargs.get("parent_title", ""),
|
98
|
+
"recursive_docs": [],
|
99
|
+
"token_usage": token_usage,
|
100
|
+
}
|
84
101
|
|
85
102
|
|
86
103
|
def parse(
|
87
104
|
path: str,
|
88
105
|
parser_type: Union[str, ParserType] = "LLM_PARSE",
|
89
|
-
raw: bool = False,
|
90
106
|
pages_per_split: int = 4,
|
91
107
|
max_processes: int = 4,
|
92
108
|
**kwargs,
|
93
|
-
) ->
|
109
|
+
) -> Dict:
|
94
110
|
"""
|
95
111
|
Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
|
96
112
|
|
97
113
|
Args:
|
98
114
|
path (str): The file path or URL.
|
99
|
-
parser_type (Union[str, ParserType], optional):
|
100
|
-
|
101
|
-
|
102
|
-
max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
|
115
|
+
parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
|
116
|
+
pages_per_split (int, optional): Number of pages per split for chunking.
|
117
|
+
max_processes (int, optional): Maximum number of processes for parallel processing.
|
103
118
|
**kwargs: Additional arguments for the parser.
|
104
119
|
|
105
120
|
Returns:
|
106
|
-
|
121
|
+
Dict: Dictionary containing:
|
122
|
+
- raw: Full markdown content as string
|
123
|
+
- segments: List of dictionaries with metadata and content
|
124
|
+
- title: Title of the document
|
125
|
+
- url: URL if applicable
|
126
|
+
- parent_title: Title of parent doc if recursively parsed
|
127
|
+
- recursive_docs: List of dictionaries for recursively parsed documents
|
128
|
+
- token_usage: Dictionary containing token usage statistics
|
107
129
|
"""
|
108
130
|
kwargs["title"] = os.path.basename(path)
|
109
131
|
kwargs["pages_per_split_"] = pages_per_split
|
110
132
|
as_pdf = kwargs.get("as_pdf", False)
|
111
133
|
depth = kwargs.get("depth", 1)
|
134
|
+
|
112
135
|
if type(parser_type) == str:
|
113
136
|
parser_type = ParserType[parser_type]
|
114
137
|
|
@@ -120,15 +143,19 @@ def parse(
|
|
120
143
|
as_pdf = True
|
121
144
|
|
122
145
|
if path.startswith(("http://", "https://")):
|
123
|
-
|
146
|
+
kwargs["url"] = path
|
147
|
+
download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
|
124
148
|
os.makedirs(download_dir, exist_ok=True)
|
125
149
|
if is_supported_url_file_type(path):
|
126
150
|
path = download_file(path, download_dir)
|
127
151
|
elif as_pdf:
|
128
|
-
|
152
|
+
pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
|
153
|
+
if not pdf_filename.endswith(".pdf"):
|
154
|
+
pdf_filename += ".pdf"
|
155
|
+
pdf_path = os.path.join(download_dir, pdf_filename)
|
129
156
|
path = convert_to_pdf(path, pdf_path)
|
130
157
|
else:
|
131
|
-
return recursive_read_html(path, depth
|
158
|
+
return recursive_read_html(path, depth)
|
132
159
|
|
133
160
|
assert is_supported_file_type(
|
134
161
|
path
|
@@ -138,11 +165,15 @@ def parse(
|
|
138
165
|
pdf_path = os.path.join(temp_dir, "converted.pdf")
|
139
166
|
path = convert_to_pdf(path, pdf_path)
|
140
167
|
|
168
|
+
if "page_nums" in kwargs and path.lower().endswith(".pdf"):
|
169
|
+
sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
|
170
|
+
os.makedirs(sub_pdf_dir, exist_ok=True)
|
171
|
+
sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
|
172
|
+
path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
|
173
|
+
|
141
174
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
142
175
|
kwargs["split"] = False
|
143
|
-
|
144
|
-
if raw:
|
145
|
-
all_docs = [all_docs]
|
176
|
+
result = parse_chunk(path, parser_type, **kwargs)
|
146
177
|
else:
|
147
178
|
kwargs["split"] = True
|
148
179
|
split_dir = os.path.join(temp_dir, "splits/")
|
@@ -156,22 +187,39 @@ def parse(
|
|
156
187
|
for i in range(0, len(split_files), chunk_size)
|
157
188
|
]
|
158
189
|
|
159
|
-
process_args = [(chunk, parser_type,
|
190
|
+
process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
|
160
191
|
|
161
192
|
if max_processes == 1 or len(file_chunks) == 1:
|
162
|
-
|
193
|
+
chunk_results = [parse_chunk_list(*args) for args in process_args]
|
163
194
|
else:
|
164
195
|
with ProcessPoolExecutor(max_workers=max_processes) as executor:
|
165
|
-
|
166
|
-
|
167
|
-
|
196
|
+
chunk_results = list(
|
197
|
+
executor.map(parse_chunk_list, *zip(*process_args))
|
198
|
+
)
|
199
|
+
|
200
|
+
# Combine results from all chunks
|
201
|
+
result = {
|
202
|
+
"raw": "\n\n".join(r["raw"] for r in chunk_results),
|
203
|
+
"segments": [seg for r in chunk_results for seg in r["segments"]],
|
204
|
+
"title": kwargs["title"],
|
205
|
+
"url": kwargs.get("url", ""),
|
206
|
+
"parent_title": kwargs.get("parent_title", ""),
|
207
|
+
"recursive_docs": [],
|
208
|
+
"token_usage": {
|
209
|
+
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
210
|
+
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
211
|
+
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
212
|
+
},
|
213
|
+
}
|
214
|
+
if as_pdf:
|
215
|
+
result["pdf_path"] = path
|
168
216
|
|
169
217
|
if depth > 1:
|
170
|
-
|
171
|
-
for
|
218
|
+
recursive_docs = []
|
219
|
+
for segment in result["segments"]:
|
172
220
|
urls = re.findall(
|
173
221
|
r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
|
174
|
-
|
222
|
+
segment["content"],
|
175
223
|
)
|
176
224
|
for url in urls:
|
177
225
|
if "](" in url:
|
@@ -182,19 +230,16 @@ def parse(
|
|
182
230
|
|
183
231
|
kwargs_cp = kwargs.copy()
|
184
232
|
kwargs_cp["depth"] = depth - 1
|
185
|
-
|
233
|
+
kwargs_cp["parent_title"] = result["title"]
|
234
|
+
sub_doc = parse(
|
186
235
|
url,
|
187
236
|
parser_type=parser_type,
|
188
|
-
raw=raw,
|
189
237
|
pages_per_split=pages_per_split,
|
190
238
|
max_processes=max_processes,
|
191
239
|
**kwargs_cp,
|
192
240
|
)
|
241
|
+
recursive_docs.append(sub_doc)
|
193
242
|
|
194
|
-
|
195
|
-
new_docs.append(res)
|
196
|
-
else:
|
197
|
-
new_docs.extend(res)
|
198
|
-
all_docs = new_docs
|
243
|
+
result["recursive_docs"] = recursive_docs
|
199
244
|
|
200
|
-
return
|
245
|
+
return result
|
@@ -18,6 +18,7 @@ from lexoid.core.prompt_templates import (
|
|
18
18
|
from lexoid.core.utils import convert_image_to_pdf
|
19
19
|
from loguru import logger
|
20
20
|
from openai import OpenAI
|
21
|
+
from together import Together
|
21
22
|
from huggingface_hub import InferenceClient
|
22
23
|
|
23
24
|
|
@@ -33,38 +34,36 @@ def retry_on_http_error(func):
|
|
33
34
|
return func(*args, **kwargs)
|
34
35
|
except HTTPError as e:
|
35
36
|
logger.error(f"Retry failed: {e}")
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
}
|
46
|
-
]
|
37
|
+
return {
|
38
|
+
"raw": "",
|
39
|
+
"segments": [],
|
40
|
+
"title": kwargs["title"],
|
41
|
+
"url": kwargs.get("url", ""),
|
42
|
+
"parent_title": kwargs.get("parent_title", ""),
|
43
|
+
"recursive_docs": [],
|
44
|
+
"error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
|
45
|
+
}
|
47
46
|
|
48
47
|
return wrapper
|
49
48
|
|
50
49
|
|
51
50
|
@retry_on_http_error
|
52
|
-
def parse_llm_doc(path: str,
|
51
|
+
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
53
52
|
if "model" not in kwargs:
|
54
53
|
kwargs["model"] = "gemini-1.5-flash"
|
55
54
|
model = kwargs.get("model")
|
56
55
|
if model.startswith("gemini"):
|
57
|
-
return parse_with_gemini(path,
|
56
|
+
return parse_with_gemini(path, **kwargs)
|
58
57
|
if model.startswith("gpt"):
|
59
|
-
return parse_with_api(path,
|
58
|
+
return parse_with_api(path, api="openai", **kwargs)
|
60
59
|
if model.startswith("meta-llama"):
|
61
60
|
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
62
|
-
return
|
63
|
-
return parse_with_api(path,
|
61
|
+
return parse_with_api(path, api="together", **kwargs)
|
62
|
+
return parse_with_api(path, api="huggingface", **kwargs)
|
64
63
|
raise ValueError(f"Unsupported model: {model}")
|
65
64
|
|
66
65
|
|
67
|
-
def parse_with_gemini(path: str,
|
66
|
+
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
68
67
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
69
68
|
if not api_key:
|
70
69
|
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
@@ -119,25 +118,33 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
119
118
|
if "text" in part
|
120
119
|
)
|
121
120
|
|
122
|
-
|
121
|
+
combined_text = ""
|
123
122
|
if "<output>" in raw_text:
|
124
|
-
|
123
|
+
combined_text = raw_text.split("<output>")[1].strip()
|
125
124
|
if "</output>" in result:
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
},
|
137
|
-
"
|
138
|
-
|
139
|
-
|
140
|
-
|
125
|
+
combined_text = result.split("</output>")[0].strip()
|
126
|
+
|
127
|
+
token_usage = result["usageMetadata"]
|
128
|
+
input_tokens = token_usage.get("promptTokenCount", 0)
|
129
|
+
output_tokens = token_usage.get("candidatesTokenCount", 0)
|
130
|
+
total_tokens = input_tokens + output_tokens
|
131
|
+
|
132
|
+
return {
|
133
|
+
"raw": combined_text,
|
134
|
+
"segments": [
|
135
|
+
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
136
|
+
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
137
|
+
],
|
138
|
+
"title": kwargs["title"],
|
139
|
+
"url": kwargs.get("url", ""),
|
140
|
+
"parent_title": kwargs.get("parent_title", ""),
|
141
|
+
"recursive_docs": [],
|
142
|
+
"token_usage": {
|
143
|
+
"input": input_tokens,
|
144
|
+
"output": output_tokens,
|
145
|
+
"total": total_tokens,
|
146
|
+
},
|
147
|
+
}
|
141
148
|
|
142
149
|
|
143
150
|
def convert_pdf_page_to_base64(
|
@@ -155,97 +162,17 @@ def convert_pdf_page_to_base64(
|
|
155
162
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
156
163
|
|
157
164
|
|
158
|
-
def
|
159
|
-
api_key = os.environ.get("TOGETHER_API_KEY")
|
160
|
-
if not api_key:
|
161
|
-
raise ValueError("TOGETHER_API_KEY environment variable is not set")
|
162
|
-
|
163
|
-
url = "https://api.together.xyz/v1/chat/completions"
|
164
|
-
headers = {
|
165
|
-
"Authorization": f"Bearer {api_key}",
|
166
|
-
"Content-Type": "application/json",
|
167
|
-
}
|
168
|
-
|
169
|
-
mime_type, _ = mimetypes.guess_type(path)
|
170
|
-
if mime_type and mime_type.startswith("image"):
|
171
|
-
with open(path, "rb") as img_file:
|
172
|
-
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
173
|
-
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
174
|
-
else:
|
175
|
-
pdf_document = pdfium.PdfDocument(path)
|
176
|
-
images = [
|
177
|
-
(
|
178
|
-
page_num,
|
179
|
-
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
180
|
-
)
|
181
|
-
for page_num in range(len(pdf_document))
|
182
|
-
]
|
183
|
-
|
184
|
-
all_results = []
|
185
|
-
for page_num, image_url in images:
|
186
|
-
messages = [
|
187
|
-
{
|
188
|
-
"role": "user",
|
189
|
-
"content": [
|
190
|
-
{"type": "text", "text": LLAMA_PARSER_PROMPT},
|
191
|
-
{"type": "image_url", "image_url": {"url": image_url}},
|
192
|
-
],
|
193
|
-
}
|
194
|
-
]
|
195
|
-
|
196
|
-
payload = {
|
197
|
-
"model": kwargs["model"],
|
198
|
-
"messages": messages,
|
199
|
-
"max_tokens": kwargs.get("max_tokens", 1024),
|
200
|
-
"temperature": kwargs.get("temperature", 0.7),
|
201
|
-
}
|
202
|
-
|
203
|
-
response = requests.post(url, json=payload, headers=headers)
|
204
|
-
response.raise_for_status()
|
205
|
-
response_data = response.json()
|
206
|
-
|
207
|
-
page_text = response_data["choices"][0]["message"]["content"]
|
208
|
-
if kwargs.get("verbose", None):
|
209
|
-
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
210
|
-
|
211
|
-
result = page_text
|
212
|
-
if "<output>" in page_text:
|
213
|
-
result = page_text.split("<output>")[1].strip()
|
214
|
-
if "</output>" in result:
|
215
|
-
result = result.split("</output>")[0].strip()
|
216
|
-
all_results.append((page_num, result))
|
217
|
-
|
218
|
-
all_results.sort(key=lambda x: x[0])
|
219
|
-
all_texts = [text for _, text in all_results]
|
220
|
-
combined_text = "<page-break>".join(all_texts)
|
221
|
-
|
222
|
-
if raw:
|
223
|
-
return combined_text
|
224
|
-
|
225
|
-
return [
|
226
|
-
{
|
227
|
-
"metadata": {
|
228
|
-
"title": kwargs["title"],
|
229
|
-
"page": kwargs.get("start", 0) + page_no,
|
230
|
-
},
|
231
|
-
"content": page,
|
232
|
-
}
|
233
|
-
for page_no, page in enumerate(all_texts, start=1)
|
234
|
-
]
|
235
|
-
|
236
|
-
|
237
|
-
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
165
|
+
def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
238
166
|
"""
|
239
167
|
Parse documents (PDFs or images) using various vision model APIs.
|
240
168
|
|
241
169
|
Args:
|
242
170
|
path (str): Path to the document to parse
|
243
|
-
|
244
|
-
api (str): Which API to use ("openai" or "huggingface")
|
171
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
245
172
|
**kwargs: Additional arguments including model, temperature, title, etc.
|
246
173
|
|
247
174
|
Returns:
|
248
|
-
|
175
|
+
Dict: Dictionary containing parsed document data
|
249
176
|
"""
|
250
177
|
# Initialize appropriate client
|
251
178
|
clients = {
|
@@ -253,6 +180,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
253
180
|
"huggingface": lambda: InferenceClient(
|
254
181
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
255
182
|
),
|
183
|
+
"together": lambda: Together(),
|
256
184
|
}
|
257
185
|
assert api in clients, f"Unsupported API: {api}"
|
258
186
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -329,6 +257,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
329
257
|
|
330
258
|
# Get completion from selected API
|
331
259
|
response = client.chat.completions.create(**completion_params)
|
260
|
+
token_usage = response.usage
|
332
261
|
|
333
262
|
# Extract the response text
|
334
263
|
page_text = response.choices[0].message.content
|
@@ -341,23 +270,44 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
341
270
|
result = page_text.split("<output>")[1].strip()
|
342
271
|
if "</output>" in result:
|
343
272
|
result = result.split("</output>")[0].strip()
|
344
|
-
all_results.append(
|
273
|
+
all_results.append(
|
274
|
+
(
|
275
|
+
page_num,
|
276
|
+
result,
|
277
|
+
token_usage.prompt_tokens,
|
278
|
+
token_usage.completion_tokens,
|
279
|
+
token_usage.total_tokens,
|
280
|
+
)
|
281
|
+
)
|
345
282
|
|
346
283
|
# Sort results by page number and combine
|
347
284
|
all_results.sort(key=lambda x: x[0])
|
348
|
-
all_texts = [text for _, text in all_results]
|
285
|
+
all_texts = [text for _, text, _, _, _ in all_results]
|
349
286
|
combined_text = "<page-break>".join(all_texts)
|
350
287
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
288
|
+
return {
|
289
|
+
"raw": combined_text,
|
290
|
+
"segments": [
|
291
|
+
{
|
292
|
+
"metadata": {
|
293
|
+
"page": kwargs.get("start", 0) + page_no + 1,
|
294
|
+
"token_usage": {
|
295
|
+
"input": input_tokens,
|
296
|
+
"output": output_tokens,
|
297
|
+
"total": total_tokens,
|
298
|
+
},
|
299
|
+
},
|
300
|
+
"content": page,
|
301
|
+
}
|
302
|
+
for page_no, page, input_tokens, output_tokens, total_tokens in all_results
|
303
|
+
],
|
304
|
+
"title": kwargs["title"],
|
305
|
+
"url": kwargs.get("url", ""),
|
306
|
+
"parent_title": kwargs.get("parent_title", ""),
|
307
|
+
"recursive_docs": [],
|
308
|
+
"token_usage": {
|
309
|
+
"input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
|
310
|
+
"output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
|
311
|
+
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
|
312
|
+
},
|
313
|
+
}
|
@@ -9,73 +9,89 @@ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
|
9
9
|
from docx import Document
|
10
10
|
|
11
11
|
|
12
|
-
def parse_static_doc(path: str,
|
12
|
+
def parse_static_doc(path: str, **kwargs) -> Dict:
|
13
|
+
"""
|
14
|
+
Parses a document using static parsing methods.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
path (str): The file path.
|
18
|
+
**kwargs: Additional arguments for parsing.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
Dict: Dictionary containing parsed document data
|
22
|
+
"""
|
13
23
|
framework = kwargs.get("framework", "pdfplumber")
|
14
24
|
|
15
25
|
file_type = get_file_type(path)
|
16
26
|
if file_type == "application/pdf":
|
17
27
|
if framework == "pdfplumber":
|
18
|
-
return parse_with_pdfplumber(path,
|
28
|
+
return parse_with_pdfplumber(path, **kwargs)
|
19
29
|
elif framework == "pdfminer":
|
20
|
-
return parse_with_pdfminer(path,
|
30
|
+
return parse_with_pdfminer(path, **kwargs)
|
21
31
|
else:
|
22
32
|
raise ValueError(f"Unsupported framework: {framework}")
|
23
33
|
elif "wordprocessing" in file_type:
|
24
|
-
return parse_with_docx(path,
|
34
|
+
return parse_with_docx(path, **kwargs)
|
25
35
|
elif file_type == "text/html":
|
26
36
|
with open(path, "r") as f:
|
27
37
|
html_content = f.read()
|
28
|
-
return html_to_markdown(html_content,
|
38
|
+
return html_to_markdown(html_content, kwargs["title"])
|
29
39
|
elif file_type == "text/plain":
|
30
40
|
with open(path, "r") as f:
|
31
41
|
content = f.read()
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
]
|
42
|
+
return {
|
43
|
+
"raw": content,
|
44
|
+
"segments": [{"metadata": {"page": 1}, "content": content}],
|
45
|
+
"title": kwargs["title"],
|
46
|
+
"url": kwargs.get("url", ""),
|
47
|
+
"parent_title": kwargs.get("parent_title", ""),
|
48
|
+
"recursive_docs": [],
|
49
|
+
}
|
41
50
|
elif file_type == "text/csv":
|
42
51
|
df = pd.read_csv(path)
|
43
52
|
content = df.to_markdown(index=False)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
]
|
53
|
+
return {
|
54
|
+
"raw": content,
|
55
|
+
"segments": [{"metadata": {"page": 1}, "content": content}],
|
56
|
+
"title": kwargs["title"],
|
57
|
+
"url": kwargs.get("url", ""),
|
58
|
+
"parent_title": kwargs.get("parent_title", ""),
|
59
|
+
"recursive_docs": [],
|
60
|
+
}
|
53
61
|
else:
|
54
62
|
raise ValueError(f"Unsupported file type: {file_type}")
|
55
63
|
|
56
64
|
|
57
|
-
def parse_with_pdfminer(path: str,
|
65
|
+
def parse_with_pdfminer(path: str, **kwargs) -> Dict:
|
66
|
+
"""
|
67
|
+
Parse PDF using pdfminer.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Dict: Dictionary containing parsed document data
|
71
|
+
"""
|
58
72
|
pages = list(extract_pages(path))
|
59
|
-
|
73
|
+
segments = []
|
74
|
+
raw_texts = []
|
75
|
+
|
60
76
|
for page_num, page_layout in enumerate(pages, start=1):
|
61
77
|
page_text = "".join(
|
62
78
|
element.get_text()
|
63
79
|
for element in page_layout
|
64
80
|
if isinstance(element, LTTextContainer)
|
65
81
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
82
|
+
raw_texts.append(page_text)
|
83
|
+
segments.append(
|
84
|
+
{"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
|
85
|
+
)
|
86
|
+
|
87
|
+
return {
|
88
|
+
"raw": "\n".join(raw_texts),
|
89
|
+
"segments": segments,
|
90
|
+
"title": kwargs["title"],
|
91
|
+
"url": kwargs.get("url", ""),
|
92
|
+
"parent_title": kwargs.get("parent_title", ""),
|
93
|
+
"recursive_docs": [],
|
94
|
+
}
|
79
95
|
|
80
96
|
|
81
97
|
def process_table(table) -> str:
|
@@ -359,44 +375,44 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
|
|
359
375
|
return page_texts
|
360
376
|
|
361
377
|
|
362
|
-
def parse_with_pdfplumber(path: str,
|
378
|
+
def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
|
363
379
|
"""
|
364
|
-
Parse PDF
|
365
|
-
|
366
|
-
Args:
|
367
|
-
path (str): Path to the PDF file
|
368
|
-
raw (bool): If True, return raw text with page breaks; if False, return structured data
|
369
|
-
**kwargs: Additional arguments including 'title' and 'start' page number
|
380
|
+
Parse PDF using pdfplumber.
|
370
381
|
|
371
382
|
Returns:
|
372
|
-
|
373
|
-
or a string of raw text with page breaks
|
383
|
+
Dict: Dictionary containing parsed document data
|
374
384
|
"""
|
375
385
|
page_texts = process_pdf_with_pdfplumber(path)
|
376
|
-
|
377
|
-
|
378
|
-
return [
|
379
|
-
{
|
380
|
-
"metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
|
381
|
-
"content": page_text,
|
382
|
-
}
|
386
|
+
segments = [
|
387
|
+
{"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
|
383
388
|
for page_num, page_text in enumerate(page_texts, start=1)
|
384
389
|
]
|
385
390
|
|
391
|
+
return {
|
392
|
+
"raw": "<page-break>".join(page_texts),
|
393
|
+
"segments": segments,
|
394
|
+
"title": kwargs["title"],
|
395
|
+
"url": kwargs.get("url", ""),
|
396
|
+
"parent_title": kwargs.get("parent_title", ""),
|
397
|
+
"recursive_docs": [],
|
398
|
+
}
|
399
|
+
|
386
400
|
|
387
|
-
def parse_with_docx(path: str,
|
401
|
+
def parse_with_docx(path: str, **kwargs) -> Dict:
|
402
|
+
"""
|
403
|
+
Parse DOCX document.
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
Dict: Dictionary containing parsed document data
|
407
|
+
"""
|
388
408
|
doc = Document(path)
|
389
409
|
full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
390
410
|
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
},
|
400
|
-
"content": full_text,
|
401
|
-
}
|
402
|
-
]
|
411
|
+
return {
|
412
|
+
"raw": full_text,
|
413
|
+
"segments": [{"metadata": {"page": kwargs["start"] + 1}, "content": full_text}],
|
414
|
+
"title": kwargs["title"],
|
415
|
+
"url": kwargs.get("url", ""),
|
416
|
+
"parent_title": kwargs.get("parent_title", ""),
|
417
|
+
"recursive_docs": [],
|
418
|
+
}
|
lexoid/core/utils.py
CHANGED
@@ -5,7 +5,8 @@ import os
|
|
5
5
|
import re
|
6
6
|
import sys
|
7
7
|
from difflib import SequenceMatcher
|
8
|
-
from
|
8
|
+
from hashlib import md5
|
9
|
+
from typing import Dict, List, Optional
|
9
10
|
from urllib.parse import urlparse
|
10
11
|
|
11
12
|
import nest_asyncio
|
@@ -44,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
|
|
44
45
|
return paths
|
45
46
|
|
46
47
|
|
48
|
+
def create_sub_pdf(
|
49
|
+
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
|
50
|
+
) -> str:
|
51
|
+
if isinstance(page_nums, int):
|
52
|
+
page_nums = (page_nums,)
|
53
|
+
page_nums = tuple(sorted(set(page_nums)))
|
54
|
+
with pikepdf.open(input_path) as pdf:
|
55
|
+
indices = page_nums if page_nums else range(len(pdf.pages))
|
56
|
+
with pikepdf.new() as new_pdf:
|
57
|
+
new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
|
58
|
+
new_pdf.save(output_path)
|
59
|
+
return output_path
|
60
|
+
|
61
|
+
|
47
62
|
def convert_image_to_pdf(image_path: str) -> bytes:
|
48
63
|
with Image.open(image_path) as img:
|
49
64
|
img_rgb = img.convert("RGB")
|
@@ -184,14 +199,11 @@ def find_dominant_heading_level(markdown_content: str) -> str:
|
|
184
199
|
return min(heading_counts.keys(), key=len)
|
185
200
|
|
186
201
|
|
187
|
-
def split_md_by_headings(
|
188
|
-
markdown_content: str, heading_pattern: str, title: str
|
189
|
-
) -> List[Dict]:
|
202
|
+
def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
|
190
203
|
"""
|
191
204
|
Splits markdown content by the specified heading pattern and structures it.
|
192
205
|
|
193
206
|
Args:
|
194
|
-
url (str): The URL of the HTML page
|
195
207
|
markdown_content (str): The markdown content to split
|
196
208
|
heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
|
197
209
|
|
@@ -211,7 +223,7 @@ def split_md_by_headings(
|
|
211
223
|
if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
|
212
224
|
structured_content.append(
|
213
225
|
{
|
214
|
-
"metadata": {"
|
226
|
+
"metadata": {"page": "Introduction"},
|
215
227
|
"content": sections.pop(0),
|
216
228
|
}
|
217
229
|
)
|
@@ -221,7 +233,7 @@ def split_md_by_headings(
|
|
221
233
|
if i + 1 < len(sections):
|
222
234
|
structured_content.append(
|
223
235
|
{
|
224
|
-
"metadata": {"
|
236
|
+
"metadata": {"page": sections[i]},
|
225
237
|
"content": sections[i + 1],
|
226
238
|
}
|
227
239
|
)
|
@@ -238,7 +250,7 @@ def split_md_by_headings(
|
|
238
250
|
if len(sections) > len(headings):
|
239
251
|
structured_content.append(
|
240
252
|
{
|
241
|
-
"metadata": {"
|
253
|
+
"metadata": {"page": "Introduction"},
|
242
254
|
"content": sections.pop(0),
|
243
255
|
}
|
244
256
|
)
|
@@ -248,7 +260,7 @@ def split_md_by_headings(
|
|
248
260
|
clean_heading = heading.replace(heading_pattern, "").strip()
|
249
261
|
structured_content.append(
|
250
262
|
{
|
251
|
-
"metadata": {"
|
263
|
+
"metadata": {"page": clean_heading},
|
252
264
|
"content": content,
|
253
265
|
}
|
254
266
|
)
|
@@ -256,39 +268,47 @@ def split_md_by_headings(
|
|
256
268
|
return structured_content
|
257
269
|
|
258
270
|
|
259
|
-
def html_to_markdown(html: str,
|
271
|
+
def html_to_markdown(html: str, title: str, url: str) -> str:
|
260
272
|
"""
|
261
273
|
Converts HTML content to markdown.
|
262
274
|
|
263
275
|
Args:
|
264
276
|
html (str): The HTML content to convert.
|
265
|
-
|
277
|
+
title (str): The title of the HTML page
|
278
|
+
url (str): The URL of the HTML page
|
266
279
|
|
267
280
|
Returns:
|
268
|
-
|
281
|
+
Dict: Dictionary containing parsed document data
|
269
282
|
"""
|
270
283
|
markdown_content = md(html)
|
271
284
|
|
272
|
-
if raw:
|
273
|
-
return markdown_content
|
274
|
-
|
275
285
|
# Find the dominant heading level
|
276
286
|
heading_pattern = find_dominant_heading_level(markdown_content)
|
277
287
|
|
278
288
|
# Split content by headings and structure it
|
279
|
-
|
289
|
+
split_md = split_md_by_headings(markdown_content, heading_pattern)
|
290
|
+
|
291
|
+
content = {
|
292
|
+
"raw": markdown_content,
|
293
|
+
"segments": split_md,
|
294
|
+
"title": title,
|
295
|
+
"url": url,
|
296
|
+
"parent_title": "",
|
297
|
+
"recursive_docs": [],
|
298
|
+
}
|
299
|
+
|
300
|
+
return content
|
280
301
|
|
281
302
|
|
282
|
-
def read_html_content(url: str
|
303
|
+
def read_html_content(url: str) -> Dict:
|
283
304
|
"""
|
284
305
|
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
285
306
|
|
286
307
|
Args:
|
287
308
|
url (str): The URL of the HTML page.
|
288
|
-
raw (bool): Whether to return raw markdown text or structured data.
|
289
309
|
|
290
310
|
Returns:
|
291
|
-
|
311
|
+
Dict: Dictionary containing parsed document data
|
292
312
|
"""
|
293
313
|
|
294
314
|
try:
|
@@ -351,7 +371,10 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
|
|
351
371
|
soup = BeautifulSoup(
|
352
372
|
response.content, "html.parser", from_encoding="iso-8859-1"
|
353
373
|
)
|
354
|
-
|
374
|
+
title = soup.title.string.strip() if soup.title else "No title"
|
375
|
+
url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
|
376
|
+
full_title = f"{title} - {url_hash}"
|
377
|
+
return html_to_markdown(str(soup), title=full_title, url=url)
|
355
378
|
|
356
379
|
|
357
380
|
def extract_urls_from_markdown(content: str) -> List[str]:
|
@@ -378,61 +401,60 @@ def extract_urls_from_markdown(content: str) -> List[str]:
|
|
378
401
|
return list(set(urls)) # Remove duplicates
|
379
402
|
|
380
403
|
|
381
|
-
def recursive_read_html(
|
382
|
-
url: str, depth: int, raw: bool, visited_urls: set = None
|
383
|
-
) -> Union[str, List[Dict]]:
|
404
|
+
def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
|
384
405
|
"""
|
385
406
|
Recursively reads HTML content from URLs up to specified depth.
|
386
407
|
|
387
408
|
Args:
|
388
409
|
url (str): The URL to parse
|
389
410
|
depth (int): How many levels deep to recursively parse
|
390
|
-
raw (bool): Whether to return raw text or structured data
|
391
411
|
visited_urls (set): Set of already visited URLs to prevent cycles
|
392
412
|
|
393
413
|
Returns:
|
394
|
-
|
414
|
+
Dict: Dictionary containing parsed document data
|
395
415
|
"""
|
396
416
|
if visited_urls is None:
|
397
417
|
visited_urls = set()
|
398
418
|
|
399
419
|
if url in visited_urls:
|
400
|
-
return
|
420
|
+
return {
|
421
|
+
"raw": "",
|
422
|
+
"segments": [],
|
423
|
+
"title": "",
|
424
|
+
"url": url,
|
425
|
+
"parent_title": "",
|
426
|
+
"recursive_docs": [],
|
427
|
+
}
|
401
428
|
|
402
429
|
visited_urls.add(url)
|
403
430
|
|
404
431
|
try:
|
405
|
-
content = read_html_content(url
|
432
|
+
content = read_html_content(url)
|
406
433
|
except Exception as e:
|
407
434
|
print(f"Error processing URL {url}: {str(e)}")
|
408
|
-
return
|
435
|
+
return {
|
436
|
+
"raw": "",
|
437
|
+
"segments": [],
|
438
|
+
"title": "",
|
439
|
+
"url": url,
|
440
|
+
"parent_title": "",
|
441
|
+
"recursive_docs": [],
|
442
|
+
}
|
409
443
|
|
410
444
|
if depth <= 1:
|
411
445
|
return content
|
412
446
|
|
413
|
-
# Extract URLs from
|
414
|
-
|
415
|
-
urls = extract_urls_from_markdown(content)
|
416
|
-
else:
|
417
|
-
# Extract URLs from all content sections
|
418
|
-
urls = []
|
419
|
-
for doc in content:
|
420
|
-
urls.extend(extract_urls_from_markdown(doc["content"]))
|
447
|
+
# Extract URLs from all content sections
|
448
|
+
urls = extract_urls_from_markdown(content["raw"])
|
421
449
|
|
422
450
|
# Recursively process each URL
|
451
|
+
recursive_docs = []
|
423
452
|
for sub_url in urls:
|
424
453
|
if sub_url not in visited_urls:
|
425
|
-
sub_content = recursive_read_html(sub_url, depth - 1,
|
426
|
-
|
427
|
-
if raw:
|
428
|
-
if sub_content:
|
429
|
-
content += f"\n\n--- Begin content from {sub_url} ---\n\n"
|
430
|
-
content += sub_content
|
431
|
-
content += f"\n\n--- End content from {sub_url} ---\n\n"
|
432
|
-
else:
|
433
|
-
if isinstance(sub_content, list):
|
434
|
-
content.extend(sub_content)
|
454
|
+
sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
|
455
|
+
recursive_docs.append(sub_content)
|
435
456
|
|
457
|
+
content["recursive_docs"] = recursive_docs
|
436
458
|
return content
|
437
459
|
|
438
460
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.10
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -28,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
|
28
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
29
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
30
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
31
|
+
Requires-Dist: together (>=1.4.0,<2.0.0)
|
31
32
|
Description-Content-Type: text/markdown
|
32
33
|
|
33
34
|
# Lexoid
|
@@ -93,10 +94,10 @@ Here's a quick example to parse documents using Lexoid:
|
|
93
94
|
from lexoid.api import parse
|
94
95
|
from lexoid.api import ParserType
|
95
96
|
|
96
|
-
parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE"
|
97
|
+
parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
|
97
98
|
# or
|
98
99
|
pdf_path = "path/to/immigration-law-advisor.pdf"
|
99
|
-
parsed_md = parse(pdf_path, parser_type="LLM_PARSE"
|
100
|
+
parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
|
100
101
|
|
101
102
|
print(parsed_md)
|
102
103
|
```
|
@@ -104,7 +105,6 @@ print(parsed_md)
|
|
104
105
|
### Parameters
|
105
106
|
- path (str): The file path or URL.
|
106
107
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
107
|
-
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
108
108
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
109
109
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
110
110
|
- **kwargs: Additional arguments for the parser.
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=45nkTuQcxdppeUiRsiyioJtvlVeWeoq_WgKtGCthIBY,9193
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=tH19B0w78OowkDdqJg3rom0kQmyuTaTfDP98Qnwufo0,10625
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=HT37qmdhPpUNN6O571G7ItE5K2Mv8SreBHmxrhdiXA8,18951
|
6
|
+
lexoid-0.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.10.dist-info/METADATA,sha256=4uhJ_IaHEKPl9lxKg8RRrBQ5dn7oB23XCnJNG5sNpH4,4576
|
8
|
+
lexoid-0.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.10.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
|
6
|
-
lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
|
8
|
-
lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.8.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|