lexoid 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +200 -0
- lexoid/core/parse_type/llm_parser.py +200 -0
- lexoid/core/parse_type/static_parser.py +350 -0
- lexoid/core/prompt_templates.py +78 -0
- lexoid/core/utils.py +534 -0
- lexoid-0.1.6.dist-info/LICENSE +201 -0
- lexoid-0.1.6.dist-info/METADATA +102 -0
- lexoid-0.1.6.dist-info/RECORD +9 -0
- lexoid-0.1.6.dist-info/WHEEL +4 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
import tempfile
|
2
|
+
import pandas as pd
|
3
|
+
import pdfplumber
|
4
|
+
from typing import List, Dict
|
5
|
+
from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
|
6
|
+
from pdfminer.high_level import extract_pages
|
7
|
+
from pdfminer.layout import LTTextContainer
|
8
|
+
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
9
|
+
from docx import Document
|
10
|
+
|
11
|
+
|
12
|
+
def parse_static_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
13
|
+
framework = kwargs.get("framework", "pdfplumber")
|
14
|
+
|
15
|
+
file_type = get_file_type(path)
|
16
|
+
if file_type == "application/pdf":
|
17
|
+
if framework == "pdfplumber":
|
18
|
+
return parse_with_pdfplumber(path, raw, **kwargs)
|
19
|
+
elif framework == "pdfminer":
|
20
|
+
return parse_with_pdfminer(path, raw, **kwargs)
|
21
|
+
else:
|
22
|
+
raise ValueError(f"Unsupported framework: {framework}")
|
23
|
+
elif "wordprocessing" in file_type:
|
24
|
+
return parse_with_docx(path, raw, **kwargs)
|
25
|
+
elif file_type == "text/html":
|
26
|
+
with open(path, "r") as f:
|
27
|
+
html_content = f.read()
|
28
|
+
return html_to_markdown(html_content, raw, kwargs["title"])
|
29
|
+
elif file_type == "text/plain":
|
30
|
+
with open(path, "r") as f:
|
31
|
+
content = f.read()
|
32
|
+
if raw:
|
33
|
+
return content
|
34
|
+
else:
|
35
|
+
return [
|
36
|
+
{
|
37
|
+
"metadata": {"title": kwargs["title"], "page": 1},
|
38
|
+
"content": content,
|
39
|
+
}
|
40
|
+
]
|
41
|
+
elif file_type == "text/csv":
|
42
|
+
df = pd.read_csv(path)
|
43
|
+
content = df.to_markdown(index=False)
|
44
|
+
if raw:
|
45
|
+
return content
|
46
|
+
else:
|
47
|
+
return [
|
48
|
+
{
|
49
|
+
"metadata": {"title": kwargs["title"], "page": 1},
|
50
|
+
"content": content,
|
51
|
+
}
|
52
|
+
]
|
53
|
+
else:
|
54
|
+
raise ValueError(f"Unsupported file type: {file_type}")
|
55
|
+
|
56
|
+
|
57
|
+
def parse_with_pdfminer(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
58
|
+
pages = list(extract_pages(path))
|
59
|
+
docs = []
|
60
|
+
for page_num, page_layout in enumerate(pages, start=1):
|
61
|
+
page_text = "".join(
|
62
|
+
element.get_text()
|
63
|
+
for element in page_layout
|
64
|
+
if isinstance(element, LTTextContainer)
|
65
|
+
)
|
66
|
+
if raw:
|
67
|
+
docs.append(page_text)
|
68
|
+
else:
|
69
|
+
docs.append(
|
70
|
+
{
|
71
|
+
"metadata": {
|
72
|
+
"title": kwargs["title"],
|
73
|
+
"page": kwargs["start"] + page_num,
|
74
|
+
},
|
75
|
+
"content": page_text,
|
76
|
+
}
|
77
|
+
)
|
78
|
+
return "\n".join(docs) if raw else docs
|
79
|
+
|
80
|
+
|
81
|
+
def process_table(table) -> str:
|
82
|
+
"""
|
83
|
+
Convert a table to markdown format.
|
84
|
+
"""
|
85
|
+
# Extract table data
|
86
|
+
table_data = table.extract()
|
87
|
+
if not table_data or not table_data[0]: # Check if table is empty
|
88
|
+
return ""
|
89
|
+
|
90
|
+
# Convert to DataFrame and handle empty cells
|
91
|
+
df = pd.DataFrame(table_data)
|
92
|
+
df = df.fillna("")
|
93
|
+
|
94
|
+
# Use first row as header and clean it up
|
95
|
+
df.columns = df.iloc[0]
|
96
|
+
df = df.drop(0)
|
97
|
+
|
98
|
+
# Convert to markdown with some formatting options
|
99
|
+
markdown_table = df.to_markdown(index=False, tablefmt="pipe")
|
100
|
+
return f"\n{markdown_table}\n\n" # Add newlines for proper markdown rendering
|
101
|
+
|
102
|
+
|
103
|
+
def embed_links_in_text(page, text, links):
|
104
|
+
"""
|
105
|
+
Embed hyperlinks inline within the text, matching their position based on rectangles.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
page (pdfplumber.page.Page): The page containing the links.
|
109
|
+
text (str): The full text extracted from the page.
|
110
|
+
links (list of tuples): List of (rect, uri) pairs.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
str: The text with hyperlinks embedded inline.
|
114
|
+
"""
|
115
|
+
words = page.extract_words(x_tolerance=1)
|
116
|
+
|
117
|
+
words_with_positions = []
|
118
|
+
cur_position = 0
|
119
|
+
for word in words:
|
120
|
+
try:
|
121
|
+
word_pos = text[cur_position:].index(word["text"])
|
122
|
+
except ValueError:
|
123
|
+
continue
|
124
|
+
words_with_positions.append(
|
125
|
+
(word["text"], word["x0"], page.mediabox[-1] - word["top"], word_pos)
|
126
|
+
)
|
127
|
+
cur_position = cur_position + word_pos + len(word["text"])
|
128
|
+
|
129
|
+
for rect, uri in links:
|
130
|
+
rect_left, rect_top, rect_right, rect_bottom = rect
|
131
|
+
text_span = []
|
132
|
+
start_pos = None
|
133
|
+
|
134
|
+
for word, x0, word_top, word_pos in words_with_positions:
|
135
|
+
if rect_left <= x0 <= rect_right and rect_top <= word_top <= rect_bottom:
|
136
|
+
if not start_pos:
|
137
|
+
start_pos = word_pos
|
138
|
+
text_span.append(word)
|
139
|
+
|
140
|
+
if text_span:
|
141
|
+
original_text = " ".join(text_span)
|
142
|
+
text = text[:start_pos] + text[start_pos:].replace(
|
143
|
+
original_text, f"[{original_text}]({uri})"
|
144
|
+
)
|
145
|
+
|
146
|
+
return text
|
147
|
+
|
148
|
+
|
149
|
+
def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
150
|
+
"""
|
151
|
+
Process a single page's content and return formatted markdown text.
|
152
|
+
"""
|
153
|
+
markdown_content = []
|
154
|
+
current_paragraph = []
|
155
|
+
current_heading = []
|
156
|
+
last_y = None
|
157
|
+
x_tolerance = kwargs.get("x_tolerance", 1)
|
158
|
+
y_tolerance = kwargs.get("y_tolerance", 5)
|
159
|
+
|
160
|
+
# First, identify tables and their positions
|
161
|
+
tables = page.find_tables()
|
162
|
+
table_zones = [(table.bbox, process_table(table)) for table in tables]
|
163
|
+
|
164
|
+
# Create a filtered page excluding table areas
|
165
|
+
filtered_page = page
|
166
|
+
for table_bbox, _ in table_zones:
|
167
|
+
filtered_page = filtered_page.filter(
|
168
|
+
lambda obj: get_bbox_overlap(obj_to_bbox(obj), table_bbox) is None
|
169
|
+
)
|
170
|
+
|
171
|
+
words = filtered_page.extract_words(
|
172
|
+
x_tolerance=x_tolerance,
|
173
|
+
y_tolerance=y_tolerance,
|
174
|
+
extra_attrs=["size", "top", "bottom"],
|
175
|
+
)
|
176
|
+
|
177
|
+
def format_paragraph(text):
|
178
|
+
text = " ".join(text.split())
|
179
|
+
return f"{text}\n\n"
|
180
|
+
|
181
|
+
def detect_heading_level(font_size):
|
182
|
+
if font_size >= 24:
|
183
|
+
return 1
|
184
|
+
elif font_size >= 20:
|
185
|
+
return 2
|
186
|
+
elif font_size >= 16:
|
187
|
+
return 3
|
188
|
+
return None
|
189
|
+
|
190
|
+
tables = []
|
191
|
+
for bbox, table_md in table_zones:
|
192
|
+
tables.append(
|
193
|
+
(
|
194
|
+
"table",
|
195
|
+
{
|
196
|
+
"top": bbox[1],
|
197
|
+
"bottom": bbox[3],
|
198
|
+
"content": table_md,
|
199
|
+
},
|
200
|
+
)
|
201
|
+
)
|
202
|
+
tables.sort(key=lambda x: x[1]["bottom"])
|
203
|
+
content_elements = []
|
204
|
+
for word in words:
|
205
|
+
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
206
|
+
content_elements.append(tables.pop(0))
|
207
|
+
content_elements.append(("word", word))
|
208
|
+
|
209
|
+
for element_type, element in content_elements:
|
210
|
+
if element_type == "table":
|
211
|
+
# If there are any pending paragraphs or headings, add them first
|
212
|
+
if current_heading:
|
213
|
+
level = detect_heading_level(current_heading[0]["size"])
|
214
|
+
heading_text = " ".join(word["text"] for word in current_heading)
|
215
|
+
markdown_content.append(f"{'#' * level} {heading_text}\n\n")
|
216
|
+
current_heading = []
|
217
|
+
if current_paragraph:
|
218
|
+
markdown_content.append(format_paragraph(" ".join(current_paragraph)))
|
219
|
+
current_paragraph = []
|
220
|
+
# Add the table
|
221
|
+
markdown_content.append(element["content"])
|
222
|
+
last_y = element["bottom"]
|
223
|
+
else:
|
224
|
+
# Process word
|
225
|
+
word = element
|
226
|
+
# Check if this might be a heading
|
227
|
+
heading_level = detect_heading_level(word["size"])
|
228
|
+
|
229
|
+
# Detect new line based on vertical position
|
230
|
+
is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
|
231
|
+
|
232
|
+
if is_new_line:
|
233
|
+
# If we were collecting a heading
|
234
|
+
if current_heading:
|
235
|
+
level = detect_heading_level(current_heading[0]["size"])
|
236
|
+
heading_text = " ".join(word["text"] for word in current_heading)
|
237
|
+
markdown_content.append(f"{'#' * level} {heading_text}\n\n")
|
238
|
+
current_heading = []
|
239
|
+
|
240
|
+
# If we were collecting a paragraph
|
241
|
+
if current_paragraph:
|
242
|
+
markdown_content.append(
|
243
|
+
format_paragraph(" ".join(current_paragraph))
|
244
|
+
)
|
245
|
+
current_paragraph = []
|
246
|
+
|
247
|
+
# Add word to appropriate collection
|
248
|
+
if heading_level:
|
249
|
+
if current_paragraph: # Flush any pending paragraph
|
250
|
+
markdown_content.append(
|
251
|
+
format_paragraph(" ".join(current_paragraph))
|
252
|
+
)
|
253
|
+
current_paragraph = []
|
254
|
+
current_heading.append({"text": word["text"], "size": word["size"]})
|
255
|
+
else:
|
256
|
+
if current_heading: # Flush any pending heading
|
257
|
+
level = detect_heading_level(current_heading[0]["size"])
|
258
|
+
heading_text = " ".join(word["text"] for word in current_heading)
|
259
|
+
markdown_content.append(f"{'#' * level} {heading_text}\n\n")
|
260
|
+
current_heading = []
|
261
|
+
current_paragraph.append(word["text"])
|
262
|
+
|
263
|
+
last_y = word["top"]
|
264
|
+
|
265
|
+
# Handle remaining content
|
266
|
+
if current_heading:
|
267
|
+
level = detect_heading_level(current_heading[0]["size"])
|
268
|
+
heading_text = " ".join(word["text"] for word in current_heading)
|
269
|
+
markdown_content.append(f"{'#' * level} {heading_text}\n\n")
|
270
|
+
|
271
|
+
if current_paragraph:
|
272
|
+
markdown_content.append(format_paragraph(" ".join(current_paragraph)))
|
273
|
+
|
274
|
+
# Process links for the page
|
275
|
+
content = "".join(markdown_content) # Process links using the new function
|
276
|
+
if page.annots:
|
277
|
+
links = []
|
278
|
+
for annot in page.annots:
|
279
|
+
uri = annot.get("uri")
|
280
|
+
if uri and uri_rects.get(uri):
|
281
|
+
links.append((uri_rects[uri], uri))
|
282
|
+
|
283
|
+
if links:
|
284
|
+
content = embed_links_in_text(page, content, links)
|
285
|
+
|
286
|
+
return content
|
287
|
+
|
288
|
+
|
289
|
+
def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
|
290
|
+
"""
|
291
|
+
Process PDF and return a list of markdown-formatted strings, one per page.
|
292
|
+
"""
|
293
|
+
page_texts = []
|
294
|
+
|
295
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
296
|
+
paths = split_pdf(path, temp_dir, pages_per_split=1)
|
297
|
+
|
298
|
+
for split_path in paths:
|
299
|
+
uri_rects = get_uri_rect(split_path)
|
300
|
+
with pdfplumber.open(split_path) as pdf:
|
301
|
+
for page in pdf.pages:
|
302
|
+
page_content = process_pdf_page_with_pdfplumber(
|
303
|
+
page, uri_rects, **kwargs
|
304
|
+
)
|
305
|
+
page_texts.append(page_content.strip())
|
306
|
+
|
307
|
+
return page_texts
|
308
|
+
|
309
|
+
|
310
|
+
def parse_with_pdfplumber(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
311
|
+
"""
|
312
|
+
Parse PDF and return either raw text or structured data.
|
313
|
+
|
314
|
+
Args:
|
315
|
+
path (str): Path to the PDF file
|
316
|
+
raw (bool): If True, return raw text with page breaks; if False, return structured data
|
317
|
+
**kwargs: Additional arguments including 'title' and 'start' page number
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
Union[List[Dict], str]: Either a list of dictionaries containing page metadata and content,
|
321
|
+
or a string of raw text with page breaks
|
322
|
+
"""
|
323
|
+
page_texts = process_pdf_with_pdfplumber(path)
|
324
|
+
if raw:
|
325
|
+
return "<page-break>".join(page_texts)
|
326
|
+
return [
|
327
|
+
{
|
328
|
+
"metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
|
329
|
+
"content": page_text,
|
330
|
+
}
|
331
|
+
for page_num, page_text in enumerate(page_texts, start=1)
|
332
|
+
]
|
333
|
+
|
334
|
+
|
335
|
+
def parse_with_docx(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
336
|
+
doc = Document(path)
|
337
|
+
full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
338
|
+
|
339
|
+
if raw:
|
340
|
+
return full_text
|
341
|
+
|
342
|
+
return [
|
343
|
+
{
|
344
|
+
"metadata": {
|
345
|
+
"title": kwargs["title"],
|
346
|
+
"page": kwargs["start"] + 1,
|
347
|
+
},
|
348
|
+
"content": full_text,
|
349
|
+
}
|
350
|
+
]
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Initial prompt,
|
2
|
+
# This might go through further changes as the library evolves.
|
3
|
+
PARSER_PROMPT = """\
|
4
|
+
You are a specialized document parsing (including OCR) and conversion agent.
|
5
|
+
Your primary task is to analyze various types of documents and reproduce their content in a format that, when rendered, visually replicates the original input as closely as possible.
|
6
|
+
Your output should use a combination of Markdown and HTML to achieve this goal.
|
7
|
+
Think step-by-step.
|
8
|
+
|
9
|
+
**Instructions:**
|
10
|
+
- Analyze the given document thoroughly, identify formatting patterns, choose optimal markup, implement conversion and verify quality.
|
11
|
+
- Your primary goal is to ensure structural fidelity of the input is replicated. Preserve all content without loss.
|
12
|
+
- Use a combination of Markdown and HTML in your output. HTML can be used anywhere in the document, not just for complex structures. Choose the format that best replicates the original structural appearance. However, keep the font colors black and the background colors white.
|
13
|
+
- When reproducing tables, use HTML tables (<table>, <tr>, <td>) if they better represent the original layout. Utilize `colspan` and `rowspan` attributes as necessary to accurately represent merged cells.
|
14
|
+
- Preserve all formatting elements such as bold, italic, underline, strikethrough text, font sizes, and colors using appropriate HTML tags and inline styles if needed.
|
15
|
+
- Maintain the hierarchy (h1-h6) and styling of headings and subheadings using appropriate HTML tags or Markdown.
|
16
|
+
- Visual Elements:
|
17
|
+
* Images: If there is text within the image, try to recreate the structure within the image. If there is no text, describe the image content and position, and use placeholder `<img>` tags to represent their location in the document. Capture the image meaning in the alt attribute. Don't specify src if not known.
|
18
|
+
* Emojis: Use Unicode characters instead of images.
|
19
|
+
* Charts/Diagrams: For content that cannot be accurately represented in text format, provide a detailed textual description within an HTML element that visually represents its position in the document.
|
20
|
+
* Complex visuals: Mark with [?] and make a note for ambiguities or uncertain interpretations in the document. Use HTML comments <!-- --> for conversion notes. Only output notes with comment tags.
|
21
|
+
- Special Characters:
|
22
|
+
* Letters with ascenders are usually: b, d, f, h, k, l, t
|
23
|
+
* Letters with descenders are usually: g, j, p, q, y. Lowercase f and z also have descenders in many typefaces.
|
24
|
+
* Pay special attention to these commonly confused character pairs,
|
25
|
+
Letter 'l' vs number '1' vs exclamation mark '!'
|
26
|
+
Number '2' vs letter 'Z'
|
27
|
+
Number '5' vs letter 'S'
|
28
|
+
Number '51' vs number '±1'
|
29
|
+
Number '6' vs letter 'G' vs letter 'b'
|
30
|
+
Number '0' vs letter 'O'
|
31
|
+
Number '8' vs letter 'B'
|
32
|
+
Letter 'f' vs letter 't'
|
33
|
+
* Contextual clues to differentiate:
|
34
|
+
- If in a numeric column, interpret 'O' as '0'
|
35
|
+
- If preceded/followed by numbers, interpret 'l' as '1'
|
36
|
+
- Consider font characteristics, e.g.
|
37
|
+
'1' typically has no serif
|
38
|
+
'2' has a curved bottom vs 'Z's straight line
|
39
|
+
'5' has more rounded features than 'S'
|
40
|
+
'6' has a closed loop vs 'G's open curve
|
41
|
+
'0' is typically more oval than 'O'
|
42
|
+
'8' has a more angular top than 'B'
|
43
|
+
{custom_instructions}
|
44
|
+
- Return only the correct markdown without additional text or explanations. Do not any additional text (such as "```html" or "```markdown") in the output.
|
45
|
+
- Think before generating the output in <thinking></thinking> tags.
|
46
|
+
|
47
|
+
Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
|
48
|
+
Prioritize replicating structure above all else.
|
49
|
+
Use tables without borders to represent column-like structures.
|
50
|
+
Keep the font color black (#000000) and the background white (#ffffff).
|
51
|
+
|
52
|
+
OUTPUT FORMAT:
|
53
|
+
Enclose the response within XML tags as follows:
|
54
|
+
<thinking>
|
55
|
+
[Step-by-step analysis and generation strategy]
|
56
|
+
</thinking>
|
57
|
+
<output>
|
58
|
+
"Your converted document content here in markdown format"
|
59
|
+
</output>
|
60
|
+
|
61
|
+
Quality Checks:
|
62
|
+
1. Verify structural and layout accuracy
|
63
|
+
2. Verify content completeness
|
64
|
+
3. Visual element handling
|
65
|
+
4. Hierarchy preservation
|
66
|
+
5. Confirm table alignment and cell merging accuracy
|
67
|
+
6. Spacing fidelity
|
68
|
+
7. Verify that numbers fall within expected ranges for their column
|
69
|
+
8. Flag any suspicious characters that could be OCR errors
|
70
|
+
9. Validate markdown syntax
|
71
|
+
"""
|
72
|
+
|
73
|
+
OPENAI_USER_PROMPT = """\
|
74
|
+
Convert the following document to markdown.
|
75
|
+
Ensure accurate representation of all content, including tables and visual elements, per your instructions.
|
76
|
+
"""
|
77
|
+
|
78
|
+
INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
|