lexoid 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ import tempfile
2
+ import pandas as pd
3
+ import pdfplumber
4
+ from typing import List, Dict
5
+ from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
6
+ from pdfminer.high_level import extract_pages
7
+ from pdfminer.layout import LTTextContainer
8
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
9
+ from docx import Document
10
+
11
+
12
+ def parse_static_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
13
+ framework = kwargs.get("framework", "pdfplumber")
14
+
15
+ file_type = get_file_type(path)
16
+ if file_type == "application/pdf":
17
+ if framework == "pdfplumber":
18
+ return parse_with_pdfplumber(path, raw, **kwargs)
19
+ elif framework == "pdfminer":
20
+ return parse_with_pdfminer(path, raw, **kwargs)
21
+ else:
22
+ raise ValueError(f"Unsupported framework: {framework}")
23
+ elif "wordprocessing" in file_type:
24
+ return parse_with_docx(path, raw, **kwargs)
25
+ elif file_type == "text/html":
26
+ with open(path, "r") as f:
27
+ html_content = f.read()
28
+ return html_to_markdown(html_content, raw, kwargs["title"])
29
+ elif file_type == "text/plain":
30
+ with open(path, "r") as f:
31
+ content = f.read()
32
+ if raw:
33
+ return content
34
+ else:
35
+ return [
36
+ {
37
+ "metadata": {"title": kwargs["title"], "page": 1},
38
+ "content": content,
39
+ }
40
+ ]
41
+ elif file_type == "text/csv":
42
+ df = pd.read_csv(path)
43
+ content = df.to_markdown(index=False)
44
+ if raw:
45
+ return content
46
+ else:
47
+ return [
48
+ {
49
+ "metadata": {"title": kwargs["title"], "page": 1},
50
+ "content": content,
51
+ }
52
+ ]
53
+ else:
54
+ raise ValueError(f"Unsupported file type: {file_type}")
55
+
56
+
57
+ def parse_with_pdfminer(path: str, raw: bool, **kwargs) -> List[Dict] | str:
58
+ pages = list(extract_pages(path))
59
+ docs = []
60
+ for page_num, page_layout in enumerate(pages, start=1):
61
+ page_text = "".join(
62
+ element.get_text()
63
+ for element in page_layout
64
+ if isinstance(element, LTTextContainer)
65
+ )
66
+ if raw:
67
+ docs.append(page_text)
68
+ else:
69
+ docs.append(
70
+ {
71
+ "metadata": {
72
+ "title": kwargs["title"],
73
+ "page": kwargs["start"] + page_num,
74
+ },
75
+ "content": page_text,
76
+ }
77
+ )
78
+ return "\n".join(docs) if raw else docs
79
+
80
+
81
+ def process_table(table) -> str:
82
+ """
83
+ Convert a table to markdown format.
84
+ """
85
+ # Extract table data
86
+ table_data = table.extract()
87
+ if not table_data or not table_data[0]: # Check if table is empty
88
+ return ""
89
+
90
+ # Convert to DataFrame and handle empty cells
91
+ df = pd.DataFrame(table_data)
92
+ df = df.fillna("")
93
+
94
+ # Use first row as header and clean it up
95
+ df.columns = df.iloc[0]
96
+ df = df.drop(0)
97
+
98
+ # Convert to markdown with some formatting options
99
+ markdown_table = df.to_markdown(index=False, tablefmt="pipe")
100
+ return f"\n{markdown_table}\n\n" # Add newlines for proper markdown rendering
101
+
102
+
103
+ def embed_links_in_text(page, text, links):
104
+ """
105
+ Embed hyperlinks inline within the text, matching their position based on rectangles.
106
+
107
+ Args:
108
+ page (pdfplumber.page.Page): The page containing the links.
109
+ text (str): The full text extracted from the page.
110
+ links (list of tuples): List of (rect, uri) pairs.
111
+
112
+ Returns:
113
+ str: The text with hyperlinks embedded inline.
114
+ """
115
+ words = page.extract_words(x_tolerance=1)
116
+
117
+ words_with_positions = []
118
+ cur_position = 0
119
+ for word in words:
120
+ try:
121
+ word_pos = text[cur_position:].index(word["text"])
122
+ except ValueError:
123
+ continue
124
+ words_with_positions.append(
125
+ (word["text"], word["x0"], page.mediabox[-1] - word["top"], word_pos)
126
+ )
127
+ cur_position = cur_position + word_pos + len(word["text"])
128
+
129
+ for rect, uri in links:
130
+ rect_left, rect_top, rect_right, rect_bottom = rect
131
+ text_span = []
132
+ start_pos = None
133
+
134
+ for word, x0, word_top, word_pos in words_with_positions:
135
+ if rect_left <= x0 <= rect_right and rect_top <= word_top <= rect_bottom:
136
+ if not start_pos:
137
+ start_pos = word_pos
138
+ text_span.append(word)
139
+
140
+ if text_span:
141
+ original_text = " ".join(text_span)
142
+ text = text[:start_pos] + text[start_pos:].replace(
143
+ original_text, f"[{original_text}]({uri})"
144
+ )
145
+
146
+ return text
147
+
148
+
149
+ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
150
+ """
151
+ Process a single page's content and return formatted markdown text.
152
+ """
153
+ markdown_content = []
154
+ current_paragraph = []
155
+ current_heading = []
156
+ last_y = None
157
+ x_tolerance = kwargs.get("x_tolerance", 1)
158
+ y_tolerance = kwargs.get("y_tolerance", 5)
159
+
160
+ # First, identify tables and their positions
161
+ tables = page.find_tables()
162
+ table_zones = [(table.bbox, process_table(table)) for table in tables]
163
+
164
+ # Create a filtered page excluding table areas
165
+ filtered_page = page
166
+ for table_bbox, _ in table_zones:
167
+ filtered_page = filtered_page.filter(
168
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), table_bbox) is None
169
+ )
170
+
171
+ words = filtered_page.extract_words(
172
+ x_tolerance=x_tolerance,
173
+ y_tolerance=y_tolerance,
174
+ extra_attrs=["size", "top", "bottom"],
175
+ )
176
+
177
+ def format_paragraph(text):
178
+ text = " ".join(text.split())
179
+ return f"{text}\n\n"
180
+
181
+ def detect_heading_level(font_size):
182
+ if font_size >= 24:
183
+ return 1
184
+ elif font_size >= 20:
185
+ return 2
186
+ elif font_size >= 16:
187
+ return 3
188
+ return None
189
+
190
+ tables = []
191
+ for bbox, table_md in table_zones:
192
+ tables.append(
193
+ (
194
+ "table",
195
+ {
196
+ "top": bbox[1],
197
+ "bottom": bbox[3],
198
+ "content": table_md,
199
+ },
200
+ )
201
+ )
202
+ tables.sort(key=lambda x: x[1]["bottom"])
203
+ content_elements = []
204
+ for word in words:
205
+ while tables and word["bottom"] > tables[0][1]["bottom"]:
206
+ content_elements.append(tables.pop(0))
207
+ content_elements.append(("word", word))
208
+
209
+ for element_type, element in content_elements:
210
+ if element_type == "table":
211
+ # If there are any pending paragraphs or headings, add them first
212
+ if current_heading:
213
+ level = detect_heading_level(current_heading[0]["size"])
214
+ heading_text = " ".join(word["text"] for word in current_heading)
215
+ markdown_content.append(f"{'#' * level} {heading_text}\n\n")
216
+ current_heading = []
217
+ if current_paragraph:
218
+ markdown_content.append(format_paragraph(" ".join(current_paragraph)))
219
+ current_paragraph = []
220
+ # Add the table
221
+ markdown_content.append(element["content"])
222
+ last_y = element["bottom"]
223
+ else:
224
+ # Process word
225
+ word = element
226
+ # Check if this might be a heading
227
+ heading_level = detect_heading_level(word["size"])
228
+
229
+ # Detect new line based on vertical position
230
+ is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
231
+
232
+ if is_new_line:
233
+ # If we were collecting a heading
234
+ if current_heading:
235
+ level = detect_heading_level(current_heading[0]["size"])
236
+ heading_text = " ".join(word["text"] for word in current_heading)
237
+ markdown_content.append(f"{'#' * level} {heading_text}\n\n")
238
+ current_heading = []
239
+
240
+ # If we were collecting a paragraph
241
+ if current_paragraph:
242
+ markdown_content.append(
243
+ format_paragraph(" ".join(current_paragraph))
244
+ )
245
+ current_paragraph = []
246
+
247
+ # Add word to appropriate collection
248
+ if heading_level:
249
+ if current_paragraph: # Flush any pending paragraph
250
+ markdown_content.append(
251
+ format_paragraph(" ".join(current_paragraph))
252
+ )
253
+ current_paragraph = []
254
+ current_heading.append({"text": word["text"], "size": word["size"]})
255
+ else:
256
+ if current_heading: # Flush any pending heading
257
+ level = detect_heading_level(current_heading[0]["size"])
258
+ heading_text = " ".join(word["text"] for word in current_heading)
259
+ markdown_content.append(f"{'#' * level} {heading_text}\n\n")
260
+ current_heading = []
261
+ current_paragraph.append(word["text"])
262
+
263
+ last_y = word["top"]
264
+
265
+ # Handle remaining content
266
+ if current_heading:
267
+ level = detect_heading_level(current_heading[0]["size"])
268
+ heading_text = " ".join(word["text"] for word in current_heading)
269
+ markdown_content.append(f"{'#' * level} {heading_text}\n\n")
270
+
271
+ if current_paragraph:
272
+ markdown_content.append(format_paragraph(" ".join(current_paragraph)))
273
+
274
+ # Process links for the page
275
+ content = "".join(markdown_content) # Process links using the new function
276
+ if page.annots:
277
+ links = []
278
+ for annot in page.annots:
279
+ uri = annot.get("uri")
280
+ if uri and uri_rects.get(uri):
281
+ links.append((uri_rects[uri], uri))
282
+
283
+ if links:
284
+ content = embed_links_in_text(page, content, links)
285
+
286
+ return content
287
+
288
+
289
+ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
290
+ """
291
+ Process PDF and return a list of markdown-formatted strings, one per page.
292
+ """
293
+ page_texts = []
294
+
295
+ with tempfile.TemporaryDirectory() as temp_dir:
296
+ paths = split_pdf(path, temp_dir, pages_per_split=1)
297
+
298
+ for split_path in paths:
299
+ uri_rects = get_uri_rect(split_path)
300
+ with pdfplumber.open(split_path) as pdf:
301
+ for page in pdf.pages:
302
+ page_content = process_pdf_page_with_pdfplumber(
303
+ page, uri_rects, **kwargs
304
+ )
305
+ page_texts.append(page_content.strip())
306
+
307
+ return page_texts
308
+
309
+
310
+ def parse_with_pdfplumber(path: str, raw: bool, **kwargs) -> List[Dict] | str:
311
+ """
312
+ Parse PDF and return either raw text or structured data.
313
+
314
+ Args:
315
+ path (str): Path to the PDF file
316
+ raw (bool): If True, return raw text with page breaks; if False, return structured data
317
+ **kwargs: Additional arguments including 'title' and 'start' page number
318
+
319
+ Returns:
320
+ Union[List[Dict], str]: Either a list of dictionaries containing page metadata and content,
321
+ or a string of raw text with page breaks
322
+ """
323
+ page_texts = process_pdf_with_pdfplumber(path)
324
+ if raw:
325
+ return "<page-break>".join(page_texts)
326
+ return [
327
+ {
328
+ "metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
329
+ "content": page_text,
330
+ }
331
+ for page_num, page_text in enumerate(page_texts, start=1)
332
+ ]
333
+
334
+
335
+ def parse_with_docx(path: str, raw: bool, **kwargs) -> List[Dict] | str:
336
+ doc = Document(path)
337
+ full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
338
+
339
+ if raw:
340
+ return full_text
341
+
342
+ return [
343
+ {
344
+ "metadata": {
345
+ "title": kwargs["title"],
346
+ "page": kwargs["start"] + 1,
347
+ },
348
+ "content": full_text,
349
+ }
350
+ ]
@@ -0,0 +1,78 @@
1
+ # Initial prompt,
2
+ # This might go through further changes as the library evolves.
3
+ PARSER_PROMPT = """\
4
+ You are a specialized document parsing (including OCR) and conversion agent.
5
+ Your primary task is to analyze various types of documents and reproduce their content in a format that, when rendered, visually replicates the original input as closely as possible.
6
+ Your output should use a combination of Markdown and HTML to achieve this goal.
7
+ Think step-by-step.
8
+
9
+ **Instructions:**
10
+ - Analyze the given document thoroughly, identify formatting patterns, choose optimal markup, implement conversion and verify quality.
11
+ - Your primary goal is to ensure structural fidelity of the input is replicated. Preserve all content without loss.
12
+ - Use a combination of Markdown and HTML in your output. HTML can be used anywhere in the document, not just for complex structures. Choose the format that best replicates the original structural appearance. However, keep the font colors black and the background colors white.
13
+ - When reproducing tables, use HTML tables (<table>, <tr>, <td>) if they better represent the original layout. Utilize `colspan` and `rowspan` attributes as necessary to accurately represent merged cells.
14
+ - Preserve all formatting elements such as bold, italic, underline, strikethrough text, font sizes, and colors using appropriate HTML tags and inline styles if needed.
15
+ - Maintain the hierarchy (h1-h6) and styling of headings and subheadings using appropriate HTML tags or Markdown.
16
+ - Visual Elements:
17
+ * Images: If there is text within the image, try to recreate the structure within the image. If there is no text, describe the image content and position, and use placeholder `<img>` tags to represent their location in the document. Capture the image meaning in the alt attribute. Don't specify src if not known.
18
+ * Emojis: Use Unicode characters instead of images.
19
+ * Charts/Diagrams: For content that cannot be accurately represented in text format, provide a detailed textual description within an HTML element that visually represents its position in the document.
20
+ * Complex visuals: Mark with [?] and make a note for ambiguities or uncertain interpretations in the document. Use HTML comments <!-- --> for conversion notes. Only output notes with comment tags.
21
+ - Special Characters:
22
+ * Letters with ascenders are usually: b, d, f, h, k, l, t
23
+ * Letters with descenders are usually: g, j, p, q, y. Lowercase f and z also have descenders in many typefaces.
24
+ * Pay special attention to these commonly confused character pairs,
25
+ Letter 'l' vs number '1' vs exclamation mark '!'
26
+ Number '2' vs letter 'Z'
27
+ Number '5' vs letter 'S'
28
+ Number '51' vs number '±1'
29
+ Number '6' vs letter 'G' vs letter 'b'
30
+ Number '0' vs letter 'O'
31
+ Number '8' vs letter 'B'
32
+ Letter 'f' vs letter 't'
33
+ * Contextual clues to differentiate:
34
+ - If in a numeric column, interpret 'O' as '0'
35
+ - If preceded/followed by numbers, interpret 'l' as '1'
36
+ - Consider font characteristics, e.g.
37
+ '1' typically has no serif
38
+ '2' has a curved bottom vs 'Z's straight line
39
+ '5' has more rounded features than 'S'
40
+ '6' has a closed loop vs 'G's open curve
41
+ '0' is typically more oval than 'O'
42
+ '8' has a more angular top than 'B'
43
+ {custom_instructions}
44
+ - Return only the correct markdown without additional text or explanations. Do not any additional text (such as "```html" or "```markdown") in the output.
45
+ - Think before generating the output in <thinking></thinking> tags.
46
+
47
+ Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
48
+ Prioritize replicating structure above all else.
49
+ Use tables without borders to represent column-like structures.
50
+ Keep the font color black (#000000) and the background white (#ffffff).
51
+
52
+ OUTPUT FORMAT:
53
+ Enclose the response within XML tags as follows:
54
+ <thinking>
55
+ [Step-by-step analysis and generation strategy]
56
+ </thinking>
57
+ <output>
58
+ "Your converted document content here in markdown format"
59
+ </output>
60
+
61
+ Quality Checks:
62
+ 1. Verify structural and layout accuracy
63
+ 2. Verify content completeness
64
+ 3. Visual element handling
65
+ 4. Hierarchy preservation
66
+ 5. Confirm table alignment and cell merging accuracy
67
+ 6. Spacing fidelity
68
+ 7. Verify that numbers fall within expected ranges for their column
69
+ 8. Flag any suspicious characters that could be OCR errors
70
+ 9. Validate markdown syntax
71
+ """
72
+
73
+ OPENAI_USER_PROMPT = """\
74
+ Convert the following document to markdown.
75
+ Ensure accurate representation of all content, including tables and visual elements, per your instructions.
76
+ """
77
+
78
+ INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."