vlmparse 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/METADATA +12 -1
  39. vlmparse-0.1.3.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
1
+ import asyncio
2
+ import io
3
+ import os
4
+
5
+ import orjson
6
+ from loguru import logger
7
+ from pydantic import Field
8
+
9
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
10
+ from vlmparse.clients.pipe_utils.utils import clean_response
11
+ from vlmparse.converter import BaseConverter, ConverterConfig
12
+ from vlmparse.data_model.document import BoundingBox, Item, Page
13
+ from vlmparse.servers.docker_server import DockerServerConfig
14
+
15
+
16
+ class MinerUDockerServerConfig(DockerServerConfig):
17
+ """Configuration for MinerU Docker server."""
18
+
19
+ model_name: str = "mineru25"
20
+ docker_image: str = "pulsia/mineru25apipulsia:latest"
21
+ docker_port: int = 4299
22
+ container_port: int = 8000
23
+
24
+ @property
25
+ def client_config(self):
26
+ return MinerUConverterConfig(api_url=f"http://localhost:{self.docker_port}")
27
+
28
+
29
+ class MinerUConverterConfig(ConverterConfig):
30
+ """Configuration for MinerU API converter."""
31
+
32
+ base_url: str = Field(
33
+ default_factory=lambda: os.getenv("MINERU_API_URL", "http://localhost:4299")
34
+ )
35
+ timeout: int = 600
36
+
37
+ def get_client(self, **kwargs) -> "MinerUConverter":
38
+ return MinerUConverter(config=self, **kwargs)
39
+
40
+
41
+ def to_bytes_io(image):
42
+ img_byte_arr = io.BytesIO()
43
+ image.save(img_byte_arr, format="PNG")
44
+ img_byte_arr.seek(0)
45
+ return img_byte_arr
46
+
47
+
48
+ class MinerUConverter(BaseConverter):
49
+ """MinerU HTTP API converter."""
50
+
51
+ config: MinerUConverterConfig
52
+
53
+ def __init__(self, config: MinerUConverterConfig, **kwargs):
54
+ super().__init__(config=config, **kwargs)
55
+ from httpx import AsyncClient
56
+
57
+ self.client = AsyncClient(base_url=config.api_url, timeout=config.timeout)
58
+
59
+ async def _async_inference_with_api(self, image) -> list:
60
+ """Run async inference with MinerU API."""
61
+
62
+ img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
63
+
64
+ response = await self.client.post(
65
+ "process-image",
66
+ files={"image": ("image.png", img_byte_arr, "image/png")},
67
+ )
68
+
69
+ response.raise_for_status()
70
+
71
+ res = orjson.loads(response.content)
72
+
73
+ return res
74
+
75
+ async def _parse_image_with_api(self, origin_image):
76
+ response = await self._async_inference_with_api(origin_image)
77
+
78
+ original_width, original_height = origin_image.size
79
+
80
+ for cell in response:
81
+ bbox = cell["bbox"]
82
+ bbox_resized = [
83
+ bbox[0] * original_width,
84
+ bbox[1] * original_height,
85
+ bbox[2] * original_width,
86
+ bbox[3] * original_height,
87
+ ]
88
+
89
+ cell["bbox"] = bbox_resized
90
+
91
+ return response
92
+
93
+ async def async_call_inside_page(self, page: Page) -> Page:
94
+ image = page.image
95
+
96
+ # Call MinerU API
97
+ response = await self._parse_image_with_api(image)
98
+ logger.info("Response: " + str(response))
99
+
100
+ contents = [item.get("content", "") for item in response]
101
+ text = "\n\n".join([content for content in contents if content is not None])
102
+ items = []
103
+ for item in response:
104
+ l, t, r, b = item["bbox"]
105
+ txt = item.get("content", "")
106
+
107
+ items.append(
108
+ Item(
109
+ text=txt if txt is not None else "",
110
+ box=BoundingBox(l=l, t=t, r=r, b=b),
111
+ category=item["type"],
112
+ )
113
+ )
114
+ page.items = items
115
+
116
+ text = clean_response(text)
117
+ text = html_to_md_keep_tables(text)
118
+ page.text = text
119
+ return page
@@ -0,0 +1,29 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
+
6
+
7
+ class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
8
+ """Configuration for NanonetOCR2 model."""
9
+
10
+ model_name: str = "nanonets/Nanonets-OCR2-3B"
11
+ aliases: list[str] = Field(default_factory=lambda: ["nanonetsocr2"])
12
+
13
+ @property
14
+ def client_config(self):
15
+ return NanonetOCR2ConverterConfig(llm_params=self.llm_params)
16
+
17
+
18
+ class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
19
+ """Configuration for NanonetOCR2 model."""
20
+
21
+ model_name: str = "nanonets/Nanonets-OCR2-3B"
22
+ preprompt: str | None = None
23
+ postprompt: str | None = (
24
+ "Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."
25
+ )
26
+ completion_kwargs: dict | None = {"temperature": 0.0, "max_tokens": 15000}
27
+ max_image_size: int | None = None
28
+ dpi: int = 200
29
+ aliases: list[str] = Field(default_factory=lambda: ["nanonetsocr2"])
@@ -0,0 +1,46 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
+
6
+
7
+ class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
8
+ """Configuration for OlmOCR model."""
9
+
10
+ model_name: str = "allenai/olmOCR-2-7B-1025-FP8"
11
+ command_args: list[str] = Field(
12
+ default_factory=lambda: [
13
+ "--limit-mm-per-prompt",
14
+ '{"image": 1, "video": 0}',
15
+ "--disable-log-requests",
16
+ "--uvicorn-log-level",
17
+ "warning",
18
+ "--max-model-len",
19
+ "16384",
20
+ ]
21
+ )
22
+ aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
23
+
24
+ @property
25
+ def client_config(self):
26
+ return OlmOCRConverterConfig(llm_params=self.llm_params)
27
+
28
+
29
+ class OlmOCRConverterConfig(OpenAIConverterConfig):
30
+ """OlmOCR converter"""
31
+
32
+ model_name: str = "allenai/olmOCR-2-7B-1025-FP8"
33
+ preprompt: str | None = (
34
+ "Attached is one page of a document that you must process. "
35
+ "Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to HTML.\n"
36
+ "If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\n"
37
+ "Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
38
+ )
39
+ postprompt: str | None = None
40
+ completion_kwargs: dict | None = {
41
+ "temperature": 0.1,
42
+ "max_tokens": 8000,
43
+ }
44
+ max_image_size: int | None = 1288
45
+ dpi: int = 200
46
+ aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
@@ -0,0 +1,173 @@
1
+ import os
2
+ from typing import Literal
3
+
4
+ from loguru import logger
5
+ from pydantic import Field
6
+
7
+ from vlmparse.base_model import VLMParseBaseModel
8
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
9
+ from vlmparse.clients.pipe_utils.utils import clean_response
10
+ from vlmparse.converter import BaseConverter, ConverterConfig
11
+ from vlmparse.data_model.document import Page
12
+ from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
13
+ from vlmparse.utils import to_base64
14
+
15
+ from .prompts import PDF2MD_PROMPT
16
+
17
+ GOOGLE_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
18
+
19
+
20
+ class LLMParams(VLMParseBaseModel):
21
+ api_key: str = ""
22
+ base_url: str | None = None
23
+ model_name: str = DEFAULT_MODEL_NAME
24
+ timeout: int | None = 500
25
+ max_retries: int = 1
26
+
27
+
28
+ def get_llm_params(model_name: str, uri: str | None = None):
29
+ if uri is not None:
30
+ return LLMParams(base_url=uri, model_name="vllm-model", api_key="")
31
+ if model_name in [
32
+ "gpt-4o",
33
+ "gpt-4o-mini",
34
+ "gpt-4.1",
35
+ "gpt-4.1-mini",
36
+ "gpt-4.1-nano",
37
+ "gpt-5",
38
+ "gpt-5-mini",
39
+ "gpt-5-nano",
40
+ ]:
41
+ base_url = None
42
+ api_key = os.getenv("OPENAI_API_KEY")
43
+ else:
44
+ if model_name in [
45
+ "gemini-2.5-flash-lite",
46
+ "gemini-2.5-flash",
47
+ "gemini-2.5-pro",
48
+ ]:
49
+ base_url = GOOGLE_API_BASE_URL
50
+ api_key = os.getenv("GOOGLE_API_KEY")
51
+ else:
52
+ return None
53
+ return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
54
+
55
+
56
+ class OpenAIConverterConfig(ConverterConfig):
57
+ llm_params: LLMParams
58
+ preprompt: str | None = None
59
+ postprompt: str | None = PDF2MD_PROMPT
60
+ completion_kwargs: dict = Field(default_factory=dict)
61
+ stream: bool = False
62
+
63
+ def get_client(self, **kwargs) -> "OpenAIConverterClient":
64
+ return OpenAIConverterClient(config=self, **kwargs)
65
+
66
+
67
+ class OpenAIConverterClient(BaseConverter):
68
+ """Client for OpenAI-compatible API servers."""
69
+
70
+ def __init__(
71
+ self,
72
+ config: OpenAIConverterConfig,
73
+ num_concurrent_files: int = 10,
74
+ num_concurrent_pages: int = 10,
75
+ save_folder: str | None = None,
76
+ save_mode: Literal["document", "md", "md_page"] = "document",
77
+ debug: bool = False,
78
+ return_documents_in_batch_mode: bool = False,
79
+ ):
80
+ super().__init__(
81
+ config=config,
82
+ num_concurrent_files=num_concurrent_files,
83
+ num_concurrent_pages=num_concurrent_pages,
84
+ save_folder=save_folder,
85
+ save_mode=save_mode,
86
+ debug=debug,
87
+ return_documents_in_batch_mode=return_documents_in_batch_mode,
88
+ )
89
+ from openai import AsyncOpenAI
90
+
91
+ self.model = AsyncOpenAI(
92
+ base_url=self.config.llm_params.base_url,
93
+ api_key=self.config.llm_params.api_key,
94
+ timeout=self.config.llm_params.timeout,
95
+ )
96
+
97
+ async def _get_chat_completion(
98
+ self, messages: list[dict], completion_kwargs: dict | None = None
99
+ ) -> str:
100
+ """Helper to handle chat completion with optional streaming."""
101
+ if completion_kwargs is None:
102
+ completion_kwargs = self.config.completion_kwargs
103
+
104
+ if self.config.stream:
105
+ response_stream = await self.model.chat.completions.create(
106
+ model=self.config.llm_params.model_name,
107
+ messages=messages,
108
+ stream=True,
109
+ **completion_kwargs,
110
+ )
111
+ response_parts = []
112
+ async for chunk in response_stream:
113
+ if chunk.choices and chunk.choices[0].delta.content:
114
+ response_parts.append(chunk.choices[0].delta.content)
115
+ return "".join(response_parts)
116
+ else:
117
+ response_obj = await self.model.chat.completions.create(
118
+ model=self.config.llm_params.model_name,
119
+ messages=messages,
120
+ **completion_kwargs,
121
+ )
122
+
123
+ if response_obj.choices[0].message.content is None:
124
+ raise ValueError(
125
+ "Response is None, finish reason: "
126
+ + response_obj.choices[0].finish_reason
127
+ )
128
+ return response_obj.choices[0].message.content
129
+
130
+ async def async_call_inside_page(self, page: Page) -> Page:
131
+ """Process a single page using OpenAI-compatible API."""
132
+ image = page.image
133
+ if self.config.preprompt:
134
+ preprompt = [
135
+ {
136
+ "role": "system",
137
+ "content": [{"type": "text", "text": self.config.preprompt}],
138
+ }
139
+ ]
140
+ else:
141
+ preprompt = []
142
+
143
+ postprompt = (
144
+ [{"type": "text", "text": self.config.postprompt}]
145
+ if self.config.postprompt
146
+ else []
147
+ )
148
+
149
+ messages = [
150
+ *preprompt,
151
+ {
152
+ "role": "user",
153
+ "content": [
154
+ {
155
+ "type": "image_url",
156
+ "image_url": {
157
+ "url": f"data:image/png;base64,{to_base64(image)}"
158
+ },
159
+ },
160
+ *postprompt,
161
+ ],
162
+ },
163
+ ]
164
+
165
+ response = await self._get_chat_completion(messages)
166
+ logger.info("Response: " + str(response))
167
+ page.raw_response = response
168
+ text = clean_response(response)
169
+
170
+ text = html_to_md_keep_tables(text)
171
+ page.text = text
172
+
173
+ return page
@@ -0,0 +1,48 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
+
6
+
7
+ class PaddleOCRVLDockerServerConfig(VLLMDockerServerConfig):
8
+ """Configuration for PaddleOCRVL model."""
9
+
10
+ model_name: str = "PaddlePaddle/PaddleOCR-VL"
11
+ command_args: list[str] = Field(
12
+ default_factory=lambda: [
13
+ "--limit-mm-per-prompt",
14
+ '{"image": 1}',
15
+ "--async-scheduling",
16
+ "--trust-remote-code",
17
+ "--mm-processor-cache-gb",
18
+ "0",
19
+ ]
20
+ )
21
+ aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
22
+
23
+ @property
24
+ def client_config(self):
25
+ return PaddleOCRVLConverterConfig(llm_params=self.llm_params)
26
+
27
+
28
+ # Task-specific base prompts
29
+ TASKS = {
30
+ "ocr": "OCR:",
31
+ "table": "Table Recognition:",
32
+ "formula": "Formula Recognition:",
33
+ "chart": "Chart Recognition:",
34
+ }
35
+
36
+
37
+ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
38
+ """PaddleOCRVL converter"""
39
+
40
+ model_name: str = "PaddlePaddle/PaddleOCR-VL"
41
+ preprompt: str | None = None
42
+ postprompt: str | None = TASKS["ocr"]
43
+ completion_kwargs: dict | None = {
44
+ "temperature": 0.0,
45
+ }
46
+ max_image_size: int | None = 1540
47
+ dpi: int = 200
48
+ aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
@@ -0,0 +1,74 @@
1
+ import re
2
+ import unicodedata
3
+
4
+ # Code adapted from olmocr.bench.tests.normalize_text
5
+
6
+
7
+ def normalize_text(
8
+ md_content: str,
9
+ additional_replacements: dict = {},
10
+ only_alphanum: bool = False,
11
+ remove_md_images: bool = True,
12
+ ) -> str:
13
+ """Normalise md text"""
14
+
15
+ if md_content is None:
16
+ return None
17
+
18
+ # Normalize <br> and <br/> to newlines
19
+ md_content = re.sub(r"<br/?>", " ", md_content)
20
+
21
+ # Normalize whitespace in the md_content
22
+ md_content = re.sub(r"[ \t]+", " ", md_content)
23
+
24
+ # remove_title_emphasis:
25
+ md_content = re.sub(r"[-=]{3,}", "", md_content)
26
+
27
+ # remove_space_with_newlines:
28
+ md_content = re.sub(r"\n *", "\n", md_content)
29
+
30
+ # remove_more_than_2_newlines:
31
+ md_content = re.sub(r"\n{2,}", "\n\n", md_content)
32
+
33
+ # Remove markdown bold formatting (** or __ for bold)
34
+ md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
35
+ # md_content = re.sub(r"__(.*?)__", r"\1", md_content)
36
+ md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
37
+ md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
38
+
39
+ # Remove markdown italics formatting (* or _ for italics)
40
+ md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
41
+ # md_content = re.sub(r"_(.*?)_", r"\1", md_content)
42
+
43
+ # remove_more_than_1_spaces:
44
+ md_content = re.sub(r" {2,}", " ", md_content)
45
+
46
+ # Convert down to a consistent unicode form, so é == e + accent, unicode forms
47
+ md_content = unicodedata.normalize("NFC", md_content)
48
+
49
+ # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
50
+ replacements = {
51
+ "‘": "'",
52
+ "’": "'",
53
+ "‚": "'",
54
+ "“": '"',
55
+ "”": '"',
56
+ "„": '"',
57
+ "_": "_",
58
+ "–": "-",
59
+ "—": "-",
60
+ "‑": "-",
61
+ "‒": "-",
62
+ "−": "-",
63
+ "\u00b5": "\u03bc",
64
+ } | additional_replacements
65
+
66
+ # Apply all replacements from the dictionary
67
+ for fancy_char, ascii_char in replacements.items():
68
+ md_content = md_content.replace(fancy_char, ascii_char)
69
+
70
+ if only_alphanum:
71
+ md_content = re.sub(r"[^a-zA-Z0-9]", "", md_content)
72
+ if remove_md_images:
73
+ md_content = re.sub(r"!\[[\s\S]*?]\([\s\S]*?\)", "", md_content, flags=re.S)
74
+ return md_content.strip()
@@ -0,0 +1,136 @@
1
+ import re
2
+
3
+ from bs4 import BeautifulSoup
4
+ from bs4.element import NavigableString
5
+ from html_to_markdown import convert_to_markdown
6
+
7
+
8
+ def html_to_md_keep_tables(html: str, remove_head: bool = False) -> str:
9
+ # remove the whole <img>…</img> block (tag + contents)
10
+ html = re.sub(r"<img>([\s\S]*?)</img>", "", html, flags=re.S)
11
+
12
+ soup = BeautifulSoup(html, "lxml")
13
+ if remove_head:
14
+ # Remove global <head> tags if present
15
+ if soup.head:
16
+ soup.head.decompose()
17
+
18
+ # --- recurse inside tables first ---------------------------------------
19
+ for table in soup.find_all("table"):
20
+ for cell in table.find_all(["td", "th"]):
21
+ inner = "".join(map(str, cell.contents))
22
+ cell_md = html_to_md_keep_tables(inner).strip() # recursion
23
+ cell.clear()
24
+ cell.append(NavigableString(cell_md))
25
+
26
+ # --- protect current-level tables --------------------------------------
27
+ markers = {}
28
+ for i, t in enumerate(soup.find_all("table")):
29
+ key = f"%%%TAB{i}%%%"
30
+ markers[key] = str(t)
31
+ t.replace_with(key)
32
+
33
+ # --- html → markdown (tables excluded) ---------------------------------
34
+ if len(str(soup)) > 0:
35
+ md_txt = convert_to_markdown(
36
+ str(soup),
37
+ strip=["table", "b", "strong", "i", "em"],
38
+ heading_style="atx",
39
+ escape_misc=False,
40
+ bullets="-",
41
+ )
42
+ else:
43
+ md_txt = ""
44
+
45
+ # --- restore tables -----------------------------------------------------
46
+ for k, raw in markers.items():
47
+ md_txt = md_txt.replace(k, raw)
48
+ # print(md_txt)
49
+
50
+ return md_txt
51
+
52
+
53
+ # %%
54
+ def md_tables_to_html(md_text: str) -> str:
55
+ """
56
+ Convert all Markdown tables in the text to HTML tables.
57
+
58
+ Args:
59
+ md_text: Markdown text containing tables
60
+
61
+ Returns:
62
+ Text with Markdown tables replaced by HTML tables
63
+ """
64
+ # Pattern to match Markdown tables
65
+ table_pattern = r"(\|[^\n]*\|\n(?:\|[-\s:]*\|\n)?(?:\|[^\n]*\|\n?)*)"
66
+
67
+ def convert_md_table_to_html(match):
68
+ table_text = match.group(1).strip()
69
+ lines = table_text.split("\n")
70
+
71
+ # Remove empty lines
72
+ lines = [line for line in lines if line.strip()]
73
+
74
+ if len(lines) < 2:
75
+ return table_text # Not a valid table
76
+
77
+ # Check if second line is a separator (contains only |, -, :, and spaces)
78
+ separator_pattern = r"^\|[-:\s|]+\|$"
79
+ has_separator = bool(re.match(separator_pattern, lines[1]))
80
+
81
+ if not has_separator:
82
+ return table_text # Not a valid table
83
+
84
+ # Parse header
85
+ header_row = lines[0]
86
+ header_cells = [cell.strip() for cell in header_row.split("|")[1:-1]]
87
+
88
+ # Parse data rows
89
+ data_rows = []
90
+ for line in lines[2:]:
91
+ if line.strip():
92
+ cells = [cell.strip() for cell in line.split("|")[1:-1]]
93
+ data_rows.append(cells)
94
+
95
+ # Build HTML table
96
+ html_parts = ["<table>"]
97
+
98
+ # Add header
99
+ if header_cells:
100
+ html_parts.append("<thead>")
101
+ html_parts.append("<tr>")
102
+ for cell in header_cells:
103
+ html_parts.append(f"<th>{cell}</th>")
104
+ html_parts.append("</tr>")
105
+ html_parts.append("</thead>")
106
+
107
+ # Add body
108
+ if data_rows:
109
+ html_parts.append("<tbody>")
110
+ for row in data_rows:
111
+ html_parts.append("<tr>")
112
+ for cell in row:
113
+ html_parts.append(f"<td>{cell}</td>")
114
+ html_parts.append("</tr>")
115
+ html_parts.append("</tbody>")
116
+
117
+ html_parts.append("</table>")
118
+
119
+ return "".join(html_parts)
120
+
121
+ # Replace all Markdown tables with HTML tables
122
+ result = re.sub(
123
+ table_pattern, convert_md_table_to_html, md_text, flags=re.MULTILINE
124
+ )
125
+
126
+ return result
127
+
128
+
129
+ if __name__ == "__main__":
130
+ md_text = """| Name | Age | City |
131
+ |------|-----|------|
132
+ | John | 25 | NYC |
133
+ | Jane | 30 | LA |"""
134
+
135
+ html_result = md_tables_to_html(md_text)
136
+ print(html_result)
@@ -0,0 +1,12 @@
1
+ def clean_response(text):
2
+ """Clean markdown/html markers from response text."""
3
+ return (
4
+ text.strip()
5
+ .removeprefix("")
6
+ .removesuffix("")
7
+ .removeprefix("```")
8
+ .removesuffix("```")
9
+ .removeprefix("markdown")
10
+ .removeprefix("html")
11
+ .strip()
12
+ )