PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show

helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py ADDED Viewed

@@ -0,0 +1,347 @@
+from typing import Optional, Tuple, List, Dict, Any
+import io
+import os
+import re
+from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
+try:
+    from latex import build_pdf
+    from pdf2image import convert_from_bytes
+    from PIL import ImageOps
+    from PIL.Image import Image
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, suggestions=["image2structure"])
+# LaTeX preamble
+# Make sure to install "latex-full".
+TEX_INCLUDES = r"""
+\usepackage{amsmath,amssymb,amsfonts}
+\usepackage{graphicx}
+\usepackage{graphicx}
+\usepackage{amsmath}
+\usepackage{xcolor}
+\usepackage{algorithm}
+\usepackage{algorithmicx}
+\usepackage{algpseudocode}
+\usepackage{listings}
+\usepackage{stfloats}
+\usepackage{epstopdf}
+\usepackage{pgfplots}
+\usepackage{tikz}
+\usepackage{tikz-cd}
+\usepackage{tikz-qtree}
+\usepackage{tikz-dependency}
+\usepackage{tikz-3dplot}
+\usepackage{tikz-network}
+\usepackage[flushleft]{threeparttable}
+\usepackage{adjustbox}
+"""
+# LaTeX delimiters
+TEX_BEGIN_FILE = r"""\documentclass{article}"""
+TEX_BEGIN_DOCUMENT = r"""\begin{document}"""
+TEX_END_DOCUMENT = r"""\end{document}"""
+# Number of times to try to fix the LaTeX code
+MAX_NUM_TRIES: int = 3
+TEX_BEGIN_DOCUMENT = r"""\begin{document}"""
+TEX_END_DOCUMENT = r"""\end{document}"""
+TEX_REPLACE_NUMBERING: List[Tuple[str, str]] = [
+    ("{equation}", "{equation*}"),
+    ("{align}", "{align*}"),
+    ("{alignat}", "{alignat*}"),
+    ("{gather}", "{gather*}"),
+    ("{flalign}", "{flalign*}"),
+    ("{multline}", "{multline*}"),
+    ("{eqnarray}", "{eqnarray*}"),
+    ("{subeqnarray}", "{subeqnarray*}"),
+    ("{multline}", "{multline*}"),
+    ("{aligneq}", "{aligneq*}"),
+]
+def latex_to_pdf(latex_code: str, assets_path: str) -> io.BytesIO:
+    # Compiling LaTeX code to PDF
+    path = os.path.join(os.path.abspath(os.path.dirname(__file__)), assets_path)
+    pdf = build_pdf(latex_code, texinputs=[path, ""])
+    return io.BytesIO(pdf.data)  # Convert PDF to a byte stream
+def pdf_to_image(
+    pdf_stream: io.BytesIO,
+    crop: bool = False,
+    resize_to: Optional[Tuple[int, int]] = None,
+) -> Image:
+    # Convert the first page of the PDF stream to an image
+    images = convert_from_bytes(pdf_stream.read(), first_page=1, last_page=1)
+    if images:
+        image = images[0]
+        # Removes the white border around the image
+        if crop:
+            (w, h) = image.size
+            image = image.crop((0, 0, w, h - int(h * 0.2)))  # Remove pagination
+            image = image.crop(ImageOps.invert(image).getbbox())  # Remove white border
+        # Resize the image
+        if resize_to:
+            image = image.resize(resize_to)
+        return image
+    else:
+        raise Exception("PDF to Image conversion failed")
+def strip_unnecessary_latex_parts(latex_code: str) -> str:
+    """Strip unnecessary parts of the LaTeX code."""
+    # Remove comments
+    minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
+    # Remove \documentclass and any \usepackage lines
+    minimal_latex_code = re.sub(r"\\documentclass\{.*?\}\n", "", latex_code)
+    minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}\n", "", minimal_latex_code)
+    # Remove everything before \begin{document} and including it, and everything after \end{document}
+    minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
+    minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
+    # Ensure \begin{...} is followed by a \n
+    minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
+    # Ensure \end{...} has a \n before it
+    minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
+    # Normalize space sequences to a single space globally
+    minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
+    # Replace tabs with a single space
+    minimal_latex_code = re.sub(r"\t", " ", minimal_latex_code)
+    # Remove leading and trailing spaces on each line
+    minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
+    # Remove unnecessary whitespace - multiple empty lines and tabulations
+    minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
+    return minimal_latex_code.strip()
+def handle_latex_error(
+    e: Exception,
+    original_latex_code: str,
+    assets_path: str,
+    crop: bool,
+    resize_to: Optional[Tuple[int, int]],
+    num_try_remaining: int,
+) -> Tuple[Image, Dict[str, Any]]:
+    # Check for error that are caused by the original LaTeX code itself
+    # and should not be fixed by trying again with a different code
+    # TODO #2346: Make this list more exhaustive
+    str_e: str = str(e).replace("\n", "")
+    # Source of the descriptions:
+    # - https://www.overleaf.com/learn/latex/Errors
+    # - https://tex.stackexchange.com/
+    for error_message in [
+        # This error occurs when LaTeX encounters an undefined control sequence
+        # Example: \blabla
+        r"""Undefined control sequence""",
+        # This error appears when you have forgotten to include an \item command.
+        # It can also appear from trying to use lists inside a table incorrectly.
+        # Example:
+        #     \begin{itemize}
+        #     First item without the \item command
+        #     \end{itemize}
+        r"""LaTeX Error: Lonely \item--perhaps a missing list environment.""",
+        # This error occurs when a { or } is missing.
+        # Example: \sum_{i=1 ^n
+        r"""Missing } inserted""",
+        r"""Missing { inserted""",
+        # This error occurs when LaTeX encounters a double subscript.
+        # Example: a_b_c
+        r"""Double subscript.""",
+        # This error occurs when an environment or $ is added around something that cannot be typeset
+        # in the given mode.
+        # Example:
+        #      $
+        #      \begin{table}
+        #      ...
+        #      \end{table}
+        #      $
+        r"""LaTeX Error: Not in outer par mode.""",
+        # This error occurs when LaTeX is typesetting a table and detects
+        # an alignment character ( & ) where it did not expect to find one
+        r"""Extra alignment tab has been changed to \cr.""",
+        # Missing control sequence othen than $ (which is handled elsewhere).
+        # Example: \left( without
+        "Missing \\",
+        # LaTeX Error: \begin{<env>} on input line <line> ended by \end{<diff_env>}
+        # This error occurs when LaTeX encounters an environment that is not properly closed.
+        # Example:
+        #     \begin{table}
+        #     ...
+        #     \end{document}
+        r"""LaTeX Error: \begin{""",
+        # This error occurs when LaTeX encounters a \noalign command in the wrong place.
+        # Example:
+        #     \begin{tabular}
+        #     \noalign{\hrule}
+        #     ...
+        #     \end{tabular}
+        r"""Misplaced \noalign""",
+        # LaTeX Error: Command <command> already defined.
+        # This errors occurs when two packages define the same command.
+        # We cannot fix this as we would have to try to find the conflicting packages.
+        # Example:
+        #     \usepackage{algorithmic}
+        #     \usepackage{algorithmicx}
+        r""" already defined.""",
+    ]:
+        if error_message in str_e:
+            raise RuntimeError(str(e)) from e
+    if num_try_remaining > 0:
+        # Check if the error is easily fixable
+        fixed_code: str = original_latex_code
+        # Equation not in math mode
+        # We correct this error as the prompt might not be obvious if the output should be:
+        # <EQUATION_CODE> or $<EQUATION_CODE>$.
+        # We only handle this cas and that is why we add the $ at the beginning and end of the equation.
+        # The missing $ might come from elsewhere but then, it is a problem of the generated code,
+        # and not some unclear instructions, so we do not handle it.
+        # Error format: "Missing $ inserted" or "<command> allowed only in math mode"
+        if "Missing $ inserted" in str(e) or " allowed only in math mode" in str_e:
+            # Only wrap the content after \begin{document} and before \end{document}
+            fixed_code = re.sub(
+                r"(?<=\\begin{document})(.*?)(?=\\end{document})",
+                r"$$\1$$",
+                fixed_code,
+                flags=re.DOTALL,
+            )  # Use \begin{equation} instead of $ to avoid inline mode
+        # Missing include
+        # Missing includes are tolerated as the prompt suggests that it is not necessary to include them,
+        # and our TEX_INCLUDES might lack some packages.
+        # Error format: "LaTeX Error: Environment <env> undefined."
+        undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
+        if undefined_search:
+            # If a package is missing and this is our first retry, then simply include TEX_INCLUDES
+            if num_try_remaining == MAX_NUM_TRIES:
+                fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
+            if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
+                # Here we try to manually solve the missing environment.
+                # This is either executed on the second rety or the first if no changements
+                # were made in the first retry.
+                assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
+                # TEX_INCLUDES is already present, so we add the missing package
+                # Since we cannot know the name of the package that contains the missing environment,
+                # we simply hope that they are named the same way.
+                env_undefined: str = undefined_search.group(1)
+                if f"\\usepackage{{{env_undefined}}}" in fixed_code:
+                    # We already tried to include the missing package, but it probably
+                    # does not exist, so we raise an error
+                    raise RuntimeError(str(e)) from e
+                fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
+        # Try again with the fixed code (if the fixed code is different from the original code)
+        if fixed_code != original_latex_code:
+            return latex_to_image(
+                fixed_code,
+                assets_path=assets_path,
+                crop=crop,
+                resize_to=resize_to,
+                num_try_remaining=num_try_remaining - 1,
+            )
+    # TODO #2346: Ideally we should never reach this point
+    # All errors should be either detected as:
+    # - generation error: should not be fixed and raised
+    # - easily fixable: should be fixed and tried again
+    # If we reach this point, it means that none of the above cases were detected.
+    raise RuntimeError(str(e)) from e
+def latex_to_image(
+    original_latex_code: str,
+    assets_path: str,
+    crop: bool = False,
+    resize_to: Optional[Tuple[int, int]] = None,
+    num_try_remaining: int = MAX_NUM_TRIES,
+) -> Tuple[Image, Dict[str, Any]]:
+    """Convert a LaTeX code to an image.
+    Args:
+        original_latex_code (str): The LaTeX code to convert to an image.
+        assets_path (str): The path to the assets.
+        crop (bool, optional): Whether to crop the image. Defaults to False.
+        resize_to (Optional[Tuple[int, int]], optional): The size to resize the image to. Defaults to None.
+        num_try_remaining (int, optional): The number of tries remaining. Defaults to MAX_NUM_TRIES.
+    Returns:
+        image (Image): The image of the LaTeX code.
+        infos (Dict[str, Any]): a dictionnary containing:
+            size (Tuple[int, int]): The size of the image.
+            latex_code (str): The modified LaTeX code that was successfully compiled.
+    Raises:
+        OptionalDependencyNotInstalled: If LaTeX is not installed.
+        RuntimeError: If the LaTeX code cannot be converted to an image.
+    """
+    # Basic LaTeX processing
+    # This changes cannot break the original LaTeX code
+    # Other processing will be done in the handle_latex_error function
+    # but these might break the original LaTeX code so they are only applied
+    # if the original LaTeX code does not compile.
+    # 0. Remove all environments that might cause numbering
+    # This is important because the numbering of the equations might change
+    # the bounding box of the image.
+    for replace in TEX_REPLACE_NUMBERING:
+        original_latex_code = original_latex_code.replace(replace[0], replace[1])
+    # Also removes all \label commands
+    # If it is followed by a \n, it should be removed as well
+    original_latex_code = re.sub(r"\\label\{.*?\}[\t ]*(\n)?", "", original_latex_code)
+    # 1. Add begin/end document if not present
+    if TEX_BEGIN_DOCUMENT not in original_latex_code and TEX_BEGIN_FILE not in original_latex_code:
+        original_latex_code = TEX_BEGIN_DOCUMENT + original_latex_code
+    if TEX_END_DOCUMENT not in original_latex_code:
+        original_latex_code = original_latex_code + TEX_END_DOCUMENT
+    # 2. Add preamble
+    # 2.1. Remove \documentclass if present to make sure we use our own
+    documentclass_search = re.search(r"\\documentclass\{(.*)\}", original_latex_code)
+    if documentclass_search:
+        documentclass: str = documentclass_search.group(1)
+        original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
+    else:
+        # If there is no \documentclass, we add our own
+        original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
+    # 2.2. Add includes. In this first step, we only add includes if none are present.
+    # We do this because if some are present, we might define them twice which can cause errors
+    # and this section should not make the original LaTeX code fail if it was compilable.
+    # If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
+    # which might define some packages twice, but often solves the problem.
+    if not re.search(r"\\usepackage\{.*\}", original_latex_code):
+        original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
+    latex_code: str = original_latex_code
+    try:
+        pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
+        image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
+        return image, {"image_size": image.size, "latex_code": latex_code}
+    except RuntimeError as e:
+        if str(e) == "No available builder could be instantiated. Please make sure LaTeX is installed.":
+            raise OptionalDependencyNotInstalled(
+                "Optional dependency LaTeX is not installed. "
+                "Please install LaTeX and make sure it is available in your PATH."
+                "You can install LaTeX on Ubuntu with `sudo apt-get install texlive-full`."
+            ) from e
+        else:
+            return handle_latex_error(e, original_latex_code, assets_path, crop, resize_to, num_try_remaining)
+    except Exception as e:
+        return handle_latex_error(e, original_latex_code, assets_path, crop, resize_to, num_try_remaining)

helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py ADDED Viewed

File without changes

helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py ADDED Viewed

@@ -0,0 +1,84 @@
+from typing import Tuple, Dict, Any
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from selenium import webdriver
+    import selenium.common.exceptions
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, suggestions=["image2structure"])
+def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
+    """Initialize the WebDriver
+    Args:
+        url (str): The URL of the website. Usually "http://localhost:{port}".
+        resolution (tuple[int, int], optional): The resolution of the WebDriver. Defaults to (1920, 1080).
+    Returns:
+        webdriver.Chrome: The Chrome WebDriver
+    """
+    options = webdriver.ChromeOptions()
+    options.add_argument(f"--window-size={resolution[0]},{resolution[1]}")
+    options.add_argument("--headless")  # Optional: run in headless mode
+    options.add_argument("--no-sandbox")  # Optional: for certain environments
+    options.add_argument("--disable-dev-shm-usage")  # Optional: overcome limited resource problems
+    driver = webdriver.Chrome(options=options)
+    driver.get(url)
+    return driver
+def close_driver(driver: webdriver.Chrome):
+    """Close the WebDriver
+    Args:
+        driver (webdriver.Chrome): The Chrome WebDriver
+    """
+    driver.quit()
+class ScreenshotOptions:
+    """A class to store the parameters for taking a screenshot"""
+    """The resolution of the screenshot"""
+    resolution: Tuple[int, int] = (1920, 1080)
+    """The delay between each action in milliseconds"""
+    delay_between_each_action_ms: int = 1000
+def save_random_screenshot(path: str, port: int, options: ScreenshotOptions = ScreenshotOptions()) -> Dict[str, Any]:
+    """Save a screenshot of a random page
+    Args:
+        path (str): The path to save the screenshot
+        port (int): The port to use for the website.
+        options (ScreenshotOptions, optional): The options to use for taking the screenshot.
+            Defaults to ScreenshotOptions().
+    Returns:
+        infos (Dict[str, Any]): Additional information about the screenshot
+    Raises:
+        ValueError: If the path does not end with .png
+    """
+    if not path.endswith(".png"):
+        raise ValueError("The path should end with .png")
+    driver: webdriver.Chrome
+    try:
+        driver = init_driver(url=f"http://localhost:{port}", resolution=options.resolution)
+    except selenium.common.exceptions.WebDriverException as e:
+        raise Exception(f"Failed to initialize the driver: {e}")
+    except Exception as e:
+        raise Exception(f"An unknown error occurred while initializing the driver: {e}")
+    # Extract the HTML of the page
+    html = driver.page_source
+    # Take a screenshot of the page
+    driver.save_screenshot(path)
+    close_driver(driver)
+    return {"html": html}

helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py ADDED Viewed

@@ -0,0 +1,182 @@
+import subprocess
+import os
+import signal
+from typing import Optional
+import time
+import socket
+import threading
+from helm.common.hierarchical_logger import hlog
+class JekyllServer:
+    """A class to start and stop a Jekyll server in a separate process."""
+    def __init__(self, repo_path: str, port: int, verbose: bool = False):
+        self.repo_path: str = repo_path
+        self.verbose: bool = verbose
+        self.port: int = port
+        self.process: Optional[subprocess.Popen] = None
+        self.success: bool = False  # Shared flag to indicate if the server started successfully
+    def __del__(self):
+        self.stop()
+        if JekyllServer.is_port_in_use(self.port):
+            if self.verbose:
+                hlog(f"Port {self.port} is in use. Attempting to free it.")
+            self.kill_process_using_port(self.port)
+        if self.verbose:
+            hlog("JekyllServer object deleted.")
+    def setup_gemfile(self):
+        # Check if Gemfile exists, if not, copy Gemfile.default to Gemfile
+        if not os.path.exists(f"{self.repo_path}/Gemfile"):
+            default_gemfile_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "Gemfile.default")
+            os.system(f"cp {default_gemfile_path} {self.repo_path}/Gemfile")
+            if self.verbose:
+                hlog("Copied Gemfile.default to Gemfile")
+            return
+        # Gemfile exists, check if it has the jekyll gem
+        if "jekyll" in open(f"{self.repo_path}/Gemfile").read():
+            # TODO: figure out if we need to do anything here
+            return
+        # Gemfile exists, but doesn't have jekyll gem
+        with open(f"{self.repo_path}/Gemfile", "a") as file:
+            file.write('gem "jekyll", "~> 4.3.3"')
+            if self.verbose:
+                hlog("Added jekyll gem to Gemfile")
+    def setup_config(self):
+        # Check if _config.yml exists, if not, copy _config.default.yml to _config.yml
+        if not os.path.exists(f"{self.repo_path}/_config.yml"):
+            default_config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "_config.default.yml")
+            os.system(f"cp {default_config_path} {self.repo_path}/_config.yml")
+            if self.verbose:
+                hlog("Copied _config.default.yml to _config.yml")
+        # Search for line starting with "port:" and replace it with "port: <port>"
+        with open(f"{self.repo_path}/_config.yml", "r") as file:
+            lines = file.readlines()
+        with open(f"{self.repo_path}/_config.yml", "w") as file:
+            for line in lines:
+                if line.startswith("port"):
+                    file.write(f"port: {self.port}\n")
+                else:
+                    file.write(line)
+    @staticmethod
+    def is_port_in_use(port: int) -> bool:
+        """Check if a port is in use on localhost."""
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex(("localhost", port)) == 0
+    def kill_process_using_port(self, port: int):
+        """Find and kill the process using the specified port."""
+        command = f"lsof -ti:{port} | grep '[0-9]' | xargs -r kill -9"
+        os.system(command)
+        if self.verbose:
+            hlog(f"Killed process using port {port}.")
+    def stream_output(self, process: subprocess.Popen):
+        """Read from stdout and stderr streams and hlog."""
+        assert process.stdout is not None
+        assert process.stderr is not None
+        while True:
+            output = process.stdout.readline()
+            if not output:
+                err = process.stderr.readline()
+                if err:
+                    decoded_line = err.decode("utf-8").strip()
+                    if self.verbose:
+                        hlog(f"\t> \033[91mStderr: {decoded_line}\033[0m")
+                    self.success = False
+                    break
+                else:
+                    # No more output
+                    break
+            else:
+                decoded_line = output.decode("utf-8").strip()
+                if self.verbose:
+                    hlog(f"\t> Stdout: {decoded_line}")
+                if "Server running... press ctrl-c to stop." in decoded_line:
+                    self.success = True
+                    break
+    def start(self, timeout: int = 30) -> bool:
+        """Start the Jekyll server in a separate process and monitor the output."""
+        if JekyllServer.is_port_in_use(self.port):
+            if self.verbose:
+                hlog(f"Port {self.port} is in use. Attempting to free it.")
+            self.kill_process_using_port(self.port)
+        self.setup_gemfile()
+        self.setup_config()
+        command_install = f"cd {self.repo_path} && bundle install"
+        subprocess.run(command_install, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        command_serve = f"cd {self.repo_path} && bundle exec jekyll serve --port {self.port}"
+        self.process = subprocess.Popen(
+            command_serve,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            preexec_fn=os.setsid,
+        )
+        # Start thread to read output
+        output_thread = threading.Thread(target=self.stream_output, args=(self.process,))
+        output_thread.start()
+        # Wait for the thread to complete or timeout
+        output_thread.join(timeout=timeout)
+        if output_thread.is_alive():
+            # If the thread is still alive after the timeout, the server did not start
+            # successfully within the timeout period
+            hlog("Timeout reached without detecting server start.")
+            self.process.terminate()  # Terminate the process if it's still running
+            output_thread.join()  # Ensure the thread is cleaned up
+            return False
+        else:
+            if self.verbose:
+                if self.success:
+                    hlog("Jekyll server started successfully.")
+                else:
+                    hlog("Jekyll server failed to start.")
+            return self.success  # Return the success flag
+    def stop(self, timeout=5):
+        """Stop the Jekyll server and terminate the process with a timeout.
+        Args:
+            timeout (int, optional): Time to wait for the server to gracefully shut down. Defaults to 5 seconds.
+        """
+        if self.process:
+            # Try to terminate the process group gracefully
+            os.killpg(os.getpgid(self.process.pid), signal.SIGTERM)
+            self.process.terminate()
+            # Wait for the process to end, checking periodically
+            try:
+                # Wait up to `timeout` seconds for process to terminate
+                for _ in range(timeout):
+                    if self.process.poll() is not None:  # Process has terminated
+                        break
+                    time.sleep(1)  # Wait a bit before checking again
+                else:
+                    # If the process is still alive after the timeout, kill it
+                    os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
+                    self.process.kill()
+                    self.process.wait()  # Wait for process to be killed
+                    if self.verbose:
+                        hlog("Jekyll server forcefully stopped.")
+            except Exception as e:
+                if self.verbose:
+                    hlog(f"Error stopping the Jekyll server: {e}")
+            self.process = None
+            if self.verbose:
+                hlog("Jekyll server stopped.")
+        elif self.verbose:
+            hlog("Jekyll server is not running.")

helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py ADDED Viewed

@@ -0,0 +1,31 @@
+import re
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from html2text import HTML2Text
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, suggestions=["image2structure"])
+def convert_html_to_text(handler: HTML2Text, html: str) -> str:
+    """Convert HTML to text
+    Args:
+        handler (HTML2Text): The HTML2Text handler
+        html (str): The HTML to convert
+    Returns:
+        str: The text
+    """
+    text: str = handler.handle(html)
+    # Normalize space sequences to a single space globally
+    text = re.sub(r" +", " ", text)
+    # Replace tabs with a single space
+    text = re.sub(r"\t", " ", text)
+    # Remove leading and trailing spaces on each line
+    text = re.sub(r"^[ \t]+|[ \t]+$", "", text, flags=re.MULTILINE)
+    # Remove unnecessary whitespace - multiple empty lines and tabulations
+    text = re.sub(r"\n\s*\n", "\n", text)
+    return text.strip()

crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl