PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/utils/compression.py ADDED Viewed

@@ -0,0 +1,72 @@
+import os
+import sys
+import tarfile
+import zipfile
+from pathlib import Path
+from typing import Optional
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
+ZIP_FILE_EXT = [".zip"]
+TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
+def uncompress_file(filename: str, path: Optional[str] = None) -> str:
+    """
+    Takes in a compressed zip or tar file and decompresses it
+    """
+    # Create path if it doesn't already exist
+    if path:
+        mkdir_concurrent_safe(Path(path))
+    if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
+        return uncompress_zip_file(zip_filename=filename, path=path)
+    elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
+        return uncompress_tar_file(tar_filename=filename, path=path)
+    else:
+        raise ValueError(
+            "filename {} not a recognized compressed extension: {}".format(
+                filename,
+                ", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
+            ),
+        )
+def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
+    head, tail = os.path.split(zip_filename)
+    for ext in ZIP_FILE_EXT:
+        if tail.endswith(ext):
+            tail = tail[: -(len(ext))]
+            break
+    path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
+    logger.info(f"extracting zip {zip_filename} -> {path}")
+    with zipfile.ZipFile(zip_filename) as zfile:
+        zfile.extractall(path=path)
+    return path
+def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
+    head, tail = os.path.split(tar_filename)
+    for ext in TAR_FILE_EXT:
+        if tail.endswith(ext):
+            tail = tail[: -(len(ext))]
+            break
+    path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
+    logger.info(f"extracting tar {tar_filename} -> {path}")
+    # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
+    with tarfile.open(tar_filename, "r:*") as tfile:
+        # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
+        # This was added in Python 3.12
+        # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
+        if sys.version_info >= (3, 12):
+            tfile.extraction_filter = tarfile.tar_filter
+        else:
+            logger.warning(
+                "Extraction filtering for tar files is available for Python 3.12 and above. "
+                "Consider upgrading your Python version to improve security. "
+                "See https://docs.python.org/3/library/tarfile.html#extraction-filters"
+            )
+        tfile.extractall(path=path)
+    return path

unstructured_ingest/utils/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # Used to append to metadata for uploaders that store element-level data
2	+ RECORD_ID_LABEL = "record_id"

unstructured_ingest/utils/data_prep.py ADDED Viewed

@@ -0,0 +1,216 @@
+import itertools
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
+from uuid import NAMESPACE_DNS, uuid5
+from unstructured_ingest.data_types.file_data import FileData
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils import ndjson
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from pandas import DataFrame
+DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
+T = TypeVar("T")
+IterableT = Iterable[T]
+def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
+    num_chunks = len(df) // chunk_size + 1
+    for i in range(num_chunks):
+        yield df[i * chunk_size : (i + 1) * chunk_size]
+def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
+    """A helper function to break an iterable into batches of size batch_size."""
+    it = iter(iterable)
+    chunk = tuple(itertools.islice(it, batch_size))
+    while chunk:
+        yield chunk
+        chunk = tuple(itertools.islice(it, batch_size))
+def generator_batching_wbytes(
+    iterable: IterableT,
+    batch_size_limit_bytes: Optional[int] = None,
+    max_batch_size: Optional[int] = None,
+) -> IterableT:
+    if not batch_size_limit_bytes and not max_batch_size:
+        return iterable
+    """A helper function to break an iterable into chunks of specified bytes."""
+    current_batch, current_batch_size = [], 0
+    for item in iterable:
+        item_size_bytes = len(json.dumps(item).encode("utf-8"))
+        if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
+            yield current_batch
+            current_batch, current_batch_size = [item], item_size_bytes
+            continue
+        if max_batch_size and len(current_batch) + 1 > max_batch_size:
+            yield current_batch
+            current_batch, current_batch_size = [item], item_size_bytes
+            continue
+        current_batch.append(item)
+        current_batch_size += item_size_bytes
+    if current_batch:
+        yield current_batch
+def flatten_dict(
+    dictionary: dict[str, Any],
+    parent_key: str = "",
+    separator: str = "_",
+    flatten_lists: bool = False,
+    remove_none: bool = False,
+    keys_to_omit: Optional[Sequence[str]] = None,
+) -> dict[str, Any]:
+    """Flattens a nested dictionary into a single level dictionary.
+    keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
+    {parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
+    well. If remove_none is True, then None keys/values are removed from the flattened
+    dictionary.
+    """
+    keys_to_omit = keys_to_omit if keys_to_omit else []
+    flattened_dict: dict[str, Any] = {}
+    for key, value in dictionary.items():
+        new_key = f"{parent_key}{separator}{key}" if parent_key else key
+        if new_key in keys_to_omit:
+            flattened_dict[new_key] = value
+        elif value is None and remove_none:
+            continue
+        elif isinstance(value, dict):
+            value = cast("dict[str, Any]", value)
+            flattened_dict.update(
+                flatten_dict(
+                    value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
+                ),
+            )
+        elif isinstance(value, (list, tuple)) and flatten_lists:
+            value = cast("list[Any] | tuple[Any]", value)
+            for index, item in enumerate(value):
+                flattened_dict.update(
+                    flatten_dict(
+                        {f"{new_key}{separator}{index}": item},
+                        "",
+                        separator,
+                        flatten_lists,
+                        remove_none,
+                        keys_to_omit=keys_to_omit,
+                    )
+                )
+        else:
+            flattened_dict[new_key] = value
+    return flattened_dict
+def validate_date_args(date: Optional[str] = None) -> bool:
+    """Validate whether the provided date string satisfies any of the supported date formats.
+    Used by unstructured/ingest/connector/biomed.py
+    Returns `True` if the date string satisfies any of the supported formats, otherwise raises
+    `ValueError`.
+    Supported Date Formats:
+        - 'YYYY-MM-DD'
+        - 'YYYY-MM-DDTHH:MM:SS'
+        - 'YYYY-MM-DD+HH:MM:SS'
+        - 'YYYY-MM-DDTHH:MM:SS±HHMM'
+    """
+    if not date:
+        raise ValueError("The argument date is None.")
+    for format in DATE_FORMATS:
+        try:
+            datetime.strptime(date, format)
+            return True
+        except ValueError:
+            pass
+    raise ValueError(
+        f"The argument {date} does not satisfy the format:"
+        f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
+    )
+def get_data_by_suffix(path: Path) -> list[dict]:
+    with path.open() as f:
+        if path.suffix == ".json":
+            return json.load(f)
+        elif path.suffix == ".ndjson":
+            return ndjson.load(f)
+        elif path.suffix == ".csv":
+            import pandas as pd
+            df = pd.read_csv(path)
+            return df.to_dict(orient="records")
+        elif path.suffix == ".parquet":
+            import pandas as pd
+            df = pd.read_parquet(path)
+            return df.to_dict(orient="records")
+        else:
+            raise ValueError(f"Unsupported file type: {path}")
+def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
+    with path.open("w") as f:
+        if path.suffix == ".json":
+            json.dump(data, f, indent=indent, ensure_ascii=False)
+        elif path.suffix == ".ndjson":
+            ndjson.dump(data, f, ensure_ascii=False)
+        else:
+            raise IOError("Unsupported file type: {path}")
+def get_json_data(path: Path) -> list[dict]:
+    with path.open() as f:
+        # Attempt by prefix
+        if path.suffix == ".json":
+            return json.load(f)
+        elif path.suffix == ".ndjson":
+            return ndjson.load(f)
+        try:
+            return json.load(f)
+        except Exception as e:
+            logger.warning(f"failed to read {path} as json: {e}")
+        try:
+            return ndjson.load(f)
+        except Exception as e:
+            logger.warning(f"failed to read {path} as ndjson: {e}")
+    raise ValueError(f"Unsupported json file: {path}")
+@requires_dependencies(["pandas"])
+def get_data_df(path: Path) -> "DataFrame":
+    import pandas as pd
+    with path.open() as f:
+        if path.suffix == ".json":
+            data = json.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".ndjson":
+            data = ndjson.load(f)
+            return pd.DataFrame(data=data)
+        elif path.suffix == ".csv":
+            df = pd.read_csv(path)
+            return df
+        elif path.suffix == ".parquet":
+            df = pd.read_parquet(path)
+            return df
+        else:
+            raise ValueError(f"Unsupported file type: {path}")
+def get_enhanced_element_id(element_dict: dict, file_data: FileData) -> str:
+    element_id = element_dict.get("element_id")
+    new_data = f"{element_id}{file_data.identifier}"
+    return str(uuid5(NAMESPACE_DNS, new_data))

unstructured_ingest/utils/dep_check.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import asyncio
+import importlib
+from functools import wraps
+from typing import (
+    Callable,
+    List,
+    Optional,
+    TypeVar,
+)
+from typing_extensions import ParamSpec
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+def requires_dependencies(
+    dependencies: str | list[str],
+    extras: Optional[str] = None,
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """Decorator ensuring required modules are installed.
+    Use on functions with local imports to ensure required modules are available and log
+    an installation instruction if they're not.
+    Args:
+        dependencies: Name(s) of module(s) required by the decorated function.
+        extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
+    Raises:
+        ImportError: When at least one of the `dependencies` is not available.
+    """
+    if isinstance(dependencies, str):
+        dependencies = [dependencies]
+    def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
+        def run_check():
+            missing_deps: List[str] = []
+            for dep in dependencies:
+                if not dependency_exists(dep):
+                    missing_deps.append(dep)
+            if len(missing_deps) > 0:
+                raise ImportError(
+                    f"Following dependencies are missing: {', '.join(missing_deps)}. "
+                    + (
+                        f"""Please install them using `pip install "unstructured-ingest[{extras}]"`."""  # noqa: E501
+                        if extras
+                        else f"Please install them using `pip install {' '.join(missing_deps)}`."
+                    ),
+                )
+        @wraps(func)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs):
+            run_check()
+            return func(*args, **kwargs)
+        @wraps(func)
+        async def wrapper_async(*args: _P.args, **kwargs: _P.kwargs):
+            run_check()
+            return await func(*args, **kwargs)
+        if asyncio.iscoroutinefunction(func):
+            return wrapper_async
+        return wrapper
+    return decorator
+def dependency_exists(dependency: str):
+    try:
+        importlib.import_module(dependency)
+    except ImportError as e:
+        # Check to make sure this isn't some unrelated import error.
+        if dependency in repr(e):
+            return False
+    return True

unstructured_ingest/utils/filesystem.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Filesystem utilities for concurrent operations.
+This module provides race-condition-safe filesystem operations that are needed
+when multiple processes operate on the same directory structures simultaneously.
+"""
+from pathlib import Path
+def mkdir_concurrent_safe(path: Path) -> None:
+    """
+    Create directory safely in concurrent environments, handling race conditions.
+    This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
+    raise FileExistsError when multiple processes attempt to create overlapping
+    directory structures simultaneously. In this codebase, this occurs when multiple
+    files are being downloaded in parallel and archive extraction is happening in parallel.
+    Related: https://github.com/python/cpython/pull/112966/files
+    Python core team used the same approach to fix zipfile race conditions.
+    """
+    try:
+        path.mkdir(parents=True, exist_ok=True)
+    except FileExistsError:
+        if not (path.exists() and path.is_dir()):
+            raise

unstructured_ingest/utils/html.py ADDED Viewed

@@ -0,0 +1,174 @@
+import base64
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from urllib.parse import urlparse
+from uuid import NAMESPACE_DNS, uuid5
+from pydantic import BaseModel, Field
+from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
+from unstructured_ingest.interfaces import DownloadResponse
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+    from bs4.element import Tag
+    from requests import Session
+class HtmlMixin(BaseModel):
+    extract_images: bool = Field(
+        default=False,
+        description="if true, will download images and replace "
+        "the html content with base64 encoded images",
+    )
+    extract_files: bool = Field(
+        default=False, description="if true, will download any embedded files"
+    )
+    force_download: bool = Field(
+        default=False,
+        description="if true, will redownload extracted files even if they already exist locally",
+    )
+    allow_list: Optional[list[str]] = Field(
+        default=None,
+        description="list of allowed urls to download, if not set, "
+        "will default to the base url the original HTML came from",
+    )
+    @requires_dependencies(["requests"])
+    def get_default_session(self) -> "Session":
+        import requests
+        return requests.Session()
+    def get_absolute_url(self, tag_link: str, url: str) -> str:
+        parsed_url = urlparse(url)
+        base_url = parsed_url.scheme + "://" + parsed_url.netloc
+        if tag_link.startswith("//"):
+            return f"{parsed_url.scheme}:{tag_link}"
+        elif tag_link.startswith("http"):
+            return tag_link
+        else:
+            tag_link = tag_link.lstrip("/")
+            return f"{base_url}/{tag_link}"
+    def download_content(self, url: str, session: "Session") -> bytes:
+        response = session.get(url)
+        response.raise_for_status()
+        return response.content
+    def can_download(self, url_to_download: str, original_url: str) -> bool:
+        parsed_original_url = urlparse(original_url)
+        base_url = parsed_original_url.scheme + "://" + parsed_original_url.netloc
+        allow_list = self.allow_list or [base_url]
+        for allowed_url in allow_list:
+            if url_to_download.startswith(allowed_url):
+                return True
+        logger.info(f"Skipping url because it does not match the allow list: {url_to_download}")
+        return False
+    def extract_image_src(self, image: "Tag", url: str, session: "Session") -> "Tag":
+        current_src = image["src"]
+        if current_src.startswith("data:image/png;base64"):
+            # already base64 encoded
+            return image
+        absolute_url = self.get_absolute_url(tag_link=image["src"], url=url)
+        if not self.can_download(url_to_download=absolute_url, original_url=url):
+            return image
+        image_content = self.download_content(url=absolute_url, session=session)
+        logger.debug("img tag having src updated from {} to base64 content".format(image["src"]))
+        image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
+        return image
+    @requires_dependencies(["bs4"])
+    def extract_html_images(self, url: str, html: str, session: Optional["Session"] = None) -> str:
+        from bs4 import BeautifulSoup
+        session = session or self.get_default_session()
+        soup = BeautifulSoup(html, "html.parser")
+        images = soup.find_all("img")
+        for image in images:
+            self.extract_image_src(image=image, url=url, session=session)
+        return str(soup)
+    @requires_dependencies(["bs4"])
+    def get_hrefs(self, url: str, html: str) -> list:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, "html.parser")
+        tags = self._find_hyperlink_tags(soup)
+        hrefs = [
+            tag["href"]
+            for tag in tags
+            if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
+        ]
+        absolute_urls = [self.get_absolute_url(tag_link=href, url=url) for href in hrefs]
+        allowed_urls = [
+            url_to_download
+            for url_to_download in absolute_urls
+            if self.can_download(url_to_download=url_to_download, original_url=url)
+        ]
+        return allowed_urls
+    def write_content(self, content: bytes, path: Path) -> None:
+        if path.exists() and path.is_file() and not self.force_download:
+            return
+        if not path.parent.exists():
+            path.parent.mkdir(parents=True)
+        with path.open("wb") as f:
+            f.write(content)
+    def get_download_response(
+        self, url: str, download_dir: Path, file_data: FileData, session: "Session"
+    ) -> DownloadResponse:
+        filename = Path(urlparse(url=url).path).name
+        download_path = download_dir / filename
+        self.write_content(
+            content=self.download_content(url=url, session=session), path=download_path
+        )
+        result_file_data = file_data.model_copy(deep=True)
+        result_file_data.metadata.url = url
+        result_file_data.display_name = filename
+        if result_file_data.metadata.record_locator is None:
+            result_file_data.metadata.record_locator = {}
+        result_file_data.metadata.record_locator["parent_url"] = url
+        result_file_data.identifier = str(uuid5(NAMESPACE_DNS, url + file_data.identifier))
+        filename = Path(urlparse(url=url).path).name
+        result_file_data.source_identifiers = SourceIdentifiers(
+            filename=filename, fullpath=filename
+        )
+        result_file_data.local_download_path = download_path.as_posix()
+        return DownloadResponse(file_data=result_file_data, path=download_path)
+    def extract_embedded_files(
+        self,
+        url: str,
+        html: str,
+        download_dir: Path,
+        original_filedata: FileData,
+        session: Optional["Session"] = None,
+    ) -> list[DownloadResponse]:
+        session = session or self.get_default_session()
+        urls_to_download = self.get_hrefs(url=url, html=html)
+        return [
+            self.get_download_response(
+                url=url_to_download,
+                download_dir=download_dir,
+                file_data=original_filedata,
+                session=session,
+            )
+            for url_to_download in urls_to_download
+        ]
+    @requires_dependencies(["bs4"])
+    def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
+        """Find hyperlink tags in the HTML.
+        Overwrite this method to customize the tag search.
+        """
+        from bs4.element import Tag
+        return [
+            element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
+        ]

unstructured_ingest/utils/ndjson.py ADDED Viewed

@@ -0,0 +1,52 @@
+import json
+from typing import IO, Any
+def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
+    return "\n".join(json.dumps(each, **kwargs) for each in obj)
+def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
+    # Indent breaks ndjson formatting
+    kwargs["indent"] = None
+    text = dumps(obj, **kwargs)
+    fp.write(text)
+def loads(s: str, **kwargs) -> list[dict[str, Any]]:
+    return [json.loads(line, **kwargs) for line in s.splitlines()]
+def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
+    return loads(fp.read(), **kwargs)
+class writer(object):
+    def __init__(self, f, **kwargs):
+        self.f = f
+        self.kwargs = kwargs
+    def write(self, row):
+        stringified = json.dumps(row, **self.kwargs)
+        self.f.write(stringified + "\n")
+class reader(object):
+    def __init__(self, f, **kwargs):
+        self.f = f
+        self.kwargs = kwargs
+    def __iter__(self):
+        return self
+    def __next__(self):
+        line = ""
+        while line == "":
+            line = next(self.f).strip()
+        return json.loads(line, **self.kwargs)
+    # NOTE: this is necessary to comply with py27
+    def next(self):
+        return self.__next__()

unstructured_ingest/utils/pydantic_models.py ADDED Viewed

@@ -0,0 +1,52 @@
+import json
+from datetime import datetime
+from inspect import isclass
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel
+from pydantic.types import _SecretBase
+def is_secret(value: Any) -> bool:
+    # Case Secret[int]
+    if hasattr(value, "__origin__") and hasattr(value, "__args__"):
+        origin = value.__origin__
+        return isclass(origin) and issubclass(origin, _SecretBase)
+    # Case SecretStr
+    return isclass(value) and issubclass(value, _SecretBase)
+def serialize_base_model(model: BaseModel) -> dict:
+    # To get the full serialized dict regardless of if values are marked as Secret
+    model_dict = model.model_dump()
+    return serialize_base_dict(model_dict=model_dict)
+def serialize_base_dict(model_dict: dict) -> dict:
+    model_dict = model_dict.copy()
+    for k, v in model_dict.items():
+        if isinstance(v, _SecretBase):
+            secret_value = v.get_secret_value()
+            if isinstance(secret_value, BaseModel):
+                model_dict[k] = serialize_base_model(model=secret_value)
+            else:
+                model_dict[k] = secret_value
+        if isinstance(v, dict):
+            model_dict[k] = serialize_base_dict(model_dict=v)
+    return model_dict
+def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
+    model_dict = serialize_base_model(model=model)
+    def json_serial(obj):
+        if isinstance(obj, Path):
+            return obj.as_posix()
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        raise TypeError("Type %s not serializable" % type(obj))
+    # Support json dumps kwargs such as sort_keys
+    return json.dumps(model_dict, default=json_serial, **json_kwargs)