PyPI - unstructured-ingest - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

unstructured-ingest 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (35) hide show

test/integration/connectors/sql/test_databricks_delta_tables.py CHANGED Viewed

@@ -17,11 +17,11 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
     CONNECTOR_TYPE,
-    DatabrickDeltaTablesAccessConfig,
-    DatabrickDeltaTablesConnectionConfig,
-    DatabrickDeltaTablesUploader,
-    DatabrickDeltaTablesUploaderConfig,
-    DatabrickDeltaTablesUploadStager,
+    DatabricksDeltaTablesAccessConfig,
+    DatabricksDeltaTablesConnectionConfig,
+    DatabricksDeltaTablesUploader,
+    DatabricksDeltaTablesUploaderConfig,
+    DatabricksDeltaTablesUploadStager,
 )
 CATALOG = "utic-dev-tech-fixtures"
@@ -112,7 +112,7 @@ async def test_databricks_delta_tables_destination(
         connector_type=CONNECTOR_TYPE,
         source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
     )
-    stager = DatabrickDeltaTablesUploadStager()
+    stager = DatabricksDeltaTablesUploadStager()
     staged_path = stager.run(
         elements_filepath=upload_file,
         file_data=mock_file_data,
@@ -122,15 +122,15 @@ async def test_databricks_delta_tables_destination(
     assert staged_path.suffix == upload_file.suffix
-    uploader = DatabrickDeltaTablesUploader(
-        connection_config=DatabrickDeltaTablesConnectionConfig(
-            access_config=DatabrickDeltaTablesAccessConfig(
+    uploader = DatabricksDeltaTablesUploader(
+        connection_config=DatabricksDeltaTablesConnectionConfig(
+            access_config=DatabricksDeltaTablesAccessConfig(
                 token=env_data.access_token.get_secret_value()
             ),
             http_path=env_data.http_path,
             server_hostname=env_data.server_hostname,
         ),
-        upload_config=DatabrickDeltaTablesUploaderConfig(
+        upload_config=DatabricksDeltaTablesUploaderConfig(
             catalog=CATALOG, database="default", table_name=destination_table
         ),
     )

test/integration/connectors/utils/validation/equality.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import json
 from pathlib import Path
-import ndjson
 from bs4 import BeautifulSoup
 from deepdiff import DeepDiff
+from unstructured_ingest.utils import ndjson
 def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
     with expected_filepath.open() as f:

test/unit/v2/connectors/databricks/__init__.py ADDED Viewed

File without changes

test/unit/v2/connectors/databricks/test_volumes_table.py ADDED Viewed

@@ -0,0 +1,44 @@
+from pathlib import Path
+import pytest
+from pytest_mock import MockerFixture
+from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
+    DatabricksVolumeDeltaTableStager,
+)
+@pytest.fixture
+def stager():
+    return DatabricksVolumeDeltaTableStager()
+@pytest.mark.parametrize(
+    ("output_path", "called_output_path"),
+    [
+        (
+            Path("/fake/path/output"),
+            Path("/fake/path/output.json"),
+        ),
+        (
+            Path("/fake/path/output.ndjson"),
+            Path("/fake/path/output.json"),
+        ),
+    ],
+)
+def test_write_output(
+    mocker: MockerFixture,
+    stager: DatabricksVolumeDeltaTableStager,
+    output_path: Path,
+    called_output_path: Path,
+):
+    data = [{"key1": "value1", "key2": "value2"}]
+    mock_get_data = mocker.patch(
+        "unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
+        return_value=None,
+    )
+    stager.write_output(output_path, data)
+    mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)

test/unit/v2/connectors/sql/test_sql.py CHANGED Viewed

@@ -47,7 +47,9 @@ def test_run_output_filename_suffix(
     mock_get_output_path = mocker.patch.object(
         SQLUploadStager, "get_output_path", return_value=output_dir / expected
     )
-    mock_write_output = mocker.patch.object(SQLUploadStager, "write_output")
+    mock_write_output = mocker.patch(
+        "unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
+    )
     # Act
     result = mock_instance.run(
@@ -67,6 +69,6 @@ def test_run_output_filename_suffix(
     mock_conform_dataframe.assert_called_once()
     mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
     mock_write_output.assert_called_once_with(
-        output_path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
+        path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
     )
     assert result.name == expected

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.0" # pragma: no cover
1	+ __version__ = "0.4.2" # pragma: no cover

unstructured_ingest/utils/data_prep.py CHANGED Viewed

@@ -4,9 +4,9 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
-import ndjson
 import pandas as pd
+from unstructured_ingest.utils import ndjson
 from unstructured_ingest.v2.logger import logger
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
             raise ValueError(f"Unsupported file type: {path}")
+def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
+    with path.open("w") as f:
+        if path.suffix == ".json":
+            json.dump(data, f, indent=indent, ensure_ascii=False)
+        elif path.suffix == ".ndjson":
+            ndjson.dump(data, f, ensure_ascii=False)
+        else:
+            raise IOError("Unsupported file type: {path}")
 def get_data(path: Path) -> list[dict]:
     try:
         return get_data_by_suffix(path=path)
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
         except Exception as e:
             logger.warning(f"failed to read {path} as parquet: {e}")
-    raise IOError(f"File could not be parsed: {path}")
 def get_data_df(path: Path) -> pd.DataFrame:
     with path.open() as f:

unstructured_ingest/utils/html.py ADDED Viewed

@@ -0,0 +1,109 @@
+import base64
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+from uuid import NAMESPACE_DNS, uuid5
+import requests
+from bs4 import BeautifulSoup
+from requests import Session
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
+from unstructured_ingest.v2.logger import logger
+def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
+    session = session or requests.Session()
+    parsed_url = urlparse(url)
+    base_url = parsed_url.scheme + "://" + parsed_url.netloc
+    soup = BeautifulSoup(original_html, "html.parser")
+    images = soup.find_all("img")
+    for image in images:
+        current_source = image["src"]
+        if current_source.startswith("//"):
+            source_url = f"{parsed_url.scheme}:{current_source}"
+        elif current_source.startswith("http"):
+            source_url = current_source
+        else:
+            source_url = base_url + current_source
+        try:
+            response = session.get(source_url)
+            response.raise_for_status()
+            image_content = response.content
+            logger.debug(
+                "img tag having src updated from {} to base64 content".format(image["src"])
+            )
+            image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
+        except Exception as e:
+            logger.warning(
+                f"failed to download image content from {source_url}: {e}", exc_info=True
+            )
+    return str(soup)
+def download_link(
+    download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
+) -> Path:
+    session = session or requests.Session()
+    filename = Path(urlparse(url=link).path).name
+    download_path = download_dir / filename
+    logger.debug(f"downloading file from {link} to {download_path}")
+    if download_path.exists() and download_path.is_file() and not force_download:
+        return download_path
+    with download_path.open("wb") as downloaded_file:
+        response = session.get(link)
+        response.raise_for_status()
+        downloaded_file.write(response.content)
+    return download_path
+def download_embedded_files(
+    download_dir: Path,
+    original_filedata: FileData,
+    original_html: str,
+    session: Optional[Session] = None,
+    force_download: bool = False,
+) -> list[DownloadResponse]:
+    session = session or requests.Session()
+    url = original_filedata.metadata.url
+    parsed_url = urlparse(url)
+    base_url = parsed_url.scheme + "://" + parsed_url.netloc
+    soup = BeautifulSoup(original_html, "html.parser")
+    tags = soup.find_all("a", href=True)
+    hrefs = [
+        tag["href"]
+        for tag in tags
+        if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
+    ]
+    results = []
+    for current_source in hrefs:
+        download_dir.mkdir(parents=True, exist_ok=True)
+        if current_source.startswith("//"):
+            source_url = f"{parsed_url.scheme}:{current_source}"
+        elif current_source.startswith("http"):
+            source_url = current_source
+        else:
+            source_url = base_url + current_source
+        try:
+            downloaded_path = download_link(
+                download_dir=download_dir,
+                link=source_url,
+                session=session,
+                force_download=force_download,
+            )
+        except Exception as e:
+            logger.warning(f"failed to download file content from {source_url}: {e}")
+            continue
+        result_file_data = original_filedata.model_copy(deep=True)
+        result_file_data.metadata.url = source_url
+        result_file_data.metadata.record_locator["parent_url"] = url
+        result_file_data.identifier = str(
+            uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
+        )
+        filename = Path(urlparse(url=source_url).path).name
+        result_file_data.source_identifiers = SourceIdentifiers(
+            filename=filename, fullpath=filename
+        )
+        result_file_data.local_download_path = downloaded_path.as_posix()
+        results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
+    return results

unstructured_ingest/utils/ndjson.py ADDED Viewed

@@ -0,0 +1,52 @@
+import json
+from typing import IO, Any
+def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
+    return "\n".join(json.dumps(each, **kwargs) for each in obj)
+def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
+    # Indent breaks ndjson formatting
+    kwargs["indent"] = None
+    text = dumps(obj, **kwargs)
+    fp.write(text)
+def loads(s: str, **kwargs) -> list[dict[str, Any]]:
+    return [json.loads(line, **kwargs) for line in s.splitlines()]
+def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
+    return loads(fp.read(), **kwargs)
+class writer(object):
+    def __init__(self, f, **kwargs):
+        self.f = f
+        self.kwargs = kwargs
+    def write(self, row):
+        stringified = json.dumps(row, **self.kwargs)
+        self.f.write(stringified + "\n")
+class reader(object):
+    def __init__(self, f, **kwargs):
+        self.f = f
+        self.kwargs = kwargs
+    def __iter__(self):
+        return self
+    def __next__(self):
+        line = ""
+        while line == "":
+            line = next(self.f).strip()
+        return json.loads(line, **self.kwargs)
+    # NOTE: this is necessary to comply with py27
+    def next(self):
+        return self.__next__()

unstructured_ingest/v2/interfaces/upload_stager.py CHANGED Viewed

@@ -2,11 +2,11 @@ import json
 from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Optional, TypeVar
+from typing import Any, TypeVar
-import ndjson
 from pydantic import BaseModel
+from unstructured_ingest.utils import ndjson
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.process import BaseProcess
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
 class UploadStager(BaseProcess, ABC):
     upload_stager_config: UploadStagerConfigT
-    def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
-        if output_path.suffix == ".json":
-            with output_path.open("w") as f:
-                json.dump(data, f, indent=indent)
-        elif output_path.suffix == ".ndjson":
-            with output_path.open("w") as f:
-                ndjson.dump(data, f)
-        else:
-            raise ValueError(f"Unsupported output format: {output_path}")
     def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         return element_dict
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
                 writer = ndjson.writer(out_f)
                 for element in reader:
                     conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
-                    writer.writerow(row=conformed_element)
+                    writer.write(row=conformed_element)
                     writer.f.flush()
     def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import asyncio
 import hashlib
-import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
+from unstructured_ingest.utils.data_prep import write_data
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
         return filepath
     def _save_output(self, output_filepath: str, chunked_content: list[dict]):
-        with open(str(output_filepath), "w") as f:
-            logger.debug(f"writing chunker output to: {output_filepath}")
-            json.dump(chunked_content, f, indent=2)
+        logger.debug(f"writing chunker output to: {output_filepath}")
+        write_data(path=Path(output_filepath), data=chunked_content)
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str, **kwargs

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import asyncio
 import hashlib
-import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
+from unstructured_ingest.utils.data_prep import write_data
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
         return filepath
     def _save_output(self, output_filepath: str, embedded_content: list[dict]):
-        with open(str(output_filepath), "w") as f:
-            logger.debug(f"writing embedded output to: {output_filepath}")
-            json.dump(embedded_content, f, indent=2)
+        logger.debug(f"writing embedded output to: {output_filepath}")
+        write_data(path=Path(output_filepath), data=embedded_content)
     async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
         path = Path(path)

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import asyncio
 import hashlib
-import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
+from unstructured_ingest.utils.data_prep import write_data
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
 from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
         return filepath
     def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
-        with open(str(output_filepath), "w") as f:
-            logger.debug(f"writing partitioned output to: {output_filepath}")
-            json.dump(partitioned_content, f, indent=2)
+        logger.debug(f"writing partitioned output to: {output_filepath}")
+        write_data(path=Path(output_filepath), data=partitioned_content)
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str

unstructured_ingest/v2/processes/connectors/confluence.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Generator, List, Optional
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
     Indexer,
     IndexerConfig,
     SourceIdentifiers,
+    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
@@ -71,17 +73,19 @@ class ConfluenceConnectionConfig(ConnectionConfig):
             )
     @requires_dependencies(["atlassian"], extras="confluence")
+    @contextmanager
     def get_client(self) -> "Confluence":
         from atlassian import Confluence
         access_configs = self.access_config.get_secret_value()
-        return Confluence(
+        with Confluence(
             url=self.url,
             username=self.username,
             password=access_configs.password,
             token=access_configs.token,
             cloud=self.cloud,
-        )
+        ) as client:
+            yield client
 class ConfluenceIndexerConfig(IndexerConfig):
@@ -103,8 +107,8 @@ class ConfluenceIndexer(Indexer):
             # Attempt to retrieve a list of spaces with limit=1.
             # This should only succeed if all creds are valid
-            client = self.connection_config.get_client()
-            client.get_all_spaces(limit=1)
+            with self.connection_config.get_client() as client:
+                client.get_all_spaces(limit=1)
             logger.info("Connection to Confluence successful.")
             return True
         except Exception as e:
@@ -116,21 +120,21 @@ class ConfluenceIndexer(Indexer):
         if spaces:
             return spaces
         else:
-            client = self.connection_config.get_client()
-            all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
+            with self.connection_config.get_client() as client:
+                all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
             space_ids = [space["key"] for space in all_spaces["results"]]
             return space_ids
     def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
-        client = self.connection_config.get_client()
-        pages = client.get_all_pages_from_space(
-            space=space_id,
-            start=0,
-            limit=self.index_config.max_num_of_docs_from_each_space,
-            expand=None,
-            content_type="page",
-            status=None,
-        )
+        with self.connection_config.get_client() as client:
+            pages = client.get_all_pages_from_space(
+                space=space_id,
+                start=0,
+                limit=self.index_config.max_num_of_docs_from_each_space,
+                expand=None,
+                content_type="page",
+                status=None,
+            )
         doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
         return doc_ids
@@ -177,7 +181,18 @@ class ConfluenceIndexer(Indexer):
 class ConfluenceDownloaderConfig(DownloaderConfig):
-    pass
+    extract_images: bool = Field(
+        default=False,
+        description="if true, will download images and replace "
+        "the html content with base64 encoded images",
+    )
+    extract_files: bool = Field(
+        default=False, description="if true, will download any embedded files"
+    )
+    force_download: bool = Field(
+        default=False,
+        description="if true, will redownload extracted files even if they already exist locally",
+    )
 @dataclass
@@ -186,14 +201,37 @@ class ConfluenceDownloader(Downloader):
     download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
     connector_type: str = CONNECTOR_TYPE
-    def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
+    def download_embedded_files(
+        self, session, html: str, current_file_data: FileData
+    ) -> list[DownloadResponse]:
+        if not self.download_config.extract_files:
+            return []
+        from unstructured_ingest.utils.html import download_embedded_files
+        filepath = current_file_data.source_identifiers.relative_path
+        download_path = Path(self.download_dir) / filepath
+        download_dir = download_path.with_suffix("")
+        return download_embedded_files(
+            download_dir=download_dir,
+            original_filedata=current_file_data,
+            original_html=html,
+            session=session,
+            force_download=self.download_config.force_download,
+        )
+    def run(self, file_data: FileData, **kwargs) -> download_responses:
+        from bs4 import BeautifulSoup
+        from unstructured_ingest.utils.html import convert_image_tags
         doc_id = file_data.identifier
         try:
-            client = self.connection_config.get_client()
-            page = client.get_page_by_id(
-                page_id=doc_id,
-                expand="history.lastUpdated,version,body.view",
-            )
+            with self.connection_config.get_client() as client:
+                page = client.get_page_by_id(
+                    page_id=doc_id,
+                    expand="history.lastUpdated,version,body.view",
+                )
         except Exception as e:
             logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
             raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
@@ -202,20 +240,52 @@ class ConfluenceDownloader(Downloader):
             raise ValueError(f"Page with ID {doc_id} does not exist.")
         content = page["body"]["view"]["value"]
+        # This supports v2 html parsing in unstructured
+        title = page["title"]
+        title_html = f"<title>{title}</title>"
+        content = f"<body class='Document' >{title_html}{content}</body>"
+        if self.download_config.extract_images:
+            with self.connection_config.get_client() as client:
+                content = convert_image_tags(
+                    url=file_data.metadata.url, original_html=content, session=client._session
+                )
         filepath = file_data.source_identifiers.relative_path
         download_path = Path(self.download_dir) / filepath
         download_path.parent.mkdir(parents=True, exist_ok=True)
         with open(download_path, "w", encoding="utf8") as f:
-            f.write(content)
+            soup = BeautifulSoup(content, "html.parser")
+            f.write(soup.prettify())
         # Update file_data with metadata
         file_data.metadata.date_created = page["history"]["createdDate"]
         file_data.metadata.date_modified = page["version"]["when"]
         file_data.metadata.version = str(page["version"]["number"])
-        file_data.display_name = page["title"]
+        file_data.display_name = title
-        return self.generate_download_response(file_data=file_data, download_path=download_path)
+        download_response = self.generate_download_response(
+            file_data=file_data, download_path=download_path
+        )
+        if self.download_config.extract_files:
+            with self.connection_config.get_client() as client:
+                extracted_download_responses = self.download_embedded_files(
+                    html=content,
+                    current_file_data=download_response["file_data"],
+                    session=client._session,
+                )
+                if extracted_download_responses:
+                    for dr in extracted_download_responses:
+                        fd = dr["file_data"]
+                        source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
+                            ""
+                        )
+                        new_fullpath = source_file_path / fd.source_identifiers.filename
+                        fd.source_identifiers = SourceIdentifiers(
+                            fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
+                        )
+                    extracted_download_responses.append(download_response)
+                    return extracted_download_responses
+        return download_response
 confluence_source_entry = SourceRegistryEntry(

unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl