PyPI - unstructured-ingest - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/interfaces.py +1 -1
unstructured_ingest/cli/utils.py +1 -1
unstructured_ingest/connector/astradb.py +1 -1
unstructured_ingest/connector/biomed.py +4 -4
unstructured_ingest/connector/chroma.py +1 -1
unstructured_ingest/connector/databricks_volumes.py +2 -2
unstructured_ingest/connector/fsspec/box.py +1 -1
unstructured_ingest/connector/fsspec/fsspec.py +5 -5
unstructured_ingest/connector/git.py +1 -1
unstructured_ingest/connector/google_drive.py +4 -4
unstructured_ingest/connector/hubspot.py +1 -1
unstructured_ingest/connector/kafka.py +8 -8
unstructured_ingest/connector/local.py +1 -1
unstructured_ingest/connector/notion/helpers.py +4 -4
unstructured_ingest/connector/onedrive.py +3 -3
unstructured_ingest/connector/outlook.py +2 -2
unstructured_ingest/connector/pinecone.py +1 -1
unstructured_ingest/connector/sharepoint.py +8 -8
unstructured_ingest/connector/vectara.py +6 -6
unstructured_ingest/embed/__init__.py +17 -0
unstructured_ingest/embed/bedrock.py +70 -0
unstructured_ingest/embed/huggingface.py +73 -0
unstructured_ingest/embed/interfaces.py +36 -0
unstructured_ingest/embed/mixedbreadai.py +177 -0
unstructured_ingest/embed/octoai.py +63 -0
unstructured_ingest/embed/openai.py +61 -0
unstructured_ingest/embed/vertexai.py +88 -0
unstructured_ingest/embed/voyageai.py +69 -0
unstructured_ingest/interfaces.py +21 -11
unstructured_ingest/logger.py +1 -1
unstructured_ingest/pipeline/copy.py +1 -1
unstructured_ingest/pipeline/interfaces.py +2 -2
unstructured_ingest/pipeline/partition.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/pipeline/reformat/chunking.py +2 -2
unstructured_ingest/pipeline/reformat/embedding.py +4 -6
unstructured_ingest/pipeline/source.py +2 -2
unstructured_ingest/utils/compression.py +3 -3
unstructured_ingest/utils/data_prep.py +20 -12
unstructured_ingest/utils/string_and_date_utils.py +2 -2
unstructured_ingest/v2/cli/base/cmd.py +3 -3
unstructured_ingest/v2/cli/base/dest.py +1 -1
unstructured_ingest/v2/cli/base/src.py +3 -2
unstructured_ingest/v2/cli/utils/click.py +1 -1
unstructured_ingest/v2/interfaces/processor.py +48 -13
unstructured_ingest/v2/logger.py +1 -1
unstructured_ingest/v2/otel.py +1 -1
unstructured_ingest/v2/pipeline/interfaces.py +12 -3
unstructured_ingest/v2/pipeline/pipeline.py +42 -29
unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
unstructured_ingest/v2/pipeline/steps/download.py +17 -2
unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
unstructured_ingest/v2/pipeline/steps/index.py +2 -2
unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
unstructured_ingest/v2/processes/connectors/local.py +6 -5
unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
unstructured_ingest/v2/processes/embedder.py +41 -24
unstructured_ingest/v2/processes/filter.py +1 -1
unstructured_ingest/v2/processes/partitioner.py +3 -3
unstructured_ingest/v2/utils.py +7 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
unstructured_ingest/evaluate.py +0 -338
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0

unstructured_ingest/embed/interfaces.py ADDED Viewed

@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List, Tuple
+from pydantic import BaseModel
+class EmbeddingConfig(BaseModel):
+    pass
+@dataclass
+class BaseEmbeddingEncoder(ABC):
+    config: EmbeddingConfig
+    def initialize(self):
+        """Initializes the embedding encoder class. Should also validate the instance
+        is properly configured: e.g., embed a single a element"""
+    @property
+    @abstractmethod
+    def num_of_dimensions(self) -> Tuple[int]:
+        """Number of dimensions for the embedding vector."""
+    @property
+    @abstractmethod
+    def is_unit_vector(self) -> bool:
+        """Denotes if the embedding vector is a unit vector."""
+    @abstractmethod
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        pass
+    @abstractmethod
+    def embed_query(self, query: str) -> List[float]:
+        pass

unstructured_ingest/embed/mixedbreadai.py ADDED Viewed

@@ -0,0 +1,177 @@
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional
+import numpy as np
+from pydantic import Field, SecretStr
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+USER_AGENT = "@mixedbread-ai/unstructured"
+BATCH_SIZE = 128
+TIMEOUT = 60
+MAX_RETRIES = 3
+ENCODING_FORMAT = "float"
+TRUNCATION_STRATEGY = "end"
+if TYPE_CHECKING:
+    from mixedbread_ai.client import MixedbreadAI
+    from mixedbread_ai.core import RequestOptions
+class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
+    """
+    Configuration class for Mixedbread AI Embedding Encoder.
+    Attributes:
+        api_key (str): API key for accessing Mixedbread AI..
+        embedder_model_name (str): Name of the model to use for embeddings.
+    """
+    api_key: SecretStr = Field(
+        default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
+    )
+    embedder_model_name: str = Field(
+        default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
+    )
+    @requires_dependencies(
+        ["mixedbread_ai"],
+        extras="embed-mixedbreadai",
+    )
+    def get_client(self) -> "MixedbreadAI":
+        """
+        Create the Mixedbread AI client.
+        Returns:
+            MixedbreadAI: Initialized client.
+        """
+        from mixedbread_ai.client import MixedbreadAI
+        return MixedbreadAI(
+            api_key=self.api_key.get_secret_value(),
+        )
+@dataclass
+class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    """
+    Embedding encoder for Mixedbread AI.
+    Attributes:
+        config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
+    """
+    config: MixedbreadAIEmbeddingConfig
+    _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
+    _request_options: Optional["RequestOptions"] = field(init=False, default=None)
+    def get_exemplary_embedding(self) -> List[float]:
+        """Get an exemplary embedding to determine dimensions and unit vector status."""
+        return self._embed(["Q"])[0]
+    def initialize(self):
+        if self.config.api_key is None:
+            raise ValueError(
+                "The Mixedbread AI API key must be specified."
+                + "You either pass it in the constructor using 'api_key'"
+                + "or via the 'MXBAI_API_KEY' environment variable."
+            )
+        from mixedbread_ai.core import RequestOptions
+        self._request_options = RequestOptions(
+            max_retries=MAX_RETRIES,
+            timeout_in_seconds=TIMEOUT,
+            additional_headers={"User-Agent": USER_AGENT},
+        )
+    @property
+    def num_of_dimensions(self):
+        """Get the number of dimensions for the embeddings."""
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    @property
+    def is_unit_vector(self) -> bool:
+        """Check if the embedding is a unit vector."""
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+    def _embed(self, texts: List[str]) -> List[List[float]]:
+        """
+        Embed a list of texts using the Mixedbread AI API.
+        Args:
+            texts (List[str]): List of texts to embed.
+        Returns:
+            List[List[float]]: List of embeddings.
+        """
+        batch_size = BATCH_SIZE
+        batch_itr = range(0, len(texts), batch_size)
+        responses = []
+        client = self.config.get_client()
+        for i in batch_itr:
+            batch = texts[i : i + batch_size]
+            response = client.embeddings(
+                model=self.config.embedder_model_name,
+                normalized=True,
+                encoding_format=ENCODING_FORMAT,
+                truncation_strategy=TRUNCATION_STRATEGY,
+                request_options=self._request_options,
+                input=batch,
+            )
+            responses.append(response)
+        return [item.embedding for response in responses for item in response.data]
+    @staticmethod
+    def _add_embeddings_to_elements(
+        elements: List[dict], embeddings: List[List[float]]
+    ) -> List[dict]:
+        """
+        Add embeddings to elements.
+        Args:
+            elements (List[Element]): List of elements.
+            embeddings (List[List[float]]): List of embeddings.
+        Returns:
+            List[Element]: Elements with embeddings added.
+        """
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        """
+        Embed a list of document elements.
+        Args:
+            elements (List[Element]): List of document elements.
+        Returns:
+            List[Element]: Elements with embeddings.
+        """
+        embeddings = self._embed([e.get("text", "") for e in elements])
+        return self._add_embeddings_to_elements(elements, embeddings)
+    def embed_query(self, query: str) -> List[float]:
+        """
+        Embed a query string.
+        Args:
+            query (str): Query string to embed.
+        Returns:
+            List[float]: Embedding of the query.
+        """
+        return self._embed([query])[0]

unstructured_ingest/embed/octoai.py ADDED Viewed

@@ -0,0 +1,63 @@
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional
+import numpy as np
+from pydantic import Field, SecretStr
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from openai import OpenAI
+class OctoAiEmbeddingConfig(EmbeddingConfig):
+    api_key: SecretStr
+    embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
+    base_url: str = Field(default="https://text.octoai.run/v1")
+    @requires_dependencies(
+        ["openai", "tiktoken"],
+        extras="embed-octoai",
+    )
+    def get_client(self) -> "OpenAI":
+        """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
+        from openai import OpenAI
+        return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
+@dataclass
+class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: OctoAiEmbeddingConfig
+    # Uses the OpenAI SDK
+    _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
+    def get_exemplary_embedding(self) -> List[float]:
+        return self.embed_query("Q")
+    def num_of_dimensions(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    def is_unit_vector(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+    def embed_query(self, query: str):
+        client = self.config.get_client()
+        response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
+        return response.data[0].embedding
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        embeddings = [self.embed_query(e.get("text", "")) for e in elements]
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements

unstructured_ingest/embed/openai.py ADDED Viewed

@@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List
+import numpy as np
+from pydantic import Field, SecretStr
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from langchain_openai.embeddings import OpenAIEmbeddings
+class OpenAIEmbeddingConfig(EmbeddingConfig):
+    api_key: SecretStr
+    embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
+    @requires_dependencies(["langchain_openai"], extras="openai")
+    def get_client(self) -> "OpenAIEmbeddings":
+        """Creates a langchain OpenAI python client to embed elements."""
+        from langchain_openai import OpenAIEmbeddings
+        openai_client = OpenAIEmbeddings(
+            openai_api_key=self.api_key.get_secret_value(),
+            model=self.embedder_model_name,  # type:ignore
+        )
+        return openai_client
+@dataclass
+class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: OpenAIEmbeddingConfig
+    def get_exemplary_embedding(self) -> List[float]:
+        return self.embed_query(query="Q")
+    def num_of_dimensions(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    def is_unit_vector(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+    def embed_query(self, query):
+        client = self.config.get_client()
+        return client.embed_query(str(query))
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        client = self.config.get_client()
+        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements

unstructured_ingest/embed/vertexai.py ADDED Viewed

@@ -0,0 +1,88 @@
+# type: ignore
+import json
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Annotated, Any, List, Optional
+import numpy as np
+from pydantic import Field, Secret, ValidationError
+from pydantic.functional_validators import BeforeValidator
+from unstructured.utils import FileHandler
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from langchain_google_vertexai import VertexAIEmbeddings
+def conform_string_to_dict(value: Any) -> dict:
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        return json.loads(value)
+    raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
+ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
+class VertexAIEmbeddingConfig(EmbeddingConfig):
+    api_key: ApiKeyType
+    embedder_model_name: Optional[str] = Field(
+        default="textembedding-gecko@001", alias="model_name"
+    )
+    def register_application_credentials(self):
+        # TODO look into passing credentials in directly, rather than via env var and tmp file
+        application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json")
+        credentials_file = FileHandler(application_credentials_path)
+        credentials_file.write_file(json.dumps(self.api_key.get_secret_value()))
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path
+    @requires_dependencies(
+        ["langchain", "langchain_google_vertexai"],
+        extras="embed-vertexai",
+    )
+    def get_client(self) -> "VertexAIEmbeddings":
+        """Creates a Langchain VertexAI python client to embed elements."""
+        from langchain_google_vertexai import VertexAIEmbeddings
+        self.register_application_credentials()
+        vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
+        return vertexai_client
+@dataclass
+class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: VertexAIEmbeddingConfig
+    def get_exemplary_embedding(self) -> List[float]:
+        return self.embed_query(query="A sample query.")
+    def num_of_dimensions(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    def is_unit_vector(self):
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+    def embed_query(self, query):
+        client = self.config.get_client()
+        result = client.embed_query(str(query))
+        return result
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        client = self.config.get_client()
+        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements

unstructured_ingest/embed/voyageai.py ADDED Viewed

@@ -0,0 +1,69 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+import numpy as np
+from pydantic import Field, SecretStr
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from langchain_voyageai import VoyageAIEmbeddings
+class VoyageAIEmbeddingConfig(EmbeddingConfig):
+    api_key: SecretStr
+    embedder_model_name: str = Field(alias="model_name")
+    batch_size: Optional[int] = Field(default=None)
+    truncation: Optional[bool] = Field(default=None)
+    @requires_dependencies(
+        ["langchain", "langchain_voyageai"],
+        extras="embed-voyageai",
+    )
+    def get_client(self) -> "VoyageAIEmbeddings":
+        """Creates a Langchain VoyageAI python client to embed elements."""
+        from langchain_voyageai import VoyageAIEmbeddings
+        return VoyageAIEmbeddings(
+            voyage_api_key=self.api_key,
+            model=self.embedder_model_name,
+            batch_size=self.batch_size,
+            truncation=self.truncation,
+        )
+@dataclass
+class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: VoyageAIEmbeddingConfig
+    def get_exemplary_embedding(self) -> List[float]:
+        return self.embed_query(query="A sample query.")
+    @property
+    def num_of_dimensions(self) -> tuple[int, ...]:
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    @property
+    def is_unit_vector(self) -> bool:
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+    def embed_documents(self, elements: List[dict]) -> List[dict]:
+        client = self.config.get_client()
+        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+        return self._add_embeddings_to_elements(elements, embeddings)
+    def embed_query(self, query: str) -> List[float]:
+        client = self.config.get_client()
+        return client.embed_query(query)
+    @staticmethod
+    def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import flatten_dict
 if TYPE_CHECKING:
     from unstructured.documents.elements import Element
-    from unstructured.embed.interfaces import BaseEmbeddingEncoder
+    from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
 A = TypeVar("A", bound="DataClassJsonMixin")
@@ -204,22 +205,31 @@ class EmbeddingConfig(BaseConfig):
             kwargs["model_name"] = self.model_name
         # TODO make this more dynamic to map to encoder configs
         if self.provider == "langchain-openai":
-            from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+            from unstructured_ingest.embed.openai import (
+                OpenAIEmbeddingConfig,
+                OpenAIEmbeddingEncoder,
+            )
             return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
         elif self.provider == "langchain-huggingface":
-            from unstructured.embed.huggingface import (
+            from unstructured_ingest.embed.huggingface import (
                 HuggingFaceEmbeddingConfig,
                 HuggingFaceEmbeddingEncoder,
             )
             return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
         elif self.provider == "octoai":
-            from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
+            from unstructured_ingest.embed.octoai import (
+                OctoAiEmbeddingConfig,
+                OctoAIEmbeddingEncoder,
+            )
             return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
         elif self.provider == "langchain-aws-bedrock":
-            from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
+            from unstructured_ingest.embed.bedrock import (
+                BedrockEmbeddingConfig,
+                BedrockEmbeddingEncoder,
+            )
             return BedrockEmbeddingEncoder(
                 config=BedrockEmbeddingConfig(
@@ -229,14 +239,14 @@ class EmbeddingConfig(BaseConfig):
                 )
             )
         elif self.provider == "langchain-vertexai":
-            from unstructured.embed.vertexai import (
+            from unstructured_ingest.embed.vertexai import (
                 VertexAIEmbeddingConfig,
                 VertexAIEmbeddingEncoder,
             )
             return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
         elif self.provider == "langchain-voyageai":
-            from unstructured.embed.voyageai import (
+            from unstructured_ingest.embed.voyageai import (
                 VoyageAIEmbeddingConfig,
                 VoyageAIEmbeddingEncoder,
             )
@@ -519,7 +529,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
                 and self.filename.is_file()
                 and self.filename.stat().st_size
             ):
-                logger.debug(f"File exists: {self.filename}, skipping {func.__name__}")
+                logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
                 return None
             return func(self, *args, **kwargs)
@@ -576,7 +586,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
             endpoint = partition_config.partition_endpoint
-            logger.debug(f"Using remote partition ({endpoint})")
+            logger.debug(f"using remote partition ({endpoint})")
             elements = partition_via_api(
                 filename=str(self.filename),
@@ -596,7 +606,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
         self._date_processed = datetime.utcnow().isoformat()
         if self.read_config.download_only:
             return None
-        logger.info(f"Processing {self.filename}")
+        logger.info(f"processing {self.filename}")
         elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
         element_dicts = [e.to_dict() for e in elements]
@@ -814,7 +824,7 @@ class IngestDocCleanupMixin:
             and self.filename.is_file()
             and not self.read_config.download_only
         ):
-            logger.debug(f"Cleaning up {self}")
+            logger.debug(f"cleaning up {self}")
             os.unlink(self.filename)

unstructured_ingest/logger.py CHANGED Viewed

@@ -95,7 +95,7 @@ class SensitiveFormatter(logging.Formatter):
 def remove_root_handlers(logger: logging.Logger) -> None:
-    # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
+    # NOTE(robinson): in some environments such as Google Colab, there is a root handler
     # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
     # Removing these when they exist prevents this behavior
     if logger.root.hasHandlers():

unstructured_ingest/pipeline/copy.py CHANGED Viewed

@@ -15,5 +15,5 @@ class Copier(CopyNode):
         ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
         desired_output = ingest_doc._output_filename
         Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Copying {json_path} -> {desired_output}")
+        logger.info(f"copying {json_path} -> {desired_output}")
         shutil.copy(json_path, desired_output)

unstructured_ingest/pipeline/interfaces.py CHANGED Viewed

@@ -57,7 +57,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
         iterable = iterable if iterable else []
         if iterable:
             logger.info(
-                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+                f"calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
             )
         self.initialize()
@@ -92,7 +92,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
     def initialize(self):
         if path := self.get_path():
-            logger.info(f"Creating {path}")
+            logger.info(f"creating {path}")
             path.mkdir(parents=True, exist_ok=True)
         ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)

unstructured_ingest/pipeline/partition.py CHANGED Viewed

@@ -30,7 +30,7 @@ class Partitioner(PartitionNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.info(f"File exists: {json_path}, skipping partition")
+                logger.info(f"file exists: {json_path}, skipping partition")
                 return str(json_path)
             partition_kwargs: t.Dict[str, t.Any] = {
                 "strategy": self.partition_config.strategy,

unstructured_ingest/pipeline/pipeline.py CHANGED Viewed

@@ -96,7 +96,7 @@ class Pipeline(DataClassJsonMixin):
         for reformat_node in self.reformat_nodes:
             reformatted_jsons = reformat_node(iterable=partitioned_jsons)
             if not reformatted_jsons:
-                logger.info(f"No files to process after {reformat_node.__class__.__name__}")
+                logger.info(f"no files to process after {reformat_node.__class__.__name__}")
                 return
             partitioned_jsons = reformatted_jsons

unstructured_ingest/pipeline/reformat/chunking.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Chunker(ReformatNode):
                 and json_path.is_file()
                 and json_path.stat().st_size
             ):
-                logger.debug(f"File exists: {json_path}, skipping chunking")
+                logger.debug(f"file exists: {json_path}, skipping chunking")
                 return str(json_path)
             chunked_elements = self.chunk(elements_json)
@@ -112,7 +112,7 @@ class Chunker(ReformatNode):
             return partition_via_api(
                 filename=elements_json_file,
-                # -- (jennings) If api_key or api_url are None, partition_via_api will raise an
+                # -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
                 # -- error, which will be caught and logged by Chunker.run()
                 api_key=self.partition_config.api_key,  # type: ignore
                 api_url=self.partition_config.partition_endpoint,  # type: ignore

unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl