unstructured-ingest 1.0.15__py3-none-any.whl → 1.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.15" # pragma: no cover
1
+ __version__ = "1.0.17" # pragma: no cover
@@ -16,15 +16,18 @@ if TYPE_CHECKING:
16
16
 
17
17
  class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
18
18
  api_version: str = Field(description="Azure API version", default="2024-06-01")
19
- azure_endpoint: str
20
- embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
19
+ azure_endpoint: str = Field(description="Azure endpoint")
20
+ embedder_model_name: str = Field(
21
+ default="text-embedding-ada-002", alias="model_name", description="Azure OpenAI model name"
22
+ )
21
23
 
22
24
  @requires_dependencies(["openai"], extras="openai")
23
25
  def get_client(self) -> "AzureOpenAI":
24
26
  from openai import AzureOpenAI
25
27
 
28
+ api_key = self.api_key.get_secret_value() if self.api_key else None
26
29
  return AzureOpenAI(
27
- api_key=self.api_key.get_secret_value(),
30
+ api_key=api_key,
28
31
  api_version=self.api_version,
29
32
  azure_endpoint=self.azure_endpoint,
30
33
  )
@@ -33,8 +36,9 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
33
36
  def get_async_client(self) -> "AsyncAzureOpenAI":
34
37
  from openai import AsyncAzureOpenAI
35
38
 
39
+ api_key = self.api_key.get_secret_value() if self.api_key else None
36
40
  return AsyncAzureOpenAI(
37
- api_key=self.api_key.get_secret_value(),
41
+ api_key=api_key,
38
42
  api_version=self.api_version,
39
43
  azure_endpoint=self.azure_endpoint,
40
44
  )
@@ -58,10 +58,14 @@ def conform_query(query: str, provider: str) -> dict:
58
58
 
59
59
 
60
60
  class BedrockEmbeddingConfig(EmbeddingConfig):
61
- aws_access_key_id: SecretStr
62
- aws_secret_access_key: SecretStr
63
- region_name: str = "us-west-2"
64
- embedder_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
61
+ aws_access_key_id: SecretStr = Field(description="aws access key id")
62
+ aws_secret_access_key: SecretStr = Field(description="aws secret access key")
63
+ region_name: str = Field(description="aws region name", default="us-west-2")
64
+ embedder_model_name: str = Field(
65
+ default="amazon.titan-embed-text-v1",
66
+ alias="model_name",
67
+ description="AWS Bedrock model name",
68
+ )
65
69
 
66
70
  def wrap_error(self, e: Exception) -> Exception:
67
71
  if is_internal_error(e=e):
@@ -15,11 +15,18 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
18
- embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
18
+ embedder_model_name: Optional[str] = Field(
19
+ default="all-MiniLM-L6-v2", alias="model_name", description="HuggingFace model name"
20
+ )
19
21
  embedder_model_kwargs: Optional[dict] = Field(
20
- default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
22
+ default_factory=lambda: {"device": "cpu"},
23
+ alias="model_kwargs",
24
+ description="additional model parameters",
25
+ )
26
+ encode_kwargs: Optional[dict] = Field(
27
+ default_factory=lambda: {"normalize_embeddings": False},
28
+ description="additional embedding parameters",
21
29
  )
22
- encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
23
30
 
24
31
  @requires_dependencies(
25
32
  ["sentence_transformers"],
@@ -34,10 +34,13 @@ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
34
34
 
35
35
  api_key: SecretStr = Field(
36
36
  default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
37
+ description="API key for Mixedbread AI",
37
38
  )
38
39
 
39
40
  embedder_model_name: str = Field(
40
- default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
41
+ default="mixedbread-ai/mxbai-embed-large-v1",
42
+ alias="model_name",
43
+ description="Mixedbread AI model name",
41
44
  )
42
45
 
43
46
  @requires_dependencies(
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from pydantic import Field, SecretStr
5
5
 
@@ -24,9 +24,13 @@ if TYPE_CHECKING:
24
24
 
25
25
 
26
26
  class OctoAiEmbeddingConfig(EmbeddingConfig):
27
- api_key: SecretStr
28
- embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
29
- base_url: str = Field(default="https://text.octoai.run/v1")
27
+ api_key: Optional[SecretStr] = Field(description="API key for OctoAI", default=None)
28
+ embedder_model_name: str = Field(
29
+ default="thenlper/gte-large", alias="model_name", description="octoai model name"
30
+ )
31
+ base_url: str = Field(
32
+ default="https://text.octoai.run/v1", description="optional override for the base url"
33
+ )
30
34
 
31
35
  def wrap_error(self, e: Exception) -> Exception:
32
36
  if is_internal_error(e=e):
@@ -73,7 +77,8 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
73
77
  """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
74
78
  from openai import OpenAI
75
79
 
76
- return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
80
+ api_key = self.api_key.get_secret_value() if self.api_key else None
81
+ return OpenAI(api_key=api_key, base_url=self.base_url)
77
82
 
78
83
  @requires_dependencies(
79
84
  ["openai", "tiktoken"],
@@ -83,7 +88,8 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
83
88
  """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
84
89
  from openai import AsyncOpenAI
85
90
 
86
- return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
91
+ api_key = self.api_key.get_secret_value() if self.api_key else None
92
+ return AsyncOpenAI(api_key=api_key, base_url=self.base_url)
87
93
 
88
94
 
89
95
  @dataclass
@@ -24,9 +24,11 @@ if TYPE_CHECKING:
24
24
 
25
25
 
26
26
  class OpenAIEmbeddingConfig(EmbeddingConfig):
27
- api_key: SecretStr
28
- embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
29
- base_url: Optional[str] = None
27
+ api_key: Optional[SecretStr] = Field(description="API key for OpenAI", default=None)
28
+ embedder_model_name: str = Field(
29
+ default="text-embedding-ada-002", alias="model_name", description="OpenAI model name"
30
+ )
31
+ base_url: Optional[str] = Field(default=None, description="optional override for the base url")
30
32
 
31
33
  @requires_dependencies(["openai"], extras="openai")
32
34
  def wrap_error(self, e: Exception) -> Exception:
@@ -86,13 +88,15 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
86
88
  def get_client(self) -> "OpenAI":
87
89
  from openai import OpenAI
88
90
 
89
- return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
91
+ api_key = self.api_key.get_secret_value() if self.api_key else None
92
+ return OpenAI(api_key=api_key, base_url=self.base_url)
90
93
 
91
94
  @requires_dependencies(["openai"], extras="openai")
92
95
  def get_async_client(self) -> "AsyncOpenAI":
93
96
  from openai import AsyncOpenAI
94
97
 
95
- return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
98
+ api_key = self.api_key.get_secret_value() if self.api_key else None
99
+ return AsyncOpenAI(api_key=api_key, base_url=self.base_url)
96
100
 
97
101
 
98
102
  @dataclass
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, Any
2
+ from typing import TYPE_CHECKING, Any, Optional
3
3
 
4
4
  from pydantic import Field, SecretStr
5
5
 
@@ -20,9 +20,11 @@ if TYPE_CHECKING:
20
20
 
21
21
 
22
22
  class TogetherAIEmbeddingConfig(EmbeddingConfig):
23
- api_key: SecretStr
23
+ api_key: Optional[SecretStr] = Field(description="API key for Together AI", default=None)
24
24
  embedder_model_name: str = Field(
25
- default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
25
+ default="togethercomputer/m2-bert-80M-8k-retrieval",
26
+ alias="model_name",
27
+ description="Together AI model name",
26
28
  )
27
29
 
28
30
  def wrap_error(self, e: Exception) -> Exception:
@@ -56,13 +58,15 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
56
58
  def get_client(self) -> "Together":
57
59
  from together import Together
58
60
 
59
- return Together(api_key=self.api_key.get_secret_value())
61
+ api_key = self.api_key.get_secret_value() if self.api_key else None
62
+ return Together(api_key=api_key)
60
63
 
61
64
  @requires_dependencies(["together"], extras="togetherai")
62
65
  def get_async_client(self) -> "AsyncTogether":
63
66
  from together import AsyncTogether
64
67
 
65
- return AsyncTogether(api_key=self.api_key.get_secret_value())
68
+ api_key = self.api_key.get_secret_value() if self.api_key else None
69
+ return AsyncTogether(api_key=api_key)
66
70
 
67
71
 
68
72
  @dataclass
@@ -32,9 +32,9 @@ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
32
32
 
33
33
 
34
34
  class VertexAIEmbeddingConfig(EmbeddingConfig):
35
- api_key: ApiKeyType
35
+ api_key: ApiKeyType = Field(description="API key for Vertex AI")
36
36
  embedder_model_name: Optional[str] = Field(
37
- default="textembedding-gecko@001", alias="model_name"
37
+ default="textembedding-gecko@001", alias="model_name", description="Vertex AI model name"
38
38
  )
39
39
 
40
40
  def wrap_error(self, e: Exception) -> Exception:
@@ -26,11 +26,14 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
26
26
  le=128,
27
27
  description="Batch size for embedding requests. VoyageAI has a limit of 128.",
28
28
  )
29
- api_key: SecretStr
30
- embedder_model_name: str = Field(default="voyage-3", alias="model_name")
31
- truncation: Optional[bool] = Field(default=None)
32
- max_retries: int = 0
33
- timeout_in_seconds: Optional[int] = None
29
+ api_key: Optional[SecretStr] = Field(description="API key for VoyageAI", default=None)
30
+ embedder_model_name: str = Field(
31
+ default="voyage-3", alias="model_name", description="VoyageAI model name"
32
+ )
33
+ max_retries: int = Field(default=0, description="Max retries for embedding requests.")
34
+ timeout_in_seconds: Optional[int] = Field(
35
+ default=None, description="Optional timeout in seconds for embedding requests."
36
+ )
34
37
 
35
38
  def wrap_error(self, e: Exception) -> Exception:
36
39
  if is_internal_error(e=e):
@@ -62,8 +65,9 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
62
65
  """Creates a VoyageAI python client to embed elements."""
63
66
  from voyageai import Client as VoyageAIClient
64
67
 
68
+ api_key = self.api_key.get_secret_value() if self.api_key else None
65
69
  client = VoyageAIClient(
66
- api_key=self.api_key.get_secret_value(),
70
+ api_key=api_key,
67
71
  max_retries=self.max_retries,
68
72
  timeout=self.timeout_in_seconds,
69
73
  )
@@ -77,8 +81,9 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
77
81
  """Creates a VoyageAI python client to embed elements."""
78
82
  from voyageai import AsyncClient as AsyncVoyageAIClient
79
83
 
84
+ api_key = self.api_key.get_secret_value() if self.api_key else None
80
85
  client = AsyncVoyageAIClient(
81
- api_key=self.api_key.get_secret_value(),
86
+ api_key=api_key,
82
87
  max_retries=self.max_retries,
83
88
  timeout=self.timeout_in_seconds,
84
89
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.15
3
+ Version: 1.0.17
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: <3.13,>=3.9
21
21
  Requires-Dist: click
22
- Requires-Dist: dataclasses-json
23
22
  Requires-Dist: opentelemetry-sdk
24
23
  Requires-Dist: pydantic>=2.7
25
24
  Requires-Dist: python-dateutil
@@ -74,7 +73,7 @@ Provides-Extra: duckdb
74
73
  Requires-Dist: duckdb; extra == 'duckdb'
75
74
  Requires-Dist: pandas; extra == 'duckdb'
76
75
  Provides-Extra: elasticsearch
77
- Requires-Dist: elasticsearch[async]; extra == 'elasticsearch'
76
+ Requires-Dist: elasticsearch[async]<9.0.0; extra == 'elasticsearch'
78
77
  Provides-Extra: epub
79
78
  Requires-Dist: unstructured[epub]; extra == 'epub'
80
79
  Provides-Extra: gcs
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=oGOfVxqR9-1jTBJa12PKKVu0cnuxJyM_nJIiJM33B_A,43
2
+ unstructured_ingest/__version__.py,sha256=HJycVzTiDHeRdW4JUDAnGRPwiviRO2FPCxl56CUWKGY,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -22,16 +22,16 @@ unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
22
22
  unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrHYIJZfb67hUIxA,371
23
23
  unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
24
24
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
26
- unstructured_ingest/embed/bedrock.py,sha256=AjOMSZakPrGvVUEOacnTbhuE1Ka5MdRZgOnK7Qb2lhY,8931
27
- unstructured_ingest/embed/huggingface.py,sha256=vJeTVeLzPU0mksmZHz-4v9TCtdiNt-d9bsHPbzhpyV8,2253
25
+ unstructured_ingest/embed/azure_openai.py,sha256=nHEkvWH7mETrJB-C9qlci6bEpSEPiW9peG82EUDQ954,1902
26
+ unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
27
+ unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
28
28
  unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
29
- unstructured_ingest/embed/mixedbreadai.py,sha256=_rqH5sZspKbMM1naSU5zYFgEULzZivtc3-5RrgZErpE,4486
30
- unstructured_ingest/embed/octoai.py,sha256=RWMcoYvp14SPBRoqfPRl0ny_dmt8nVbCW02UGcYetbY,4387
31
- unstructured_ingest/embed/openai.py,sha256=09Lx9lqlDXTHWXRHNiUerK-g3bhK8hSuW9RLzglHWCA,4493
32
- unstructured_ingest/embed/togetherai.py,sha256=a1ko06oSgkUgQahYNtPt6pU8v8bmNkRl7WTEK5foBvc,3527
33
- unstructured_ingest/embed/vertexai.py,sha256=CjSkXYo9ZT590eM3MU3tdDa4p5gHn4HW0bAlm70MjOo,3682
34
- unstructured_ingest/embed/voyageai.py,sha256=84O49p-vQuQzhNndPIfwCYcCNBoHFIX9QHZvc8m5DdA,4496
29
+ unstructured_ingest/embed/mixedbreadai.py,sha256=pmpGQ0E-bfkkg4rvPvsFxL6Oc7H5f0mJGguHtfL7oLc,4592
30
+ unstructured_ingest/embed/octoai.py,sha256=imuH_vLlmDd3GgAgiA0AaXB1fGjaI9lPpCCBG5HbpU8,4678
31
+ unstructured_ingest/embed/openai.py,sha256=yYqpSZcD8kUZOT36oj39hN8sCFpPKShTHVyV4dI3Bbg,4775
32
+ unstructured_ingest/embed/togetherai.py,sha256=19Le-SdMLp2U1qy5mTk_kO90b-AbOG_-a7Fslp1caJA,3767
33
+ unstructured_ingest/embed/vertexai.py,sha256=jA3Y-AysVVaYwqkVd_OgRKF0JdHLAgZlRgfgddcZV2o,3763
34
+ unstructured_ingest/embed/voyageai.py,sha256=-aKSxZm6G5NcKlloA6je70HmT30WSgcnZWCGEoz9PPo,4826
35
35
  unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
36
36
  unstructured_ingest/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
37
37
  unstructured_ingest/interfaces/downloader.py,sha256=xX0ZzsFRSzZb7SAeoeQph8sIbVq13DRw-3MYkdADrY0,2918
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.15.dist-info/METADATA,sha256=FLLtR4NPVBbafV9aTVcwguRub-D_xwmHwQmPWDAGmXU,8720
235
- unstructured_ingest-1.0.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.15.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.15.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.17.dist-info/METADATA,sha256=XvwbN72yhaJKn5uYKyWePqbH5ZmptqS9I0RP___NbXQ,8694
235
+ unstructured_ingest-1.0.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.17.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.17.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.17.dist-info/RECORD,,