unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/embed/__init__.py +17 -0
  22. unstructured_ingest/embed/bedrock.py +70 -0
  23. unstructured_ingest/embed/huggingface.py +73 -0
  24. unstructured_ingest/embed/interfaces.py +36 -0
  25. unstructured_ingest/embed/mixedbreadai.py +177 -0
  26. unstructured_ingest/embed/octoai.py +63 -0
  27. unstructured_ingest/embed/openai.py +61 -0
  28. unstructured_ingest/embed/vertexai.py +88 -0
  29. unstructured_ingest/embed/voyageai.py +69 -0
  30. unstructured_ingest/interfaces.py +21 -11
  31. unstructured_ingest/logger.py +1 -1
  32. unstructured_ingest/pipeline/copy.py +1 -1
  33. unstructured_ingest/pipeline/interfaces.py +2 -2
  34. unstructured_ingest/pipeline/partition.py +1 -1
  35. unstructured_ingest/pipeline/pipeline.py +1 -1
  36. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  37. unstructured_ingest/pipeline/reformat/embedding.py +4 -6
  38. unstructured_ingest/pipeline/source.py +2 -2
  39. unstructured_ingest/utils/compression.py +3 -3
  40. unstructured_ingest/utils/data_prep.py +20 -12
  41. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  42. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  43. unstructured_ingest/v2/cli/base/dest.py +1 -1
  44. unstructured_ingest/v2/cli/base/src.py +3 -2
  45. unstructured_ingest/v2/cli/utils/click.py +1 -1
  46. unstructured_ingest/v2/interfaces/processor.py +48 -13
  47. unstructured_ingest/v2/logger.py +1 -1
  48. unstructured_ingest/v2/otel.py +1 -1
  49. unstructured_ingest/v2/pipeline/interfaces.py +12 -3
  50. unstructured_ingest/v2/pipeline/pipeline.py +42 -29
  51. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  52. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  53. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  54. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  55. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  56. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  57. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  58. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  60. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  61. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
  66. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  67. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
  69. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  70. unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
  71. unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
  72. unstructured_ingest/v2/processes/embedder.py +41 -24
  73. unstructured_ingest/v2/processes/filter.py +1 -1
  74. unstructured_ingest/v2/processes/partitioner.py +3 -3
  75. unstructured_ingest/v2/utils.py +7 -0
  76. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
  77. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
  78. unstructured_ingest/evaluate.py +0 -338
  79. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
  80. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
  81. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
  82. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
@@ -5,11 +6,10 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
5
6
 
6
7
  from pydantic import BaseModel, Field, SecretStr
7
8
 
8
- from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
10
 
11
11
  if TYPE_CHECKING:
12
- from unstructured.embed.interfaces import BaseEmbeddingEncoder
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
13
13
 
14
14
 
15
15
  class EmbedderConfig(BaseModel):
@@ -21,6 +21,7 @@ class EmbedderConfig(BaseModel):
21
21
  "langchain-vertexai",
22
22
  "langchain-voyageai",
23
23
  "octoai",
24
+ "mixedbread-ai",
24
25
  ]
25
26
  ] = Field(default=None, description="Type of the embedding class to be used.")
26
27
  embedding_api_key: Optional[SecretStr] = Field(
@@ -42,30 +43,31 @@ class EmbedderConfig(BaseModel):
42
43
  default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
43
44
  )
44
45
 
45
- @requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
46
46
  def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
47
- from unstructured.embed.huggingface import (
47
+ from unstructured_ingest.embed.huggingface import (
48
48
  HuggingFaceEmbeddingConfig,
49
49
  HuggingFaceEmbeddingEncoder,
50
50
  )
51
51
 
52
- return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**embedding_kwargs))
52
+ return HuggingFaceEmbeddingEncoder(
53
+ config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
54
+ )
53
55
 
54
- @requires_dependencies(dependencies=["unstructured"], extras="openai")
55
56
  def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
56
- from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
57
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
57
58
 
58
- return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**embedding_kwargs))
59
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
59
60
 
60
- @requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
61
61
  def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
62
- from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
62
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
63
63
 
64
- return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**embedding_kwargs))
64
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
65
65
 
66
- @requires_dependencies(dependencies=["unstructured"], extras="bedrock")
67
66
  def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
68
- from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
67
+ from unstructured_ingest.embed.bedrock import (
68
+ BedrockEmbeddingConfig,
69
+ BedrockEmbeddingEncoder,
70
+ )
69
71
 
70
72
  return BedrockEmbeddingEncoder(
71
73
  config=BedrockEmbeddingConfig(
@@ -75,20 +77,35 @@ class EmbedderConfig(BaseModel):
75
77
  )
76
78
  )
77
79
 
78
- @requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
79
80
  def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
80
- from unstructured.embed.vertexai import (
81
+ from unstructured_ingest.embed.vertexai import (
81
82
  VertexAIEmbeddingConfig,
82
83
  VertexAIEmbeddingEncoder,
83
84
  )
84
85
 
85
- return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**embedding_kwargs))
86
+ return VertexAIEmbeddingEncoder(
87
+ config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
88
+ )
86
89
 
87
- @requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
88
90
  def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
89
- from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
91
+ from unstructured_ingest.embed.voyageai import (
92
+ VoyageAIEmbeddingConfig,
93
+ VoyageAIEmbeddingEncoder,
94
+ )
95
+
96
+ return VoyageAIEmbeddingEncoder(
97
+ config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
98
+ )
90
99
 
91
- return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**embedding_kwargs))
100
+ def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
101
+ from unstructured_ingest.embed.mixedbreadai import (
102
+ MixedbreadAIEmbeddingConfig,
103
+ MixedbreadAIEmbeddingEncoder,
104
+ )
105
+
106
+ return MixedbreadAIEmbeddingEncoder(
107
+ config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
108
+ )
92
109
 
93
110
  def get_embedder(self) -> "BaseEmbeddingEncoder":
94
111
  kwargs: dict[str, Any] = {}
@@ -114,6 +131,8 @@ class EmbedderConfig(BaseModel):
114
131
 
115
132
  if self.embedding_provider == "langchain-voyageai":
116
133
  return self.get_voyageai_embedder(embedding_kwargs=kwargs)
134
+ if self.embedding_provider == "mixedbread-ai":
135
+ return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
117
136
 
118
137
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
119
138
 
@@ -122,14 +141,12 @@ class EmbedderConfig(BaseModel):
122
141
  class Embedder(BaseProcess, ABC):
123
142
  config: EmbedderConfig
124
143
 
125
- @requires_dependencies(dependencies=["unstructured"])
126
144
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
127
- from unstructured.staging.base import elements_from_json
128
-
129
145
  # TODO update base embedder classes to support async
130
146
  embedder = self.config.get_embedder()
131
- elements = elements_from_json(filename=str(elements_filepath))
147
+ with elements_filepath.open("r") as elements_file:
148
+ elements = json.load(elements_file)
132
149
  if not elements:
133
150
  return [e.to_dict() for e in elements]
134
151
  embedded_elements = embedder.embed_documents(elements=elements)
135
- return [e.to_dict() for e in embedded_elements]
152
+ return embedded_elements
@@ -47,7 +47,7 @@ class Filterer(BaseProcess, ABC):
47
47
  for pattern in patterns:
48
48
  if fnmatch.filter([path], pattern):
49
49
  return True
50
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
50
+ logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
51
51
  return False
52
52
 
53
53
  def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
@@ -145,7 +145,7 @@ class Partitioner(BaseProcess, ABC):
145
145
  class FileDataSourceMetadata(DataSourceMetadata):
146
146
  filesize_bytes: Optional[int] = None
147
147
 
148
- logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
148
+ logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
149
149
  logger.debug(f"partitioning file {filename} with metadata {metadata}")
150
150
  elements = partition(
151
151
  filename=str(filename.resolve()),
@@ -165,7 +165,7 @@ class Partitioner(BaseProcess, ABC):
165
165
 
166
166
  partition_request = self.config.to_partition_kwargs()
167
167
 
168
- # Note(austin): PartitionParameters is a Pydantic model in v0.26.0
168
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
169
169
  # Prior to this it was a dataclass which doesn't have .__fields
170
170
  try:
171
171
  possible_fields = PartitionParameters.__fields__
@@ -182,7 +182,7 @@ class Partitioner(BaseProcess, ABC):
182
182
  ", ".join([v for v in partition_request if v not in filtered_partition_request])
183
183
  )
184
184
  )
185
- logger.debug(f"Using hosted partitioner with kwargs: {partition_request}")
185
+ logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
186
186
  with open(filename, "rb") as f:
187
187
  files = Files(
188
188
  content=f.read(),
@@ -20,6 +20,11 @@ def is_secret(value: Any) -> bool:
20
20
  def serialize_base_model(model: BaseModel) -> dict:
21
21
  # To get the full serialized dict regardless of if values are marked as Secret
22
22
  model_dict = model.dict()
23
+ return serialize_base_dict(model_dict=model_dict)
24
+
25
+
26
+ def serialize_base_dict(model_dict: dict) -> dict:
27
+ model_dict = model_dict.copy()
23
28
  for k, v in model_dict.items():
24
29
  if isinstance(v, _SecretBase):
25
30
  secret_value = v.get_secret_value()
@@ -27,6 +32,8 @@ def serialize_base_model(model: BaseModel) -> dict:
27
32
  model_dict[k] = serialize_base_model(model=secret_value)
28
33
  else:
29
34
  model_dict[k] = secret_value
35
+ if isinstance(v, dict):
36
+ model_dict[k] = serialize_base_dict(model_dict=v)
30
37
 
31
38
  return model_dict
32
39