unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (32) hide show
  1. test/integration/connectors/test_confluence.py +2 -2
  2. test/unit/v2/connectors/test_confluence.py +35 -3
  3. unstructured_ingest/__version__.py +1 -1
  4. unstructured_ingest/embed/huggingface.py +3 -7
  5. unstructured_ingest/utils/data_prep.py +4 -2
  6. unstructured_ingest/v2/interfaces/file_data.py +1 -1
  7. unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
  8. unstructured_ingest/v2/pipeline/pipeline.py +7 -0
  9. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  10. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  11. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  12. unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
  13. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
  14. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
  15. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
  16. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
  19. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
  21. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
  22. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
  23. unstructured_ingest/v2/processes/connectors/local.py +8 -1
  24. unstructured_ingest/v2/processes/embedder.py +3 -4
  25. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  26. unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
  27. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/METADATA +18 -18
  28. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/RECORD +32 -30
  29. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/LICENSE.md +0 -0
  30. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/WHEEL +0 -0
  31. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/entry_points.txt +0 -0
  32. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
30
30
  spaces = ["testteamsp", "MFS"]
31
31
 
32
32
  # Create connection and indexer configurations
33
- access_config = ConfluenceAccessConfig(password=api_token)
33
+ access_config = ConfluenceAccessConfig(api_token=api_token)
34
34
  connection_config = ConfluenceConnectionConfig(
35
35
  url=confluence_url,
36
36
  username=user_email,
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
77
77
  spaces = ["testteamsp1"]
78
78
 
79
79
  # Create connection and indexer configurations
80
- access_config = ConfluenceAccessConfig(password=api_token)
80
+ access_config = ConfluenceAccessConfig(api_token=api_token)
81
81
  connection_config = ConfluenceConnectionConfig(
82
82
  url=confluence_url,
83
83
  username=user_email,
@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
11
11
  with pytest.raises(ValidationError):
12
12
  ConfluenceConnectionConfig(
13
13
  access_config=ConfluenceAccessConfig(
14
- password="api_token",
14
+ password="password",
15
15
  token="access_token",
16
16
  ),
17
17
  username="user_email",
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
19
19
  )
20
20
 
21
21
 
22
+ def test_connection_config_multiple_auth2():
23
+ with pytest.raises(ValidationError):
24
+ ConfluenceConnectionConfig(
25
+ access_config=ConfluenceAccessConfig(
26
+ api_token="api_token",
27
+ token="access_token",
28
+ ),
29
+ username="user_email",
30
+ url="url",
31
+ )
32
+
33
+
34
+ def test_connection_config_multiple_auth3():
35
+ with pytest.raises(ValidationError):
36
+ ConfluenceConnectionConfig(
37
+ access_config=ConfluenceAccessConfig(
38
+ api_token="api_token",
39
+ password="password",
40
+ ),
41
+ username="user_email",
42
+ url="url",
43
+ )
44
+
45
+
22
46
  def test_connection_config_no_auth():
23
47
  with pytest.raises(ValidationError):
24
48
  ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
25
49
 
26
50
 
27
- def test_connection_config_basic_auth():
51
+ def test_connection_config_password_auth():
52
+ ConfluenceConnectionConfig(
53
+ access_config=ConfluenceAccessConfig(password="password"),
54
+ url="url",
55
+ username="user_email",
56
+ )
57
+
58
+
59
+ def test_connection_config_api_token_auth():
28
60
  ConfluenceConnectionConfig(
29
- access_config=ConfluenceAccessConfig(password="api_token"),
61
+ access_config=ConfluenceAccessConfig(api_token="api_token"),
30
62
  url="url",
31
63
  username="user_email",
32
64
  )
@@ -1 +1 @@
1
- __version__ = "0.5.14" # pragma: no cover
1
+ __version__ = "0.5.15" # pragma: no cover
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from pydantic import Field
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
18
- embedder_model_name: Optional[str] = Field(
19
- default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
20
- )
18
+ embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
21
19
  embedder_model_kwargs: Optional[dict] = Field(
22
20
  default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
23
21
  )
24
22
  encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
25
- cache_folder: Optional[str] = Field(default=None)
26
23
 
27
24
  @requires_dependencies(
28
25
  ["sentence_transformers"],
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
33
30
 
34
31
  return SentenceTransformer(
35
32
  model_name_or_path=self.embedder_model_name,
36
- cache_folder=self.cache_folder,
37
33
  **self.embedder_model_kwargs,
38
34
  )
39
35
 
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
45
41
 
46
42
  @dataclass
47
43
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
48
- config: HuggingFaceEmbeddingConfig
44
+ config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
49
45
 
50
46
  def _embed_query(self, query: str) -> list[float]:
51
47
  return self._embed_documents(texts=[query])[0]
@@ -2,7 +2,7 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
6
6
 
7
7
  import pandas as pd
8
8
 
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
163
163
  raise IOError("Unsupported file type: {path}")
164
164
 
165
165
 
166
- def get_data(path: Path) -> list[dict]:
166
+ def get_data(path: Union[Path, str]) -> list[dict]:
167
+ if isinstance(path, str):
168
+ path = Path(path)
167
169
  try:
168
170
  return get_data_by_suffix(path=path)
169
171
  except Exception as e:
@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
102
102
  try:
103
103
  return BatchFileData.from_file(path=path)
104
104
  except ValidationError:
105
- logger.debug(f"{path} not valid for batch file data")
105
+ logger.debug(f"{path} not detected as batch file data")
106
106
 
107
107
  return FileData.from_file(path=path)
108
108
 
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
7
6
  from pydantic import BaseModel
8
7
 
9
8
  from unstructured_ingest.utils import ndjson
9
+ from unstructured_ingest.utils.data_prep import get_data, write_data
10
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
11
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
12
 
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
43
43
  writer.f.flush()
44
44
 
45
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
- with input_file.open() as in_f:
47
- elements_contents = json.load(in_f)
46
+ elements_contents = get_data(path=input_file)
48
47
 
49
48
  conformed_elements = [
50
49
  self.conform_dict(element_dict=element, file_data=file_data)
51
50
  for element in elements_contents
52
51
  ]
53
-
54
- with open(output_file, "w") as out_f:
55
- json.dump(conformed_elements, out_f, indent=2)
52
+ write_data(path=output_file, data=conformed_elements)
56
53
 
57
54
  def run(
58
55
  self,
@@ -108,6 +108,13 @@ class Pipeline:
108
108
  uploader_connector_type = self.uploader_step.process.connector_type
109
109
  registry_entry = destination_registry[uploader_connector_type]
110
110
  if registry_entry.upload_stager and self.stager_step is None:
111
+ try:
112
+ self.stager_step = UploadStageStep(
113
+ process=registry_entry.upload_stager(), context=self.context
114
+ )
115
+ return
116
+ except Exception as e:
117
+ logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
111
118
  raise ValueError(
112
119
  f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
113
120
  f"expects a stager of type {registry_entry.upload_stager.__name__} "
@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
35
35
 
36
36
  class ConfluenceAccessConfig(AccessConfig):
37
37
  password: Optional[str] = Field(
38
- description="Confluence password or Cloud API token",
38
+ description="Confluence password",
39
+ default=None,
40
+ )
41
+ api_token: Optional[str] = Field(
42
+ description="Confluence Cloud API token",
39
43
  default=None,
40
44
  )
41
45
  token: Optional[str] = Field(
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
57
61
 
58
62
  def model_post_init(self, __context):
59
63
  access_configs = self.access_config.get_secret_value()
60
- basic_auth = self.username and access_configs.password
64
+ if access_configs.password and access_configs.api_token:
65
+ raise ValueError(
66
+ "both password and api_token provided, only one allowed, "
67
+ "see: https://atlassian-python-api.readthedocs.io/"
68
+ )
69
+ basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
61
70
  pat_auth = access_configs.token
62
71
  if self.cloud and not basic_auth:
63
72
  raise ValueError(
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
74
83
  "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
75
84
  )
76
85
 
86
+ def password_or_api_token(self) -> str:
87
+ # Confluence takes either password or API token under the same field: password
88
+ # This ambiguity led to confusion, so we are making it specific what you are passing in
89
+ access_configs = self.access_config.get_secret_value()
90
+ if access_configs.password:
91
+ return access_configs.password
92
+ return access_configs.api_token
93
+
77
94
  @requires_dependencies(["atlassian"], extras="confluence")
78
95
  @contextmanager
79
96
  def get_client(self) -> "Confluence":
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
83
100
  with Confluence(
84
101
  url=self.url,
85
102
  username=self.username,
86
- password=access_configs.password,
103
+ password=self.password_or_api_token(),
87
104
  token=access_configs.token,
88
105
  cloud=self.cloud,
89
106
  ) as client:
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_aws"
22
26
 
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
80
  connection_config=DatabricksAWSVolumesConnectionConfig,
77
81
  uploader=DatabricksAWSVolumesUploader,
78
82
  uploader_config=DatabricksAWSVolumesUploaderConfig,
83
+ upload_stager_config=BlobStoreUploadStagerConfig,
84
+ upload_stager=BlobStoreUploadStager,
79
85
  )
80
86
 
81
87
  databricks_aws_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_azure"
22
26
 
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
95
  connection_config=DatabricksAzureVolumesConnectionConfig,
92
96
  uploader=DatabricksAzureVolumesUploader,
93
97
  uploader_config=DatabricksAzureVolumesUploaderConfig,
98
+ upload_stager_config=BlobStoreUploadStagerConfig,
99
+ upload_stager=BlobStoreUploadStager,
94
100
  )
95
101
 
96
102
  databricks_azure_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_gcp"
22
26
 
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
78
  connection_config=DatabricksGoogleVolumesConnectionConfig,
75
79
  uploader=DatabricksGoogleVolumesUploader,
76
80
  uploader_config=DatabricksGoogleVolumesUploaderConfig,
81
+ upload_stager_config=BlobStoreUploadStagerConfig,
82
+ upload_stager=BlobStoreUploadStager,
77
83
  )
78
84
 
79
85
  databricks_gcp_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes"
22
26
 
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
79
  connection_config=DatabricksNativeVolumesConnectionConfig,
76
80
  uploader=DatabricksNativeVolumesUploader,
77
81
  uploader_config=DatabricksNativeVolumesUploaderConfig,
82
+ upload_stager_config=BlobStoreUploadStagerConfig,
83
+ upload_stager=BlobStoreUploadStager,
78
84
  )
79
85
 
80
86
  databricks_native_volumes_source_entry = SourceRegistryEntry(
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploaderConfig,
27
27
  )
28
28
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  if TYPE_CHECKING:
31
35
  from adlfs import AzureBlobFileSystem
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
194
198
  uploader=AzureUploader,
195
199
  uploader_config=AzureUploaderConfig,
196
200
  connection_config=AzureConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
197
203
  )
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  FsspecUploaderConfig,
29
29
  )
30
30
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
31
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
+ BlobStoreUploadStager,
33
+ BlobStoreUploadStagerConfig,
34
+ )
31
35
 
32
36
  if TYPE_CHECKING:
33
37
  from boxfs import BoxFileSystem
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
167
171
  uploader=BoxUploader,
168
172
  uploader_config=BoxUploaderConfig,
169
173
  connection_config=BoxConnectionConfig,
174
+ upload_stager_config=BlobStoreUploadStagerConfig,
175
+ upload_stager=BlobStoreUploadStager,
170
176
  )
@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
31
31
  FsspecUploader,
32
32
  FsspecUploaderConfig,
33
33
  )
34
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
35
+ BlobStoreUploadStager,
36
+ BlobStoreUploadStagerConfig,
37
+ )
34
38
 
35
39
  if TYPE_CHECKING:
36
40
  pass
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
228
232
  uploader=DropboxUploader,
229
233
  uploader_config=DropboxUploaderConfig,
230
234
  connection_config=DropboxConnectionConfig,
235
+ upload_stager_config=BlobStoreUploadStagerConfig,
236
+ upload_stager=BlobStoreUploadStager,
231
237
  )
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  FsspecUploader,
29
29
  FsspecUploaderConfig,
30
30
  )
31
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
+ BlobStoreUploadStager,
33
+ BlobStoreUploadStagerConfig,
34
+ )
31
35
 
32
36
  if TYPE_CHECKING:
33
37
  from gcsfs import GCSFileSystem
@@ -194,4 +198,6 @@ gcs_destination_entry = DestinationRegistryEntry(
194
198
  uploader=GcsUploader,
195
199
  uploader_config=GcsUploaderConfig,
196
200
  connection_config=GcsConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
197
203
  )
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploader,
27
27
  FsspecUploaderConfig,
28
28
  )
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  CONNECTOR_TYPE = "s3"
31
35
 
@@ -182,4 +186,6 @@ s3_destination_entry = DestinationRegistryEntry(
182
186
  uploader=S3Uploader,
183
187
  uploader_config=S3UploaderConfig,
184
188
  connection_config=S3ConnectionConfig,
189
+ upload_stager_config=BlobStoreUploadStagerConfig,
190
+ upload_stager=BlobStoreUploadStager,
185
191
  )
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploader,
27
27
  FsspecUploaderConfig,
28
28
  )
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  if TYPE_CHECKING:
31
35
  from fsspec.implementations.sftp import SFTPFileSystem
@@ -168,4 +172,6 @@ sftp_destination_entry = DestinationRegistryEntry(
168
172
  uploader=SftpUploader,
169
173
  uploader_config=SftpUploaderConfig,
170
174
  connection_config=SftpConnectionConfig,
175
+ upload_stager_config=BlobStoreUploadStagerConfig,
176
+ upload_stager=BlobStoreUploadStager,
171
177
  )
@@ -27,6 +27,10 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
27
  DestinationRegistryEntry,
28
28
  SourceRegistryEntry,
29
29
  )
30
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
31
+ BlobStoreUploadStager,
32
+ BlobStoreUploadStagerConfig,
33
+ )
30
34
 
31
35
  CONNECTOR_TYPE = "local"
32
36
 
@@ -213,5 +217,8 @@ local_source_entry = SourceRegistryEntry(
213
217
  )
214
218
 
215
219
  local_destination_entry = DestinationRegistryEntry(
216
- uploader=LocalUploader, uploader_config=LocalUploaderConfig
220
+ uploader=LocalUploader,
221
+ uploader_config=LocalUploaderConfig,
222
+ upload_stager_config=BlobStoreUploadStagerConfig,
223
+ upload_stager=BlobStoreUploadStager,
217
224
  )
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -6,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
6
5
 
7
6
  from pydantic import BaseModel, Field, SecretStr
8
7
 
8
+ from unstructured_ingest.utils.data_prep import get_data
9
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
10
 
11
11
  if TYPE_CHECKING:
@@ -192,9 +192,8 @@ class Embedder(BaseProcess, ABC):
192
192
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
193
193
  # TODO update base embedder classes to support async
194
194
  embedder = self.config.get_embedder()
195
- with elements_filepath.open("r") as elements_file:
196
- elements = json.load(elements_file)
195
+ elements = get_data(path=elements_filepath)
197
196
  if not elements:
198
- return [e.to_dict() for e in elements]
197
+ return []
199
198
  embedded_elements = embedder.embed_documents(elements=elements)
200
199
  return embedded_elements
File without changes
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from unstructured_ingest.utils.data_prep import get_data, write_data
6
+ from unstructured_ingest.v2.interfaces import FileData, UploadStager, UploadStagerConfig
7
+
8
+
9
+ class BlobStoreUploadStagerConfig(UploadStagerConfig):
10
+ pass
11
+
12
+
13
+ @dataclass
14
+ class BlobStoreUploadStager(UploadStager):
15
+ upload_stager_config: BlobStoreUploadStagerConfig = field(
16
+ default_factory=BlobStoreUploadStagerConfig
17
+ )
18
+
19
+ def run(
20
+ self,
21
+ elements_filepath: Path,
22
+ file_data: FileData,
23
+ output_dir: Path,
24
+ output_filename: str,
25
+ **kwargs: Any,
26
+ ) -> Path:
27
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
28
+ # Always save as json
29
+ data = get_data(elements_filepath)
30
+ write_data(path=output_file.with_suffix(".json"), data=data)
31
+ return output_file.with_suffix(".json")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.14
3
+ Version: 0.5.15
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,11 +23,11 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pandas
26
- Requires-Dist: opentelemetry-sdk
27
- Requires-Dist: python-dateutil
26
+ Requires-Dist: tqdm
28
27
  Requires-Dist: pydantic>=2.7
28
+ Requires-Dist: python-dateutil
29
29
  Requires-Dist: click
30
- Requires-Dist: tqdm
30
+ Requires-Dist: opentelemetry-sdk
31
31
  Requires-Dist: dataclasses_json
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
@@ -71,11 +71,11 @@ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: requests; extra == "biomed"
75
74
  Requires-Dist: bs4; extra == "biomed"
75
+ Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: boxfs; extra == "box"
78
77
  Requires-Dist: fsspec; extra == "box"
78
+ Requires-Dist: boxfs; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
89
- Requires-Dist: boto3; extra == "delta-table"
90
89
  Requires-Dist: deltalake; extra == "delta-table"
90
+ Requires-Dist: boto3; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
@@ -102,15 +102,15 @@ Requires-Dist: gcsfs; extra == "gcs"
102
102
  Requires-Dist: bs4; extra == "gcs"
103
103
  Requires-Dist: fsspec; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: requests; extra == "github"
106
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
+ Requires-Dist: requests; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: hubspot-api-client; extra == "hubspot"
113
112
  Requires-Dist: urllib3; extra == "hubspot"
113
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
114
114
  Provides-Extra: jira
115
115
  Requires-Dist: atlassian-python-api; extra == "jira"
116
116
  Provides-Extra: kafka
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
124
124
  Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
+ Requires-Dist: cymple; extra == "neo4j"
127
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
128
129
  Requires-Dist: networkx; extra == "neo4j"
129
- Requires-Dist: cymple; extra == "neo4j"
130
130
  Provides-Extra: notion
131
131
  Requires-Dist: httpx; extra == "notion"
132
- Requires-Dist: backoff; extra == "notion"
133
- Requires-Dist: notion-client; extra == "notion"
134
132
  Requires-Dist: htmlBuilder; extra == "notion"
133
+ Requires-Dist: notion-client; extra == "notion"
134
+ Requires-Dist: backoff; extra == "notion"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: msal; extra == "onedrive"
137
- Requires-Dist: bs4; extra == "onedrive"
138
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
+ Requires-Dist: bs4; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: msal; extra == "outlook"
143
142
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
+ Requires-Dist: msal; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -155,8 +155,8 @@ Provides-Extra: s3
155
155
  Requires-Dist: s3fs; extra == "s3"
156
156
  Requires-Dist: fsspec; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: msal; extra == "sharepoint"
159
158
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
+ Requires-Dist: msal; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
@@ -182,13 +182,13 @@ Requires-Dist: httpx; extra == "vectara"
182
182
  Requires-Dist: requests; extra == "vectara"
183
183
  Requires-Dist: aiofiles; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
+ Requires-Dist: vastdb; extra == "vastdb"
185
186
  Requires-Dist: ibis; extra == "vastdb"
186
187
  Requires-Dist: pyarrow; extra == "vastdb"
187
- Requires-Dist: vastdb; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
189
  Requires-Dist: httpx; extra == "zendesk"
190
- Requires-Dist: aiofiles; extra == "zendesk"
191
190
  Requires-Dist: bs4; extra == "zendesk"
191
+ Requires-Dist: aiofiles; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
@@ -8,7 +8,7 @@ test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpei
8
8
  test/integration/connectors/test_astradb.py,sha256=c9Lk0dvJVVdzHcokvsc4XMNJ4SIO1k2vGtT5py0cFVM,9753
9
9
  test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
10
10
  test/integration/connectors/test_chroma.py,sha256=1uGHbZXkXKGb8wl3p7c9G-L1MViUe283Hw5u3dg8OgI,4532
11
- test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
11
+ test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
12
12
  test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
13
13
  test/integration/connectors/test_dropbox.py,sha256=jzpZ6wawLa4sC1BVoHWZJ3cHjL4DWWUEX5ee7bXUOOM,4945
14
14
  test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
@@ -89,7 +89,7 @@ test/unit/v2/test_utils.py,sha256=TWVAeE0OrcHgPyzGPtEnQakICsVrDeVhIKPMRQPX554,26
89
89
  test/unit/v2/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
90
  test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCrSXvcMFecE,1772
91
91
  test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
- test/unit/v2/connectors/test_confluence.py,sha256=bXrn_kRb4IQdqkk4rc-P2gJAtPba7n7pNplQgfbqZDY,1047
92
+ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
93
93
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
94
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
95
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=-WADymCsmQMFkVZ96tUEJfzNLFjyNrB26q5Or_LtNJs,43
114
+ unstructured_ingest/__version__.py,sha256=noAC1JV7rAfkk9NQctRgYOifiiASnPhPSbtOr9y3Hkk,43
115
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -281,7 +281,7 @@ unstructured_ingest/connector/notion/types/database_properties/verification.py,s
281
281
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
282
282
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
283
283
  unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
284
- unstructured_ingest/embed/huggingface.py,sha256=EWU1kd5Cm6ajgCw6hP5w_4pniGSgxnR0wM9vjuPQ6Yk,2334
284
+ unstructured_ingest/embed/huggingface.py,sha256=-ZD17O_H_UnK80fqig6y6wNKJckjx0HuAkY5vgPvk8M,2259
285
285
  unstructured_ingest/embed/interfaces.py,sha256=_-CqasY6R5nnNUY-X6PS5lz8dsmGaUw5zIGRdPfx16o,4918
286
286
  unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
287
287
  unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
@@ -370,7 +370,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
370
370
  unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
371
371
  unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
372
372
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
373
- unstructured_ingest/utils/data_prep.py,sha256=X3d8Kos1zqX-HQAicF_8TB0BrstRtHrbMzu_1s7mj7M,7191
373
+ unstructured_ingest/utils/data_prep.py,sha256=AKtsdu9stYA63CV1C5B_fFWigqy-giVv-euumitos-A,7266
374
374
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
375
375
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
376
376
  unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
@@ -399,30 +399,30 @@ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1K
399
399
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
400
400
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
401
401
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
402
- unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
402
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
403
403
  unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
404
404
  unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
405
405
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
406
- unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
406
+ unstructured_ingest/v2/interfaces/upload_stager.py,sha256=Bzhb994gVqFI8KBd6lx3Rcr5UH0ZhU66jOD3WAUr20Q,3151
407
407
  unstructured_ingest/v2/interfaces/uploader.py,sha256=AMgp0uaJ5XeqiyURLIUnWyoIqhUT9Ak5P_LT9-qasYk,2107
408
408
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
409
409
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
410
410
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
411
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=UeOk5SywJZIn3kCnHclQ2cP7JJIXb4NDjpwzsCP_cF0,16523
411
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=m3m9F9wZsCEhsFK_0WZv5_ENl2M42VHBV6Vc39t90v8,16842
412
412
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
413
- unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
413
+ unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=s2BY2v1cs_ImTsPrg8J-92k-fV73b61nDiSy4p9736k,3223
414
414
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
415
- unstructured_ingest/v2/pipeline/steps/embed.py,sha256=iL6X0G5AvKnlfI-3XRWudlb0-6rD_PqyzA3MFmmcn6M,3199
415
+ unstructured_ingest/v2/pipeline/steps/embed.py,sha256=HPQgEWvVrpThUD1FB9k7XNiARXkd6rb4lnpxTGmEQxI,3201
416
416
  unstructured_ingest/v2/pipeline/steps/filter.py,sha256=pju7knTSbB2ll1jC9DPePRDnHlOlvEcU1-sjk6xYGGc,1211
417
417
  unstructured_ingest/v2/pipeline/steps/index.py,sha256=m0BbUwe_7s_gFxR9K31IJdAf3_GgKXXajGJec5jcSXA,3557
418
- unstructured_ingest/v2/pipeline/steps/partition.py,sha256=IJQWaOTcyFlH2bz8WbmynE5Zkd5D8ELOKTnSCnt9Wcc,3282
418
+ unstructured_ingest/v2/pipeline/steps/partition.py,sha256=yE4HFFyORhnzH25PoJG6MNquMXqpzAznyf9NoZYBV5E,3284
419
419
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=VR8SLUJdVva61aieVKyxUHzupTCQbQeaMA0CKu4Fx7o,2347
420
420
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=p2nPFGbcpivPAZO5jDogTfn0iaL5bCFsgBNMejxVbzE,1768
421
421
  unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_vjQt5hgsy_jRCxPzxo4,2010
422
422
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
423
423
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
424
424
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
425
- unstructured_ingest/v2/processes/embedder.py,sha256=4x-Rt5UCvwdgihDAr24hvTGDEd1CdKF9xJrf3aMU-ck,7926
425
+ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8gJwIS-imgloE-UOc,7887
426
426
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
427
427
  unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
428
428
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
@@ -431,7 +431,7 @@ unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XE
431
431
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=E6fB4anCd_gtSzVUsZ5pDrfdxs5AWERQM_NEfeenfEs,18202
432
432
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
433
433
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
434
- unstructured_ingest/v2/processes/connectors/confluence.py,sha256=wTZewdbmCHaQuEJZ7Wf0NBOo8fS_n1I0DDwlhN96woE,11243
434
+ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=gSs4-AxL0gfeWdJfP7JfCrQSQNLoJRkvHquKK9RJvpQ,12043
435
435
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
436
436
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=xvLWTSFEC3gyGTwEISXxWmUoAfCgzdgZkETMMBOPHuI,7153
437
437
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
@@ -439,7 +439,7 @@ unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5B
439
439
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
440
440
  unstructured_ingest/v2/processes/connectors/jira.py,sha256=-f_vIWNw6Xr8rMNdAcfCC2cmhB-QndnZk5XymHo60FU,17094
441
441
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
442
- unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
442
+ unstructured_ingest/v2/processes/connectors/local.py,sha256=FWPRjjUsnQjyZMChuZGuMU04AB5X0sFEOcAXhx1r9sk,7381
443
443
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
444
444
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
445
445
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
@@ -456,10 +456,10 @@ unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HB
456
456
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
457
457
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
458
458
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
459
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
460
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=cb-EUW0T-linZMkbU6AcKEGWnFHQvhpO5Abtps4P2X0,3532
461
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8NubkyHw49IpW_42g6w1Koxlm56EPiPf1lB-eoRSI,2783
462
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
459
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
460
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
461
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
462
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
463
463
  unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
464
464
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
465
465
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
@@ -469,13 +469,13 @@ unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc
469
469
  unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
470
470
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
471
471
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
472
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=fwbHYoRrN0ZRuLdLb5X-Z7nr11rMSY8VhWMhfR3ljQo,6933
473
- unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=VXxEfgJbW8DCOrqLW7mQkSeWqH-HczidTNIE28SgERY,5658
474
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=DUAxkMDosLhRYfITuXh-Jpupwd20_3VK4P-FK-wgg7k,8137
472
+ unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
473
+ unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
474
+ unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
475
475
  unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
476
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=uOfm2tLc0r5U3CNkfauuwhGOhP7RJpjyBpHWMDXCk7c,6954
477
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=LcfIU-QgW5oVMF4jMUVm7HSgVcSrQamY6mgXdQuiSjc,6400
478
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=3cVwVH3fT_JEYzIbl48-NDXdbo7XWX4C4_eqTvgWIro,6150
476
+ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
477
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
478
+ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
479
479
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
480
480
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
481
481
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
@@ -575,9 +575,11 @@ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnV
575
575
  unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
576
576
  unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=MNyI6SUuxZHf_6zONoC6jR2f9BvyTYoMyGKDOhl4kgs,7897
577
577
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=vQHZa5YYiDPXXPRAPMnPXhh0QzXeiBVx_YIWskZBQIc,15465
578
- unstructured_ingest-0.5.14.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
579
- unstructured_ingest-0.5.14.dist-info/METADATA,sha256=MWLSj6JZslpZjjlZhvn34zn_AJdlVHZibG2RPEbaMjE,8465
580
- unstructured_ingest-0.5.14.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
581
- unstructured_ingest-0.5.14.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
582
- unstructured_ingest-0.5.14.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
583
- unstructured_ingest-0.5.14.dist-info/RECORD,,
578
+ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
+ unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
+ unstructured_ingest-0.5.15.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
+ unstructured_ingest-0.5.15.dist-info/METADATA,sha256=TimVS8ZngyfFUMhuD317dXl6nlI9acBRC_LxZWZczuU,8465
582
+ unstructured_ingest-0.5.15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
+ unstructured_ingest-0.5.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
+ unstructured_ingest-0.5.15.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
+ unstructured_ingest-0.5.15.dist-info/RECORD,,