unstructured-ingest 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (35) hide show
  1. test/integration/connectors/test_confluence.py +2 -2
  2. test/integration/connectors/test_zendesk.py +1 -1
  3. test/unit/v2/connectors/test_confluence.py +35 -3
  4. unstructured_ingest/__version__.py +1 -1
  5. unstructured_ingest/embed/huggingface.py +3 -7
  6. unstructured_ingest/utils/data_prep.py +4 -2
  7. unstructured_ingest/v2/interfaces/file_data.py +1 -1
  8. unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
  9. unstructured_ingest/v2/pipeline/pipeline.py +7 -0
  10. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  11. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  12. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  13. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  14. unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
  15. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
  16. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
  17. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
  19. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
  20. unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
  21. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
  22. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
  24. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
  25. unstructured_ingest/v2/processes/connectors/local.py +8 -1
  26. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -31
  27. unstructured_ingest/v2/processes/embedder.py +3 -4
  28. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  29. unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
  30. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/METADATA +19 -19
  31. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/RECORD +35 -33
  32. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/LICENSE.md +0 -0
  33. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/WHEEL +0 -0
  34. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/entry_points.txt +0 -0
  35. {unstructured_ingest-0.5.13.dist-info → unstructured_ingest-0.5.15.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
30
30
  spaces = ["testteamsp", "MFS"]
31
31
 
32
32
  # Create connection and indexer configurations
33
- access_config = ConfluenceAccessConfig(password=api_token)
33
+ access_config = ConfluenceAccessConfig(api_token=api_token)
34
34
  connection_config = ConfluenceConnectionConfig(
35
35
  url=confluence_url,
36
36
  username=user_email,
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
77
77
  spaces = ["testteamsp1"]
78
78
 
79
79
  # Create connection and indexer configurations
80
- access_config = ConfluenceAccessConfig(password=api_token)
80
+ access_config = ConfluenceAccessConfig(api_token=api_token)
81
81
  connection_config = ConfluenceConnectionConfig(
82
82
  url=confluence_url,
83
83
  username=user_email,
@@ -11,7 +11,7 @@ from test.integration.connectors.utils.validation.source import (
11
11
  )
12
12
  from test.integration.utils import requires_env
13
13
  from unstructured_ingest.v2.errors import UserAuthError
14
- from unstructured_ingest.v2.processes.connectors.zendesk import (
14
+ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
15
15
  CONNECTOR_TYPE,
16
16
  ZendeskAccessConfig,
17
17
  ZendeskConnectionConfig,
@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
11
11
  with pytest.raises(ValidationError):
12
12
  ConfluenceConnectionConfig(
13
13
  access_config=ConfluenceAccessConfig(
14
- password="api_token",
14
+ password="password",
15
15
  token="access_token",
16
16
  ),
17
17
  username="user_email",
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
19
19
  )
20
20
 
21
21
 
22
+ def test_connection_config_multiple_auth2():
23
+ with pytest.raises(ValidationError):
24
+ ConfluenceConnectionConfig(
25
+ access_config=ConfluenceAccessConfig(
26
+ api_token="api_token",
27
+ token="access_token",
28
+ ),
29
+ username="user_email",
30
+ url="url",
31
+ )
32
+
33
+
34
+ def test_connection_config_multiple_auth3():
35
+ with pytest.raises(ValidationError):
36
+ ConfluenceConnectionConfig(
37
+ access_config=ConfluenceAccessConfig(
38
+ api_token="api_token",
39
+ password="password",
40
+ ),
41
+ username="user_email",
42
+ url="url",
43
+ )
44
+
45
+
22
46
  def test_connection_config_no_auth():
23
47
  with pytest.raises(ValidationError):
24
48
  ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
25
49
 
26
50
 
27
- def test_connection_config_basic_auth():
51
+ def test_connection_config_password_auth():
52
+ ConfluenceConnectionConfig(
53
+ access_config=ConfluenceAccessConfig(password="password"),
54
+ url="url",
55
+ username="user_email",
56
+ )
57
+
58
+
59
+ def test_connection_config_api_token_auth():
28
60
  ConfluenceConnectionConfig(
29
- access_config=ConfluenceAccessConfig(password="api_token"),
61
+ access_config=ConfluenceAccessConfig(api_token="api_token"),
30
62
  url="url",
31
63
  username="user_email",
32
64
  )
@@ -1 +1 @@
1
- __version__ = "0.5.13" # pragma: no cover
1
+ __version__ = "0.5.15" # pragma: no cover
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from pydantic import Field
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
18
- embedder_model_name: Optional[str] = Field(
19
- default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
20
- )
18
+ embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
21
19
  embedder_model_kwargs: Optional[dict] = Field(
22
20
  default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
23
21
  )
24
22
  encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
25
- cache_folder: Optional[str] = Field(default=None)
26
23
 
27
24
  @requires_dependencies(
28
25
  ["sentence_transformers"],
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
33
30
 
34
31
  return SentenceTransformer(
35
32
  model_name_or_path=self.embedder_model_name,
36
- cache_folder=self.cache_folder,
37
33
  **self.embedder_model_kwargs,
38
34
  )
39
35
 
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
45
41
 
46
42
  @dataclass
47
43
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
48
- config: HuggingFaceEmbeddingConfig
44
+ config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
49
45
 
50
46
  def _embed_query(self, query: str) -> list[float]:
51
47
  return self._embed_documents(texts=[query])[0]
@@ -2,7 +2,7 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
6
6
 
7
7
  import pandas as pd
8
8
 
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
163
163
  raise IOError("Unsupported file type: {path}")
164
164
 
165
165
 
166
- def get_data(path: Path) -> list[dict]:
166
+ def get_data(path: Union[Path, str]) -> list[dict]:
167
+ if isinstance(path, str):
168
+ path = Path(path)
167
169
  try:
168
170
  return get_data_by_suffix(path=path)
169
171
  except Exception as e:
@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
102
102
  try:
103
103
  return BatchFileData.from_file(path=path)
104
104
  except ValidationError:
105
- logger.debug(f"{path} not valid for batch file data")
105
+ logger.debug(f"{path} not detected as batch file data")
106
106
 
107
107
  return FileData.from_file(path=path)
108
108
 
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
7
6
  from pydantic import BaseModel
8
7
 
9
8
  from unstructured_ingest.utils import ndjson
9
+ from unstructured_ingest.utils.data_prep import get_data, write_data
10
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
11
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
12
 
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
43
43
  writer.f.flush()
44
44
 
45
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
- with input_file.open() as in_f:
47
- elements_contents = json.load(in_f)
46
+ elements_contents = get_data(path=input_file)
48
47
 
49
48
  conformed_elements = [
50
49
  self.conform_dict(element_dict=element, file_data=file_data)
51
50
  for element in elements_contents
52
51
  ]
53
-
54
- with open(output_file, "w") as out_f:
55
- json.dump(conformed_elements, out_f, indent=2)
52
+ write_data(path=output_file, data=conformed_elements)
56
53
 
57
54
  def run(
58
55
  self,
@@ -108,6 +108,13 @@ class Pipeline:
108
108
  uploader_connector_type = self.uploader_step.process.connector_type
109
109
  registry_entry = destination_registry[uploader_connector_type]
110
110
  if registry_entry.upload_stager and self.stager_step is None:
111
+ try:
112
+ self.stager_step = UploadStageStep(
113
+ process=registry_entry.upload_stager(), context=self.context
114
+ )
115
+ return
116
+ except Exception as e:
117
+ logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
111
118
  raise ValueError(
112
119
  f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
113
120
  f"expects a stager of type {registry_entry.upload_stager.__name__} "
@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -64,6 +64,8 @@ from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
64
64
  from .slack import slack_source_entry
65
65
  from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
66
66
  from .vectara import vectara_destination_entry
67
+ from .zendesk.zendesk import CONNECTOR_TYPE as ZENDESK_CONNECTOR_TYPE
68
+ from .zendesk.zendesk import zendesk_source_entry
67
69
 
68
70
  add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
69
71
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -119,3 +121,4 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
119
121
  add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
120
122
 
121
123
  add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
124
+ add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
35
35
 
36
36
  class ConfluenceAccessConfig(AccessConfig):
37
37
  password: Optional[str] = Field(
38
- description="Confluence password or Cloud API token",
38
+ description="Confluence password",
39
+ default=None,
40
+ )
41
+ api_token: Optional[str] = Field(
42
+ description="Confluence Cloud API token",
39
43
  default=None,
40
44
  )
41
45
  token: Optional[str] = Field(
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
57
61
 
58
62
  def model_post_init(self, __context):
59
63
  access_configs = self.access_config.get_secret_value()
60
- basic_auth = self.username and access_configs.password
64
+ if access_configs.password and access_configs.api_token:
65
+ raise ValueError(
66
+ "both password and api_token provided, only one allowed, "
67
+ "see: https://atlassian-python-api.readthedocs.io/"
68
+ )
69
+ basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
61
70
  pat_auth = access_configs.token
62
71
  if self.cloud and not basic_auth:
63
72
  raise ValueError(
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
74
83
  "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
75
84
  )
76
85
 
86
+ def password_or_api_token(self) -> str:
87
+ # Confluence takes either password or API token under the same field: password
88
+ # This ambiguity led to confusion, so we are making it specific what you are passing in
89
+ access_configs = self.access_config.get_secret_value()
90
+ if access_configs.password:
91
+ return access_configs.password
92
+ return access_configs.api_token
93
+
77
94
  @requires_dependencies(["atlassian"], extras="confluence")
78
95
  @contextmanager
79
96
  def get_client(self) -> "Confluence":
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
83
100
  with Confluence(
84
101
  url=self.url,
85
102
  username=self.username,
86
- password=access_configs.password,
103
+ password=self.password_or_api_token(),
87
104
  token=access_configs.token,
88
105
  cloud=self.cloud,
89
106
  ) as client:
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_aws"
22
26
 
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
80
  connection_config=DatabricksAWSVolumesConnectionConfig,
77
81
  uploader=DatabricksAWSVolumesUploader,
78
82
  uploader_config=DatabricksAWSVolumesUploaderConfig,
83
+ upload_stager_config=BlobStoreUploadStagerConfig,
84
+ upload_stager=BlobStoreUploadStager,
79
85
  )
80
86
 
81
87
  databricks_aws_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_azure"
22
26
 
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
95
  connection_config=DatabricksAzureVolumesConnectionConfig,
92
96
  uploader=DatabricksAzureVolumesUploader,
93
97
  uploader_config=DatabricksAzureVolumesUploaderConfig,
98
+ upload_stager_config=BlobStoreUploadStagerConfig,
99
+ upload_stager=BlobStoreUploadStager,
94
100
  )
95
101
 
96
102
  databricks_azure_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_gcp"
22
26
 
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
78
  connection_config=DatabricksGoogleVolumesConnectionConfig,
75
79
  uploader=DatabricksGoogleVolumesUploader,
76
80
  uploader_config=DatabricksGoogleVolumesUploaderConfig,
81
+ upload_stager_config=BlobStoreUploadStagerConfig,
82
+ upload_stager=BlobStoreUploadStager,
77
83
  )
78
84
 
79
85
  databricks_gcp_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes"
22
26
 
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
79
  connection_config=DatabricksNativeVolumesConnectionConfig,
76
80
  uploader=DatabricksNativeVolumesUploader,
77
81
  uploader_config=DatabricksNativeVolumesUploaderConfig,
82
+ upload_stager_config=BlobStoreUploadStagerConfig,
83
+ upload_stager=BlobStoreUploadStager,
78
84
  )
79
85
 
80
86
  databricks_native_volumes_source_entry = SourceRegistryEntry(
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploaderConfig,
27
27
  )
28
28
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  if TYPE_CHECKING:
31
35
  from adlfs import AzureBlobFileSystem
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
194
198
  uploader=AzureUploader,
195
199
  uploader_config=AzureUploaderConfig,
196
200
  connection_config=AzureConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
197
203
  )
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  FsspecUploaderConfig,
29
29
  )
30
30
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
31
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
+ BlobStoreUploadStager,
33
+ BlobStoreUploadStagerConfig,
34
+ )
31
35
 
32
36
  if TYPE_CHECKING:
33
37
  from boxfs import BoxFileSystem
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
167
171
  uploader=BoxUploader,
168
172
  uploader_config=BoxUploaderConfig,
169
173
  connection_config=BoxConnectionConfig,
174
+ upload_stager_config=BlobStoreUploadStagerConfig,
175
+ upload_stager=BlobStoreUploadStager,
170
176
  )
@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
31
31
  FsspecUploader,
32
32
  FsspecUploaderConfig,
33
33
  )
34
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
35
+ BlobStoreUploadStager,
36
+ BlobStoreUploadStagerConfig,
37
+ )
34
38
 
35
39
  if TYPE_CHECKING:
36
40
  pass
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
228
232
  uploader=DropboxUploader,
229
233
  uploader_config=DropboxUploaderConfig,
230
234
  connection_config=DropboxConnectionConfig,
235
+ upload_stager_config=BlobStoreUploadStagerConfig,
236
+ upload_stager=BlobStoreUploadStager,
231
237
  )
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  FsspecUploader,
29
29
  FsspecUploaderConfig,
30
30
  )
31
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
+ BlobStoreUploadStager,
33
+ BlobStoreUploadStagerConfig,
34
+ )
31
35
 
32
36
  if TYPE_CHECKING:
33
37
  from gcsfs import GCSFileSystem
@@ -194,4 +198,6 @@ gcs_destination_entry = DestinationRegistryEntry(
194
198
  uploader=GcsUploader,
195
199
  uploader_config=GcsUploaderConfig,
196
200
  connection_config=GcsConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
197
203
  )
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploader,
27
27
  FsspecUploaderConfig,
28
28
  )
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  CONNECTOR_TYPE = "s3"
31
35
 
@@ -182,4 +186,6 @@ s3_destination_entry = DestinationRegistryEntry(
182
186
  uploader=S3Uploader,
183
187
  uploader_config=S3UploaderConfig,
184
188
  connection_config=S3ConnectionConfig,
189
+ upload_stager_config=BlobStoreUploadStagerConfig,
190
+ upload_stager=BlobStoreUploadStager,
185
191
  )
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploader,
27
27
  FsspecUploaderConfig,
28
28
  )
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  if TYPE_CHECKING:
31
35
  from fsspec.implementations.sftp import SFTPFileSystem
@@ -168,4 +172,6 @@ sftp_destination_entry = DestinationRegistryEntry(
168
172
  uploader=SftpUploader,
169
173
  uploader_config=SftpUploaderConfig,
170
174
  connection_config=SftpConnectionConfig,
175
+ upload_stager_config=BlobStoreUploadStagerConfig,
176
+ upload_stager=BlobStoreUploadStager,
171
177
  )
@@ -27,6 +27,10 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
27
  DestinationRegistryEntry,
28
28
  SourceRegistryEntry,
29
29
  )
30
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
31
+ BlobStoreUploadStager,
32
+ BlobStoreUploadStagerConfig,
33
+ )
30
34
 
31
35
  CONNECTOR_TYPE = "local"
32
36
 
@@ -213,5 +217,8 @@ local_source_entry = SourceRegistryEntry(
213
217
  )
214
218
 
215
219
  local_destination_entry = DestinationRegistryEntry(
216
- uploader=LocalUploader, uploader_config=LocalUploaderConfig
220
+ uploader=LocalUploader,
221
+ uploader_config=LocalUploaderConfig,
222
+ upload_stager_config=BlobStoreUploadStagerConfig,
223
+ upload_stager=BlobStoreUploadStager,
217
224
  )
@@ -1,31 +0,0 @@
1
- from unstructured_ingest.v2.processes.connector_registry import (
2
- add_source_entry,
3
- )
4
-
5
- from .zendesk import (
6
- CONNECTOR_TYPE,
7
- ZendeskAccessConfig,
8
- ZendeskClient,
9
- ZendeskConnectionConfig,
10
- ZendeskDownloader,
11
- ZendeskDownloaderConfig,
12
- ZendeskIndexer,
13
- ZendeskIndexerConfig,
14
- ZendeskTicket,
15
- zendesk_source_entry,
16
- )
17
-
18
- __all__ = [
19
- "add_source_entry",
20
- "zendesk_source_entry",
21
- "ZendeskAccessConfig",
22
- "ZendeskClient",
23
- "ZendeskConnectionConfig",
24
- "ZendeskDownloader",
25
- "ZendeskDownloaderConfig",
26
- "ZendeskIndexer",
27
- "ZendeskIndexerConfig",
28
- "ZendeskTicket",
29
- ]
30
-
31
- add_source_entry(source_type=CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -6,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
6
5
 
7
6
  from pydantic import BaseModel, Field, SecretStr
8
7
 
8
+ from unstructured_ingest.utils.data_prep import get_data
9
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
10
 
11
11
  if TYPE_CHECKING:
@@ -192,9 +192,8 @@ class Embedder(BaseProcess, ABC):
192
192
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
193
193
  # TODO update base embedder classes to support async
194
194
  embedder = self.config.get_embedder()
195
- with elements_filepath.open("r") as elements_file:
196
- elements = json.load(elements_file)
195
+ elements = get_data(path=elements_filepath)
197
196
  if not elements:
198
- return [e.to_dict() for e in elements]
197
+ return []
199
198
  embedded_elements = embedder.embed_documents(elements=elements)
200
199
  return embedded_elements
File without changes
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from unstructured_ingest.utils.data_prep import get_data, write_data
6
+ from unstructured_ingest.v2.interfaces import FileData, UploadStager, UploadStagerConfig
7
+
8
+
9
+ class BlobStoreUploadStagerConfig(UploadStagerConfig):
10
+ pass
11
+
12
+
13
+ @dataclass
14
+ class BlobStoreUploadStager(UploadStager):
15
+ upload_stager_config: BlobStoreUploadStagerConfig = field(
16
+ default_factory=BlobStoreUploadStagerConfig
17
+ )
18
+
19
+ def run(
20
+ self,
21
+ elements_filepath: Path,
22
+ file_data: FileData,
23
+ output_dir: Path,
24
+ output_filename: str,
25
+ **kwargs: Any,
26
+ ) -> Path:
27
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
28
+ # Always save as json
29
+ data = get_data(elements_filepath)
30
+ write_data(path=output_file.with_suffix(".json"), data=data)
31
+ return output_file.with_suffix(".json")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.13
3
+ Version: 0.5.15
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
26
- Requires-Dist: dataclasses_json
27
25
  Requires-Dist: pandas
26
+ Requires-Dist: tqdm
28
27
  Requires-Dist: pydantic>=2.7
28
+ Requires-Dist: python-dateutil
29
29
  Requires-Dist: click
30
30
  Requires-Dist: opentelemetry-sdk
31
- Requires-Dist: tqdm
31
+ Requires-Dist: dataclasses_json
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -71,11 +71,11 @@ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: requests; extra == "biomed"
75
74
  Requires-Dist: bs4; extra == "biomed"
75
+ Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: boxfs; extra == "box"
78
77
  Requires-Dist: fsspec; extra == "box"
78
+ Requires-Dist: boxfs; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
@@ -102,15 +102,15 @@ Requires-Dist: gcsfs; extra == "gcs"
102
102
  Requires-Dist: bs4; extra == "gcs"
103
103
  Requires-Dist: fsspec; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: requests; extra == "github"
106
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
+ Requires-Dist: requests; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: hubspot-api-client; extra == "hubspot"
113
112
  Requires-Dist: urllib3; extra == "hubspot"
113
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
114
114
  Provides-Extra: jira
115
115
  Requires-Dist: atlassian-python-api; extra == "jira"
116
116
  Provides-Extra: kafka
@@ -128,14 +128,14 @@ Requires-Dist: cymple; extra == "neo4j"
128
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
129
  Requires-Dist: networkx; extra == "neo4j"
130
130
  Provides-Extra: notion
131
- Requires-Dist: notion-client; extra == "notion"
131
+ Requires-Dist: httpx; extra == "notion"
132
132
  Requires-Dist: htmlBuilder; extra == "notion"
133
+ Requires-Dist: notion-client; extra == "notion"
133
134
  Requires-Dist: backoff; extra == "notion"
134
- Requires-Dist: httpx; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
137
  Requires-Dist: bs4; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
@@ -152,8 +152,8 @@ Requires-Dist: praw; extra == "reddit"
152
152
  Provides-Extra: redis
153
153
  Requires-Dist: redis; extra == "redis"
154
154
  Provides-Extra: s3
155
- Requires-Dist: fsspec; extra == "s3"
156
155
  Requires-Dist: s3fs; extra == "s3"
156
+ Requires-Dist: fsspec; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
158
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
159
  Requires-Dist: msal; extra == "sharepoint"
@@ -165,8 +165,8 @@ Requires-Dist: fsspec; extra == "sftp"
165
165
  Provides-Extra: slack
166
166
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
167
  Provides-Extra: snowflake
168
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
169
168
  Requires-Dist: psycopg2-binary; extra == "snowflake"
169
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
170
170
  Provides-Extra: wikipedia
171
171
  Requires-Dist: wikipedia; extra == "wikipedia"
172
172
  Provides-Extra: weaviate
@@ -178,22 +178,22 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
+ Requires-Dist: httpx; extra == "vectara"
181
182
  Requires-Dist: requests; extra == "vectara"
182
183
  Requires-Dist: aiofiles; extra == "vectara"
183
- Requires-Dist: httpx; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
- Requires-Dist: ibis; extra == "vastdb"
186
185
  Requires-Dist: vastdb; extra == "vastdb"
186
+ Requires-Dist: ibis; extra == "vastdb"
187
187
  Requires-Dist: pyarrow; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
+ Requires-Dist: httpx; extra == "zendesk"
189
190
  Requires-Dist: bs4; extra == "zendesk"
190
191
  Requires-Dist: aiofiles; extra == "zendesk"
191
- Requires-Dist: httpx; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
195
- Requires-Dist: tiktoken; extra == "embed-octoai"
196
195
  Requires-Dist: openai; extra == "embed-octoai"
196
+ Requires-Dist: tiktoken; extra == "embed-octoai"
197
197
  Provides-Extra: embed-vertexai
198
198
  Requires-Dist: vertexai; extra == "embed-vertexai"
199
199
  Provides-Extra: embed-voyageai
@@ -201,11 +201,11 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
201
201
  Provides-Extra: embed-mixedbreadai
202
202
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
203
203
  Provides-Extra: openai
204
- Requires-Dist: tiktoken; extra == "openai"
205
204
  Requires-Dist: openai; extra == "openai"
205
+ Requires-Dist: tiktoken; extra == "openai"
206
206
  Provides-Extra: bedrock
207
- Requires-Dist: boto3; extra == "bedrock"
208
207
  Requires-Dist: aioboto3; extra == "bedrock"
208
+ Requires-Dist: boto3; extra == "bedrock"
209
209
  Provides-Extra: togetherai
210
210
  Requires-Dist: together; extra == "togetherai"
211
211
  Dynamic: author
@@ -8,7 +8,7 @@ test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpei
8
8
  test/integration/connectors/test_astradb.py,sha256=c9Lk0dvJVVdzHcokvsc4XMNJ4SIO1k2vGtT5py0cFVM,9753
9
9
  test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
10
10
  test/integration/connectors/test_chroma.py,sha256=1uGHbZXkXKGb8wl3p7c9G-L1MViUe283Hw5u3dg8OgI,4532
11
- test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
11
+ test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
12
12
  test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
13
13
  test/integration/connectors/test_dropbox.py,sha256=jzpZ6wawLa4sC1BVoHWZJ3cHjL4DWWUEX5ee7bXUOOM,4945
14
14
  test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
@@ -25,7 +25,7 @@ test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWT
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
27
  test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
28
- test/integration/connectors/test_zendesk.py,sha256=6Xsxxav9b1NBp_zd66S_sE4Nn5iO6Et4a5zgGR2-Y04,4159
28
+ test/integration/connectors/test_zendesk.py,sha256=Jc1GcMBnCrpzm6_6tJi-FdYxSs15xnp94a7kVwrObMc,4167
29
29
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
31
31
  test/integration/connectors/discord/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -89,7 +89,7 @@ test/unit/v2/test_utils.py,sha256=TWVAeE0OrcHgPyzGPtEnQakICsVrDeVhIKPMRQPX554,26
89
89
  test/unit/v2/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
90
  test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCrSXvcMFecE,1772
91
91
  test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
- test/unit/v2/connectors/test_confluence.py,sha256=bXrn_kRb4IQdqkk4rc-P2gJAtPba7n7pNplQgfbqZDY,1047
92
+ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
93
93
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
94
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
95
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=SoXJvWNbP_dJpjjbYHn-IaGnsULa9X8yicnEFO_W3yI,43
114
+ unstructured_ingest/__version__.py,sha256=noAC1JV7rAfkk9NQctRgYOifiiASnPhPSbtOr9y3Hkk,43
115
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -281,7 +281,7 @@ unstructured_ingest/connector/notion/types/database_properties/verification.py,s
281
281
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
282
282
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
283
283
  unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
284
- unstructured_ingest/embed/huggingface.py,sha256=EWU1kd5Cm6ajgCw6hP5w_4pniGSgxnR0wM9vjuPQ6Yk,2334
284
+ unstructured_ingest/embed/huggingface.py,sha256=-ZD17O_H_UnK80fqig6y6wNKJckjx0HuAkY5vgPvk8M,2259
285
285
  unstructured_ingest/embed/interfaces.py,sha256=_-CqasY6R5nnNUY-X6PS5lz8dsmGaUw5zIGRdPfx16o,4918
286
286
  unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
287
287
  unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
@@ -370,7 +370,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
370
370
  unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
371
371
  unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
372
372
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
373
- unstructured_ingest/utils/data_prep.py,sha256=X3d8Kos1zqX-HQAicF_8TB0BrstRtHrbMzu_1s7mj7M,7191
373
+ unstructured_ingest/utils/data_prep.py,sha256=AKtsdu9stYA63CV1C5B_fFWigqy-giVv-euumitos-A,7266
374
374
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
375
375
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
376
376
  unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
@@ -399,39 +399,39 @@ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1K
399
399
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
400
400
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
401
401
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
402
- unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
402
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
403
403
  unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
404
404
  unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
405
405
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
406
- unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
406
+ unstructured_ingest/v2/interfaces/upload_stager.py,sha256=Bzhb994gVqFI8KBd6lx3Rcr5UH0ZhU66jOD3WAUr20Q,3151
407
407
  unstructured_ingest/v2/interfaces/uploader.py,sha256=AMgp0uaJ5XeqiyURLIUnWyoIqhUT9Ak5P_LT9-qasYk,2107
408
408
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
409
409
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
410
410
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
411
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=UeOk5SywJZIn3kCnHclQ2cP7JJIXb4NDjpwzsCP_cF0,16523
411
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=m3m9F9wZsCEhsFK_0WZv5_ENl2M42VHBV6Vc39t90v8,16842
412
412
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
413
- unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
413
+ unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=s2BY2v1cs_ImTsPrg8J-92k-fV73b61nDiSy4p9736k,3223
414
414
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
415
- unstructured_ingest/v2/pipeline/steps/embed.py,sha256=iL6X0G5AvKnlfI-3XRWudlb0-6rD_PqyzA3MFmmcn6M,3199
415
+ unstructured_ingest/v2/pipeline/steps/embed.py,sha256=HPQgEWvVrpThUD1FB9k7XNiARXkd6rb4lnpxTGmEQxI,3201
416
416
  unstructured_ingest/v2/pipeline/steps/filter.py,sha256=pju7knTSbB2ll1jC9DPePRDnHlOlvEcU1-sjk6xYGGc,1211
417
417
  unstructured_ingest/v2/pipeline/steps/index.py,sha256=m0BbUwe_7s_gFxR9K31IJdAf3_GgKXXajGJec5jcSXA,3557
418
- unstructured_ingest/v2/pipeline/steps/partition.py,sha256=IJQWaOTcyFlH2bz8WbmynE5Zkd5D8ELOKTnSCnt9Wcc,3282
418
+ unstructured_ingest/v2/pipeline/steps/partition.py,sha256=yE4HFFyORhnzH25PoJG6MNquMXqpzAznyf9NoZYBV5E,3284
419
419
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=VR8SLUJdVva61aieVKyxUHzupTCQbQeaMA0CKu4Fx7o,2347
420
420
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=p2nPFGbcpivPAZO5jDogTfn0iaL5bCFsgBNMejxVbzE,1768
421
421
  unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_vjQt5hgsy_jRCxPzxo4,2010
422
422
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
423
423
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
424
424
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
425
- unstructured_ingest/v2/processes/embedder.py,sha256=4x-Rt5UCvwdgihDAr24hvTGDEd1CdKF9xJrf3aMU-ck,7926
425
+ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8gJwIS-imgloE-UOc,7887
426
426
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
427
427
  unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
428
428
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
429
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=rkEQVVgcHoY3jwgW_5PH_NzdXIEwtBLs9Dk4VzmTZMA,6387
429
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ebLvZes84qRx4eS20SkvlVH6WIIM76hifyUgkUJ-dfg,6588
430
430
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
431
431
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=E6fB4anCd_gtSzVUsZ5pDrfdxs5AWERQM_NEfeenfEs,18202
432
432
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
433
433
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
434
- unstructured_ingest/v2/processes/connectors/confluence.py,sha256=wTZewdbmCHaQuEJZ7Wf0NBOo8fS_n1I0DDwlhN96woE,11243
434
+ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=gSs4-AxL0gfeWdJfP7JfCrQSQNLoJRkvHquKK9RJvpQ,12043
435
435
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
436
436
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=xvLWTSFEC3gyGTwEISXxWmUoAfCgzdgZkETMMBOPHuI,7153
437
437
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
@@ -439,7 +439,7 @@ unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5B
439
439
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
440
440
  unstructured_ingest/v2/processes/connectors/jira.py,sha256=-f_vIWNw6Xr8rMNdAcfCC2cmhB-QndnZk5XymHo60FU,17094
441
441
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
442
- unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
442
+ unstructured_ingest/v2/processes/connectors/local.py,sha256=FWPRjjUsnQjyZMChuZGuMU04AB5X0sFEOcAXhx1r9sk,7381
443
443
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
444
444
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
445
445
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
@@ -456,10 +456,10 @@ unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HB
456
456
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
457
457
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
458
458
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
459
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
460
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=cb-EUW0T-linZMkbU6AcKEGWnFHQvhpO5Abtps4P2X0,3532
461
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8NubkyHw49IpW_42g6w1Koxlm56EPiPf1lB-eoRSI,2783
462
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
459
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
460
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
461
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
462
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
463
463
  unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
464
464
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
465
465
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
@@ -469,13 +469,13 @@ unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc
469
469
  unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
470
470
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
471
471
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
472
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=fwbHYoRrN0ZRuLdLb5X-Z7nr11rMSY8VhWMhfR3ljQo,6933
473
- unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=VXxEfgJbW8DCOrqLW7mQkSeWqH-HczidTNIE28SgERY,5658
474
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=DUAxkMDosLhRYfITuXh-Jpupwd20_3VK4P-FK-wgg7k,8137
472
+ unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
473
+ unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
474
+ unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
475
475
  unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
476
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=uOfm2tLc0r5U3CNkfauuwhGOhP7RJpjyBpHWMDXCk7c,6954
477
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=LcfIU-QgW5oVMF4jMUVm7HSgVcSrQamY6mgXdQuiSjc,6400
478
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=3cVwVH3fT_JEYzIbl48-NDXdbo7XWX4C4_eqTvgWIro,6150
476
+ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
477
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
478
+ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
479
479
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
480
480
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
481
481
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
@@ -572,12 +572,14 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
572
572
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
573
573
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
574
574
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnVNx9BWFG2fPah4VbQbgEDF4nP78bQeU3jg08,12821
575
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=XMNocKJ3FHDfy36p_KHhH7ALi0-ji6NhGuQNCV2E4vY,699
575
+ unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
576
576
  unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=MNyI6SUuxZHf_6zONoC6jR2f9BvyTYoMyGKDOhl4kgs,7897
577
577
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=vQHZa5YYiDPXXPRAPMnPXhh0QzXeiBVx_YIWskZBQIc,15465
578
- unstructured_ingest-0.5.13.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
579
- unstructured_ingest-0.5.13.dist-info/METADATA,sha256=K95xEzr8Tq75w90-ivlwvfFhRkNRTPnNmtIiRXDXhjs,8465
580
- unstructured_ingest-0.5.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
581
- unstructured_ingest-0.5.13.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
582
- unstructured_ingest-0.5.13.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
583
- unstructured_ingest-0.5.13.dist-info/RECORD,,
578
+ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
+ unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
+ unstructured_ingest-0.5.15.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
+ unstructured_ingest-0.5.15.dist-info/METADATA,sha256=TimVS8ZngyfFUMhuD317dXl6nlI9acBRC_LxZWZczuU,8465
582
+ unstructured_ingest-0.5.15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
+ unstructured_ingest-0.5.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
+ unstructured_ingest-0.5.15.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
+ unstructured_ingest-0.5.15.dist-info/RECORD,,