unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_confluence.py +2 -2
- test/unit/v2/connectors/test_confluence.py +35 -3
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +3 -7
- unstructured_ingest/utils/data_prep.py +4 -2
- unstructured_ingest/v2/interfaces/file_data.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
- unstructured_ingest/v2/pipeline/pipeline.py +7 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
- unstructured_ingest/v2/processes/connectors/local.py +8 -1
- unstructured_ingest/v2/processes/embedder.py +3 -4
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/METADATA +18 -18
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/RECORD +32 -30
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
|
|
|
30
30
|
spaces = ["testteamsp", "MFS"]
|
|
31
31
|
|
|
32
32
|
# Create connection and indexer configurations
|
|
33
|
-
access_config = ConfluenceAccessConfig(
|
|
33
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
34
34
|
connection_config = ConfluenceConnectionConfig(
|
|
35
35
|
url=confluence_url,
|
|
36
36
|
username=user_email,
|
|
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
|
|
|
77
77
|
spaces = ["testteamsp1"]
|
|
78
78
|
|
|
79
79
|
# Create connection and indexer configurations
|
|
80
|
-
access_config = ConfluenceAccessConfig(
|
|
80
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
81
81
|
connection_config = ConfluenceConnectionConfig(
|
|
82
82
|
url=confluence_url,
|
|
83
83
|
username=user_email,
|
|
@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
|
|
|
11
11
|
with pytest.raises(ValidationError):
|
|
12
12
|
ConfluenceConnectionConfig(
|
|
13
13
|
access_config=ConfluenceAccessConfig(
|
|
14
|
-
password="
|
|
14
|
+
password="password",
|
|
15
15
|
token="access_token",
|
|
16
16
|
),
|
|
17
17
|
username="user_email",
|
|
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def test_connection_config_multiple_auth2():
|
|
23
|
+
with pytest.raises(ValidationError):
|
|
24
|
+
ConfluenceConnectionConfig(
|
|
25
|
+
access_config=ConfluenceAccessConfig(
|
|
26
|
+
api_token="api_token",
|
|
27
|
+
token="access_token",
|
|
28
|
+
),
|
|
29
|
+
username="user_email",
|
|
30
|
+
url="url",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_connection_config_multiple_auth3():
|
|
35
|
+
with pytest.raises(ValidationError):
|
|
36
|
+
ConfluenceConnectionConfig(
|
|
37
|
+
access_config=ConfluenceAccessConfig(
|
|
38
|
+
api_token="api_token",
|
|
39
|
+
password="password",
|
|
40
|
+
),
|
|
41
|
+
username="user_email",
|
|
42
|
+
url="url",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
22
46
|
def test_connection_config_no_auth():
|
|
23
47
|
with pytest.raises(ValidationError):
|
|
24
48
|
ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
|
|
25
49
|
|
|
26
50
|
|
|
27
|
-
def
|
|
51
|
+
def test_connection_config_password_auth():
|
|
52
|
+
ConfluenceConnectionConfig(
|
|
53
|
+
access_config=ConfluenceAccessConfig(password="password"),
|
|
54
|
+
url="url",
|
|
55
|
+
username="user_email",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_connection_config_api_token_auth():
|
|
28
60
|
ConfluenceConnectionConfig(
|
|
29
|
-
access_config=ConfluenceAccessConfig(
|
|
61
|
+
access_config=ConfluenceAccessConfig(api_token="api_token"),
|
|
30
62
|
url="url",
|
|
31
63
|
username="user_email",
|
|
32
64
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.15" # pragma: no cover
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
4
|
from pydantic import Field
|
|
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
18
|
-
embedder_model_name: Optional[str] = Field(
|
|
19
|
-
default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
|
|
20
|
-
)
|
|
18
|
+
embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
|
|
21
19
|
embedder_model_kwargs: Optional[dict] = Field(
|
|
22
20
|
default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
|
|
23
21
|
)
|
|
24
22
|
encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
|
|
25
|
-
cache_folder: Optional[str] = Field(default=None)
|
|
26
23
|
|
|
27
24
|
@requires_dependencies(
|
|
28
25
|
["sentence_transformers"],
|
|
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
33
30
|
|
|
34
31
|
return SentenceTransformer(
|
|
35
32
|
model_name_or_path=self.embedder_model_name,
|
|
36
|
-
cache_folder=self.cache_folder,
|
|
37
33
|
**self.embedder_model_kwargs,
|
|
38
34
|
)
|
|
39
35
|
|
|
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
45
41
|
|
|
46
42
|
@dataclass
|
|
47
43
|
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
48
|
-
config: HuggingFaceEmbeddingConfig
|
|
44
|
+
config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
|
|
49
45
|
|
|
50
46
|
def _embed_query(self, query: str) -> list[float]:
|
|
51
47
|
return self._embed_documents(texts=[query])[0]
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
|
+
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
|
163
163
|
raise IOError("Unsupported file type: {path}")
|
|
164
164
|
|
|
165
165
|
|
|
166
|
-
def get_data(path: Path) -> list[dict]:
|
|
166
|
+
def get_data(path: Union[Path, str]) -> list[dict]:
|
|
167
|
+
if isinstance(path, str):
|
|
168
|
+
path = Path(path)
|
|
167
169
|
try:
|
|
168
170
|
return get_data_by_suffix(path=path)
|
|
169
171
|
except Exception as e:
|
|
@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
|
|
|
102
102
|
try:
|
|
103
103
|
return BatchFileData.from_file(path=path)
|
|
104
104
|
except ValidationError:
|
|
105
|
-
logger.debug(f"{path} not
|
|
105
|
+
logger.debug(f"{path} not detected as batch file data")
|
|
106
106
|
|
|
107
107
|
return FileData.from_file(path=path)
|
|
108
108
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from abc import ABC
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
|
|
|
7
6
|
from pydantic import BaseModel
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.utils import ndjson
|
|
9
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
10
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
12
|
|
|
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
|
|
|
43
43
|
writer.f.flush()
|
|
44
44
|
|
|
45
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
-
|
|
47
|
-
elements_contents = json.load(in_f)
|
|
46
|
+
elements_contents = get_data(path=input_file)
|
|
48
47
|
|
|
49
48
|
conformed_elements = [
|
|
50
49
|
self.conform_dict(element_dict=element, file_data=file_data)
|
|
51
50
|
for element in elements_contents
|
|
52
51
|
]
|
|
53
|
-
|
|
54
|
-
with open(output_file, "w") as out_f:
|
|
55
|
-
json.dump(conformed_elements, out_f, indent=2)
|
|
52
|
+
write_data(path=output_file, data=conformed_elements)
|
|
56
53
|
|
|
57
54
|
def run(
|
|
58
55
|
self,
|
|
@@ -108,6 +108,13 @@ class Pipeline:
|
|
|
108
108
|
uploader_connector_type = self.uploader_step.process.connector_type
|
|
109
109
|
registry_entry = destination_registry[uploader_connector_type]
|
|
110
110
|
if registry_entry.upload_stager and self.stager_step is None:
|
|
111
|
+
try:
|
|
112
|
+
self.stager_step = UploadStageStep(
|
|
113
|
+
process=registry_entry.upload_stager(), context=self.context
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
|
|
111
118
|
raise ValueError(
|
|
112
119
|
f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
|
|
113
120
|
f"expects a stager of type {registry_entry.upload_stager.__name__} "
|
|
@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
|
|
|
35
35
|
|
|
36
36
|
class ConfluenceAccessConfig(AccessConfig):
|
|
37
37
|
password: Optional[str] = Field(
|
|
38
|
-
description="Confluence password
|
|
38
|
+
description="Confluence password",
|
|
39
|
+
default=None,
|
|
40
|
+
)
|
|
41
|
+
api_token: Optional[str] = Field(
|
|
42
|
+
description="Confluence Cloud API token",
|
|
39
43
|
default=None,
|
|
40
44
|
)
|
|
41
45
|
token: Optional[str] = Field(
|
|
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
57
61
|
|
|
58
62
|
def model_post_init(self, __context):
|
|
59
63
|
access_configs = self.access_config.get_secret_value()
|
|
60
|
-
|
|
64
|
+
if access_configs.password and access_configs.api_token:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
"both password and api_token provided, only one allowed, "
|
|
67
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
68
|
+
)
|
|
69
|
+
basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
|
|
61
70
|
pat_auth = access_configs.token
|
|
62
71
|
if self.cloud and not basic_auth:
|
|
63
72
|
raise ValueError(
|
|
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
74
83
|
"no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
|
|
75
84
|
)
|
|
76
85
|
|
|
86
|
+
def password_or_api_token(self) -> str:
|
|
87
|
+
# Confluence takes either password or API token under the same field: password
|
|
88
|
+
# This ambiguity led to confusion, so we are making it specific what you are passing in
|
|
89
|
+
access_configs = self.access_config.get_secret_value()
|
|
90
|
+
if access_configs.password:
|
|
91
|
+
return access_configs.password
|
|
92
|
+
return access_configs.api_token
|
|
93
|
+
|
|
77
94
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
78
95
|
@contextmanager
|
|
79
96
|
def get_client(self) -> "Confluence":
|
|
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
83
100
|
with Confluence(
|
|
84
101
|
url=self.url,
|
|
85
102
|
username=self.username,
|
|
86
|
-
password=
|
|
103
|
+
password=self.password_or_api_token(),
|
|
87
104
|
token=access_configs.token,
|
|
88
105
|
cloud=self.cloud,
|
|
89
106
|
) as client:
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
26
|
|
|
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
76
80
|
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
77
81
|
uploader=DatabricksAWSVolumesUploader,
|
|
78
82
|
uploader_config=DatabricksAWSVolumesUploaderConfig,
|
|
83
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
84
|
+
upload_stager=BlobStoreUploadStager,
|
|
79
85
|
)
|
|
80
86
|
|
|
81
87
|
databricks_aws_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
26
|
|
|
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
91
95
|
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
92
96
|
uploader=DatabricksAzureVolumesUploader,
|
|
93
97
|
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
98
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
99
|
+
upload_stager=BlobStoreUploadStager,
|
|
94
100
|
)
|
|
95
101
|
|
|
96
102
|
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
26
|
|
|
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
74
78
|
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
75
79
|
uploader=DatabricksGoogleVolumesUploader,
|
|
76
80
|
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
81
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
82
|
+
upload_stager=BlobStoreUploadStager,
|
|
77
83
|
)
|
|
78
84
|
|
|
79
85
|
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
26
|
|
|
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
75
79
|
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
76
80
|
uploader=DatabricksNativeVolumesUploader,
|
|
77
81
|
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
82
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
83
|
+
upload_stager=BlobStoreUploadStager,
|
|
78
84
|
)
|
|
79
85
|
|
|
80
86
|
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
26
|
FsspecUploaderConfig,
|
|
27
27
|
)
|
|
28
28
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
29
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
35
|
from adlfs import AzureBlobFileSystem
|
|
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
|
|
|
194
198
|
uploader=AzureUploader,
|
|
195
199
|
uploader_config=AzureUploaderConfig,
|
|
196
200
|
connection_config=AzureConnectionConfig,
|
|
201
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
202
|
+
upload_stager=BlobStoreUploadStager,
|
|
197
203
|
)
|
|
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
FsspecUploaderConfig,
|
|
29
29
|
)
|
|
30
30
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
31
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
32
|
+
BlobStoreUploadStager,
|
|
33
|
+
BlobStoreUploadStagerConfig,
|
|
34
|
+
)
|
|
31
35
|
|
|
32
36
|
if TYPE_CHECKING:
|
|
33
37
|
from boxfs import BoxFileSystem
|
|
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
|
|
|
167
171
|
uploader=BoxUploader,
|
|
168
172
|
uploader_config=BoxUploaderConfig,
|
|
169
173
|
connection_config=BoxConnectionConfig,
|
|
174
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
175
|
+
upload_stager=BlobStoreUploadStager,
|
|
170
176
|
)
|
|
@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
31
31
|
FsspecUploader,
|
|
32
32
|
FsspecUploaderConfig,
|
|
33
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
35
|
+
BlobStoreUploadStager,
|
|
36
|
+
BlobStoreUploadStagerConfig,
|
|
37
|
+
)
|
|
34
38
|
|
|
35
39
|
if TYPE_CHECKING:
|
|
36
40
|
pass
|
|
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
|
|
|
228
232
|
uploader=DropboxUploader,
|
|
229
233
|
uploader_config=DropboxUploaderConfig,
|
|
230
234
|
connection_config=DropboxConnectionConfig,
|
|
235
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
236
|
+
upload_stager=BlobStoreUploadStager,
|
|
231
237
|
)
|
|
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
FsspecUploader,
|
|
29
29
|
FsspecUploaderConfig,
|
|
30
30
|
)
|
|
31
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
32
|
+
BlobStoreUploadStager,
|
|
33
|
+
BlobStoreUploadStagerConfig,
|
|
34
|
+
)
|
|
31
35
|
|
|
32
36
|
if TYPE_CHECKING:
|
|
33
37
|
from gcsfs import GCSFileSystem
|
|
@@ -194,4 +198,6 @@ gcs_destination_entry = DestinationRegistryEntry(
|
|
|
194
198
|
uploader=GcsUploader,
|
|
195
199
|
uploader_config=GcsUploaderConfig,
|
|
196
200
|
connection_config=GcsConnectionConfig,
|
|
201
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
202
|
+
upload_stager=BlobStoreUploadStager,
|
|
197
203
|
)
|
|
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
26
|
FsspecUploader,
|
|
27
27
|
FsspecUploaderConfig,
|
|
28
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
CONNECTOR_TYPE = "s3"
|
|
31
35
|
|
|
@@ -182,4 +186,6 @@ s3_destination_entry = DestinationRegistryEntry(
|
|
|
182
186
|
uploader=S3Uploader,
|
|
183
187
|
uploader_config=S3UploaderConfig,
|
|
184
188
|
connection_config=S3ConnectionConfig,
|
|
189
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
190
|
+
upload_stager=BlobStoreUploadStager,
|
|
185
191
|
)
|
|
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
26
|
FsspecUploader,
|
|
27
27
|
FsspecUploaderConfig,
|
|
28
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
35
|
from fsspec.implementations.sftp import SFTPFileSystem
|
|
@@ -168,4 +172,6 @@ sftp_destination_entry = DestinationRegistryEntry(
|
|
|
168
172
|
uploader=SftpUploader,
|
|
169
173
|
uploader_config=SftpUploaderConfig,
|
|
170
174
|
connection_config=SftpConnectionConfig,
|
|
175
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
176
|
+
upload_stager=BlobStoreUploadStager,
|
|
171
177
|
)
|
|
@@ -27,6 +27,10 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
27
27
|
DestinationRegistryEntry,
|
|
28
28
|
SourceRegistryEntry,
|
|
29
29
|
)
|
|
30
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
31
|
+
BlobStoreUploadStager,
|
|
32
|
+
BlobStoreUploadStagerConfig,
|
|
33
|
+
)
|
|
30
34
|
|
|
31
35
|
CONNECTOR_TYPE = "local"
|
|
32
36
|
|
|
@@ -213,5 +217,8 @@ local_source_entry = SourceRegistryEntry(
|
|
|
213
217
|
)
|
|
214
218
|
|
|
215
219
|
local_destination_entry = DestinationRegistryEntry(
|
|
216
|
-
uploader=LocalUploader,
|
|
220
|
+
uploader=LocalUploader,
|
|
221
|
+
uploader_config=LocalUploaderConfig,
|
|
222
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
223
|
+
upload_stager=BlobStoreUploadStager,
|
|
217
224
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from abc import ABC
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
@@ -6,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
|
6
5
|
|
|
7
6
|
from pydantic import BaseModel, Field, SecretStr
|
|
8
7
|
|
|
8
|
+
from unstructured_ingest.utils.data_prep import get_data
|
|
9
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
@@ -192,9 +192,8 @@ class Embedder(BaseProcess, ABC):
|
|
|
192
192
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
193
193
|
# TODO update base embedder classes to support async
|
|
194
194
|
embedder = self.config.get_embedder()
|
|
195
|
-
|
|
196
|
-
elements = json.load(elements_file)
|
|
195
|
+
elements = get_data(path=elements_filepath)
|
|
197
196
|
if not elements:
|
|
198
|
-
return [
|
|
197
|
+
return []
|
|
199
198
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
200
199
|
return embedded_elements
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
|
+
from unstructured_ingest.v2.interfaces import FileData, UploadStager, UploadStagerConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BlobStoreUploadStagerConfig(UploadStagerConfig):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BlobStoreUploadStager(UploadStager):
|
|
15
|
+
upload_stager_config: BlobStoreUploadStagerConfig = field(
|
|
16
|
+
default_factory=BlobStoreUploadStagerConfig
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def run(
|
|
20
|
+
self,
|
|
21
|
+
elements_filepath: Path,
|
|
22
|
+
file_data: FileData,
|
|
23
|
+
output_dir: Path,
|
|
24
|
+
output_filename: str,
|
|
25
|
+
**kwargs: Any,
|
|
26
|
+
) -> Path:
|
|
27
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
28
|
+
# Always save as json
|
|
29
|
+
data = get_data(elements_filepath)
|
|
30
|
+
write_data(path=output_file.with_suffix(".json"), data=data)
|
|
31
|
+
return output_file.with_suffix(".json")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.15
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,11 +23,11 @@ Requires-Python: >=3.9.0,<3.14
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: tqdm
|
|
28
27
|
Requires-Dist: pydantic>=2.7
|
|
28
|
+
Requires-Dist: python-dateutil
|
|
29
29
|
Requires-Dist: click
|
|
30
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
31
|
Requires-Dist: dataclasses_json
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
|
|
@@ -71,11 +71,11 @@ Requires-Dist: fsspec; extra == "azure"
|
|
|
71
71
|
Provides-Extra: azure-ai-search
|
|
72
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
73
73
|
Provides-Extra: biomed
|
|
74
|
-
Requires-Dist: requests; extra == "biomed"
|
|
75
74
|
Requires-Dist: bs4; extra == "biomed"
|
|
75
|
+
Requires-Dist: requests; extra == "biomed"
|
|
76
76
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: boxfs; extra == "box"
|
|
78
77
|
Requires-Dist: fsspec; extra == "box"
|
|
78
|
+
Requires-Dist: boxfs; extra == "box"
|
|
79
79
|
Provides-Extra: chroma
|
|
80
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
81
81
|
Provides-Extra: clarifai
|
|
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
|
86
86
|
Provides-Extra: couchbase
|
|
87
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
88
88
|
Provides-Extra: delta-table
|
|
89
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
90
89
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
90
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
91
91
|
Provides-Extra: discord
|
|
92
92
|
Requires-Dist: discord.py; extra == "discord"
|
|
93
93
|
Provides-Extra: dropbox
|
|
@@ -102,15 +102,15 @@ Requires-Dist: gcsfs; extra == "gcs"
|
|
|
102
102
|
Requires-Dist: bs4; extra == "gcs"
|
|
103
103
|
Requires-Dist: fsspec; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
|
-
Requires-Dist: requests; extra == "github"
|
|
106
105
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
|
+
Requires-Dist: requests; extra == "github"
|
|
107
107
|
Provides-Extra: gitlab
|
|
108
108
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
109
109
|
Provides-Extra: google-drive
|
|
110
110
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
111
111
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
112
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
113
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
114
114
|
Provides-Extra: jira
|
|
115
115
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
116
116
|
Provides-Extra: kafka
|
|
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
|
|
|
124
124
|
Provides-Extra: mongodb
|
|
125
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
126
|
Provides-Extra: neo4j
|
|
127
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
127
128
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
128
129
|
Requires-Dist: networkx; extra == "neo4j"
|
|
129
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
130
130
|
Provides-Extra: notion
|
|
131
131
|
Requires-Dist: httpx; extra == "notion"
|
|
132
|
-
Requires-Dist: backoff; extra == "notion"
|
|
133
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
134
132
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
133
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
134
|
+
Requires-Dist: backoff; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
137
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
138
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
139
139
|
Provides-Extra: opensearch
|
|
140
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
141
141
|
Provides-Extra: outlook
|
|
142
|
-
Requires-Dist: msal; extra == "outlook"
|
|
143
142
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
143
|
+
Requires-Dist: msal; extra == "outlook"
|
|
144
144
|
Provides-Extra: pinecone
|
|
145
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
146
146
|
Provides-Extra: postgres
|
|
@@ -155,8 +155,8 @@ Provides-Extra: s3
|
|
|
155
155
|
Requires-Dist: s3fs; extra == "s3"
|
|
156
156
|
Requires-Dist: fsspec; extra == "s3"
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
159
158
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
160
160
|
Provides-Extra: salesforce
|
|
161
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
162
162
|
Provides-Extra: sftp
|
|
@@ -182,13 +182,13 @@ Requires-Dist: httpx; extra == "vectara"
|
|
|
182
182
|
Requires-Dist: requests; extra == "vectara"
|
|
183
183
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
184
184
|
Provides-Extra: vastdb
|
|
185
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
185
186
|
Requires-Dist: ibis; extra == "vastdb"
|
|
186
187
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
187
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
188
188
|
Provides-Extra: zendesk
|
|
189
189
|
Requires-Dist: httpx; extra == "zendesk"
|
|
190
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
191
190
|
Requires-Dist: bs4; extra == "zendesk"
|
|
191
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
192
192
|
Provides-Extra: embed-huggingface
|
|
193
193
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
194
194
|
Provides-Extra: embed-octoai
|
|
@@ -8,7 +8,7 @@ test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpei
|
|
|
8
8
|
test/integration/connectors/test_astradb.py,sha256=c9Lk0dvJVVdzHcokvsc4XMNJ4SIO1k2vGtT5py0cFVM,9753
|
|
9
9
|
test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
|
|
10
10
|
test/integration/connectors/test_chroma.py,sha256=1uGHbZXkXKGb8wl3p7c9G-L1MViUe283Hw5u3dg8OgI,4532
|
|
11
|
-
test/integration/connectors/test_confluence.py,sha256=
|
|
11
|
+
test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
|
|
12
12
|
test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
|
|
13
13
|
test/integration/connectors/test_dropbox.py,sha256=jzpZ6wawLa4sC1BVoHWZJ3cHjL4DWWUEX5ee7bXUOOM,4945
|
|
14
14
|
test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
|
|
@@ -89,7 +89,7 @@ test/unit/v2/test_utils.py,sha256=TWVAeE0OrcHgPyzGPtEnQakICsVrDeVhIKPMRQPX554,26
|
|
|
89
89
|
test/unit/v2/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
90
|
test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCrSXvcMFecE,1772
|
|
91
91
|
test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
|
-
test/unit/v2/connectors/test_confluence.py,sha256=
|
|
92
|
+
test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
|
|
93
93
|
test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
|
|
94
94
|
test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
95
|
test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
|
|
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
111
111
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
112
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
113
113
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
114
|
-
unstructured_ingest/__version__.py,sha256
|
|
114
|
+
unstructured_ingest/__version__.py,sha256=noAC1JV7rAfkk9NQctRgYOifiiASnPhPSbtOr9y3Hkk,43
|
|
115
115
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
116
116
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
117
117
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -281,7 +281,7 @@ unstructured_ingest/connector/notion/types/database_properties/verification.py,s
|
|
|
281
281
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
282
282
|
unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
|
|
283
283
|
unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
|
|
284
|
-
unstructured_ingest/embed/huggingface.py,sha256
|
|
284
|
+
unstructured_ingest/embed/huggingface.py,sha256=-ZD17O_H_UnK80fqig6y6wNKJckjx0HuAkY5vgPvk8M,2259
|
|
285
285
|
unstructured_ingest/embed/interfaces.py,sha256=_-CqasY6R5nnNUY-X6PS5lz8dsmGaUw5zIGRdPfx16o,4918
|
|
286
286
|
unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
|
|
287
287
|
unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
|
|
@@ -370,7 +370,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
|
|
|
370
370
|
unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
371
371
|
unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
|
|
372
372
|
unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
|
|
373
|
-
unstructured_ingest/utils/data_prep.py,sha256=
|
|
373
|
+
unstructured_ingest/utils/data_prep.py,sha256=AKtsdu9stYA63CV1C5B_fFWigqy-giVv-euumitos-A,7266
|
|
374
374
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
375
375
|
unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
|
|
376
376
|
unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
|
|
@@ -399,30 +399,30 @@ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1K
|
|
|
399
399
|
unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
|
|
400
400
|
unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
|
|
401
401
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
|
|
402
|
-
unstructured_ingest/v2/interfaces/file_data.py,sha256=
|
|
402
|
+
unstructured_ingest/v2/interfaces/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
|
|
403
403
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
|
|
404
404
|
unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
405
405
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
406
|
-
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=
|
|
406
|
+
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=Bzhb994gVqFI8KBd6lx3Rcr5UH0ZhU66jOD3WAUr20Q,3151
|
|
407
407
|
unstructured_ingest/v2/interfaces/uploader.py,sha256=AMgp0uaJ5XeqiyURLIUnWyoIqhUT9Ak5P_LT9-qasYk,2107
|
|
408
408
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
409
409
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
410
410
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
411
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
411
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=m3m9F9wZsCEhsFK_0WZv5_ENl2M42VHBV6Vc39t90v8,16842
|
|
412
412
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
413
|
-
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=
|
|
413
|
+
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=s2BY2v1cs_ImTsPrg8J-92k-fV73b61nDiSy4p9736k,3223
|
|
414
414
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
415
|
-
unstructured_ingest/v2/pipeline/steps/embed.py,sha256=
|
|
415
|
+
unstructured_ingest/v2/pipeline/steps/embed.py,sha256=HPQgEWvVrpThUD1FB9k7XNiARXkd6rb4lnpxTGmEQxI,3201
|
|
416
416
|
unstructured_ingest/v2/pipeline/steps/filter.py,sha256=pju7knTSbB2ll1jC9DPePRDnHlOlvEcU1-sjk6xYGGc,1211
|
|
417
417
|
unstructured_ingest/v2/pipeline/steps/index.py,sha256=m0BbUwe_7s_gFxR9K31IJdAf3_GgKXXajGJec5jcSXA,3557
|
|
418
|
-
unstructured_ingest/v2/pipeline/steps/partition.py,sha256=
|
|
418
|
+
unstructured_ingest/v2/pipeline/steps/partition.py,sha256=yE4HFFyORhnzH25PoJG6MNquMXqpzAznyf9NoZYBV5E,3284
|
|
419
419
|
unstructured_ingest/v2/pipeline/steps/stage.py,sha256=VR8SLUJdVva61aieVKyxUHzupTCQbQeaMA0CKu4Fx7o,2347
|
|
420
420
|
unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=p2nPFGbcpivPAZO5jDogTfn0iaL5bCFsgBNMejxVbzE,1768
|
|
421
421
|
unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_vjQt5hgsy_jRCxPzxo4,2010
|
|
422
422
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
423
423
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
424
424
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
425
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
425
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8gJwIS-imgloE-UOc,7887
|
|
426
426
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
427
427
|
unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
|
|
428
428
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
@@ -431,7 +431,7 @@ unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XE
|
|
|
431
431
|
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=E6fB4anCd_gtSzVUsZ5pDrfdxs5AWERQM_NEfeenfEs,18202
|
|
432
432
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
|
|
433
433
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
|
|
434
|
-
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=
|
|
434
|
+
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=gSs4-AxL0gfeWdJfP7JfCrQSQNLoJRkvHquKK9RJvpQ,12043
|
|
435
435
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
|
|
436
436
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=xvLWTSFEC3gyGTwEISXxWmUoAfCgzdgZkETMMBOPHuI,7153
|
|
437
437
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
|
|
@@ -439,7 +439,7 @@ unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5B
|
|
|
439
439
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
|
|
440
440
|
unstructured_ingest/v2/processes/connectors/jira.py,sha256=-f_vIWNw6Xr8rMNdAcfCC2cmhB-QndnZk5XymHo60FU,17094
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
|
|
442
|
-
unstructured_ingest/v2/processes/connectors/local.py,sha256=
|
|
442
|
+
unstructured_ingest/v2/processes/connectors/local.py,sha256=FWPRjjUsnQjyZMChuZGuMU04AB5X0sFEOcAXhx1r9sk,7381
|
|
443
443
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
445
445
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
|
|
@@ -456,10 +456,10 @@ unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HB
|
|
|
456
456
|
unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
457
457
|
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
|
|
458
458
|
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
|
|
459
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=
|
|
460
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=
|
|
461
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=
|
|
462
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=
|
|
459
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
|
|
460
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
|
|
461
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
|
|
462
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
463
463
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
|
|
464
464
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
465
465
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
|
|
@@ -469,13 +469,13 @@ unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc
|
|
|
469
469
|
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
|
|
470
470
|
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
|
|
471
471
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
472
|
-
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=
|
|
473
|
-
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=
|
|
474
|
-
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=
|
|
472
|
+
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
|
|
473
|
+
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
|
|
474
|
+
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
|
|
475
475
|
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
|
|
476
|
-
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=
|
|
477
|
-
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=
|
|
478
|
-
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=
|
|
476
|
+
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
|
|
477
|
+
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
|
|
478
|
+
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
|
|
479
479
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
480
480
|
unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
|
|
481
481
|
unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
|
|
@@ -575,9 +575,11 @@ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnV
|
|
|
575
575
|
unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
576
576
|
unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=MNyI6SUuxZHf_6zONoC6jR2f9BvyTYoMyGKDOhl4kgs,7897
|
|
577
577
|
unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=vQHZa5YYiDPXXPRAPMnPXhh0QzXeiBVx_YIWskZBQIc,15465
|
|
578
|
-
unstructured_ingest
|
|
579
|
-
unstructured_ingest
|
|
580
|
-
unstructured_ingest-0.5.
|
|
581
|
-
unstructured_ingest-0.5.
|
|
582
|
-
unstructured_ingest-0.5.
|
|
583
|
-
unstructured_ingest-0.5.
|
|
578
|
+
unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
579
|
+
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
|
|
580
|
+
unstructured_ingest-0.5.15.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
581
|
+
unstructured_ingest-0.5.15.dist-info/METADATA,sha256=TimVS8ZngyfFUMhuD317dXl6nlI9acBRC_LxZWZczuU,8465
|
|
582
|
+
unstructured_ingest-0.5.15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
583
|
+
unstructured_ingest-0.5.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
584
|
+
unstructured_ingest-0.5.15.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
585
|
+
unstructured_ingest-0.5.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.15.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|