unstructured-ingest 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_onedrive.py +51 -2
- test/integration/connectors/test_sharepoint.py +71 -0
- test/integration/connectors/utils/validation/source.py +45 -16
- test/integration/embedders/test_bedrock.py +1 -1
- test/integration/partitioners/test_partitioner.py +10 -9
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +1 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +4 -1
- unstructured_ingest/v2/processes/embedder.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +50 -6
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/METADATA +21 -21
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/RECORD +19 -18
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -5,13 +5,25 @@ from pathlib import Path
|
|
|
5
5
|
import pytest
|
|
6
6
|
from office365.graph_client import GraphClient
|
|
7
7
|
|
|
8
|
-
from test.integration.connectors.utils.constants import
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
BLOB_STORAGE_TAG,
|
|
10
|
+
DESTINATION_TAG,
|
|
11
|
+
SOURCE_TAG,
|
|
12
|
+
)
|
|
13
|
+
from test.integration.connectors.utils.validation.source import (
|
|
14
|
+
SourceValidationConfigs,
|
|
15
|
+
source_connector_validation,
|
|
16
|
+
)
|
|
9
17
|
from test.integration.utils import requires_env
|
|
10
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
11
19
|
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
12
20
|
CONNECTOR_TYPE,
|
|
13
21
|
OnedriveAccessConfig,
|
|
14
22
|
OnedriveConnectionConfig,
|
|
23
|
+
OnedriveDownloader,
|
|
24
|
+
OnedriveDownloaderConfig,
|
|
25
|
+
OnedriveIndexer,
|
|
26
|
+
OnedriveIndexerConfig,
|
|
15
27
|
OnedriveUploader,
|
|
16
28
|
OnedriveUploaderConfig,
|
|
17
29
|
)
|
|
@@ -62,9 +74,46 @@ def get_connection_config():
|
|
|
62
74
|
return connection_config
|
|
63
75
|
|
|
64
76
|
|
|
77
|
+
@pytest.mark.asyncio
|
|
78
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
79
|
+
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
80
|
+
async def test_onedrive_source(temp_dir):
|
|
81
|
+
connection_config = get_connection_config()
|
|
82
|
+
index_config = OnedriveIndexerConfig(recursive=True, path="eml")
|
|
83
|
+
|
|
84
|
+
download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
|
|
85
|
+
|
|
86
|
+
# Instantiate indexer and downloader
|
|
87
|
+
indexer = OnedriveIndexer(
|
|
88
|
+
connection_config=connection_config,
|
|
89
|
+
index_config=index_config,
|
|
90
|
+
)
|
|
91
|
+
downloader = OnedriveDownloader(
|
|
92
|
+
connection_config=connection_config,
|
|
93
|
+
download_config=download_config,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Run the source connector validation
|
|
97
|
+
await source_connector_validation(
|
|
98
|
+
indexer=indexer,
|
|
99
|
+
downloader=downloader,
|
|
100
|
+
configs=SourceValidationConfigs(
|
|
101
|
+
test_id="onedrive",
|
|
102
|
+
expected_num_files=1,
|
|
103
|
+
validate_downloaded_files=True,
|
|
104
|
+
exclude_fields_extend=[
|
|
105
|
+
"metadata.date_created",
|
|
106
|
+
"metadata.date_modified",
|
|
107
|
+
"additional_metadata.LastModified",
|
|
108
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
109
|
+
],
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
65
114
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
66
115
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
67
|
-
def
|
|
116
|
+
def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
68
117
|
"""
|
|
69
118
|
Integration test for the OneDrive destination connector.
|
|
70
119
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
|
|
6
|
+
from test.integration.connectors.utils.validation.source import (
|
|
7
|
+
SourceValidationConfigs,
|
|
8
|
+
source_connector_validation,
|
|
9
|
+
)
|
|
10
|
+
from test.integration.utils import requires_env
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sharepoint import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
SharepointAccessConfig,
|
|
14
|
+
SharepointConnectionConfig,
|
|
15
|
+
SharepointDownloader,
|
|
16
|
+
SharepointDownloaderConfig,
|
|
17
|
+
SharepointIndexer,
|
|
18
|
+
SharepointIndexerConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.asyncio
|
|
23
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
24
|
+
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
25
|
+
async def test_sharepoint_source(temp_dir):
|
|
26
|
+
# Retrieve environment variables
|
|
27
|
+
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
28
|
+
client_id = os.environ["SHAREPOINT_CLIENT_ID"]
|
|
29
|
+
client_cred = os.environ["SHAREPOINT_CRED"]
|
|
30
|
+
user_pname = os.environ["MS_USER_PNAME"]
|
|
31
|
+
tenant = os.environ["MS_TENANT_ID"]
|
|
32
|
+
|
|
33
|
+
# Create connection and indexer configurations
|
|
34
|
+
access_config = SharepointAccessConfig(client_cred=client_cred)
|
|
35
|
+
connection_config = SharepointConnectionConfig(
|
|
36
|
+
client_id=client_id,
|
|
37
|
+
site=site,
|
|
38
|
+
tenant=tenant,
|
|
39
|
+
user_pname=user_pname,
|
|
40
|
+
access_config=access_config,
|
|
41
|
+
)
|
|
42
|
+
index_config = SharepointIndexerConfig(recursive=True)
|
|
43
|
+
|
|
44
|
+
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
45
|
+
|
|
46
|
+
# Instantiate indexer and downloader
|
|
47
|
+
indexer = SharepointIndexer(
|
|
48
|
+
connection_config=connection_config,
|
|
49
|
+
index_config=index_config,
|
|
50
|
+
)
|
|
51
|
+
downloader = SharepointDownloader(
|
|
52
|
+
connection_config=connection_config,
|
|
53
|
+
download_config=download_config,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Run the source connector validation
|
|
57
|
+
await source_connector_validation(
|
|
58
|
+
indexer=indexer,
|
|
59
|
+
downloader=downloader,
|
|
60
|
+
configs=SourceValidationConfigs(
|
|
61
|
+
test_id="sharepoint",
|
|
62
|
+
expected_num_files=4,
|
|
63
|
+
validate_downloaded_files=True,
|
|
64
|
+
exclude_fields_extend=[
|
|
65
|
+
"metadata.date_created",
|
|
66
|
+
"metadata.date_modified",
|
|
67
|
+
"additional_metadata.LastModified",
|
|
68
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
69
|
+
],
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -10,6 +10,13 @@ from pydantic import Field
|
|
|
10
10
|
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
11
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
12
|
|
|
13
|
+
NONSTANDARD_METADATA_FIELDS = {
|
|
14
|
+
"additional_metadata.@microsoft.graph.downloadUrl": [
|
|
15
|
+
"additional_metadata",
|
|
16
|
+
"@microsoft.graph.downloadUrl",
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
class SourceValidationConfigs(ValidationConfig):
|
|
15
22
|
expected_number_indexed_file_data: Optional[int] = None
|
|
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
26
33
|
def get_exclude_fields(self) -> list[str]:
|
|
27
34
|
exclude_fields = self.exclude_fields
|
|
28
35
|
exclude_fields.extend(self.exclude_fields_extend)
|
|
29
|
-
return exclude_fields
|
|
36
|
+
return list(set(exclude_fields))
|
|
30
37
|
|
|
31
38
|
def run_file_data_validation(
|
|
32
39
|
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
45
52
|
exclude_fields = self.get_exclude_fields()
|
|
46
53
|
# Ignore fields that dynamically change every time the tests run
|
|
47
54
|
copied_data = data.copy()
|
|
55
|
+
|
|
48
56
|
for exclude_field in exclude_fields:
|
|
49
|
-
exclude_field_vals =
|
|
57
|
+
exclude_field_vals = (
|
|
58
|
+
NONSTANDARD_METADATA_FIELDS[exclude_field]
|
|
59
|
+
if exclude_field in NONSTANDARD_METADATA_FIELDS
|
|
60
|
+
else exclude_field.split(".")
|
|
61
|
+
)
|
|
50
62
|
if len(exclude_field_vals) == 1:
|
|
51
63
|
current_val = copied_data
|
|
52
64
|
drop_field = exclude_field_vals[0]
|
|
@@ -261,21 +273,38 @@ async def source_connector_validation(
|
|
|
261
273
|
indexer.precheck()
|
|
262
274
|
download_dir = downloader.download_config.download_dir
|
|
263
275
|
test_output_dir = configs.test_output_dir()
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
276
|
+
if indexer.is_async():
|
|
277
|
+
async for file_data in indexer.run_async():
|
|
278
|
+
assert file_data
|
|
279
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
280
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
281
|
+
if downloader.is_async():
|
|
282
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
283
|
+
else:
|
|
284
|
+
resp = downloader.run(file_data=file_data)
|
|
285
|
+
if isinstance(resp, list):
|
|
286
|
+
for r in resp:
|
|
287
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
288
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
289
|
+
else:
|
|
290
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
291
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
292
|
+
else:
|
|
293
|
+
for file_data in indexer.run():
|
|
294
|
+
assert file_data
|
|
295
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
296
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
297
|
+
if downloader.is_async():
|
|
298
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
299
|
+
else:
|
|
300
|
+
resp = downloader.run(file_data=file_data)
|
|
301
|
+
if isinstance(resp, list):
|
|
302
|
+
for r in resp:
|
|
303
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
304
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
305
|
+
else:
|
|
306
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
275
307
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
276
|
-
else:
|
|
277
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
278
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
279
308
|
if not overwrite_fixtures:
|
|
280
309
|
print("Running validation")
|
|
281
310
|
run_all_validations(
|
|
@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
|
|
|
31
31
|
def test_bedrock_embedder(embedder_file: Path):
|
|
32
32
|
aws_credentials = get_aws_credentials()
|
|
33
33
|
embedder_config = EmbedderConfig(
|
|
34
|
-
embedding_provider="
|
|
34
|
+
embedding_provider="bedrock",
|
|
35
35
|
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
36
36
|
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
37
37
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
|
15
14
|
non_image_partition_files = [
|
|
16
15
|
path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
|
|
17
16
|
]
|
|
17
|
+
supported_fast_partition_files = [
|
|
18
|
+
path for path in non_image_partition_files if path.suffix != ".eml"
|
|
19
|
+
]
|
|
18
20
|
image_partition_files = [
|
|
19
21
|
path for path in all_partition_files if path not in non_image_partition_files
|
|
20
22
|
]
|
|
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
|
|
|
33
35
|
)
|
|
34
36
|
partitioner = Partitioner(config=partitioner_config)
|
|
35
37
|
results = await partitioner.run_async(filename=partition_file)
|
|
36
|
-
results_dir = int_test_dir / "results"
|
|
37
|
-
results_dir.mkdir(exist_ok=True)
|
|
38
|
-
results_path = results_dir / f"{partition_file.name}.json"
|
|
39
|
-
with results_path.open("w") as f:
|
|
40
|
-
json.dump(results, f, indent=2)
|
|
41
38
|
assert results
|
|
42
39
|
|
|
43
40
|
|
|
44
41
|
@pytest.mark.parametrize(
|
|
45
42
|
"partition_file",
|
|
46
|
-
|
|
47
|
-
ids=[path.name for path in
|
|
43
|
+
supported_fast_partition_files,
|
|
44
|
+
ids=[path.name for path in supported_fast_partition_files],
|
|
48
45
|
)
|
|
49
46
|
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
50
47
|
@pytest.mark.asyncio
|
|
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
|
|
|
68
65
|
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
69
66
|
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
70
67
|
partitioner_config = PartitionerConfig(
|
|
71
|
-
strategy="fast",
|
|
68
|
+
strategy="fast",
|
|
69
|
+
partition_by_api=True,
|
|
70
|
+
api_key=api_key,
|
|
71
|
+
partition_endpoint=api_url,
|
|
72
|
+
raise_unsupported_filetype=True,
|
|
72
73
|
)
|
|
73
74
|
partitioner = Partitioner(config=partitioner_config)
|
|
74
75
|
with pytest.raises(UserError):
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.5.0" # pragma: no cover
|
|
@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
|
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
229
|
-
elif self.provider == "
|
|
229
|
+
elif self.provider == "bedrock":
|
|
230
230
|
from unstructured_ingest.embed.bedrock import (
|
|
231
231
|
BedrockEmbeddingConfig,
|
|
232
232
|
BedrockEmbeddingEncoder,
|
|
@@ -268,6 +268,7 @@ class Pipeline:
|
|
|
268
268
|
|
|
269
269
|
# Partition content
|
|
270
270
|
elements = self.partitioner_step(downloaded_data)
|
|
271
|
+
elements = self.clean_results(results=elements)
|
|
271
272
|
# Download data non longer needed, delete if possible
|
|
272
273
|
self.downloader_step.delete_cache()
|
|
273
274
|
elements = self.clean_results(results=elements)
|
|
@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
|
|
|
105
105
|
class OnedriveIndexer(Indexer):
|
|
106
106
|
connection_config: OnedriveConnectionConfig
|
|
107
107
|
index_config: OnedriveIndexerConfig
|
|
108
|
+
connector_type: str = CONNECTOR_TYPE
|
|
108
109
|
|
|
109
110
|
def precheck(self) -> None:
|
|
110
111
|
try:
|
|
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
|
|
|
172
173
|
)
|
|
173
174
|
return FileData(
|
|
174
175
|
identifier=drive_item.id,
|
|
175
|
-
connector_type=
|
|
176
|
+
connector_type=self.connector_type,
|
|
176
177
|
source_identifiers=SourceIdentifiers(
|
|
177
178
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
178
179
|
),
|
|
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
|
|
|
201
202
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
202
203
|
if "error" in token_resp:
|
|
203
204
|
raise SourceConnectionError(
|
|
204
|
-
f"[{
|
|
205
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
206
|
+
f"({token_resp.get('error_description')})"
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
|
|
|
221
223
|
class OnedriveDownloader(Downloader):
|
|
222
224
|
connection_config: OnedriveConnectionConfig
|
|
223
225
|
download_config: OnedriveDownloaderConfig
|
|
226
|
+
connector_type: str = CONNECTOR_TYPE
|
|
224
227
|
|
|
225
228
|
@SourceConnectionNetworkError.wrap
|
|
226
229
|
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
|
|
|
260
263
|
file.download_session(f).execute_query()
|
|
261
264
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
262
265
|
except Exception as e:
|
|
263
|
-
logger.error(
|
|
266
|
+
logger.error(
|
|
267
|
+
f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
|
|
268
|
+
)
|
|
264
269
|
# Re-raise to see full stack trace locally
|
|
265
270
|
raise
|
|
266
271
|
|
|
@@ -56,6 +56,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
|
56
56
|
class SharepointIndexer(OnedriveIndexer):
|
|
57
57
|
connection_config: SharepointConnectionConfig
|
|
58
58
|
index_config: SharepointIndexerConfig
|
|
59
|
+
connector_type: str = CONNECTOR_TYPE
|
|
59
60
|
|
|
60
61
|
@requires_dependencies(["office365"], extras="sharepoint")
|
|
61
62
|
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
@@ -64,7 +65,8 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
64
65
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
65
66
|
if "error" in token_resp:
|
|
66
67
|
raise SourceConnectionError(
|
|
67
|
-
f"[{
|
|
68
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
69
|
+
f"({token_resp.get('error_description')})"
|
|
68
70
|
)
|
|
69
71
|
|
|
70
72
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
@@ -90,6 +92,7 @@ class SharepointDownloaderConfig(OnedriveDownloaderConfig):
|
|
|
90
92
|
class SharepointDownloader(OnedriveDownloader):
|
|
91
93
|
connection_config: SharepointConnectionConfig
|
|
92
94
|
download_config: SharepointDownloaderConfig
|
|
95
|
+
connector_type: str = CONNECTOR_TYPE
|
|
93
96
|
|
|
94
97
|
@SourceConnectionNetworkError.wrap
|
|
95
98
|
@requires_dependencies(["office365"], extras="onedrive")
|
|
@@ -18,7 +18,7 @@ class EmbedderConfig(BaseModel):
|
|
|
18
18
|
"openai",
|
|
19
19
|
"azure-openai",
|
|
20
20
|
"huggingface",
|
|
21
|
-
"
|
|
21
|
+
"bedrock",
|
|
22
22
|
"vertexai",
|
|
23
23
|
"voyageai",
|
|
24
24
|
"octoai",
|
|
@@ -162,7 +162,7 @@ class EmbedderConfig(BaseModel):
|
|
|
162
162
|
if self.embedding_provider == "octoai":
|
|
163
163
|
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
164
164
|
|
|
165
|
-
if self.embedding_provider == "
|
|
165
|
+
if self.embedding_provider == "bedrock":
|
|
166
166
|
return self.get_bedrock_embedder()
|
|
167
167
|
|
|
168
168
|
if self.embedding_provider == "vertexai":
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field, SecretStr
|
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
10
12
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
13
|
from unstructured_ingest.v2.logger import logger
|
|
12
14
|
from unstructured_ingest.v2.unstructured_api import call_api_async
|
|
@@ -73,6 +75,9 @@ class PartitionerConfig(BaseModel):
|
|
|
73
75
|
hi_res_model_name: Optional[str] = Field(
|
|
74
76
|
default=None, description="Model name for hi-res strategy."
|
|
75
77
|
)
|
|
78
|
+
raise_unsupported_filetype: bool = Field(
|
|
79
|
+
default=False, description="Raise an error if the file type is not supported"
|
|
80
|
+
)
|
|
76
81
|
|
|
77
82
|
def model_post_init(self, __context: Any) -> None:
|
|
78
83
|
if self.metadata_exclude and self.metadata_include:
|
|
@@ -151,13 +156,25 @@ class Partitioner(BaseProcess, ABC):
|
|
|
151
156
|
class FileDataSourceMetadata(DataSourceMetadata):
|
|
152
157
|
filesize_bytes: Optional[int] = None
|
|
153
158
|
|
|
159
|
+
metadata = metadata or {}
|
|
154
160
|
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
155
161
|
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
try:
|
|
163
|
+
elements = partition(
|
|
164
|
+
filename=str(filename.resolve()),
|
|
165
|
+
data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
|
|
166
|
+
**self.config.to_partition_kwargs(),
|
|
167
|
+
)
|
|
168
|
+
except ValueError as sdk_error:
|
|
169
|
+
if (
|
|
170
|
+
self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
|
|
171
|
+
and not self.config.raise_unsupported_filetype
|
|
172
|
+
):
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
175
|
+
)
|
|
176
|
+
return []
|
|
177
|
+
raise sdk_error
|
|
161
178
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
162
179
|
|
|
163
180
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
@@ -179,10 +196,37 @@ class Partitioner(BaseProcess, ABC):
|
|
|
179
196
|
element["metadata"]["data_source"] = metadata
|
|
180
197
|
return self.postprocess(elements=elements)
|
|
181
198
|
|
|
199
|
+
def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
|
|
200
|
+
error_msg = sdk_error.args[0]
|
|
201
|
+
return (
|
|
202
|
+
"Invalid file" in error_msg
|
|
203
|
+
or "Unstructured schema" in error_msg
|
|
204
|
+
or "fast strategy is not available for image files" in error_msg
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
|
|
208
|
+
error_msg = error.args[0]
|
|
209
|
+
error_dict = json.loads(error_msg)
|
|
210
|
+
details = error_dict["detail"]
|
|
211
|
+
return "fast strategy is not available for image files" in details or (
|
|
212
|
+
"file type" in details.lower() and "is not supported" in details.lower()
|
|
213
|
+
)
|
|
214
|
+
|
|
182
215
|
def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
|
|
183
216
|
return self.partition_locally(filename, metadata=metadata, **kwargs)
|
|
184
217
|
|
|
185
218
|
async def run_async(
|
|
186
219
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
187
220
|
) -> list[dict]:
|
|
188
|
-
|
|
221
|
+
try:
|
|
222
|
+
return await self.partition_via_api(filename, metadata=metadata, **kwargs)
|
|
223
|
+
except UserError as user_error:
|
|
224
|
+
if (
|
|
225
|
+
self.is_client_error_unsupported_filetype(error=user_error)
|
|
226
|
+
and not self.config.raise_unsupported_filetype
|
|
227
|
+
):
|
|
228
|
+
logger.warning(
|
|
229
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
230
|
+
)
|
|
231
|
+
return []
|
|
232
|
+
raise user_error
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,30 +23,30 @@ Requires-Python: >=3.9.0,<3.14
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
26
|
Requires-Dist: dataclasses-json
|
|
27
|
+
Requires-Dist: pydantic>=2.7
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
28
30
|
Requires-Dist: python-dateutil
|
|
29
31
|
Requires-Dist: opentelemetry-sdk
|
|
30
|
-
Requires-Dist: tqdm
|
|
31
|
-
Requires-Dist: click
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: fsspec; extra == "box"
|
|
49
48
|
Requires-Dist: boxfs; extra == "box"
|
|
49
|
+
Requires-Dist: fsspec; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
63
63
|
Provides-Extra: databricks-volumes
|
|
64
64
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
65
65
|
Provides-Extra: delta-table
|
|
66
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
67
66
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
67
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
68
68
|
Provides-Extra: discord
|
|
69
69
|
Requires-Dist: discord.py; extra == "discord"
|
|
70
70
|
Provides-Extra: doc
|
|
@@ -92,19 +92,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
92
92
|
Provides-Extra: epub
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
96
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
97
95
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
97
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
|
-
Requires-Dist: requests; extra == "github"
|
|
100
99
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
|
+
Requires-Dist: requests; extra == "github"
|
|
101
101
|
Provides-Extra: gitlab
|
|
102
102
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
103
103
|
Provides-Extra: google-drive
|
|
104
104
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
105
105
|
Provides-Extra: hubspot
|
|
106
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
107
106
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
107
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
108
108
|
Provides-Extra: jira
|
|
109
109
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
110
110
|
Provides-Extra: kafka
|
|
@@ -127,15 +127,15 @@ Requires-Dist: cymple; extra == "neo4j"
|
|
|
127
127
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
129
|
Requires-Dist: httpx; extra == "notion"
|
|
130
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
131
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
132
130
|
Requires-Dist: backoff; extra == "notion"
|
|
131
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
137
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
138
137
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
140
|
Requires-Dist: tiktoken; extra == "openai"
|
|
141
141
|
Requires-Dist: openai; extra == "openai"
|
|
@@ -144,8 +144,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
144
144
|
Provides-Extra: org
|
|
145
145
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
146
146
|
Provides-Extra: outlook
|
|
147
|
-
Requires-Dist: msal; extra == "outlook"
|
|
148
147
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
|
+
Requires-Dist: msal; extra == "outlook"
|
|
149
149
|
Provides-Extra: pdf
|
|
150
150
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
151
151
|
Provides-Extra: pinecone
|
|
@@ -169,35 +169,35 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
169
169
|
Provides-Extra: rtf
|
|
170
170
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
171
171
|
Provides-Extra: s3
|
|
172
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
173
172
|
Requires-Dist: s3fs; extra == "s3"
|
|
173
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
174
174
|
Provides-Extra: salesforce
|
|
175
175
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
176
176
|
Provides-Extra: sftp
|
|
177
177
|
Requires-Dist: paramiko; extra == "sftp"
|
|
178
178
|
Requires-Dist: fsspec; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
181
180
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
182
182
|
Provides-Extra: singlestore
|
|
183
183
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
184
184
|
Provides-Extra: slack
|
|
185
185
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
186
186
|
Provides-Extra: snowflake
|
|
187
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
188
187
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
189
|
Provides-Extra: togetherai
|
|
190
190
|
Requires-Dist: together; extra == "togetherai"
|
|
191
191
|
Provides-Extra: tsv
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
194
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
195
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
196
195
|
Requires-Dist: ibis; extra == "vastdb"
|
|
196
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
199
198
|
Requires-Dist: httpx; extra == "vectara"
|
|
200
199
|
Requires-Dist: requests; extra == "vectara"
|
|
200
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
201
201
|
Provides-Extra: weaviate
|
|
202
202
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
203
203
|
Provides-Extra: wikipedia
|
|
@@ -15,11 +15,12 @@ test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8D
|
|
|
15
15
|
test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
|
|
16
16
|
test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
|
|
17
17
|
test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
|
|
18
|
-
test/integration/connectors/test_onedrive.py,sha256=
|
|
18
|
+
test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
|
|
19
19
|
test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
|
|
20
20
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
21
21
|
test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
|
|
22
22
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
23
|
+
test/integration/connectors/test_sharepoint.py,sha256=8HlcnrP4K8oPUzef6AA11P2cMlxSp7tiddTkT4JOeRU,2378
|
|
23
24
|
test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
|
|
24
25
|
test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
26
|
test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
|
|
@@ -46,7 +47,7 @@ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQ
|
|
|
46
47
|
test/integration/connectors/utils/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
48
|
test/integration/connectors/utils/validation/destination.py,sha256=ZvMSvqz9in35xaoUJGx9rG8oWCU3FYlfLLQ6sfdI0pw,2649
|
|
48
49
|
test/integration/connectors/utils/validation/equality.py,sha256=R6d_1c-Si5518WJcBcshF_wBRnywnZ0ORQ-NL0xNmGo,2602
|
|
49
|
-
test/integration/connectors/utils/validation/source.py,sha256=
|
|
50
|
+
test/integration/connectors/utils/validation/source.py,sha256=xnAZI26ILdeMhgrWAGrU2N2fqK58YNGkfyUhJekZ0Ho,13541
|
|
50
51
|
test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_JjB4w2Sf8aQPvDVT5vZTs05ILs,1428
|
|
51
52
|
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
53
|
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
@@ -55,7 +56,7 @@ test/integration/connectors/weaviate/test_local.py,sha256=gXMpnzVcrNQdptDjx0haPW
|
|
|
55
56
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
57
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
57
58
|
test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
|
|
58
|
-
test/integration/embedders/test_bedrock.py,sha256=
|
|
59
|
+
test/integration/embedders/test_bedrock.py,sha256=vmjoi1uUk-LX4Yz0ZPn6Ry1JdVEsyIhLhPbSPmkeT9o,3553
|
|
59
60
|
test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
|
|
60
61
|
test/integration/embedders/test_mixedbread.py,sha256=lLz_cooyC38VSo-FMHbhKpHvYs3QzA20NOIvM5oooaw,1998
|
|
61
62
|
test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
|
|
@@ -65,7 +66,7 @@ test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gT
|
|
|
65
66
|
test/integration/embedders/test_voyageai.py,sha256=Gm3sVjhsym1ASIDfr-sZoCbpsNMaAk_l4E3-dtjRCQ4,1832
|
|
66
67
|
test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
|
|
67
68
|
test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
test/integration/partitioners/test_partitioner.py,sha256=
|
|
69
|
+
test/integration/partitioners/test_partitioner.py,sha256=6sdZhhtqEICBPqEgpKrCQIfJ-7hKcwuTFqjWs1mbQf8,2787
|
|
69
70
|
test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
71
|
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
71
72
|
test/unit/test_html.py,sha256=LKGi_QaH4U4gktrbd2NcURL-d-0Rm1UnG5Y6r9EvTG0,4489
|
|
@@ -102,9 +103,9 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
102
103
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
104
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
104
105
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
105
|
-
unstructured_ingest/__version__.py,sha256=
|
|
106
|
+
unstructured_ingest/__version__.py,sha256=A09Ks7MDqP-QtYP9TIQMxydOZeCTtu9i7xoq5wdy4As,42
|
|
106
107
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
107
|
-
unstructured_ingest/interfaces.py,sha256=
|
|
108
|
+
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
108
109
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
109
110
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
110
111
|
unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
|
|
@@ -112,7 +113,7 @@ unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29
|
|
|
112
113
|
unstructured_ingest/cli/cli.py,sha256=LutBTBYMqboKw8cputHVszpenyfnySzcUC15ifwuYyg,1049
|
|
113
114
|
unstructured_ingest/cli/cmd_factory.py,sha256=UdHm1KacTombpF6DxyTSwTCuApsKHUYw_kVu5Nhcy3Y,364
|
|
114
115
|
unstructured_ingest/cli/common.py,sha256=I0El08FHz5kxw7iz0VWOWPrvcJD1rBgXJSwVIpVmmwU,204
|
|
115
|
-
unstructured_ingest/cli/interfaces.py,sha256=
|
|
116
|
+
unstructured_ingest/cli/interfaces.py,sha256=pvEwNfYwINx3-TQ0LPudjpYNR3PnanUiXpEePPEtRSw,24086
|
|
116
117
|
unstructured_ingest/cli/utils.py,sha256=KNhkFNKOeEihc8HlvMz_MTbYVQNFklrBKbC8xg9h1xE,7982
|
|
117
118
|
unstructured_ingest/cli/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
118
119
|
unstructured_ingest/cli/base/cmd.py,sha256=BbfjA2v203Jh-7DL6bzxQ7fOeNixd5BsBMuzXz6h5IQ,583
|
|
@@ -399,7 +400,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
|
|
|
399
400
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
400
401
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
401
402
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
402
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
403
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=b37fQGm_lGutQ3Jc0qePB15lkBiFavH9tCso3inm-3I,16564
|
|
403
404
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
404
405
|
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
|
|
405
406
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
@@ -413,9 +414,9 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
|
|
|
413
414
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
414
415
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
415
416
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
416
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
417
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=PTBlRgNbAXkSaLg7JrZzHwAoqpHmopg8jNU1TmaXguU,7804
|
|
417
418
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
418
|
-
unstructured_ingest/v2/processes/partitioner.py,sha256=
|
|
419
|
+
unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
|
|
419
420
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
420
421
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
|
|
421
422
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
@@ -433,12 +434,12 @@ unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWo
|
|
|
433
434
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
434
435
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
435
436
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
|
|
436
|
-
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=
|
|
437
|
+
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
437
438
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
438
439
|
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
|
|
439
440
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
440
441
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
441
|
-
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=
|
|
442
|
+
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=f0F7KioXgucVc3tVASTa67ynlTa4s9_FKGPHop6Xm0A,4563
|
|
442
443
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
443
444
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
|
|
444
445
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
|
|
@@ -562,9 +563,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
|
|
|
562
563
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
563
564
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
564
565
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
|
|
565
|
-
unstructured_ingest-0.
|
|
566
|
-
unstructured_ingest-0.
|
|
567
|
-
unstructured_ingest-0.
|
|
568
|
-
unstructured_ingest-0.
|
|
569
|
-
unstructured_ingest-0.
|
|
570
|
-
unstructured_ingest-0.
|
|
566
|
+
unstructured_ingest-0.5.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
567
|
+
unstructured_ingest-0.5.0.dist-info/METADATA,sha256=dyxZ7tfjq1tkZPJgaK6ZanQwB6pteSIznmfUhAgnT64,8051
|
|
568
|
+
unstructured_ingest-0.5.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
569
|
+
unstructured_ingest-0.5.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
570
|
+
unstructured_ingest-0.5.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
571
|
+
unstructured_ingest-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|