unstructured-ingest 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_vastdb.py +34 -0
- test/integration/connectors/test_google_drive.py +116 -0
- test/integration/connectors/test_onedrive.py +51 -2
- test/integration/connectors/test_sharepoint.py +71 -0
- test/integration/connectors/utils/validation/source.py +45 -16
- test/integration/embedders/test_bedrock.py +1 -1
- test/integration/partitioners/test_partitioner.py +10 -9
- test/unit/v2/connectors/motherduck/__init__.py +0 -0
- test/unit/v2/connectors/motherduck/test_base.py +74 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +1 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +12 -10
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -3
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +4 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +3 -47
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +4 -12
- unstructured_ingest/v2/processes/embedder.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +50 -6
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/METADATA +23 -23
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/RECORD +29 -24
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from _pytest.fixtures import TopRequest
|
|
5
|
+
|
|
6
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
7
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
8
|
+
StagerValidationConfigs,
|
|
9
|
+
stager_validation,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sql.vastdb import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
VastdbUploadStager,
|
|
14
|
+
VastdbUploadStagerConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
19
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
20
|
+
def test_vast_stager(
|
|
21
|
+
request: TopRequest,
|
|
22
|
+
upload_file_str: str,
|
|
23
|
+
tmp_path: Path,
|
|
24
|
+
):
|
|
25
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
26
|
+
stager = VastdbUploadStager(
|
|
27
|
+
upload_stager_config=VastdbUploadStagerConfig(rename_columns_map={"page_number": "page"})
|
|
28
|
+
)
|
|
29
|
+
stager_validation(
|
|
30
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
31
|
+
input_file=upload_file,
|
|
32
|
+
stager=stager,
|
|
33
|
+
tmp_dir=tmp_path,
|
|
34
|
+
)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import (
|
|
6
|
+
SOURCE_TAG,
|
|
7
|
+
UNCATEGORIZED_TAG,
|
|
8
|
+
)
|
|
9
|
+
from test.integration.connectors.utils.validation.source import (
|
|
10
|
+
SourceValidationConfigs,
|
|
11
|
+
get_all_file_data,
|
|
12
|
+
run_all_validations,
|
|
13
|
+
update_fixtures,
|
|
14
|
+
)
|
|
15
|
+
from test.integration.utils import requires_env
|
|
16
|
+
from unstructured_ingest.v2.interfaces import Downloader, Indexer
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.google_drive import (
|
|
18
|
+
CONNECTOR_TYPE,
|
|
19
|
+
GoogleDriveAccessConfig,
|
|
20
|
+
GoogleDriveConnectionConfig,
|
|
21
|
+
GoogleDriveDownloader,
|
|
22
|
+
GoogleDriveDownloaderConfig,
|
|
23
|
+
GoogleDriveIndexer,
|
|
24
|
+
GoogleDriveIndexerConfig,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@requires_env("GOOGLE_DRIVE_SERVICE_KEY")
|
|
29
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
|
|
30
|
+
def test_google_drive_source(temp_dir):
|
|
31
|
+
# Retrieve environment variables
|
|
32
|
+
service_account_key = os.environ["GOOGLE_DRIVE_SERVICE_KEY"]
|
|
33
|
+
|
|
34
|
+
# Create connection and indexer configurations
|
|
35
|
+
access_config = GoogleDriveAccessConfig(service_account_key=service_account_key)
|
|
36
|
+
connection_config = GoogleDriveConnectionConfig(
|
|
37
|
+
drive_id="1XidSOO76VpZ4m0i3gJN2m1X0Obol3UAi",
|
|
38
|
+
access_config=access_config,
|
|
39
|
+
)
|
|
40
|
+
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
41
|
+
|
|
42
|
+
download_config = GoogleDriveDownloaderConfig(download_dir=temp_dir)
|
|
43
|
+
|
|
44
|
+
# Instantiate indexer and downloader
|
|
45
|
+
indexer = GoogleDriveIndexer(
|
|
46
|
+
connection_config=connection_config,
|
|
47
|
+
index_config=index_config,
|
|
48
|
+
)
|
|
49
|
+
downloader = GoogleDriveDownloader(
|
|
50
|
+
connection_config=connection_config,
|
|
51
|
+
download_config=download_config,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Run the source connector validation
|
|
55
|
+
source_connector_validation(
|
|
56
|
+
indexer=indexer,
|
|
57
|
+
downloader=downloader,
|
|
58
|
+
configs=SourceValidationConfigs(
|
|
59
|
+
test_id="google_drive_source",
|
|
60
|
+
expected_num_files=1,
|
|
61
|
+
validate_downloaded_files=True,
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
67
|
+
def source_connector_validation(
|
|
68
|
+
indexer: Indexer,
|
|
69
|
+
downloader: Downloader,
|
|
70
|
+
configs: SourceValidationConfigs,
|
|
71
|
+
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
72
|
+
) -> None:
|
|
73
|
+
# Run common validations on the process of running a source connector, supporting dynamic
|
|
74
|
+
# validators that get passed in along with comparisons on the saved expected values.
|
|
75
|
+
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
76
|
+
# expected values with what gets generated by this test.
|
|
77
|
+
all_predownload_file_data = []
|
|
78
|
+
all_postdownload_file_data = []
|
|
79
|
+
indexer.precheck()
|
|
80
|
+
download_dir = downloader.download_config.download_dir
|
|
81
|
+
test_output_dir = configs.test_output_dir()
|
|
82
|
+
|
|
83
|
+
for file_data in indexer.run():
|
|
84
|
+
assert file_data
|
|
85
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
86
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
87
|
+
resp = downloader.run(file_data=file_data)
|
|
88
|
+
if isinstance(resp, list):
|
|
89
|
+
for r in resp:
|
|
90
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
91
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
92
|
+
else:
|
|
93
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
94
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
95
|
+
|
|
96
|
+
if not overwrite_fixtures:
|
|
97
|
+
print("Running validation")
|
|
98
|
+
run_all_validations(
|
|
99
|
+
configs=configs,
|
|
100
|
+
predownload_file_data=all_predownload_file_data,
|
|
101
|
+
postdownload_file_data=all_postdownload_file_data,
|
|
102
|
+
download_dir=download_dir,
|
|
103
|
+
test_output_dir=test_output_dir,
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
print("Running fixtures update")
|
|
107
|
+
update_fixtures(
|
|
108
|
+
output_dir=test_output_dir,
|
|
109
|
+
download_dir=download_dir,
|
|
110
|
+
all_file_data=get_all_file_data(
|
|
111
|
+
all_predownload_file_data=all_predownload_file_data,
|
|
112
|
+
all_postdownload_file_data=all_postdownload_file_data,
|
|
113
|
+
),
|
|
114
|
+
save_downloads=configs.validate_downloaded_files,
|
|
115
|
+
save_filedata=configs.validate_file_data,
|
|
116
|
+
)
|
|
@@ -5,13 +5,25 @@ from pathlib import Path
|
|
|
5
5
|
import pytest
|
|
6
6
|
from office365.graph_client import GraphClient
|
|
7
7
|
|
|
8
|
-
from test.integration.connectors.utils.constants import
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
BLOB_STORAGE_TAG,
|
|
10
|
+
DESTINATION_TAG,
|
|
11
|
+
SOURCE_TAG,
|
|
12
|
+
)
|
|
13
|
+
from test.integration.connectors.utils.validation.source import (
|
|
14
|
+
SourceValidationConfigs,
|
|
15
|
+
source_connector_validation,
|
|
16
|
+
)
|
|
9
17
|
from test.integration.utils import requires_env
|
|
10
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
11
19
|
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
12
20
|
CONNECTOR_TYPE,
|
|
13
21
|
OnedriveAccessConfig,
|
|
14
22
|
OnedriveConnectionConfig,
|
|
23
|
+
OnedriveDownloader,
|
|
24
|
+
OnedriveDownloaderConfig,
|
|
25
|
+
OnedriveIndexer,
|
|
26
|
+
OnedriveIndexerConfig,
|
|
15
27
|
OnedriveUploader,
|
|
16
28
|
OnedriveUploaderConfig,
|
|
17
29
|
)
|
|
@@ -62,9 +74,46 @@ def get_connection_config():
|
|
|
62
74
|
return connection_config
|
|
63
75
|
|
|
64
76
|
|
|
77
|
+
@pytest.mark.asyncio
|
|
78
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
79
|
+
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
80
|
+
async def test_onedrive_source(temp_dir):
|
|
81
|
+
connection_config = get_connection_config()
|
|
82
|
+
index_config = OnedriveIndexerConfig(recursive=True, path="eml")
|
|
83
|
+
|
|
84
|
+
download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
|
|
85
|
+
|
|
86
|
+
# Instantiate indexer and downloader
|
|
87
|
+
indexer = OnedriveIndexer(
|
|
88
|
+
connection_config=connection_config,
|
|
89
|
+
index_config=index_config,
|
|
90
|
+
)
|
|
91
|
+
downloader = OnedriveDownloader(
|
|
92
|
+
connection_config=connection_config,
|
|
93
|
+
download_config=download_config,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Run the source connector validation
|
|
97
|
+
await source_connector_validation(
|
|
98
|
+
indexer=indexer,
|
|
99
|
+
downloader=downloader,
|
|
100
|
+
configs=SourceValidationConfigs(
|
|
101
|
+
test_id="onedrive",
|
|
102
|
+
expected_num_files=1,
|
|
103
|
+
validate_downloaded_files=True,
|
|
104
|
+
exclude_fields_extend=[
|
|
105
|
+
"metadata.date_created",
|
|
106
|
+
"metadata.date_modified",
|
|
107
|
+
"additional_metadata.LastModified",
|
|
108
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
109
|
+
],
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
65
114
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
66
115
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
67
|
-
def
|
|
116
|
+
def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
68
117
|
"""
|
|
69
118
|
Integration test for the OneDrive destination connector.
|
|
70
119
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
|
|
6
|
+
from test.integration.connectors.utils.validation.source import (
|
|
7
|
+
SourceValidationConfigs,
|
|
8
|
+
source_connector_validation,
|
|
9
|
+
)
|
|
10
|
+
from test.integration.utils import requires_env
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sharepoint import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
SharepointAccessConfig,
|
|
14
|
+
SharepointConnectionConfig,
|
|
15
|
+
SharepointDownloader,
|
|
16
|
+
SharepointDownloaderConfig,
|
|
17
|
+
SharepointIndexer,
|
|
18
|
+
SharepointIndexerConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.asyncio
|
|
23
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
24
|
+
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
25
|
+
async def test_sharepoint_source(temp_dir):
|
|
26
|
+
# Retrieve environment variables
|
|
27
|
+
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
28
|
+
client_id = os.environ["SHAREPOINT_CLIENT_ID"]
|
|
29
|
+
client_cred = os.environ["SHAREPOINT_CRED"]
|
|
30
|
+
user_pname = os.environ["MS_USER_PNAME"]
|
|
31
|
+
tenant = os.environ["MS_TENANT_ID"]
|
|
32
|
+
|
|
33
|
+
# Create connection and indexer configurations
|
|
34
|
+
access_config = SharepointAccessConfig(client_cred=client_cred)
|
|
35
|
+
connection_config = SharepointConnectionConfig(
|
|
36
|
+
client_id=client_id,
|
|
37
|
+
site=site,
|
|
38
|
+
tenant=tenant,
|
|
39
|
+
user_pname=user_pname,
|
|
40
|
+
access_config=access_config,
|
|
41
|
+
)
|
|
42
|
+
index_config = SharepointIndexerConfig(recursive=True)
|
|
43
|
+
|
|
44
|
+
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
45
|
+
|
|
46
|
+
# Instantiate indexer and downloader
|
|
47
|
+
indexer = SharepointIndexer(
|
|
48
|
+
connection_config=connection_config,
|
|
49
|
+
index_config=index_config,
|
|
50
|
+
)
|
|
51
|
+
downloader = SharepointDownloader(
|
|
52
|
+
connection_config=connection_config,
|
|
53
|
+
download_config=download_config,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Run the source connector validation
|
|
57
|
+
await source_connector_validation(
|
|
58
|
+
indexer=indexer,
|
|
59
|
+
downloader=downloader,
|
|
60
|
+
configs=SourceValidationConfigs(
|
|
61
|
+
test_id="sharepoint",
|
|
62
|
+
expected_num_files=4,
|
|
63
|
+
validate_downloaded_files=True,
|
|
64
|
+
exclude_fields_extend=[
|
|
65
|
+
"metadata.date_created",
|
|
66
|
+
"metadata.date_modified",
|
|
67
|
+
"additional_metadata.LastModified",
|
|
68
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
69
|
+
],
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -10,6 +10,13 @@ from pydantic import Field
|
|
|
10
10
|
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
11
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
12
|
|
|
13
|
+
NONSTANDARD_METADATA_FIELDS = {
|
|
14
|
+
"additional_metadata.@microsoft.graph.downloadUrl": [
|
|
15
|
+
"additional_metadata",
|
|
16
|
+
"@microsoft.graph.downloadUrl",
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
class SourceValidationConfigs(ValidationConfig):
|
|
15
22
|
expected_number_indexed_file_data: Optional[int] = None
|
|
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
26
33
|
def get_exclude_fields(self) -> list[str]:
|
|
27
34
|
exclude_fields = self.exclude_fields
|
|
28
35
|
exclude_fields.extend(self.exclude_fields_extend)
|
|
29
|
-
return exclude_fields
|
|
36
|
+
return list(set(exclude_fields))
|
|
30
37
|
|
|
31
38
|
def run_file_data_validation(
|
|
32
39
|
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
45
52
|
exclude_fields = self.get_exclude_fields()
|
|
46
53
|
# Ignore fields that dynamically change every time the tests run
|
|
47
54
|
copied_data = data.copy()
|
|
55
|
+
|
|
48
56
|
for exclude_field in exclude_fields:
|
|
49
|
-
exclude_field_vals =
|
|
57
|
+
exclude_field_vals = (
|
|
58
|
+
NONSTANDARD_METADATA_FIELDS[exclude_field]
|
|
59
|
+
if exclude_field in NONSTANDARD_METADATA_FIELDS
|
|
60
|
+
else exclude_field.split(".")
|
|
61
|
+
)
|
|
50
62
|
if len(exclude_field_vals) == 1:
|
|
51
63
|
current_val = copied_data
|
|
52
64
|
drop_field = exclude_field_vals[0]
|
|
@@ -261,21 +273,38 @@ async def source_connector_validation(
|
|
|
261
273
|
indexer.precheck()
|
|
262
274
|
download_dir = downloader.download_config.download_dir
|
|
263
275
|
test_output_dir = configs.test_output_dir()
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
276
|
+
if indexer.is_async():
|
|
277
|
+
async for file_data in indexer.run_async():
|
|
278
|
+
assert file_data
|
|
279
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
280
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
281
|
+
if downloader.is_async():
|
|
282
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
283
|
+
else:
|
|
284
|
+
resp = downloader.run(file_data=file_data)
|
|
285
|
+
if isinstance(resp, list):
|
|
286
|
+
for r in resp:
|
|
287
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
288
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
289
|
+
else:
|
|
290
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
291
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
292
|
+
else:
|
|
293
|
+
for file_data in indexer.run():
|
|
294
|
+
assert file_data
|
|
295
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
296
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
297
|
+
if downloader.is_async():
|
|
298
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
299
|
+
else:
|
|
300
|
+
resp = downloader.run(file_data=file_data)
|
|
301
|
+
if isinstance(resp, list):
|
|
302
|
+
for r in resp:
|
|
303
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
304
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
305
|
+
else:
|
|
306
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
275
307
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
276
|
-
else:
|
|
277
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
278
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
279
308
|
if not overwrite_fixtures:
|
|
280
309
|
print("Running validation")
|
|
281
310
|
run_all_validations(
|
|
@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
|
|
|
31
31
|
def test_bedrock_embedder(embedder_file: Path):
|
|
32
32
|
aws_credentials = get_aws_credentials()
|
|
33
33
|
embedder_config = EmbedderConfig(
|
|
34
|
-
embedding_provider="
|
|
34
|
+
embedding_provider="bedrock",
|
|
35
35
|
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
36
36
|
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
37
37
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
|
15
14
|
non_image_partition_files = [
|
|
16
15
|
path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
|
|
17
16
|
]
|
|
17
|
+
supported_fast_partition_files = [
|
|
18
|
+
path for path in non_image_partition_files if path.suffix != ".eml"
|
|
19
|
+
]
|
|
18
20
|
image_partition_files = [
|
|
19
21
|
path for path in all_partition_files if path not in non_image_partition_files
|
|
20
22
|
]
|
|
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
|
|
|
33
35
|
)
|
|
34
36
|
partitioner = Partitioner(config=partitioner_config)
|
|
35
37
|
results = await partitioner.run_async(filename=partition_file)
|
|
36
|
-
results_dir = int_test_dir / "results"
|
|
37
|
-
results_dir.mkdir(exist_ok=True)
|
|
38
|
-
results_path = results_dir / f"{partition_file.name}.json"
|
|
39
|
-
with results_path.open("w") as f:
|
|
40
|
-
json.dump(results, f, indent=2)
|
|
41
38
|
assert results
|
|
42
39
|
|
|
43
40
|
|
|
44
41
|
@pytest.mark.parametrize(
|
|
45
42
|
"partition_file",
|
|
46
|
-
|
|
47
|
-
ids=[path.name for path in
|
|
43
|
+
supported_fast_partition_files,
|
|
44
|
+
ids=[path.name for path in supported_fast_partition_files],
|
|
48
45
|
)
|
|
49
46
|
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
50
47
|
@pytest.mark.asyncio
|
|
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
|
|
|
68
65
|
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
69
66
|
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
70
67
|
partitioner_config = PartitionerConfig(
|
|
71
|
-
strategy="fast",
|
|
68
|
+
strategy="fast",
|
|
69
|
+
partition_by_api=True,
|
|
70
|
+
api_key=api_key,
|
|
71
|
+
partition_endpoint=api_url,
|
|
72
|
+
raise_unsupported_filetype=True,
|
|
72
73
|
)
|
|
73
74
|
partitioner = Partitioner(config=partitioner_config)
|
|
74
75
|
with pytest.raises(UserError):
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from pytest_mock import MockerFixture
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import SourceIdentifiers
|
|
8
|
+
from unstructured_ingest.v2.interfaces.upload_stager import UploadStagerConfig
|
|
9
|
+
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def mock_instance() -> BaseDuckDBUploadStager:
|
|
14
|
+
return BaseDuckDBUploadStager(UploadStagerConfig())
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.parametrize(
|
|
18
|
+
("input_filepath", "output_filename", "expected"),
|
|
19
|
+
[
|
|
20
|
+
(
|
|
21
|
+
"/path/to/input_file.ndjson",
|
|
22
|
+
"output_file.ndjson",
|
|
23
|
+
"output_file.ndjson",
|
|
24
|
+
),
|
|
25
|
+
("input_file.txt", "output_file.json", "output_file.txt"),
|
|
26
|
+
("/path/to/input_file.json", "output_file", "output_file.json"),
|
|
27
|
+
],
|
|
28
|
+
)
|
|
29
|
+
def test_run_output_filename_suffix(
|
|
30
|
+
mocker: MockerFixture,
|
|
31
|
+
mock_instance: BaseDuckDBUploadStager,
|
|
32
|
+
input_filepath: str,
|
|
33
|
+
output_filename: str,
|
|
34
|
+
expected: str,
|
|
35
|
+
):
|
|
36
|
+
output_dir = Path("/tmp/test/output_dir")
|
|
37
|
+
|
|
38
|
+
# Mocks
|
|
39
|
+
mock_get_data = mocker.patch(
|
|
40
|
+
"unstructured_ingest.v2.processes.connectors.duckdb.base.get_data",
|
|
41
|
+
return_value=[{"key": "value"}, {"key": "value2"}],
|
|
42
|
+
)
|
|
43
|
+
mock_conform_dict = mocker.patch.object(
|
|
44
|
+
BaseDuckDBUploadStager,
|
|
45
|
+
"conform_dict",
|
|
46
|
+
side_effect=lambda element_dict, file_data: element_dict,
|
|
47
|
+
)
|
|
48
|
+
mock_get_output_path = mocker.patch.object(
|
|
49
|
+
BaseDuckDBUploadStager, "get_output_path", return_value=output_dir / expected
|
|
50
|
+
)
|
|
51
|
+
mock_write_output = mocker.patch(
|
|
52
|
+
"unstructured_ingest.v2.processes.connectors.duckdb.base.write_data", return_value=None
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Act
|
|
56
|
+
result = mock_instance.run(
|
|
57
|
+
elements_filepath=Path(input_filepath),
|
|
58
|
+
file_data=FileData(
|
|
59
|
+
identifier="test",
|
|
60
|
+
connector_type="test",
|
|
61
|
+
source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
|
|
62
|
+
),
|
|
63
|
+
output_dir=output_dir,
|
|
64
|
+
output_filename=output_filename,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Assert
|
|
68
|
+
mock_get_data.assert_called_once_with(path=Path(input_filepath))
|
|
69
|
+
assert mock_conform_dict.call_count == 2
|
|
70
|
+
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
71
|
+
mock_write_output.assert_called_once_with(
|
|
72
|
+
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
73
|
+
)
|
|
74
|
+
assert result.name == expected
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.5.1" # pragma: no cover
|
|
@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
|
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
229
|
-
elif self.provider == "
|
|
229
|
+
elif self.provider == "bedrock":
|
|
230
230
|
from unstructured_ingest.embed.bedrock import (
|
|
231
231
|
BedrockEmbeddingConfig,
|
|
232
232
|
BedrockEmbeddingEncoder,
|
|
@@ -268,6 +268,7 @@ class Pipeline:
|
|
|
268
268
|
|
|
269
269
|
# Partition content
|
|
270
270
|
elements = self.partitioner_step(downloaded_data)
|
|
271
|
+
elements = self.clean_results(results=elements)
|
|
271
272
|
# Download data non longer needed, delete if possible
|
|
272
273
|
self.downloader_step.delete_cache()
|
|
273
274
|
elements = self.clean_results(results=elements)
|
|
@@ -81,6 +81,8 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
81
81
|
**kwargs: Any,
|
|
82
82
|
) -> Path:
|
|
83
83
|
elements_contents = get_data(path=elements_filepath)
|
|
84
|
+
output_filename_suffix = Path(elements_filepath).suffix
|
|
85
|
+
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
84
86
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
85
87
|
|
|
86
88
|
output = [
|
|
@@ -61,7 +61,7 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
61
61
|
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
62
62
|
},
|
|
63
63
|
) as conn:
|
|
64
|
-
conn.sql(f
|
|
64
|
+
conn.sql(f'USE "{self.database}"')
|
|
65
65
|
yield conn
|
|
66
66
|
|
|
67
67
|
@contextmanager
|
|
@@ -102,11 +102,12 @@ class MotherDuckUploader(Uploader):
|
|
|
102
102
|
|
|
103
103
|
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
104
104
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
|
+
database = self.connection_config.database
|
|
106
|
+
db_schema = self.connection_config.db_schema
|
|
107
|
+
table = self.connection_config.table
|
|
105
108
|
|
|
106
109
|
with self.connection_config.get_client() as conn:
|
|
107
|
-
conn.query(
|
|
108
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
109
|
-
)
|
|
110
|
+
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
110
111
|
|
|
111
112
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
112
113
|
df = pd.DataFrame(data=data)
|
|
@@ -310,20 +310,22 @@ class GoogleDriveDownloader(Downloader):
|
|
|
310
310
|
from googleapiclient.http import MediaIoBaseDownload
|
|
311
311
|
|
|
312
312
|
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
313
|
-
mime_type = file_data.additional_metadata["mimeType"]
|
|
314
313
|
record_id = file_data.identifier
|
|
314
|
+
mime_type = file_data.additional_metadata["mimeType"]
|
|
315
|
+
if not mime_type:
|
|
316
|
+
raise TypeError(
|
|
317
|
+
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
318
|
+
f"ID: {record_id} "
|
|
319
|
+
f"MimeType: {mime_type}"
|
|
320
|
+
)
|
|
315
321
|
with self.connection_config.get_client() as client:
|
|
316
|
-
if
|
|
322
|
+
if (
|
|
323
|
+
mime_type.startswith("application/vnd.google-apps")
|
|
324
|
+
and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
|
|
325
|
+
):
|
|
317
326
|
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
|
318
|
-
|
|
327
|
+
mime_type, # type: ignore
|
|
319
328
|
)
|
|
320
|
-
if not export_mime:
|
|
321
|
-
raise TypeError(
|
|
322
|
-
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
323
|
-
f"ID: {record_id} "
|
|
324
|
-
f"MimeType: {mime_type}"
|
|
325
|
-
)
|
|
326
|
-
|
|
327
329
|
request = client.export_media(
|
|
328
330
|
fileId=record_id,
|
|
329
331
|
mimeType=export_mime,
|
|
@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
|
|
|
105
105
|
class OnedriveIndexer(Indexer):
|
|
106
106
|
connection_config: OnedriveConnectionConfig
|
|
107
107
|
index_config: OnedriveIndexerConfig
|
|
108
|
+
connector_type: str = CONNECTOR_TYPE
|
|
108
109
|
|
|
109
110
|
def precheck(self) -> None:
|
|
110
111
|
try:
|
|
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
|
|
|
172
173
|
)
|
|
173
174
|
return FileData(
|
|
174
175
|
identifier=drive_item.id,
|
|
175
|
-
connector_type=
|
|
176
|
+
connector_type=self.connector_type,
|
|
176
177
|
source_identifiers=SourceIdentifiers(
|
|
177
178
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
178
179
|
),
|
|
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
|
|
|
201
202
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
202
203
|
if "error" in token_resp:
|
|
203
204
|
raise SourceConnectionError(
|
|
204
|
-
f"[{
|
|
205
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
206
|
+
f"({token_resp.get('error_description')})"
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
|
|
|
221
223
|
class OnedriveDownloader(Downloader):
|
|
222
224
|
connection_config: OnedriveConnectionConfig
|
|
223
225
|
download_config: OnedriveDownloaderConfig
|
|
226
|
+
connector_type: str = CONNECTOR_TYPE
|
|
224
227
|
|
|
225
228
|
@SourceConnectionNetworkError.wrap
|
|
226
229
|
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
|
|
|
260
263
|
file.download_session(f).execute_query()
|
|
261
264
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
262
265
|
except Exception as e:
|
|
263
|
-
logger.error(
|
|
266
|
+
logger.error(
|
|
267
|
+
f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
|
|
268
|
+
)
|
|
264
269
|
# Re-raise to see full stack trace locally
|
|
265
270
|
raise
|
|
266
271
|
|
|
@@ -56,6 +56,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
|
56
56
|
class SharepointIndexer(OnedriveIndexer):
|
|
57
57
|
connection_config: SharepointConnectionConfig
|
|
58
58
|
index_config: SharepointIndexerConfig
|
|
59
|
+
connector_type: str = CONNECTOR_TYPE
|
|
59
60
|
|
|
60
61
|
@requires_dependencies(["office365"], extras="sharepoint")
|
|
61
62
|
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
@@ -64,7 +65,8 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
64
65
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
65
66
|
if "error" in token_resp:
|
|
66
67
|
raise SourceConnectionError(
|
|
67
|
-
f"[{
|
|
68
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
69
|
+
f"({token_resp.get('error_description')})"
|
|
68
70
|
)
|
|
69
71
|
|
|
70
72
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
@@ -90,6 +92,7 @@ class SharepointDownloaderConfig(OnedriveDownloaderConfig):
|
|
|
90
92
|
class SharepointDownloader(OnedriveDownloader):
|
|
91
93
|
connection_config: SharepointConnectionConfig
|
|
92
94
|
download_config: SharepointDownloaderConfig
|
|
95
|
+
connector_type: str = CONNECTOR_TYPE
|
|
93
96
|
|
|
94
97
|
@SourceConnectionNetworkError.wrap
|
|
95
98
|
@requires_dependencies(["office365"], extras="onedrive")
|
|
@@ -38,48 +38,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
38
38
|
from unstructured_ingest.v2.logger import logger
|
|
39
39
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
40
40
|
|
|
41
|
-
_COLUMNS = (
|
|
42
|
-
"id",
|
|
43
|
-
"element_id",
|
|
44
|
-
"text",
|
|
45
|
-
"embeddings",
|
|
46
|
-
"type",
|
|
47
|
-
"system",
|
|
48
|
-
"layout_width",
|
|
49
|
-
"layout_height",
|
|
50
|
-
"points",
|
|
51
|
-
"url",
|
|
52
|
-
"version",
|
|
53
|
-
"date_created",
|
|
54
|
-
"date_modified",
|
|
55
|
-
"date_processed",
|
|
56
|
-
"permissions_data",
|
|
57
|
-
"record_locator",
|
|
58
|
-
"category_depth",
|
|
59
|
-
"parent_id",
|
|
60
|
-
"attached_filename",
|
|
61
|
-
"filetype",
|
|
62
|
-
"last_modified",
|
|
63
|
-
"file_directory",
|
|
64
|
-
"filename",
|
|
65
|
-
"languages",
|
|
66
|
-
"page_number",
|
|
67
|
-
"links",
|
|
68
|
-
"page_name",
|
|
69
|
-
"link_urls",
|
|
70
|
-
"link_texts",
|
|
71
|
-
"sent_from",
|
|
72
|
-
"sent_to",
|
|
73
|
-
"subject",
|
|
74
|
-
"section",
|
|
75
|
-
"header_footer_type",
|
|
76
|
-
"emphasized_text_contents",
|
|
77
|
-
"emphasized_text_tags",
|
|
78
|
-
"text_as_html",
|
|
79
|
-
"regex_metadata",
|
|
80
|
-
"detection_class_prob",
|
|
81
|
-
)
|
|
82
|
-
|
|
83
41
|
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
84
42
|
|
|
85
43
|
|
|
@@ -270,10 +228,8 @@ class SQLUploadStager(UploadStager):
|
|
|
270
228
|
|
|
271
229
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
272
230
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
element[RECORD_ID_LABEL] = file_data.identifier
|
|
276
|
-
return element
|
|
231
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
232
|
+
return data
|
|
277
233
|
|
|
278
234
|
def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
279
235
|
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
@@ -375,7 +331,7 @@ class SQLUploader(Uploader):
|
|
|
375
331
|
missing_columns = schema_fields - columns
|
|
376
332
|
|
|
377
333
|
if columns_to_drop:
|
|
378
|
-
logger.
|
|
334
|
+
logger.info(
|
|
379
335
|
"Following columns will be dropped to match the table's schema: "
|
|
380
336
|
f"{', '.join(columns_to_drop)}"
|
|
381
337
|
)
|
|
@@ -19,7 +19,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
19
19
|
SourceRegistryEntry,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
22
|
-
_COLUMNS,
|
|
23
22
|
SQLAccessConfig,
|
|
24
23
|
SqlBatchFileData,
|
|
25
24
|
SQLConnectionConfig,
|
|
@@ -149,13 +148,11 @@ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
|
|
|
149
148
|
default=None,
|
|
150
149
|
description="Map of column names to rename, ex: {'old_name': 'new_name'}",
|
|
151
150
|
)
|
|
152
|
-
additional_columns: Optional[list[str]] = Field(
|
|
153
|
-
default_factory=list, description="Additional columns to include in the upload"
|
|
154
|
-
)
|
|
155
151
|
|
|
156
152
|
|
|
153
|
+
@dataclass
|
|
157
154
|
class VastdbUploadStager(SQLUploadStager):
|
|
158
|
-
upload_stager_config: VastdbUploadStagerConfig
|
|
155
|
+
upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
|
|
159
156
|
|
|
160
157
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
161
158
|
data = element_dict.copy()
|
|
@@ -168,13 +165,8 @@ class VastdbUploadStager(SQLUploadStager):
|
|
|
168
165
|
data.update(coordinates)
|
|
169
166
|
|
|
170
167
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
# but also allow for additional columns
|
|
174
|
-
approved_columns = set(_COLUMNS).union(self.upload_stager_config.additional_columns)
|
|
175
|
-
element = {k: v for k, v in data.items() if k in approved_columns}
|
|
176
|
-
element[RECORD_ID_LABEL] = file_data.identifier
|
|
177
|
-
return element
|
|
168
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
169
|
+
return data
|
|
178
170
|
|
|
179
171
|
def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
180
172
|
df = super().conform_dataframe(df=df)
|
|
@@ -18,7 +18,7 @@ class EmbedderConfig(BaseModel):
|
|
|
18
18
|
"openai",
|
|
19
19
|
"azure-openai",
|
|
20
20
|
"huggingface",
|
|
21
|
-
"
|
|
21
|
+
"bedrock",
|
|
22
22
|
"vertexai",
|
|
23
23
|
"voyageai",
|
|
24
24
|
"octoai",
|
|
@@ -162,7 +162,7 @@ class EmbedderConfig(BaseModel):
|
|
|
162
162
|
if self.embedding_provider == "octoai":
|
|
163
163
|
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
164
164
|
|
|
165
|
-
if self.embedding_provider == "
|
|
165
|
+
if self.embedding_provider == "bedrock":
|
|
166
166
|
return self.get_bedrock_embedder()
|
|
167
167
|
|
|
168
168
|
if self.embedding_provider == "vertexai":
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field, SecretStr
|
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
10
12
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
13
|
from unstructured_ingest.v2.logger import logger
|
|
12
14
|
from unstructured_ingest.v2.unstructured_api import call_api_async
|
|
@@ -73,6 +75,9 @@ class PartitionerConfig(BaseModel):
|
|
|
73
75
|
hi_res_model_name: Optional[str] = Field(
|
|
74
76
|
default=None, description="Model name for hi-res strategy."
|
|
75
77
|
)
|
|
78
|
+
raise_unsupported_filetype: bool = Field(
|
|
79
|
+
default=False, description="Raise an error if the file type is not supported"
|
|
80
|
+
)
|
|
76
81
|
|
|
77
82
|
def model_post_init(self, __context: Any) -> None:
|
|
78
83
|
if self.metadata_exclude and self.metadata_include:
|
|
@@ -151,13 +156,25 @@ class Partitioner(BaseProcess, ABC):
|
|
|
151
156
|
class FileDataSourceMetadata(DataSourceMetadata):
|
|
152
157
|
filesize_bytes: Optional[int] = None
|
|
153
158
|
|
|
159
|
+
metadata = metadata or {}
|
|
154
160
|
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
155
161
|
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
try:
|
|
163
|
+
elements = partition(
|
|
164
|
+
filename=str(filename.resolve()),
|
|
165
|
+
data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
|
|
166
|
+
**self.config.to_partition_kwargs(),
|
|
167
|
+
)
|
|
168
|
+
except ValueError as sdk_error:
|
|
169
|
+
if (
|
|
170
|
+
self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
|
|
171
|
+
and not self.config.raise_unsupported_filetype
|
|
172
|
+
):
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
175
|
+
)
|
|
176
|
+
return []
|
|
177
|
+
raise sdk_error
|
|
161
178
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
162
179
|
|
|
163
180
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
@@ -179,10 +196,37 @@ class Partitioner(BaseProcess, ABC):
|
|
|
179
196
|
element["metadata"]["data_source"] = metadata
|
|
180
197
|
return self.postprocess(elements=elements)
|
|
181
198
|
|
|
199
|
+
def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
|
|
200
|
+
error_msg = sdk_error.args[0]
|
|
201
|
+
return (
|
|
202
|
+
"Invalid file" in error_msg
|
|
203
|
+
or "Unstructured schema" in error_msg
|
|
204
|
+
or "fast strategy is not available for image files" in error_msg
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
|
|
208
|
+
error_msg = error.args[0]
|
|
209
|
+
error_dict = json.loads(error_msg)
|
|
210
|
+
details = error_dict["detail"]
|
|
211
|
+
return "fast strategy is not available for image files" in details or (
|
|
212
|
+
"file type" in details.lower() and "is not supported" in details.lower()
|
|
213
|
+
)
|
|
214
|
+
|
|
182
215
|
def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
|
|
183
216
|
return self.partition_locally(filename, metadata=metadata, **kwargs)
|
|
184
217
|
|
|
185
218
|
async def run_async(
|
|
186
219
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
187
220
|
) -> list[dict]:
|
|
188
|
-
|
|
221
|
+
try:
|
|
222
|
+
return await self.partition_via_api(filename, metadata=metadata, **kwargs)
|
|
223
|
+
except UserError as user_error:
|
|
224
|
+
if (
|
|
225
|
+
self.is_client_error_unsupported_filetype(error=user_error)
|
|
226
|
+
and not self.config.raise_unsupported_filetype
|
|
227
|
+
):
|
|
228
|
+
logger.warning(
|
|
229
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
230
|
+
)
|
|
231
|
+
return []
|
|
232
|
+
raise user_error
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
25
|
Requires-Dist: dataclasses-json
|
|
28
|
-
Requires-Dist:
|
|
26
|
+
Requires-Dist: click
|
|
29
27
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
+
Requires-Dist: pydantic>=2.7
|
|
29
|
+
Requires-Dist: python-dateutil
|
|
30
|
+
Requires-Dist: pandas
|
|
30
31
|
Requires-Dist: tqdm
|
|
31
|
-
Requires-Dist: click
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: fsspec; extra == "box"
|
|
49
48
|
Requires-Dist: boxfs; extra == "box"
|
|
49
|
+
Requires-Dist: fsspec; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
53
53
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
54
54
|
Provides-Extra: confluence
|
|
55
|
-
Requires-Dist: requests; extra == "confluence"
|
|
56
55
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
|
+
Requires-Dist: requests; extra == "confluence"
|
|
57
57
|
Provides-Extra: couchbase
|
|
58
58
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
59
59
|
Provides-Extra: csv
|
|
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
83
83
|
Provides-Extra: embed-mixedbreadai
|
|
84
84
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
85
85
|
Provides-Extra: embed-octoai
|
|
86
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
87
86
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
87
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
88
88
|
Provides-Extra: embed-vertexai
|
|
89
89
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
90
90
|
Provides-Extra: embed-voyageai
|
|
@@ -92,9 +92,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
92
92
|
Provides-Extra: epub
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
96
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
97
95
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
97
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
99
|
Requires-Dist: requests; extra == "github"
|
|
100
100
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
@@ -103,8 +103,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
103
103
|
Provides-Extra: google-drive
|
|
104
104
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
105
105
|
Provides-Extra: hubspot
|
|
106
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
107
106
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
107
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
108
108
|
Provides-Extra: jira
|
|
109
109
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
110
110
|
Provides-Extra: kafka
|
|
@@ -122,30 +122,30 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
122
122
|
Provides-Extra: msg
|
|
123
123
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
124
124
|
Provides-Extra: neo4j
|
|
125
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
126
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
127
125
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
126
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
127
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist:
|
|
129
|
+
Requires-Dist: backoff; extra == "notion"
|
|
130
130
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
131
|
+
Requires-Dist: httpx; extra == "notion"
|
|
131
132
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
-
Requires-Dist: backoff; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
137
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
138
138
|
Requires-Dist: bs4; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
141
140
|
Requires-Dist: openai; extra == "openai"
|
|
141
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
142
142
|
Provides-Extra: opensearch
|
|
143
143
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
144
144
|
Provides-Extra: org
|
|
145
145
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
146
146
|
Provides-Extra: outlook
|
|
147
|
-
Requires-Dist: msal; extra == "outlook"
|
|
148
147
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
|
+
Requires-Dist: msal; extra == "outlook"
|
|
149
149
|
Provides-Extra: pdf
|
|
150
150
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
151
151
|
Provides-Extra: pinecone
|
|
@@ -177,8 +177,8 @@ Provides-Extra: sftp
|
|
|
177
177
|
Requires-Dist: paramiko; extra == "sftp"
|
|
178
178
|
Requires-Dist: fsspec; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
181
180
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
182
182
|
Provides-Extra: singlestore
|
|
183
183
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
184
184
|
Provides-Extra: slack
|
|
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
|
|
|
191
191
|
Provides-Extra: tsv
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
195
194
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
195
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
196
196
|
Requires-Dist: ibis; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
199
198
|
Requires-Dist: httpx; extra == "vectara"
|
|
200
199
|
Requires-Dist: requests; extra == "vectara"
|
|
200
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
201
201
|
Provides-Extra: weaviate
|
|
202
202
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
203
203
|
Provides-Extra: wikipedia
|
|
@@ -10,16 +10,18 @@ test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNp
|
|
|
10
10
|
test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
|
|
11
11
|
test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
|
|
12
12
|
test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
|
|
13
|
+
test/integration/connectors/test_google_drive.py,sha256=0zJZ4UJOq4TkfU-bkc556_abV7q6zVS9ZgIvW9qcTU4,4204
|
|
13
14
|
test/integration/connectors/test_lancedb.py,sha256=8MBxK_CUtOt87-4B7svDDK82NFII5psceo5cNN8HJMs,9228
|
|
14
15
|
test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8DgGzlpyevsFu2w,7173
|
|
15
16
|
test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
|
|
16
17
|
test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
|
|
17
18
|
test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
|
|
18
|
-
test/integration/connectors/test_onedrive.py,sha256=
|
|
19
|
+
test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
|
|
19
20
|
test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
|
|
20
21
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
21
22
|
test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
|
|
22
23
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
24
|
+
test/integration/connectors/test_sharepoint.py,sha256=8HlcnrP4K8oPUzef6AA11P2cMlxSp7tiddTkT4JOeRU,2378
|
|
23
25
|
test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
|
|
24
26
|
test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
27
|
test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
|
|
@@ -39,6 +41,7 @@ test/integration/connectors/sql/test_postgres.py,sha256=bGDyzLRpgrXO7nl0U8nF2zSN
|
|
|
39
41
|
test/integration/connectors/sql/test_singlestore.py,sha256=XeU2s4Kt_3tGyaDYYKTgYjdOyb8j2dnz4TgSMwFUjWs,6153
|
|
40
42
|
test/integration/connectors/sql/test_snowflake.py,sha256=LEwsRDoC6-rRiwYsqeo5B9Eo6RYygLLGAUsrtrgI9pM,7494
|
|
41
43
|
test/integration/connectors/sql/test_sqlite.py,sha256=MHvhFRx1y_LTgfS-aPz-Zn9yOGsm-TF_s0t1seBzV1k,5956
|
|
44
|
+
test/integration/connectors/sql/test_vastdb.py,sha256=66T-o_y7NaDKGmKFkT778AB-nanlLv9KdtgUGPOdnLs,1069
|
|
42
45
|
test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
46
|
test/integration/connectors/utils/constants.py,sha256=JhTk6YNw7JVpkk-Pl8zn2YYkExeL1oE9VBWm_kMYGfo,369
|
|
44
47
|
test/integration/connectors/utils/docker.py,sha256=4g1STiSbYN5qcmDTXyPxVJgwx97O6wk7n-DJ-zgzgag,4971
|
|
@@ -46,7 +49,7 @@ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQ
|
|
|
46
49
|
test/integration/connectors/utils/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
50
|
test/integration/connectors/utils/validation/destination.py,sha256=ZvMSvqz9in35xaoUJGx9rG8oWCU3FYlfLLQ6sfdI0pw,2649
|
|
48
51
|
test/integration/connectors/utils/validation/equality.py,sha256=R6d_1c-Si5518WJcBcshF_wBRnywnZ0ORQ-NL0xNmGo,2602
|
|
49
|
-
test/integration/connectors/utils/validation/source.py,sha256=
|
|
52
|
+
test/integration/connectors/utils/validation/source.py,sha256=xnAZI26ILdeMhgrWAGrU2N2fqK58YNGkfyUhJekZ0Ho,13541
|
|
50
53
|
test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_JjB4w2Sf8aQPvDVT5vZTs05ILs,1428
|
|
51
54
|
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
55
|
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
@@ -55,7 +58,7 @@ test/integration/connectors/weaviate/test_local.py,sha256=gXMpnzVcrNQdptDjx0haPW
|
|
|
55
58
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
59
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
57
60
|
test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
|
|
58
|
-
test/integration/embedders/test_bedrock.py,sha256=
|
|
61
|
+
test/integration/embedders/test_bedrock.py,sha256=vmjoi1uUk-LX4Yz0ZPn6Ry1JdVEsyIhLhPbSPmkeT9o,3553
|
|
59
62
|
test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
|
|
60
63
|
test/integration/embedders/test_mixedbread.py,sha256=lLz_cooyC38VSo-FMHbhKpHvYs3QzA20NOIvM5oooaw,1998
|
|
61
64
|
test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
|
|
@@ -65,7 +68,7 @@ test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gT
|
|
|
65
68
|
test/integration/embedders/test_voyageai.py,sha256=Gm3sVjhsym1ASIDfr-sZoCbpsNMaAk_l4E3-dtjRCQ4,1832
|
|
66
69
|
test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
|
|
67
70
|
test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
test/integration/partitioners/test_partitioner.py,sha256=
|
|
71
|
+
test/integration/partitioners/test_partitioner.py,sha256=6sdZhhtqEICBPqEgpKrCQIfJ-7hKcwuTFqjWs1mbQf8,2787
|
|
69
72
|
test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
73
|
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
71
74
|
test/unit/test_html.py,sha256=LKGi_QaH4U4gktrbd2NcURL-d-0Rm1UnG5Y6r9EvTG0,4489
|
|
@@ -86,6 +89,8 @@ test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
86
89
|
test/unit/v2/connectors/test_confluence.py,sha256=bXrn_kRb4IQdqkk4rc-P2gJAtPba7n7pNplQgfbqZDY,1047
|
|
87
90
|
test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
88
91
|
test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
|
|
92
|
+
test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
93
|
+
test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
|
|
89
94
|
test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
95
|
test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
|
|
91
96
|
test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -102,9 +107,9 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
102
107
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
108
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
104
109
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
105
|
-
unstructured_ingest/__version__.py,sha256=
|
|
110
|
+
unstructured_ingest/__version__.py,sha256=LXdgOM6QWErpDu1oCqJrypfmAkBaXzRxVPcjHL8yPrI,42
|
|
106
111
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
107
|
-
unstructured_ingest/interfaces.py,sha256=
|
|
112
|
+
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
108
113
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
109
114
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
110
115
|
unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
|
|
@@ -112,7 +117,7 @@ unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29
|
|
|
112
117
|
unstructured_ingest/cli/cli.py,sha256=LutBTBYMqboKw8cputHVszpenyfnySzcUC15ifwuYyg,1049
|
|
113
118
|
unstructured_ingest/cli/cmd_factory.py,sha256=UdHm1KacTombpF6DxyTSwTCuApsKHUYw_kVu5Nhcy3Y,364
|
|
114
119
|
unstructured_ingest/cli/common.py,sha256=I0El08FHz5kxw7iz0VWOWPrvcJD1rBgXJSwVIpVmmwU,204
|
|
115
|
-
unstructured_ingest/cli/interfaces.py,sha256=
|
|
120
|
+
unstructured_ingest/cli/interfaces.py,sha256=pvEwNfYwINx3-TQ0LPudjpYNR3PnanUiXpEePPEtRSw,24086
|
|
116
121
|
unstructured_ingest/cli/utils.py,sha256=KNhkFNKOeEihc8HlvMz_MTbYVQNFklrBKbC8xg9h1xE,7982
|
|
117
122
|
unstructured_ingest/cli/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
118
123
|
unstructured_ingest/cli/base/cmd.py,sha256=BbfjA2v203Jh-7DL6bzxQ7fOeNixd5BsBMuzXz6h5IQ,583
|
|
@@ -399,7 +404,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
|
|
|
399
404
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
400
405
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
401
406
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
402
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
407
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=b37fQGm_lGutQ3Jc0qePB15lkBiFavH9tCso3inm-3I,16564
|
|
403
408
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
404
409
|
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
|
|
405
410
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
@@ -413,9 +418,9 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
|
|
|
413
418
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
414
419
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
415
420
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
416
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
421
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=PTBlRgNbAXkSaLg7JrZzHwAoqpHmopg8jNU1TmaXguU,7804
|
|
417
422
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
418
|
-
unstructured_ingest/v2/processes/partitioner.py,sha256=
|
|
423
|
+
unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
|
|
419
424
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
420
425
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
|
|
421
426
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
@@ -427,18 +432,18 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVm
|
|
|
427
432
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
|
|
428
433
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
|
|
429
434
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
|
|
430
|
-
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=
|
|
435
|
+
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=tSbyibwm9RQyXD-HJGZa1Y9lBSCXaEFnvxpf6bHwBSE,13394
|
|
431
436
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
|
|
432
437
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
|
|
433
438
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
434
439
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
435
440
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
|
|
436
|
-
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=
|
|
441
|
+
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
437
442
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
438
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=U5gSa8S08JvCwmAhE8aV0yxGTIFnUlKVsQDybE8Fqb8,10746
|
|
439
444
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
440
445
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
441
|
-
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=
|
|
446
|
+
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=f0F7KioXgucVc3tVASTa67ynlTa4s9_FKGPHop6Xm0A,4563
|
|
442
447
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
443
448
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
|
|
444
449
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
|
|
@@ -452,9 +457,9 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8
|
|
|
452
457
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
|
|
453
458
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
|
|
454
459
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
455
|
-
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=
|
|
460
|
+
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
|
|
456
461
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=oUHHaLpO2pWW2Lu4Mc-XFjrA0ze97205WQ_xP95ua4M,4296
|
|
457
|
-
unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=
|
|
462
|
+
unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=OsRy-rcrP4_KSustpxlEKoZ_FmJNFMyMmIfFk6WJ3UY,4559
|
|
458
463
|
unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
|
|
459
464
|
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
|
|
460
465
|
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
|
|
@@ -554,17 +559,17 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
|
|
|
554
559
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
555
560
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
|
|
556
561
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=QE-WBqrPVjCgcxR5EdVD9iTHBjgDSSSQgWYvq5N61qU,7746
|
|
557
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
562
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
|
|
558
563
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
|
|
559
|
-
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=
|
|
564
|
+
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
|
|
560
565
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
561
566
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
562
567
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
563
568
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
564
569
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
|
|
565
|
-
unstructured_ingest-0.
|
|
566
|
-
unstructured_ingest-0.
|
|
567
|
-
unstructured_ingest-0.
|
|
568
|
-
unstructured_ingest-0.
|
|
569
|
-
unstructured_ingest-0.
|
|
570
|
-
unstructured_ingest-0.
|
|
570
|
+
unstructured_ingest-0.5.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
571
|
+
unstructured_ingest-0.5.1.dist-info/METADATA,sha256=4fo4K5ac0RNRlWGGyNumZ5gXJf-0PwknZWjS6HvAD6w,8051
|
|
572
|
+
unstructured_ingest-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
573
|
+
unstructured_ingest-0.5.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
574
|
+
unstructured_ingest-0.5.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
575
|
+
unstructured_ingest-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|