unstructured-ingest 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. test/integration/connectors/sql/test_vastdb.py +34 -0
  2. test/integration/connectors/test_google_drive.py +116 -0
  3. test/integration/connectors/test_onedrive.py +51 -2
  4. test/integration/connectors/test_sharepoint.py +71 -0
  5. test/integration/connectors/utils/validation/source.py +45 -16
  6. test/integration/embedders/test_bedrock.py +1 -1
  7. test/integration/partitioners/test_partitioner.py +10 -9
  8. test/unit/v2/connectors/motherduck/__init__.py +0 -0
  9. test/unit/v2/connectors/motherduck/test_base.py +74 -0
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/cli/interfaces.py +1 -1
  12. unstructured_ingest/interfaces.py +1 -1
  13. unstructured_ingest/v2/pipeline/pipeline.py +1 -0
  14. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -0
  15. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -4
  16. unstructured_ingest/v2/processes/connectors/google_drive.py +12 -10
  17. unstructured_ingest/v2/processes/connectors/onedrive.py +8 -3
  18. unstructured_ingest/v2/processes/connectors/pinecone.py +1 -0
  19. unstructured_ingest/v2/processes/connectors/sharepoint.py +4 -1
  20. unstructured_ingest/v2/processes/connectors/sql/sql.py +3 -47
  21. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +4 -12
  22. unstructured_ingest/v2/processes/embedder.py +2 -2
  23. unstructured_ingest/v2/processes/partitioner.py +50 -6
  24. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/METADATA +23 -23
  25. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/RECORD +29 -24
  26. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/LICENSE.md +0 -0
  27. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/WHEEL +0 -0
  28. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/entry_points.txt +0 -0
  29. {unstructured_ingest-0.4.7.dist-info → unstructured_ingest-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from _pytest.fixtures import TopRequest
5
+
6
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
7
+ from test.integration.connectors.utils.validation.destination import (
8
+ StagerValidationConfigs,
9
+ stager_validation,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.sql.vastdb import (
12
+ CONNECTOR_TYPE,
13
+ VastdbUploadStager,
14
+ VastdbUploadStagerConfig,
15
+ )
16
+
17
+
18
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
19
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
20
+ def test_vast_stager(
21
+ request: TopRequest,
22
+ upload_file_str: str,
23
+ tmp_path: Path,
24
+ ):
25
+ upload_file: Path = request.getfixturevalue(upload_file_str)
26
+ stager = VastdbUploadStager(
27
+ upload_stager_config=VastdbUploadStagerConfig(rename_columns_map={"page_number": "page"})
28
+ )
29
+ stager_validation(
30
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
31
+ input_file=upload_file,
32
+ stager=stager,
33
+ tmp_dir=tmp_path,
34
+ )
@@ -0,0 +1,116 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import (
6
+ SOURCE_TAG,
7
+ UNCATEGORIZED_TAG,
8
+ )
9
+ from test.integration.connectors.utils.validation.source import (
10
+ SourceValidationConfigs,
11
+ get_all_file_data,
12
+ run_all_validations,
13
+ update_fixtures,
14
+ )
15
+ from test.integration.utils import requires_env
16
+ from unstructured_ingest.v2.interfaces import Downloader, Indexer
17
+ from unstructured_ingest.v2.processes.connectors.google_drive import (
18
+ CONNECTOR_TYPE,
19
+ GoogleDriveAccessConfig,
20
+ GoogleDriveConnectionConfig,
21
+ GoogleDriveDownloader,
22
+ GoogleDriveDownloaderConfig,
23
+ GoogleDriveIndexer,
24
+ GoogleDriveIndexerConfig,
25
+ )
26
+
27
+
28
+ @requires_env("GOOGLE_DRIVE_SERVICE_KEY")
29
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
30
+ def test_google_drive_source(temp_dir):
31
+ # Retrieve environment variables
32
+ service_account_key = os.environ["GOOGLE_DRIVE_SERVICE_KEY"]
33
+
34
+ # Create connection and indexer configurations
35
+ access_config = GoogleDriveAccessConfig(service_account_key=service_account_key)
36
+ connection_config = GoogleDriveConnectionConfig(
37
+ drive_id="1XidSOO76VpZ4m0i3gJN2m1X0Obol3UAi",
38
+ access_config=access_config,
39
+ )
40
+ index_config = GoogleDriveIndexerConfig(recursive=True)
41
+
42
+ download_config = GoogleDriveDownloaderConfig(download_dir=temp_dir)
43
+
44
+ # Instantiate indexer and downloader
45
+ indexer = GoogleDriveIndexer(
46
+ connection_config=connection_config,
47
+ index_config=index_config,
48
+ )
49
+ downloader = GoogleDriveDownloader(
50
+ connection_config=connection_config,
51
+ download_config=download_config,
52
+ )
53
+
54
+ # Run the source connector validation
55
+ source_connector_validation(
56
+ indexer=indexer,
57
+ downloader=downloader,
58
+ configs=SourceValidationConfigs(
59
+ test_id="google_drive_source",
60
+ expected_num_files=1,
61
+ validate_downloaded_files=True,
62
+ ),
63
+ )
64
+
65
+
66
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
67
+ def source_connector_validation(
68
+ indexer: Indexer,
69
+ downloader: Downloader,
70
+ configs: SourceValidationConfigs,
71
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
72
+ ) -> None:
73
+ # Run common validations on the process of running a source connector, supporting dynamic
74
+ # validators that get passed in along with comparisons on the saved expected values.
75
+ # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
76
+ # expected values with what gets generated by this test.
77
+ all_predownload_file_data = []
78
+ all_postdownload_file_data = []
79
+ indexer.precheck()
80
+ download_dir = downloader.download_config.download_dir
81
+ test_output_dir = configs.test_output_dir()
82
+
83
+ for file_data in indexer.run():
84
+ assert file_data
85
+ predownload_file_data = file_data.model_copy(deep=True)
86
+ all_predownload_file_data.append(predownload_file_data)
87
+ resp = downloader.run(file_data=file_data)
88
+ if isinstance(resp, list):
89
+ for r in resp:
90
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
91
+ all_postdownload_file_data.append(postdownload_file_data)
92
+ else:
93
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
94
+ all_postdownload_file_data.append(postdownload_file_data)
95
+
96
+ if not overwrite_fixtures:
97
+ print("Running validation")
98
+ run_all_validations(
99
+ configs=configs,
100
+ predownload_file_data=all_predownload_file_data,
101
+ postdownload_file_data=all_postdownload_file_data,
102
+ download_dir=download_dir,
103
+ test_output_dir=test_output_dir,
104
+ )
105
+ else:
106
+ print("Running fixtures update")
107
+ update_fixtures(
108
+ output_dir=test_output_dir,
109
+ download_dir=download_dir,
110
+ all_file_data=get_all_file_data(
111
+ all_predownload_file_data=all_predownload_file_data,
112
+ all_postdownload_file_data=all_postdownload_file_data,
113
+ ),
114
+ save_downloads=configs.validate_downloaded_files,
115
+ save_filedata=configs.validate_file_data,
116
+ )
@@ -5,13 +5,25 @@ from pathlib import Path
5
5
  import pytest
6
6
  from office365.graph_client import GraphClient
7
7
 
8
- from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
8
+ from test.integration.connectors.utils.constants import (
9
+ BLOB_STORAGE_TAG,
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ )
13
+ from test.integration.connectors.utils.validation.source import (
14
+ SourceValidationConfigs,
15
+ source_connector_validation,
16
+ )
9
17
  from test.integration.utils import requires_env
10
18
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
11
19
  from unstructured_ingest.v2.processes.connectors.onedrive import (
12
20
  CONNECTOR_TYPE,
13
21
  OnedriveAccessConfig,
14
22
  OnedriveConnectionConfig,
23
+ OnedriveDownloader,
24
+ OnedriveDownloaderConfig,
25
+ OnedriveIndexer,
26
+ OnedriveIndexerConfig,
15
27
  OnedriveUploader,
16
28
  OnedriveUploaderConfig,
17
29
  )
@@ -62,9 +74,46 @@ def get_connection_config():
62
74
  return connection_config
63
75
 
64
76
 
77
+ @pytest.mark.asyncio
78
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
79
+ @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
80
+ async def test_onedrive_source(temp_dir):
81
+ connection_config = get_connection_config()
82
+ index_config = OnedriveIndexerConfig(recursive=True, path="eml")
83
+
84
+ download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
85
+
86
+ # Instantiate indexer and downloader
87
+ indexer = OnedriveIndexer(
88
+ connection_config=connection_config,
89
+ index_config=index_config,
90
+ )
91
+ downloader = OnedriveDownloader(
92
+ connection_config=connection_config,
93
+ download_config=download_config,
94
+ )
95
+
96
+ # Run the source connector validation
97
+ await source_connector_validation(
98
+ indexer=indexer,
99
+ downloader=downloader,
100
+ configs=SourceValidationConfigs(
101
+ test_id="onedrive",
102
+ expected_num_files=1,
103
+ validate_downloaded_files=True,
104
+ exclude_fields_extend=[
105
+ "metadata.date_created",
106
+ "metadata.date_modified",
107
+ "additional_metadata.LastModified",
108
+ "additional_metadata.@microsoft.graph.downloadUrl",
109
+ ],
110
+ ),
111
+ )
112
+
113
+
65
114
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
66
115
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
67
- def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
116
+ def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
68
117
  """
69
118
  Integration test for the OneDrive destination connector.
70
119
 
@@ -0,0 +1,71 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
6
+ from test.integration.connectors.utils.validation.source import (
7
+ SourceValidationConfigs,
8
+ source_connector_validation,
9
+ )
10
+ from test.integration.utils import requires_env
11
+ from unstructured_ingest.v2.processes.connectors.sharepoint import (
12
+ CONNECTOR_TYPE,
13
+ SharepointAccessConfig,
14
+ SharepointConnectionConfig,
15
+ SharepointDownloader,
16
+ SharepointDownloaderConfig,
17
+ SharepointIndexer,
18
+ SharepointIndexerConfig,
19
+ )
20
+
21
+
22
+ @pytest.mark.asyncio
23
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
24
+ @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
25
+ async def test_sharepoint_source(temp_dir):
26
+ # Retrieve environment variables
27
+ site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
28
+ client_id = os.environ["SHAREPOINT_CLIENT_ID"]
29
+ client_cred = os.environ["SHAREPOINT_CRED"]
30
+ user_pname = os.environ["MS_USER_PNAME"]
31
+ tenant = os.environ["MS_TENANT_ID"]
32
+
33
+ # Create connection and indexer configurations
34
+ access_config = SharepointAccessConfig(client_cred=client_cred)
35
+ connection_config = SharepointConnectionConfig(
36
+ client_id=client_id,
37
+ site=site,
38
+ tenant=tenant,
39
+ user_pname=user_pname,
40
+ access_config=access_config,
41
+ )
42
+ index_config = SharepointIndexerConfig(recursive=True)
43
+
44
+ download_config = SharepointDownloaderConfig(download_dir=temp_dir)
45
+
46
+ # Instantiate indexer and downloader
47
+ indexer = SharepointIndexer(
48
+ connection_config=connection_config,
49
+ index_config=index_config,
50
+ )
51
+ downloader = SharepointDownloader(
52
+ connection_config=connection_config,
53
+ download_config=download_config,
54
+ )
55
+
56
+ # Run the source connector validation
57
+ await source_connector_validation(
58
+ indexer=indexer,
59
+ downloader=downloader,
60
+ configs=SourceValidationConfigs(
61
+ test_id="sharepoint",
62
+ expected_num_files=4,
63
+ validate_downloaded_files=True,
64
+ exclude_fields_extend=[
65
+ "metadata.date_created",
66
+ "metadata.date_modified",
67
+ "additional_metadata.LastModified",
68
+ "additional_metadata.@microsoft.graph.downloadUrl",
69
+ ],
70
+ ),
71
+ )
@@ -10,6 +10,13 @@ from pydantic import Field
10
10
  from test.integration.connectors.utils.validation.utils import ValidationConfig
11
11
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
12
 
13
+ NONSTANDARD_METADATA_FIELDS = {
14
+ "additional_metadata.@microsoft.graph.downloadUrl": [
15
+ "additional_metadata",
16
+ "@microsoft.graph.downloadUrl",
17
+ ]
18
+ }
19
+
13
20
 
14
21
  class SourceValidationConfigs(ValidationConfig):
15
22
  expected_number_indexed_file_data: Optional[int] = None
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
26
33
  def get_exclude_fields(self) -> list[str]:
27
34
  exclude_fields = self.exclude_fields
28
35
  exclude_fields.extend(self.exclude_fields_extend)
29
- return exclude_fields
36
+ return list(set(exclude_fields))
30
37
 
31
38
  def run_file_data_validation(
32
39
  self, predownload_file_data: FileData, postdownload_file_data: FileData
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
45
52
  exclude_fields = self.get_exclude_fields()
46
53
  # Ignore fields that dynamically change every time the tests run
47
54
  copied_data = data.copy()
55
+
48
56
  for exclude_field in exclude_fields:
49
- exclude_field_vals = exclude_field.split(".")
57
+ exclude_field_vals = (
58
+ NONSTANDARD_METADATA_FIELDS[exclude_field]
59
+ if exclude_field in NONSTANDARD_METADATA_FIELDS
60
+ else exclude_field.split(".")
61
+ )
50
62
  if len(exclude_field_vals) == 1:
51
63
  current_val = copied_data
52
64
  drop_field = exclude_field_vals[0]
@@ -261,21 +273,38 @@ async def source_connector_validation(
261
273
  indexer.precheck()
262
274
  download_dir = downloader.download_config.download_dir
263
275
  test_output_dir = configs.test_output_dir()
264
- for file_data in indexer.run():
265
- assert file_data
266
- predownload_file_data = file_data.model_copy(deep=True)
267
- all_predownload_file_data.append(predownload_file_data)
268
- if downloader.is_async():
269
- resp = await downloader.run_async(file_data=file_data)
270
- else:
271
- resp = downloader.run(file_data=file_data)
272
- if isinstance(resp, list):
273
- for r in resp:
274
- postdownload_file_data = r["file_data"].model_copy(deep=True)
276
+ if indexer.is_async():
277
+ async for file_data in indexer.run_async():
278
+ assert file_data
279
+ predownload_file_data = file_data.model_copy(deep=True)
280
+ all_predownload_file_data.append(predownload_file_data)
281
+ if downloader.is_async():
282
+ resp = await downloader.run_async(file_data=file_data)
283
+ else:
284
+ resp = downloader.run(file_data=file_data)
285
+ if isinstance(resp, list):
286
+ for r in resp:
287
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
288
+ all_postdownload_file_data.append(postdownload_file_data)
289
+ else:
290
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
291
+ all_postdownload_file_data.append(postdownload_file_data)
292
+ else:
293
+ for file_data in indexer.run():
294
+ assert file_data
295
+ predownload_file_data = file_data.model_copy(deep=True)
296
+ all_predownload_file_data.append(predownload_file_data)
297
+ if downloader.is_async():
298
+ resp = await downloader.run_async(file_data=file_data)
299
+ else:
300
+ resp = downloader.run(file_data=file_data)
301
+ if isinstance(resp, list):
302
+ for r in resp:
303
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
304
+ all_postdownload_file_data.append(postdownload_file_data)
305
+ else:
306
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
275
307
  all_postdownload_file_data.append(postdownload_file_data)
276
- else:
277
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
278
- all_postdownload_file_data.append(postdownload_file_data)
279
308
  if not overwrite_fixtures:
280
309
  print("Running validation")
281
310
  run_all_validations(
@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
31
31
  def test_bedrock_embedder(embedder_file: Path):
32
32
  aws_credentials = get_aws_credentials()
33
33
  embedder_config = EmbedderConfig(
34
- embedding_provider="aws-bedrock",
34
+ embedding_provider="bedrock",
35
35
  embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
36
36
  embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
37
37
  )
@@ -1,4 +1,3 @@
1
- import json
2
1
  import os
3
2
  from pathlib import Path
4
3
 
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
15
14
  non_image_partition_files = [
16
15
  path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
17
16
  ]
17
+ supported_fast_partition_files = [
18
+ path for path in non_image_partition_files if path.suffix != ".eml"
19
+ ]
18
20
  image_partition_files = [
19
21
  path for path in all_partition_files if path not in non_image_partition_files
20
22
  ]
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
33
35
  )
34
36
  partitioner = Partitioner(config=partitioner_config)
35
37
  results = await partitioner.run_async(filename=partition_file)
36
- results_dir = int_test_dir / "results"
37
- results_dir.mkdir(exist_ok=True)
38
- results_path = results_dir / f"{partition_file.name}.json"
39
- with results_path.open("w") as f:
40
- json.dump(results, f, indent=2)
41
38
  assert results
42
39
 
43
40
 
44
41
  @pytest.mark.parametrize(
45
42
  "partition_file",
46
- non_image_partition_files,
47
- ids=[path.name for path in non_image_partition_files],
43
+ supported_fast_partition_files,
44
+ ids=[path.name for path in supported_fast_partition_files],
48
45
  )
49
46
  @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
50
47
  @pytest.mark.asyncio
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
68
65
  api_key = os.getenv("UNSTRUCTURED_API_KEY")
69
66
  api_url = os.getenv("UNSTRUCTURED_API_URL")
70
67
  partitioner_config = PartitionerConfig(
71
- strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
68
+ strategy="fast",
69
+ partition_by_api=True,
70
+ api_key=api_key,
71
+ partition_endpoint=api_url,
72
+ raise_unsupported_filetype=True,
72
73
  )
73
74
  partitioner = Partitioner(config=partitioner_config)
74
75
  with pytest.raises(UserError):
File without changes
@@ -0,0 +1,74 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from pytest_mock import MockerFixture
5
+
6
+ from unstructured_ingest.v2.interfaces import FileData
7
+ from unstructured_ingest.v2.interfaces.file_data import SourceIdentifiers
8
+ from unstructured_ingest.v2.interfaces.upload_stager import UploadStagerConfig
9
+ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
10
+
11
+
12
+ @pytest.fixture
13
+ def mock_instance() -> BaseDuckDBUploadStager:
14
+ return BaseDuckDBUploadStager(UploadStagerConfig())
15
+
16
+
17
+ @pytest.mark.parametrize(
18
+ ("input_filepath", "output_filename", "expected"),
19
+ [
20
+ (
21
+ "/path/to/input_file.ndjson",
22
+ "output_file.ndjson",
23
+ "output_file.ndjson",
24
+ ),
25
+ ("input_file.txt", "output_file.json", "output_file.txt"),
26
+ ("/path/to/input_file.json", "output_file", "output_file.json"),
27
+ ],
28
+ )
29
+ def test_run_output_filename_suffix(
30
+ mocker: MockerFixture,
31
+ mock_instance: BaseDuckDBUploadStager,
32
+ input_filepath: str,
33
+ output_filename: str,
34
+ expected: str,
35
+ ):
36
+ output_dir = Path("/tmp/test/output_dir")
37
+
38
+ # Mocks
39
+ mock_get_data = mocker.patch(
40
+ "unstructured_ingest.v2.processes.connectors.duckdb.base.get_data",
41
+ return_value=[{"key": "value"}, {"key": "value2"}],
42
+ )
43
+ mock_conform_dict = mocker.patch.object(
44
+ BaseDuckDBUploadStager,
45
+ "conform_dict",
46
+ side_effect=lambda element_dict, file_data: element_dict,
47
+ )
48
+ mock_get_output_path = mocker.patch.object(
49
+ BaseDuckDBUploadStager, "get_output_path", return_value=output_dir / expected
50
+ )
51
+ mock_write_output = mocker.patch(
52
+ "unstructured_ingest.v2.processes.connectors.duckdb.base.write_data", return_value=None
53
+ )
54
+
55
+ # Act
56
+ result = mock_instance.run(
57
+ elements_filepath=Path(input_filepath),
58
+ file_data=FileData(
59
+ identifier="test",
60
+ connector_type="test",
61
+ source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
62
+ ),
63
+ output_dir=output_dir,
64
+ output_filename=output_filename,
65
+ )
66
+
67
+ # Assert
68
+ mock_get_data.assert_called_once_with(path=Path(input_filepath))
69
+ assert mock_conform_dict.call_count == 2
70
+ mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
71
+ mock_write_output.assert_called_once_with(
72
+ path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
+ )
74
+ assert result.name == expected
@@ -1 +1 @@
1
- __version__ = "0.4.7" # pragma: no cover
1
+ __version__ = "0.5.1" # pragma: no cover
@@ -417,7 +417,7 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
417
417
  embed_providers = [
418
418
  "openai",
419
419
  "huggingface",
420
- "aws-bedrock",
420
+ "bedrock",
421
421
  "vertexai",
422
422
  "voyageai",
423
423
  "octoai",
@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
226
226
  )
227
227
 
228
228
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
229
- elif self.provider == "aws-bedrock":
229
+ elif self.provider == "bedrock":
230
230
  from unstructured_ingest.embed.bedrock import (
231
231
  BedrockEmbeddingConfig,
232
232
  BedrockEmbeddingEncoder,
@@ -268,6 +268,7 @@ class Pipeline:
268
268
 
269
269
  # Partition content
270
270
  elements = self.partitioner_step(downloaded_data)
271
+ elements = self.clean_results(results=elements)
271
272
  # Download data non longer needed, delete if possible
272
273
  self.downloader_step.delete_cache()
273
274
  elements = self.clean_results(results=elements)
@@ -81,6 +81,8 @@ class BaseDuckDBUploadStager(UploadStager):
81
81
  **kwargs: Any,
82
82
  ) -> Path:
83
83
  elements_contents = get_data(path=elements_filepath)
84
+ output_filename_suffix = Path(elements_filepath).suffix
85
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
84
86
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
85
87
 
86
88
  output = [
@@ -61,7 +61,7 @@ class MotherDuckConnectionConfig(ConnectionConfig):
61
61
  "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
62
  },
63
63
  ) as conn:
64
- conn.sql(f"USE {self.database}")
64
+ conn.sql(f'USE "{self.database}"')
65
65
  yield conn
66
66
 
67
67
  @contextmanager
@@ -102,11 +102,12 @@ class MotherDuckUploader(Uploader):
102
102
 
103
103
  def upload_dataframe(self, df: pd.DataFrame) -> None:
104
104
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
+ database = self.connection_config.database
106
+ db_schema = self.connection_config.db_schema
107
+ table = self.connection_config.table
105
108
 
106
109
  with self.connection_config.get_client() as conn:
107
- conn.query(
108
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
109
- )
110
+ conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
110
111
 
111
112
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
112
113
  df = pd.DataFrame(data=data)
@@ -310,20 +310,22 @@ class GoogleDriveDownloader(Downloader):
310
310
  from googleapiclient.http import MediaIoBaseDownload
311
311
 
312
312
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
313
- mime_type = file_data.additional_metadata["mimeType"]
314
313
  record_id = file_data.identifier
314
+ mime_type = file_data.additional_metadata["mimeType"]
315
+ if not mime_type:
316
+ raise TypeError(
317
+ f"File not supported. Name: {file_data.source_identifiers.filename} "
318
+ f"ID: {record_id} "
319
+ f"MimeType: {mime_type}"
320
+ )
315
321
  with self.connection_config.get_client() as client:
316
- if mime_type.startswith("application/vnd.google-apps"):
322
+ if (
323
+ mime_type.startswith("application/vnd.google-apps")
324
+ and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
325
+ ):
317
326
  export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
318
- self.meta.get("mimeType"), # type: ignore
327
+ mime_type, # type: ignore
319
328
  )
320
- if not export_mime:
321
- raise TypeError(
322
- f"File not supported. Name: {file_data.source_identifiers.filename} "
323
- f"ID: {record_id} "
324
- f"MimeType: {mime_type}"
325
- )
326
-
327
329
  request = client.export_media(
328
330
  fileId=record_id,
329
331
  mimeType=export_mime,
@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
105
105
  class OnedriveIndexer(Indexer):
106
106
  connection_config: OnedriveConnectionConfig
107
107
  index_config: OnedriveIndexerConfig
108
+ connector_type: str = CONNECTOR_TYPE
108
109
 
109
110
  def precheck(self) -> None:
110
111
  try:
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
172
173
  )
173
174
  return FileData(
174
175
  identifier=drive_item.id,
175
- connector_type=CONNECTOR_TYPE,
176
+ connector_type=self.connector_type,
176
177
  source_identifiers=SourceIdentifiers(
177
178
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
178
179
  ),
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
201
202
  token_resp = await asyncio.to_thread(self.connection_config.get_token)
202
203
  if "error" in token_resp:
203
204
  raise SourceConnectionError(
204
- f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
205
+ f"[{self.connector_type}]: {token_resp['error']} "
206
+ f"({token_resp.get('error_description')})"
205
207
  )
206
208
 
207
209
  client = await asyncio.to_thread(self.connection_config.get_client)
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
221
223
  class OnedriveDownloader(Downloader):
222
224
  connection_config: OnedriveConnectionConfig
223
225
  download_config: OnedriveDownloaderConfig
226
+ connector_type: str = CONNECTOR_TYPE
224
227
 
225
228
  @SourceConnectionNetworkError.wrap
226
229
  def _fetch_file(self, file_data: FileData) -> DriveItem:
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
260
263
  file.download_session(f).execute_query()
261
264
  return self.generate_download_response(file_data=file_data, download_path=download_path)
262
265
  except Exception as e:
263
- logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
266
+ logger.error(
267
+ f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
268
+ )
264
269
  # Re-raise to see full stack trace locally
265
270
  raise
266
271
 
@@ -81,6 +81,7 @@ ALLOWED_FIELDS = (
81
81
  "link_urls",
82
82
  "link_texts",
83
83
  "text_as_html",
84
+ "entities",
84
85
  )
85
86
 
86
87
 
@@ -56,6 +56,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
56
56
  class SharepointIndexer(OnedriveIndexer):
57
57
  connection_config: SharepointConnectionConfig
58
58
  index_config: SharepointIndexerConfig
59
+ connector_type: str = CONNECTOR_TYPE
59
60
 
60
61
  @requires_dependencies(["office365"], extras="sharepoint")
61
62
  async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
@@ -64,7 +65,8 @@ class SharepointIndexer(OnedriveIndexer):
64
65
  token_resp = await asyncio.to_thread(self.connection_config.get_token)
65
66
  if "error" in token_resp:
66
67
  raise SourceConnectionError(
67
- f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
68
+ f"[{self.connector_type}]: {token_resp['error']} "
69
+ f"({token_resp.get('error_description')})"
68
70
  )
69
71
 
70
72
  client = await asyncio.to_thread(self.connection_config.get_client)
@@ -90,6 +92,7 @@ class SharepointDownloaderConfig(OnedriveDownloaderConfig):
90
92
  class SharepointDownloader(OnedriveDownloader):
91
93
  connection_config: SharepointConnectionConfig
92
94
  download_config: SharepointDownloaderConfig
95
+ connector_type: str = CONNECTOR_TYPE
93
96
 
94
97
  @SourceConnectionNetworkError.wrap
95
98
  @requires_dependencies(["office365"], extras="onedrive")
@@ -38,48 +38,6 @@ from unstructured_ingest.v2.interfaces import (
38
38
  from unstructured_ingest.v2.logger import logger
39
39
  from unstructured_ingest.v2.utils import get_enhanced_element_id
40
40
 
41
- _COLUMNS = (
42
- "id",
43
- "element_id",
44
- "text",
45
- "embeddings",
46
- "type",
47
- "system",
48
- "layout_width",
49
- "layout_height",
50
- "points",
51
- "url",
52
- "version",
53
- "date_created",
54
- "date_modified",
55
- "date_processed",
56
- "permissions_data",
57
- "record_locator",
58
- "category_depth",
59
- "parent_id",
60
- "attached_filename",
61
- "filetype",
62
- "last_modified",
63
- "file_directory",
64
- "filename",
65
- "languages",
66
- "page_number",
67
- "links",
68
- "page_name",
69
- "link_urls",
70
- "link_texts",
71
- "sent_from",
72
- "sent_to",
73
- "subject",
74
- "section",
75
- "header_footer_type",
76
- "emphasized_text_contents",
77
- "emphasized_text_tags",
78
- "text_as_html",
79
- "regex_metadata",
80
- "detection_class_prob",
81
- )
82
-
83
41
  _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
84
42
 
85
43
 
@@ -270,10 +228,8 @@ class SQLUploadStager(UploadStager):
270
228
 
271
229
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
272
230
 
273
- # remove extraneous, not supported columns
274
- element = {k: v for k, v in data.items() if k in _COLUMNS}
275
- element[RECORD_ID_LABEL] = file_data.identifier
276
- return element
231
+ data[RECORD_ID_LABEL] = file_data.identifier
232
+ return data
277
233
 
278
234
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
279
235
  for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
@@ -375,7 +331,7 @@ class SQLUploader(Uploader):
375
331
  missing_columns = schema_fields - columns
376
332
 
377
333
  if columns_to_drop:
378
- logger.warning(
334
+ logger.info(
379
335
  "Following columns will be dropped to match the table's schema: "
380
336
  f"{', '.join(columns_to_drop)}"
381
337
  )
@@ -19,7 +19,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
19
19
  SourceRegistryEntry,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
22
- _COLUMNS,
23
22
  SQLAccessConfig,
24
23
  SqlBatchFileData,
25
24
  SQLConnectionConfig,
@@ -149,13 +148,11 @@ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
149
148
  default=None,
150
149
  description="Map of column names to rename, ex: {'old_name': 'new_name'}",
151
150
  )
152
- additional_columns: Optional[list[str]] = Field(
153
- default_factory=list, description="Additional columns to include in the upload"
154
- )
155
151
 
156
152
 
153
+ @dataclass
157
154
  class VastdbUploadStager(SQLUploadStager):
158
- upload_stager_config: VastdbUploadStagerConfig
155
+ upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
159
156
 
160
157
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
161
158
  data = element_dict.copy()
@@ -168,13 +165,8 @@ class VastdbUploadStager(SQLUploadStager):
168
165
  data.update(coordinates)
169
166
 
170
167
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
171
-
172
- # remove extraneous, not supported columns
173
- # but also allow for additional columns
174
- approved_columns = set(_COLUMNS).union(self.upload_stager_config.additional_columns)
175
- element = {k: v for k, v in data.items() if k in approved_columns}
176
- element[RECORD_ID_LABEL] = file_data.identifier
177
- return element
168
+ data[RECORD_ID_LABEL] = file_data.identifier
169
+ return data
178
170
 
179
171
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
180
172
  df = super().conform_dataframe(df=df)
@@ -18,7 +18,7 @@ class EmbedderConfig(BaseModel):
18
18
  "openai",
19
19
  "azure-openai",
20
20
  "huggingface",
21
- "aws-bedrock",
21
+ "bedrock",
22
22
  "vertexai",
23
23
  "voyageai",
24
24
  "octoai",
@@ -162,7 +162,7 @@ class EmbedderConfig(BaseModel):
162
162
  if self.embedding_provider == "octoai":
163
163
  return self.get_octoai_embedder(embedding_kwargs=kwargs)
164
164
 
165
- if self.embedding_provider == "aws-bedrock":
165
+ if self.embedding_provider == "bedrock":
166
166
  return self.get_bedrock_embedder()
167
167
 
168
168
  if self.embedding_provider == "vertexai":
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field, SecretStr
7
8
 
8
9
  from unstructured_ingest.utils.data_prep import flatten_dict
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import UserError
10
12
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
13
  from unstructured_ingest.v2.logger import logger
12
14
  from unstructured_ingest.v2.unstructured_api import call_api_async
@@ -73,6 +75,9 @@ class PartitionerConfig(BaseModel):
73
75
  hi_res_model_name: Optional[str] = Field(
74
76
  default=None, description="Model name for hi-res strategy."
75
77
  )
78
+ raise_unsupported_filetype: bool = Field(
79
+ default=False, description="Raise an error if the file type is not supported"
80
+ )
76
81
 
77
82
  def model_post_init(self, __context: Any) -> None:
78
83
  if self.metadata_exclude and self.metadata_include:
@@ -151,13 +156,25 @@ class Partitioner(BaseProcess, ABC):
151
156
  class FileDataSourceMetadata(DataSourceMetadata):
152
157
  filesize_bytes: Optional[int] = None
153
158
 
159
+ metadata = metadata or {}
154
160
  logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
155
161
  logger.debug(f"partitioning file {filename} with metadata {metadata}")
156
- elements = partition(
157
- filename=str(filename.resolve()),
158
- data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
159
- **self.config.to_partition_kwargs(),
160
- )
162
+ try:
163
+ elements = partition(
164
+ filename=str(filename.resolve()),
165
+ data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
166
+ **self.config.to_partition_kwargs(),
167
+ )
168
+ except ValueError as sdk_error:
169
+ if (
170
+ self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
171
+ and not self.config.raise_unsupported_filetype
172
+ ):
173
+ logger.warning(
174
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
175
+ )
176
+ return []
177
+ raise sdk_error
161
178
  return self.postprocess(elements=elements_to_dicts(elements))
162
179
 
163
180
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
@@ -179,10 +196,37 @@ class Partitioner(BaseProcess, ABC):
179
196
  element["metadata"]["data_source"] = metadata
180
197
  return self.postprocess(elements=elements)
181
198
 
199
+ def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
200
+ error_msg = sdk_error.args[0]
201
+ return (
202
+ "Invalid file" in error_msg
203
+ or "Unstructured schema" in error_msg
204
+ or "fast strategy is not available for image files" in error_msg
205
+ )
206
+
207
+ def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
208
+ error_msg = error.args[0]
209
+ error_dict = json.loads(error_msg)
210
+ details = error_dict["detail"]
211
+ return "fast strategy is not available for image files" in details or (
212
+ "file type" in details.lower() and "is not supported" in details.lower()
213
+ )
214
+
182
215
  def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
183
216
  return self.partition_locally(filename, metadata=metadata, **kwargs)
184
217
 
185
218
  async def run_async(
186
219
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
187
220
  ) -> list[dict]:
188
- return await self.partition_via_api(filename, metadata=metadata, **kwargs)
221
+ try:
222
+ return await self.partition_via_api(filename, metadata=metadata, **kwargs)
223
+ except UserError as user_error:
224
+ if (
225
+ self.is_client_error_unsupported_filetype(error=user_error)
226
+ and not self.config.raise_unsupported_filetype
227
+ ):
228
+ logger.warning(
229
+ f"Unsupported file type for strategy {self.config.strategy}: {filename}"
230
+ )
231
+ return []
232
+ raise user_error
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.4.7
3
+ Version: 0.5.1
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: pandas
26
- Requires-Dist: pydantic>=2.7
27
25
  Requires-Dist: dataclasses-json
28
- Requires-Dist: python-dateutil
26
+ Requires-Dist: click
29
27
  Requires-Dist: opentelemetry-sdk
28
+ Requires-Dist: pydantic>=2.7
29
+ Requires-Dist: python-dateutil
30
+ Requires-Dist: pandas
30
31
  Requires-Dist: tqdm
31
- Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: fsspec; extra == "azure"
38
37
  Requires-Dist: adlfs; extra == "azure"
38
+ Requires-Dist: fsspec; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: boto3; extra == "bedrock"
43
42
  Requires-Dist: aioboto3; extra == "bedrock"
43
+ Requires-Dist: boto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Requires-Dist: bs4; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: fsspec; extra == "box"
49
48
  Requires-Dist: boxfs; extra == "box"
49
+ Requires-Dist: fsspec; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: requests; extra == "confluence"
56
55
  Requires-Dist: atlassian-python-api; extra == "confluence"
56
+ Requires-Dist: requests; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
83
83
  Provides-Extra: embed-mixedbreadai
84
84
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
85
85
  Provides-Extra: embed-octoai
86
- Requires-Dist: tiktoken; extra == "embed-octoai"
87
86
  Requires-Dist: openai; extra == "embed-octoai"
87
+ Requires-Dist: tiktoken; extra == "embed-octoai"
88
88
  Provides-Extra: embed-vertexai
89
89
  Requires-Dist: vertexai; extra == "embed-vertexai"
90
90
  Provides-Extra: embed-voyageai
@@ -92,9 +92,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
- Requires-Dist: fsspec; extra == "gcs"
96
- Requires-Dist: bs4; extra == "gcs"
97
95
  Requires-Dist: gcsfs; extra == "gcs"
96
+ Requires-Dist: bs4; extra == "gcs"
97
+ Requires-Dist: fsspec; extra == "gcs"
98
98
  Provides-Extra: github
99
99
  Requires-Dist: requests; extra == "github"
100
100
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -103,8 +103,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
104
104
  Requires-Dist: google-api-python-client; extra == "google-drive"
105
105
  Provides-Extra: hubspot
106
- Requires-Dist: urllib3; extra == "hubspot"
107
106
  Requires-Dist: hubspot-api-client; extra == "hubspot"
107
+ Requires-Dist: urllib3; extra == "hubspot"
108
108
  Provides-Extra: jira
109
109
  Requires-Dist: atlassian-python-api; extra == "jira"
110
110
  Provides-Extra: kafka
@@ -122,30 +122,30 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
- Requires-Dist: networkx; extra == "neo4j"
126
- Requires-Dist: cymple; extra == "neo4j"
127
125
  Requires-Dist: neo4j; extra == "neo4j"
126
+ Requires-Dist: cymple; extra == "neo4j"
127
+ Requires-Dist: networkx; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: httpx; extra == "notion"
129
+ Requires-Dist: backoff; extra == "notion"
130
130
  Requires-Dist: htmlBuilder; extra == "notion"
131
+ Requires-Dist: httpx; extra == "notion"
131
132
  Requires-Dist: notion-client; extra == "notion"
132
- Requires-Dist: backoff; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: msal; extra == "onedrive"
137
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
+ Requires-Dist: msal; extra == "onedrive"
138
138
  Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: openai
140
- Requires-Dist: tiktoken; extra == "openai"
141
140
  Requires-Dist: openai; extra == "openai"
141
+ Requires-Dist: tiktoken; extra == "openai"
142
142
  Provides-Extra: opensearch
143
143
  Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
145
145
  Requires-Dist: unstructured[org]; extra == "org"
146
146
  Provides-Extra: outlook
147
- Requires-Dist: msal; extra == "outlook"
148
147
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
+ Requires-Dist: msal; extra == "outlook"
149
149
  Provides-Extra: pdf
150
150
  Requires-Dist: unstructured[pdf]; extra == "pdf"
151
151
  Provides-Extra: pinecone
@@ -177,8 +177,8 @@ Provides-Extra: sftp
177
177
  Requires-Dist: paramiko; extra == "sftp"
178
178
  Requires-Dist: fsspec; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
- Requires-Dist: msal; extra == "sharepoint"
181
180
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
+ Requires-Dist: msal; extra == "sharepoint"
182
182
  Provides-Extra: singlestore
183
183
  Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
- Requires-Dist: vastdb; extra == "vastdb"
195
194
  Requires-Dist: pyarrow; extra == "vastdb"
195
+ Requires-Dist: vastdb; extra == "vastdb"
196
196
  Requires-Dist: ibis; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
- Requires-Dist: aiofiles; extra == "vectara"
199
198
  Requires-Dist: httpx; extra == "vectara"
200
199
  Requires-Dist: requests; extra == "vectara"
200
+ Requires-Dist: aiofiles; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"
203
203
  Provides-Extra: wikipedia
@@ -10,16 +10,18 @@ test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNp
10
10
  test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
11
11
  test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
12
12
  test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
13
+ test/integration/connectors/test_google_drive.py,sha256=0zJZ4UJOq4TkfU-bkc556_abV7q6zVS9ZgIvW9qcTU4,4204
13
14
  test/integration/connectors/test_lancedb.py,sha256=8MBxK_CUtOt87-4B7svDDK82NFII5psceo5cNN8HJMs,9228
14
15
  test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8DgGzlpyevsFu2w,7173
15
16
  test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
16
17
  test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
17
18
  test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
18
- test/integration/connectors/test_onedrive.py,sha256=rjgN2LhaW1htEMBJPxmlP_kcRB7p_oOeZcogFlHyJH4,3721
19
+ test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
19
20
  test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
20
21
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
21
22
  test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
22
23
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
24
+ test/integration/connectors/test_sharepoint.py,sha256=8HlcnrP4K8oPUzef6AA11P2cMlxSp7tiddTkT4JOeRU,2378
23
25
  test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
24
26
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
27
  test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
@@ -39,6 +41,7 @@ test/integration/connectors/sql/test_postgres.py,sha256=bGDyzLRpgrXO7nl0U8nF2zSN
39
41
  test/integration/connectors/sql/test_singlestore.py,sha256=XeU2s4Kt_3tGyaDYYKTgYjdOyb8j2dnz4TgSMwFUjWs,6153
40
42
  test/integration/connectors/sql/test_snowflake.py,sha256=LEwsRDoC6-rRiwYsqeo5B9Eo6RYygLLGAUsrtrgI9pM,7494
41
43
  test/integration/connectors/sql/test_sqlite.py,sha256=MHvhFRx1y_LTgfS-aPz-Zn9yOGsm-TF_s0t1seBzV1k,5956
44
+ test/integration/connectors/sql/test_vastdb.py,sha256=66T-o_y7NaDKGmKFkT778AB-nanlLv9KdtgUGPOdnLs,1069
42
45
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
46
  test/integration/connectors/utils/constants.py,sha256=JhTk6YNw7JVpkk-Pl8zn2YYkExeL1oE9VBWm_kMYGfo,369
44
47
  test/integration/connectors/utils/docker.py,sha256=4g1STiSbYN5qcmDTXyPxVJgwx97O6wk7n-DJ-zgzgag,4971
@@ -46,7 +49,7 @@ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQ
46
49
  test/integration/connectors/utils/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
50
  test/integration/connectors/utils/validation/destination.py,sha256=ZvMSvqz9in35xaoUJGx9rG8oWCU3FYlfLLQ6sfdI0pw,2649
48
51
  test/integration/connectors/utils/validation/equality.py,sha256=R6d_1c-Si5518WJcBcshF_wBRnywnZ0ORQ-NL0xNmGo,2602
49
- test/integration/connectors/utils/validation/source.py,sha256=VALU5ms_JBu_eFkp2WQ7oZtJKozJ8MZSJ7h7ZA3Fz_Q,12296
52
+ test/integration/connectors/utils/validation/source.py,sha256=xnAZI26ILdeMhgrWAGrU2N2fqK58YNGkfyUhJekZ0Ho,13541
50
53
  test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_JjB4w2Sf8aQPvDVT5vZTs05ILs,1428
51
54
  test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
55
  test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
@@ -55,7 +58,7 @@ test/integration/connectors/weaviate/test_local.py,sha256=gXMpnzVcrNQdptDjx0haPW
55
58
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
59
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
57
60
  test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
58
- test/integration/embedders/test_bedrock.py,sha256=ZehreheLgY9Bqdjk-3MQOaou9IP-H3Pcz7WWiOWAxTU,3557
61
+ test/integration/embedders/test_bedrock.py,sha256=vmjoi1uUk-LX4Yz0ZPn6Ry1JdVEsyIhLhPbSPmkeT9o,3553
59
62
  test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
60
63
  test/integration/embedders/test_mixedbread.py,sha256=lLz_cooyC38VSo-FMHbhKpHvYs3QzA20NOIvM5oooaw,1998
61
64
  test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
@@ -65,7 +68,7 @@ test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gT
65
68
  test/integration/embedders/test_voyageai.py,sha256=Gm3sVjhsym1ASIDfr-sZoCbpsNMaAk_l4E3-dtjRCQ4,1832
66
69
  test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
67
70
  test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- test/integration/partitioners/test_partitioner.py,sha256=MEQJbRoc01uPLT6O8CkXeQF_DXK21nz3KVJkzkBtsgM,2835
71
+ test/integration/partitioners/test_partitioner.py,sha256=6sdZhhtqEICBPqEgpKrCQIfJ-7hKcwuTFqjWs1mbQf8,2787
69
72
  test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
73
  test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
71
74
  test/unit/test_html.py,sha256=LKGi_QaH4U4gktrbd2NcURL-d-0Rm1UnG5Y6r9EvTG0,4489
@@ -86,6 +89,8 @@ test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
86
89
  test/unit/v2/connectors/test_confluence.py,sha256=bXrn_kRb4IQdqkk4rc-P2gJAtPba7n7pNplQgfbqZDY,1047
87
90
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
88
91
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
92
+ test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
+ test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
89
94
  test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
95
  test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
91
96
  test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -102,9 +107,9 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
102
107
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
108
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
104
109
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
105
- unstructured_ingest/__version__.py,sha256=i2QrUEuUnVPQuTv5hg_JWbhbwm5k6KU4hPIFq0SIgdc,42
110
+ unstructured_ingest/__version__.py,sha256=LXdgOM6QWErpDu1oCqJrypfmAkBaXzRxVPcjHL8yPrI,42
106
111
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
107
- unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
112
+ unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
108
113
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
109
114
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
110
115
  unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
@@ -112,7 +117,7 @@ unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29
112
117
  unstructured_ingest/cli/cli.py,sha256=LutBTBYMqboKw8cputHVszpenyfnySzcUC15ifwuYyg,1049
113
118
  unstructured_ingest/cli/cmd_factory.py,sha256=UdHm1KacTombpF6DxyTSwTCuApsKHUYw_kVu5Nhcy3Y,364
114
119
  unstructured_ingest/cli/common.py,sha256=I0El08FHz5kxw7iz0VWOWPrvcJD1rBgXJSwVIpVmmwU,204
115
- unstructured_ingest/cli/interfaces.py,sha256=lpaaOdAQ4NMsawVaHSk5lXCcZ0Mw85kRzfElu1ODCB0,24090
120
+ unstructured_ingest/cli/interfaces.py,sha256=pvEwNfYwINx3-TQ0LPudjpYNR3PnanUiXpEePPEtRSw,24086
116
121
  unstructured_ingest/cli/utils.py,sha256=KNhkFNKOeEihc8HlvMz_MTbYVQNFklrBKbC8xg9h1xE,7982
117
122
  unstructured_ingest/cli/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
123
  unstructured_ingest/cli/base/cmd.py,sha256=BbfjA2v203Jh-7DL6bzxQ7fOeNixd5BsBMuzXz6h5IQ,583
@@ -399,7 +404,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
399
404
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
400
405
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
401
406
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
402
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=4IwCWMlBrMpZI6V82q5nzrbyQNDVM62AQsWt6MUBWa8,16508
407
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=b37fQGm_lGutQ3Jc0qePB15lkBiFavH9tCso3inm-3I,16564
403
408
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
404
409
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
405
410
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -413,9 +418,9 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
413
418
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
414
419
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
415
420
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
416
- unstructured_ingest/v2/processes/embedder.py,sha256=uiuCOSwwasHp4eqtewMvgnM86WVch7HDFiWqpGLahvo,7812
421
+ unstructured_ingest/v2/processes/embedder.py,sha256=PTBlRgNbAXkSaLg7JrZzHwAoqpHmopg8jNU1TmaXguU,7804
417
422
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
418
- unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
423
+ unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
419
424
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
420
425
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
421
426
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
@@ -427,18 +432,18 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVm
427
432
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
428
433
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
429
434
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
430
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=5k7pdAzJGXSdyPCzW9vu2OaAjGVTo2JevDyGaXM1Hvk,13370
435
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=tSbyibwm9RQyXD-HJGZa1Y9lBSCXaEFnvxpf6bHwBSE,13394
431
436
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
432
437
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
433
438
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
434
439
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
435
440
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
436
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=b616B_-9MfU6gxvpw7IBUa2szNFURA_VP8q5j2FXxnA,17632
441
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
437
442
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
438
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
443
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=U5gSa8S08JvCwmAhE8aV0yxGTIFnUlKVsQDybE8Fqb8,10746
439
444
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
440
445
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
441
- unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SdcbOEUzgi1sUZJA6doZDm-a8d4F3Qtud-OVbDKW7Ng,4456
446
+ unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=f0F7KioXgucVc3tVASTa67ynlTa4s9_FKGPHop6Xm0A,4563
442
447
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
443
448
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
444
449
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
@@ -452,9 +457,9 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8
452
457
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
453
458
  unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
454
459
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
455
- unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=0YBdOpTX5mbRLhP00lRHSMpl2-LfuRpqB1XPMJMxn04,2647
460
+ unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
456
461
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=oUHHaLpO2pWW2Lu4Mc-XFjrA0ze97205WQ_xP95ua4M,4296
457
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=mU5x6SnbFgRsVicNGh4y4gtR6ek7eQFinI0dQQmzMds,4481
462
+ unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=OsRy-rcrP4_KSustpxlEKoZ_FmJNFMyMmIfFk6WJ3UY,4559
458
463
  unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
459
464
  unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
460
465
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
@@ -554,17 +559,17 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
554
559
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
555
560
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
556
561
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=QE-WBqrPVjCgcxR5EdVD9iTHBjgDSSSQgWYvq5N61qU,7746
557
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=O2XBu_E2WqNia9OUTdhTWkYo0xhoMMm6ZuanTz-0V9s,16192
562
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
558
563
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
559
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=4DckpVAXpmMTcoKrWiJbnFQQlcrwMA-GMaDsAYchTUs,9992
564
+ unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
560
565
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
561
566
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
562
567
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
563
568
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
564
569
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
565
- unstructured_ingest-0.4.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
566
- unstructured_ingest-0.4.7.dist-info/METADATA,sha256=yGcahQ8fZmoU_c1h02b76tRn5w0uj_931AAQKlFrqxs,8051
567
- unstructured_ingest-0.4.7.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
568
- unstructured_ingest-0.4.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
569
- unstructured_ingest-0.4.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
570
- unstructured_ingest-0.4.7.dist-info/RECORD,,
570
+ unstructured_ingest-0.5.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
+ unstructured_ingest-0.5.1.dist-info/METADATA,sha256=4fo4K5ac0RNRlWGGyNumZ5gXJf-0PwknZWjS6HvAD6w,8051
572
+ unstructured_ingest-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
573
+ unstructured_ingest-0.5.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
+ unstructured_ingest-0.5.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
+ unstructured_ingest-0.5.1.dist-info/RECORD,,