unstructured-ingest 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -5,22 +5,31 @@ from pathlib import Path
5
5
  import pytest
6
6
  from office365.graph_client import GraphClient
7
7
 
8
- from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
8
+ from test.integration.connectors.utils.constants import (
9
+ BLOB_STORAGE_TAG,
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ )
13
+ from test.integration.connectors.utils.validation.source import (
14
+ SourceValidationConfigs,
15
+ source_connector_validation,
16
+ )
9
17
  from test.integration.utils import requires_env
10
18
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
11
19
  from unstructured_ingest.v2.processes.connectors.onedrive import (
12
20
  CONNECTOR_TYPE,
13
21
  OnedriveAccessConfig,
14
22
  OnedriveConnectionConfig,
23
+ OnedriveDownloader,
24
+ OnedriveDownloaderConfig,
25
+ OnedriveIndexer,
26
+ OnedriveIndexerConfig,
15
27
  OnedriveUploader,
16
28
  OnedriveUploaderConfig,
17
29
  )
18
30
 
19
31
 
20
32
  @pytest.fixture
21
- @pytest.mark.xfail(
22
- reason="Issues with test setup on the provider side."
23
- ) # TODO: remove line when issues are addressed
24
33
  def onedrive_test_folder() -> str:
25
34
  """
26
35
  Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -65,12 +74,46 @@ def get_connection_config():
65
74
  return connection_config
66
75
 
67
76
 
77
+ @pytest.mark.asyncio
78
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
79
+ @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
80
+ async def test_onedrive_source(temp_dir):
81
+ connection_config = get_connection_config()
82
+ index_config = OnedriveIndexerConfig(recursive=True, path="eml")
83
+
84
+ download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
85
+
86
+ # Instantiate indexer and downloader
87
+ indexer = OnedriveIndexer(
88
+ connection_config=connection_config,
89
+ index_config=index_config,
90
+ )
91
+ downloader = OnedriveDownloader(
92
+ connection_config=connection_config,
93
+ download_config=download_config,
94
+ )
95
+
96
+ # Run the source connector validation
97
+ await source_connector_validation(
98
+ indexer=indexer,
99
+ downloader=downloader,
100
+ configs=SourceValidationConfigs(
101
+ test_id="onedrive",
102
+ expected_num_files=1,
103
+ validate_downloaded_files=True,
104
+ exclude_fields_extend=[
105
+ "metadata.date_created",
106
+ "metadata.date_modified",
107
+ "additional_metadata.LastModified",
108
+ "additional_metadata.@microsoft.graph.downloadUrl",
109
+ ],
110
+ ),
111
+ )
112
+
113
+
68
114
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
69
115
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
70
- @pytest.mark.xfail(
71
- reason="Issues with test setup on the provider side."
72
- ) # TODO: remove line when issues are addressed
73
- def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
116
+ def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
74
117
  """
75
118
  Integration test for the OneDrive destination connector.
76
119
 
@@ -107,10 +150,14 @@ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
107
150
  client = connection_config.get_client()
108
151
  drive = client.users[user_pname].drive
109
152
 
153
+ # Workaround: File should not have .json in the metadata.filename it comes from embedder
110
154
  uploaded_file = (
111
- drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
155
+ drive.root.get_by_path(f"{destination_fullpath}.json")
156
+ .select(["id", "name"])
157
+ .get()
158
+ .execute_query()
112
159
  )
113
160
 
114
161
  # Check if the file exists
115
162
  assert uploaded_file is not None
116
- assert uploaded_file.name == upload_file.name
163
+ assert uploaded_file.name == f"{upload_file.name}.json"
@@ -0,0 +1,71 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
6
+ from test.integration.connectors.utils.validation.source import (
7
+ SourceValidationConfigs,
8
+ source_connector_validation,
9
+ )
10
+ from test.integration.utils import requires_env
11
+ from unstructured_ingest.v2.processes.connectors.sharepoint import (
12
+ CONNECTOR_TYPE,
13
+ SharepointAccessConfig,
14
+ SharepointConnectionConfig,
15
+ SharepointDownloader,
16
+ SharepointDownloaderConfig,
17
+ SharepointIndexer,
18
+ SharepointIndexerConfig,
19
+ )
20
+
21
+
22
+ @pytest.mark.asyncio
23
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
24
+ @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
25
+ async def test_sharepoint_source(temp_dir):
26
+ # Retrieve environment variables
27
+ site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
28
+ client_id = os.environ["SHAREPOINT_CLIENT_ID"]
29
+ client_cred = os.environ["SHAREPOINT_CRED"]
30
+ user_pname = os.environ["MS_USER_PNAME"]
31
+ tenant = os.environ["MS_TENANT_ID"]
32
+
33
+ # Create connection and indexer configurations
34
+ access_config = SharepointAccessConfig(client_cred=client_cred)
35
+ connection_config = SharepointConnectionConfig(
36
+ client_id=client_id,
37
+ site=site,
38
+ tenant=tenant,
39
+ user_pname=user_pname,
40
+ access_config=access_config,
41
+ )
42
+ index_config = SharepointIndexerConfig(recursive=True)
43
+
44
+ download_config = SharepointDownloaderConfig(download_dir=temp_dir)
45
+
46
+ # Instantiate indexer and downloader
47
+ indexer = SharepointIndexer(
48
+ connection_config=connection_config,
49
+ index_config=index_config,
50
+ )
51
+ downloader = SharepointDownloader(
52
+ connection_config=connection_config,
53
+ download_config=download_config,
54
+ )
55
+
56
+ # Run the source connector validation
57
+ await source_connector_validation(
58
+ indexer=indexer,
59
+ downloader=downloader,
60
+ configs=SourceValidationConfigs(
61
+ test_id="sharepoint",
62
+ expected_num_files=4,
63
+ validate_downloaded_files=True,
64
+ exclude_fields_extend=[
65
+ "metadata.date_created",
66
+ "metadata.date_modified",
67
+ "additional_metadata.LastModified",
68
+ "additional_metadata.@microsoft.graph.downloadUrl",
69
+ ],
70
+ ),
71
+ )
@@ -10,6 +10,13 @@ from pydantic import Field
10
10
  from test.integration.connectors.utils.validation.utils import ValidationConfig
11
11
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
12
 
13
+ NONSTANDARD_METADATA_FIELDS = {
14
+ "additional_metadata.@microsoft.graph.downloadUrl": [
15
+ "additional_metadata",
16
+ "@microsoft.graph.downloadUrl",
17
+ ]
18
+ }
19
+
13
20
 
14
21
  class SourceValidationConfigs(ValidationConfig):
15
22
  expected_number_indexed_file_data: Optional[int] = None
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
26
33
  def get_exclude_fields(self) -> list[str]:
27
34
  exclude_fields = self.exclude_fields
28
35
  exclude_fields.extend(self.exclude_fields_extend)
29
- return exclude_fields
36
+ return list(set(exclude_fields))
30
37
 
31
38
  def run_file_data_validation(
32
39
  self, predownload_file_data: FileData, postdownload_file_data: FileData
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
45
52
  exclude_fields = self.get_exclude_fields()
46
53
  # Ignore fields that dynamically change every time the tests run
47
54
  copied_data = data.copy()
55
+
48
56
  for exclude_field in exclude_fields:
49
- exclude_field_vals = exclude_field.split(".")
57
+ exclude_field_vals = (
58
+ NONSTANDARD_METADATA_FIELDS[exclude_field]
59
+ if exclude_field in NONSTANDARD_METADATA_FIELDS
60
+ else exclude_field.split(".")
61
+ )
50
62
  if len(exclude_field_vals) == 1:
51
63
  current_val = copied_data
52
64
  drop_field = exclude_field_vals[0]
@@ -261,21 +273,38 @@ async def source_connector_validation(
261
273
  indexer.precheck()
262
274
  download_dir = downloader.download_config.download_dir
263
275
  test_output_dir = configs.test_output_dir()
264
- for file_data in indexer.run():
265
- assert file_data
266
- predownload_file_data = file_data.model_copy(deep=True)
267
- all_predownload_file_data.append(predownload_file_data)
268
- if downloader.is_async():
269
- resp = await downloader.run_async(file_data=file_data)
270
- else:
271
- resp = downloader.run(file_data=file_data)
272
- if isinstance(resp, list):
273
- for r in resp:
274
- postdownload_file_data = r["file_data"].model_copy(deep=True)
276
+ if indexer.is_async():
277
+ async for file_data in indexer.run_async():
278
+ assert file_data
279
+ predownload_file_data = file_data.model_copy(deep=True)
280
+ all_predownload_file_data.append(predownload_file_data)
281
+ if downloader.is_async():
282
+ resp = await downloader.run_async(file_data=file_data)
283
+ else:
284
+ resp = downloader.run(file_data=file_data)
285
+ if isinstance(resp, list):
286
+ for r in resp:
287
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
288
+ all_postdownload_file_data.append(postdownload_file_data)
289
+ else:
290
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
291
+ all_postdownload_file_data.append(postdownload_file_data)
292
+ else:
293
+ for file_data in indexer.run():
294
+ assert file_data
295
+ predownload_file_data = file_data.model_copy(deep=True)
296
+ all_predownload_file_data.append(predownload_file_data)
297
+ if downloader.is_async():
298
+ resp = await downloader.run_async(file_data=file_data)
299
+ else:
300
+ resp = downloader.run(file_data=file_data)
301
+ if isinstance(resp, list):
302
+ for r in resp:
303
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
304
+ all_postdownload_file_data.append(postdownload_file_data)
305
+ else:
306
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
275
307
  all_postdownload_file_data.append(postdownload_file_data)
276
- else:
277
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
278
- all_postdownload_file_data.append(postdownload_file_data)
279
308
  if not overwrite_fixtures:
280
309
  print("Running validation")
281
310
  run_all_validations(
@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
31
31
  def test_bedrock_embedder(embedder_file: Path):
32
32
  aws_credentials = get_aws_credentials()
33
33
  embedder_config = EmbedderConfig(
34
- embedding_provider="aws-bedrock",
34
+ embedding_provider="bedrock",
35
35
  embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
36
36
  embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
37
37
  )
@@ -1,4 +1,3 @@
1
- import json
2
1
  import os
3
2
  from pathlib import Path
4
3
 
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
15
14
  non_image_partition_files = [
16
15
  path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
17
16
  ]
17
+ supported_fast_partition_files = [
18
+ path for path in non_image_partition_files if path.suffix != ".eml"
19
+ ]
18
20
  image_partition_files = [
19
21
  path for path in all_partition_files if path not in non_image_partition_files
20
22
  ]
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
33
35
  )
34
36
  partitioner = Partitioner(config=partitioner_config)
35
37
  results = await partitioner.run_async(filename=partition_file)
36
- results_dir = int_test_dir / "results"
37
- results_dir.mkdir(exist_ok=True)
38
- results_path = results_dir / f"{partition_file.name}.json"
39
- with results_path.open("w") as f:
40
- json.dump(results, f, indent=2)
41
38
  assert results
42
39
 
43
40
 
44
41
  @pytest.mark.parametrize(
45
42
  "partition_file",
46
- non_image_partition_files,
47
- ids=[path.name for path in non_image_partition_files],
43
+ supported_fast_partition_files,
44
+ ids=[path.name for path in supported_fast_partition_files],
48
45
  )
49
46
  @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
50
47
  @pytest.mark.asyncio
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
68
65
  api_key = os.getenv("UNSTRUCTURED_API_KEY")
69
66
  api_url = os.getenv("UNSTRUCTURED_API_URL")
70
67
  partitioner_config = PartitionerConfig(
71
- strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
68
+ strategy="fast",
69
+ partition_by_api=True,
70
+ api_key=api_key,
71
+ partition_endpoint=api_url,
72
+ raise_unsupported_filetype=True,
72
73
  )
73
74
  partitioner = Partitioner(config=partitioner_config)
74
75
  with pytest.raises(UserError):
@@ -1 +1 @@
1
- __version__ = "0.4.6" # pragma: no cover
1
+ __version__ = "0.5.0" # pragma: no cover
@@ -417,7 +417,7 @@ class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
417
417
  embed_providers = [
418
418
  "openai",
419
419
  "huggingface",
420
- "aws-bedrock",
420
+ "bedrock",
421
421
  "vertexai",
422
422
  "voyageai",
423
423
  "octoai",
@@ -3,11 +3,15 @@ from typing import TYPE_CHECKING
3
3
 
4
4
  from pydantic import Field
5
5
 
6
- from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
6
+ from unstructured_ingest.embed.openai import (
7
+ AsyncOpenAIEmbeddingEncoder,
8
+ OpenAIEmbeddingConfig,
9
+ OpenAIEmbeddingEncoder,
10
+ )
7
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
12
 
9
13
  if TYPE_CHECKING:
10
- from openai import AzureOpenAI
14
+ from openai import AsyncAzureOpenAI, AzureOpenAI
11
15
 
12
16
 
13
17
  class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
@@ -25,7 +29,22 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
25
29
  azure_endpoint=self.azure_endpoint,
26
30
  )
27
31
 
32
+ @requires_dependencies(["openai"], extras="openai")
33
+ def get_async_client(self) -> "AsyncAzureOpenAI":
34
+ from openai import AsyncAzureOpenAI
35
+
36
+ return AsyncAzureOpenAI(
37
+ api_key=self.api_key.get_secret_value(),
38
+ api_version=self.api_version,
39
+ azure_endpoint=self.azure_endpoint,
40
+ )
41
+
28
42
 
29
43
  @dataclass
30
44
  class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
31
45
  config: AzureOpenAIEmbeddingConfig
46
+
47
+
48
+ @dataclass
49
+ class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
50
+ config: AzureOpenAIEmbeddingConfig
@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
226
226
  )
227
227
 
228
228
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
229
- elif self.provider == "aws-bedrock":
229
+ elif self.provider == "bedrock":
230
230
  from unstructured_ingest.embed.bedrock import (
231
231
  BedrockEmbeddingConfig,
232
232
  BedrockEmbeddingEncoder,
@@ -268,6 +268,7 @@ class Pipeline:
268
268
 
269
269
  # Partition content
270
270
  elements = self.partitioner_step(downloaded_data)
271
+ elements = self.clean_results(results=elements)
271
272
  # Download data non longer needed, delete if possible
272
273
  self.downloader_step.delete_cache()
273
274
  elements = self.clean_results(results=elements)
@@ -329,9 +330,9 @@ class Pipeline:
329
330
  source_entry = {
330
331
  k: v
331
332
  for k, v in source_registry.items()
332
- if isinstance(indexer_config, v.indexer_config)
333
- and isinstance(downloader_config, v.downloader_config)
334
- and isinstance(source_connection_config, v.connection_config)
333
+ if type(indexer_config) is v.indexer_config
334
+ and type(downloader_config) is v.downloader_config
335
+ and type(source_connection_config) is v.connection_config
335
336
  }
336
337
  if len(source_entry) > 1:
337
338
  raise ValueError(
@@ -0,0 +1,23 @@
1
+ {
2
+ "properties": [
3
+ {
4
+ "dataType": [
5
+ "text"
6
+ ],
7
+ "indexFilterable": true,
8
+ "indexSearchable": true,
9
+ "name": "record_id",
10
+ "tokenization": "word"
11
+ },
12
+ {
13
+ "dataType": [
14
+ "text"
15
+ ],
16
+ "indexFilterable": true,
17
+ "indexSearchable": true,
18
+ "name": "text",
19
+ "tokenization": "word"
20
+ }
21
+ ],
22
+ "vectorizer": "none"
23
+ }
@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
105
105
  class OnedriveIndexer(Indexer):
106
106
  connection_config: OnedriveConnectionConfig
107
107
  index_config: OnedriveIndexerConfig
108
+ connector_type: str = CONNECTOR_TYPE
108
109
 
109
110
  def precheck(self) -> None:
110
111
  try:
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
172
173
  )
173
174
  return FileData(
174
175
  identifier=drive_item.id,
175
- connector_type=CONNECTOR_TYPE,
176
+ connector_type=self.connector_type,
176
177
  source_identifiers=SourceIdentifiers(
177
178
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
178
179
  ),
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
201
202
  token_resp = await asyncio.to_thread(self.connection_config.get_token)
202
203
  if "error" in token_resp:
203
204
  raise SourceConnectionError(
204
- f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
205
+ f"[{self.connector_type}]: {token_resp['error']} "
206
+ f"({token_resp.get('error_description')})"
205
207
  )
206
208
 
207
209
  client = await asyncio.to_thread(self.connection_config.get_client)
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
221
223
  class OnedriveDownloader(Downloader):
222
224
  connection_config: OnedriveConnectionConfig
223
225
  download_config: OnedriveDownloaderConfig
226
+ connector_type: str = CONNECTOR_TYPE
224
227
 
225
228
  @SourceConnectionNetworkError.wrap
226
229
  def _fetch_file(self, file_data: FileData) -> DriveItem:
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
260
263
  file.download_session(f).execute_query()
261
264
  return self.generate_download_response(file_data=file_data, download_path=download_path)
262
265
  except Exception as e:
263
- logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
266
+ logger.error(
267
+ f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
268
+ )
264
269
  # Re-raise to see full stack trace locally
265
270
  raise
266
271