unstructured-ingest 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_onedrive.py +57 -10
- test/integration/connectors/test_sharepoint.py +71 -0
- test/integration/connectors/utils/validation/source.py +45 -16
- test/integration/embedders/test_bedrock.py +1 -1
- test/integration/partitioners/test_partitioner.py +10 -9
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/embed/azure_openai.py +21 -2
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +4 -3
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +70 -389
- unstructured_ingest/v2/processes/embedder.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +50 -6
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/RECORD +21 -19
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.6.dist-info → unstructured_ingest-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -5,22 +5,31 @@ from pathlib import Path
|
|
|
5
5
|
import pytest
|
|
6
6
|
from office365.graph_client import GraphClient
|
|
7
7
|
|
|
8
|
-
from test.integration.connectors.utils.constants import
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
BLOB_STORAGE_TAG,
|
|
10
|
+
DESTINATION_TAG,
|
|
11
|
+
SOURCE_TAG,
|
|
12
|
+
)
|
|
13
|
+
from test.integration.connectors.utils.validation.source import (
|
|
14
|
+
SourceValidationConfigs,
|
|
15
|
+
source_connector_validation,
|
|
16
|
+
)
|
|
9
17
|
from test.integration.utils import requires_env
|
|
10
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
11
19
|
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
12
20
|
CONNECTOR_TYPE,
|
|
13
21
|
OnedriveAccessConfig,
|
|
14
22
|
OnedriveConnectionConfig,
|
|
23
|
+
OnedriveDownloader,
|
|
24
|
+
OnedriveDownloaderConfig,
|
|
25
|
+
OnedriveIndexer,
|
|
26
|
+
OnedriveIndexerConfig,
|
|
15
27
|
OnedriveUploader,
|
|
16
28
|
OnedriveUploaderConfig,
|
|
17
29
|
)
|
|
18
30
|
|
|
19
31
|
|
|
20
32
|
@pytest.fixture
|
|
21
|
-
@pytest.mark.xfail(
|
|
22
|
-
reason="Issues with test setup on the provider side."
|
|
23
|
-
) # TODO: remove line when issues are addressed
|
|
24
33
|
def onedrive_test_folder() -> str:
|
|
25
34
|
"""
|
|
26
35
|
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
@@ -65,12 +74,46 @@ def get_connection_config():
|
|
|
65
74
|
return connection_config
|
|
66
75
|
|
|
67
76
|
|
|
77
|
+
@pytest.mark.asyncio
|
|
78
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
79
|
+
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
80
|
+
async def test_onedrive_source(temp_dir):
|
|
81
|
+
connection_config = get_connection_config()
|
|
82
|
+
index_config = OnedriveIndexerConfig(recursive=True, path="eml")
|
|
83
|
+
|
|
84
|
+
download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
|
|
85
|
+
|
|
86
|
+
# Instantiate indexer and downloader
|
|
87
|
+
indexer = OnedriveIndexer(
|
|
88
|
+
connection_config=connection_config,
|
|
89
|
+
index_config=index_config,
|
|
90
|
+
)
|
|
91
|
+
downloader = OnedriveDownloader(
|
|
92
|
+
connection_config=connection_config,
|
|
93
|
+
download_config=download_config,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Run the source connector validation
|
|
97
|
+
await source_connector_validation(
|
|
98
|
+
indexer=indexer,
|
|
99
|
+
downloader=downloader,
|
|
100
|
+
configs=SourceValidationConfigs(
|
|
101
|
+
test_id="onedrive",
|
|
102
|
+
expected_num_files=1,
|
|
103
|
+
validate_downloaded_files=True,
|
|
104
|
+
exclude_fields_extend=[
|
|
105
|
+
"metadata.date_created",
|
|
106
|
+
"metadata.date_modified",
|
|
107
|
+
"additional_metadata.LastModified",
|
|
108
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
109
|
+
],
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
68
114
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
69
115
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
70
|
-
|
|
71
|
-
reason="Issues with test setup on the provider side."
|
|
72
|
-
) # TODO: remove line when issues are addressed
|
|
73
|
-
def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
116
|
+
def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
74
117
|
"""
|
|
75
118
|
Integration test for the OneDrive destination connector.
|
|
76
119
|
|
|
@@ -107,10 +150,14 @@ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
|
107
150
|
client = connection_config.get_client()
|
|
108
151
|
drive = client.users[user_pname].drive
|
|
109
152
|
|
|
153
|
+
# Workaround: File should not have .json in the metadata.filename it comes from embedder
|
|
110
154
|
uploaded_file = (
|
|
111
|
-
drive.root.get_by_path(destination_fullpath
|
|
155
|
+
drive.root.get_by_path(f"{destination_fullpath}.json")
|
|
156
|
+
.select(["id", "name"])
|
|
157
|
+
.get()
|
|
158
|
+
.execute_query()
|
|
112
159
|
)
|
|
113
160
|
|
|
114
161
|
# Check if the file exists
|
|
115
162
|
assert uploaded_file is not None
|
|
116
|
-
assert uploaded_file.name == upload_file.name
|
|
163
|
+
assert uploaded_file.name == f"{upload_file.name}.json"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
|
|
6
|
+
from test.integration.connectors.utils.validation.source import (
|
|
7
|
+
SourceValidationConfigs,
|
|
8
|
+
source_connector_validation,
|
|
9
|
+
)
|
|
10
|
+
from test.integration.utils import requires_env
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sharepoint import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
SharepointAccessConfig,
|
|
14
|
+
SharepointConnectionConfig,
|
|
15
|
+
SharepointDownloader,
|
|
16
|
+
SharepointDownloaderConfig,
|
|
17
|
+
SharepointIndexer,
|
|
18
|
+
SharepointIndexerConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.asyncio
|
|
23
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
24
|
+
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
25
|
+
async def test_sharepoint_source(temp_dir):
|
|
26
|
+
# Retrieve environment variables
|
|
27
|
+
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
28
|
+
client_id = os.environ["SHAREPOINT_CLIENT_ID"]
|
|
29
|
+
client_cred = os.environ["SHAREPOINT_CRED"]
|
|
30
|
+
user_pname = os.environ["MS_USER_PNAME"]
|
|
31
|
+
tenant = os.environ["MS_TENANT_ID"]
|
|
32
|
+
|
|
33
|
+
# Create connection and indexer configurations
|
|
34
|
+
access_config = SharepointAccessConfig(client_cred=client_cred)
|
|
35
|
+
connection_config = SharepointConnectionConfig(
|
|
36
|
+
client_id=client_id,
|
|
37
|
+
site=site,
|
|
38
|
+
tenant=tenant,
|
|
39
|
+
user_pname=user_pname,
|
|
40
|
+
access_config=access_config,
|
|
41
|
+
)
|
|
42
|
+
index_config = SharepointIndexerConfig(recursive=True)
|
|
43
|
+
|
|
44
|
+
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
45
|
+
|
|
46
|
+
# Instantiate indexer and downloader
|
|
47
|
+
indexer = SharepointIndexer(
|
|
48
|
+
connection_config=connection_config,
|
|
49
|
+
index_config=index_config,
|
|
50
|
+
)
|
|
51
|
+
downloader = SharepointDownloader(
|
|
52
|
+
connection_config=connection_config,
|
|
53
|
+
download_config=download_config,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Run the source connector validation
|
|
57
|
+
await source_connector_validation(
|
|
58
|
+
indexer=indexer,
|
|
59
|
+
downloader=downloader,
|
|
60
|
+
configs=SourceValidationConfigs(
|
|
61
|
+
test_id="sharepoint",
|
|
62
|
+
expected_num_files=4,
|
|
63
|
+
validate_downloaded_files=True,
|
|
64
|
+
exclude_fields_extend=[
|
|
65
|
+
"metadata.date_created",
|
|
66
|
+
"metadata.date_modified",
|
|
67
|
+
"additional_metadata.LastModified",
|
|
68
|
+
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
69
|
+
],
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -10,6 +10,13 @@ from pydantic import Field
|
|
|
10
10
|
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
11
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
12
|
|
|
13
|
+
NONSTANDARD_METADATA_FIELDS = {
|
|
14
|
+
"additional_metadata.@microsoft.graph.downloadUrl": [
|
|
15
|
+
"additional_metadata",
|
|
16
|
+
"@microsoft.graph.downloadUrl",
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
class SourceValidationConfigs(ValidationConfig):
|
|
15
22
|
expected_number_indexed_file_data: Optional[int] = None
|
|
@@ -26,7 +33,7 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
26
33
|
def get_exclude_fields(self) -> list[str]:
|
|
27
34
|
exclude_fields = self.exclude_fields
|
|
28
35
|
exclude_fields.extend(self.exclude_fields_extend)
|
|
29
|
-
return exclude_fields
|
|
36
|
+
return list(set(exclude_fields))
|
|
30
37
|
|
|
31
38
|
def run_file_data_validation(
|
|
32
39
|
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
@@ -45,8 +52,13 @@ class SourceValidationConfigs(ValidationConfig):
|
|
|
45
52
|
exclude_fields = self.get_exclude_fields()
|
|
46
53
|
# Ignore fields that dynamically change every time the tests run
|
|
47
54
|
copied_data = data.copy()
|
|
55
|
+
|
|
48
56
|
for exclude_field in exclude_fields:
|
|
49
|
-
exclude_field_vals =
|
|
57
|
+
exclude_field_vals = (
|
|
58
|
+
NONSTANDARD_METADATA_FIELDS[exclude_field]
|
|
59
|
+
if exclude_field in NONSTANDARD_METADATA_FIELDS
|
|
60
|
+
else exclude_field.split(".")
|
|
61
|
+
)
|
|
50
62
|
if len(exclude_field_vals) == 1:
|
|
51
63
|
current_val = copied_data
|
|
52
64
|
drop_field = exclude_field_vals[0]
|
|
@@ -261,21 +273,38 @@ async def source_connector_validation(
|
|
|
261
273
|
indexer.precheck()
|
|
262
274
|
download_dir = downloader.download_config.download_dir
|
|
263
275
|
test_output_dir = configs.test_output_dir()
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
276
|
+
if indexer.is_async():
|
|
277
|
+
async for file_data in indexer.run_async():
|
|
278
|
+
assert file_data
|
|
279
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
280
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
281
|
+
if downloader.is_async():
|
|
282
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
283
|
+
else:
|
|
284
|
+
resp = downloader.run(file_data=file_data)
|
|
285
|
+
if isinstance(resp, list):
|
|
286
|
+
for r in resp:
|
|
287
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
288
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
289
|
+
else:
|
|
290
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
291
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
292
|
+
else:
|
|
293
|
+
for file_data in indexer.run():
|
|
294
|
+
assert file_data
|
|
295
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
296
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
297
|
+
if downloader.is_async():
|
|
298
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
299
|
+
else:
|
|
300
|
+
resp = downloader.run(file_data=file_data)
|
|
301
|
+
if isinstance(resp, list):
|
|
302
|
+
for r in resp:
|
|
303
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
304
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
305
|
+
else:
|
|
306
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
275
307
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
276
|
-
else:
|
|
277
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
278
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
279
308
|
if not overwrite_fixtures:
|
|
280
309
|
print("Running validation")
|
|
281
310
|
run_all_validations(
|
|
@@ -31,7 +31,7 @@ def get_aws_credentials() -> dict:
|
|
|
31
31
|
def test_bedrock_embedder(embedder_file: Path):
|
|
32
32
|
aws_credentials = get_aws_credentials()
|
|
33
33
|
embedder_config = EmbedderConfig(
|
|
34
|
-
embedding_provider="
|
|
34
|
+
embedding_provider="bedrock",
|
|
35
35
|
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
36
36
|
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
37
37
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
@@ -15,6 +14,9 @@ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
|
15
14
|
non_image_partition_files = [
|
|
16
15
|
path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
|
|
17
16
|
]
|
|
17
|
+
supported_fast_partition_files = [
|
|
18
|
+
path for path in non_image_partition_files if path.suffix != ".eml"
|
|
19
|
+
]
|
|
18
20
|
image_partition_files = [
|
|
19
21
|
path for path in all_partition_files if path not in non_image_partition_files
|
|
20
22
|
]
|
|
@@ -33,18 +35,13 @@ async def test_partitioner_api_hi_res(partition_file: Path):
|
|
|
33
35
|
)
|
|
34
36
|
partitioner = Partitioner(config=partitioner_config)
|
|
35
37
|
results = await partitioner.run_async(filename=partition_file)
|
|
36
|
-
results_dir = int_test_dir / "results"
|
|
37
|
-
results_dir.mkdir(exist_ok=True)
|
|
38
|
-
results_path = results_dir / f"{partition_file.name}.json"
|
|
39
|
-
with results_path.open("w") as f:
|
|
40
|
-
json.dump(results, f, indent=2)
|
|
41
38
|
assert results
|
|
42
39
|
|
|
43
40
|
|
|
44
41
|
@pytest.mark.parametrize(
|
|
45
42
|
"partition_file",
|
|
46
|
-
|
|
47
|
-
ids=[path.name for path in
|
|
43
|
+
supported_fast_partition_files,
|
|
44
|
+
ids=[path.name for path in supported_fast_partition_files],
|
|
48
45
|
)
|
|
49
46
|
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
50
47
|
@pytest.mark.asyncio
|
|
@@ -68,7 +65,11 @@ async def test_partitioner_api_fast_error(partition_file: Path):
|
|
|
68
65
|
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
69
66
|
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
70
67
|
partitioner_config = PartitionerConfig(
|
|
71
|
-
strategy="fast",
|
|
68
|
+
strategy="fast",
|
|
69
|
+
partition_by_api=True,
|
|
70
|
+
api_key=api_key,
|
|
71
|
+
partition_endpoint=api_url,
|
|
72
|
+
raise_unsupported_filetype=True,
|
|
72
73
|
)
|
|
73
74
|
partitioner = Partitioner(config=partitioner_config)
|
|
74
75
|
with pytest.raises(UserError):
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.5.0" # pragma: no cover
|
|
@@ -3,11 +3,15 @@ from typing import TYPE_CHECKING
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.embed.openai import
|
|
6
|
+
from unstructured_ingest.embed.openai import (
|
|
7
|
+
AsyncOpenAIEmbeddingEncoder,
|
|
8
|
+
OpenAIEmbeddingConfig,
|
|
9
|
+
OpenAIEmbeddingEncoder,
|
|
10
|
+
)
|
|
7
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
12
|
|
|
9
13
|
if TYPE_CHECKING:
|
|
10
|
-
from openai import AzureOpenAI
|
|
14
|
+
from openai import AsyncAzureOpenAI, AzureOpenAI
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
@@ -25,7 +29,22 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
|
25
29
|
azure_endpoint=self.azure_endpoint,
|
|
26
30
|
)
|
|
27
31
|
|
|
32
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
33
|
+
def get_async_client(self) -> "AsyncAzureOpenAI":
|
|
34
|
+
from openai import AsyncAzureOpenAI
|
|
35
|
+
|
|
36
|
+
return AsyncAzureOpenAI(
|
|
37
|
+
api_key=self.api_key.get_secret_value(),
|
|
38
|
+
api_version=self.api_version,
|
|
39
|
+
azure_endpoint=self.azure_endpoint,
|
|
40
|
+
)
|
|
41
|
+
|
|
28
42
|
|
|
29
43
|
@dataclass
|
|
30
44
|
class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
|
|
31
45
|
config: AzureOpenAIEmbeddingConfig
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class AsyncAzureOpenAIEmbeddingEncoder(AsyncOpenAIEmbeddingEncoder):
|
|
50
|
+
config: AzureOpenAIEmbeddingConfig
|
|
@@ -226,7 +226,7 @@ class EmbeddingConfig(BaseConfig):
|
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
229
|
-
elif self.provider == "
|
|
229
|
+
elif self.provider == "bedrock":
|
|
230
230
|
from unstructured_ingest.embed.bedrock import (
|
|
231
231
|
BedrockEmbeddingConfig,
|
|
232
232
|
BedrockEmbeddingEncoder,
|
|
@@ -268,6 +268,7 @@ class Pipeline:
|
|
|
268
268
|
|
|
269
269
|
# Partition content
|
|
270
270
|
elements = self.partitioner_step(downloaded_data)
|
|
271
|
+
elements = self.clean_results(results=elements)
|
|
271
272
|
# Download data non longer needed, delete if possible
|
|
272
273
|
self.downloader_step.delete_cache()
|
|
273
274
|
elements = self.clean_results(results=elements)
|
|
@@ -329,9 +330,9 @@ class Pipeline:
|
|
|
329
330
|
source_entry = {
|
|
330
331
|
k: v
|
|
331
332
|
for k, v in source_registry.items()
|
|
332
|
-
if
|
|
333
|
-
and
|
|
334
|
-
and
|
|
333
|
+
if type(indexer_config) is v.indexer_config
|
|
334
|
+
and type(downloader_config) is v.downloader_config
|
|
335
|
+
and type(source_connection_config) is v.connection_config
|
|
335
336
|
}
|
|
336
337
|
if len(source_entry) > 1:
|
|
337
338
|
raise ValueError(
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"properties": [
|
|
3
|
+
{
|
|
4
|
+
"dataType": [
|
|
5
|
+
"text"
|
|
6
|
+
],
|
|
7
|
+
"indexFilterable": true,
|
|
8
|
+
"indexSearchable": true,
|
|
9
|
+
"name": "record_id",
|
|
10
|
+
"tokenization": "word"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"dataType": [
|
|
14
|
+
"text"
|
|
15
|
+
],
|
|
16
|
+
"indexFilterable": true,
|
|
17
|
+
"indexSearchable": true,
|
|
18
|
+
"name": "text",
|
|
19
|
+
"tokenization": "word"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"vectorizer": "none"
|
|
23
|
+
}
|
|
@@ -105,6 +105,7 @@ class OnedriveIndexerConfig(IndexerConfig):
|
|
|
105
105
|
class OnedriveIndexer(Indexer):
|
|
106
106
|
connection_config: OnedriveConnectionConfig
|
|
107
107
|
index_config: OnedriveIndexerConfig
|
|
108
|
+
connector_type: str = CONNECTOR_TYPE
|
|
108
109
|
|
|
109
110
|
def precheck(self) -> None:
|
|
110
111
|
try:
|
|
@@ -172,7 +173,7 @@ class OnedriveIndexer(Indexer):
|
|
|
172
173
|
)
|
|
173
174
|
return FileData(
|
|
174
175
|
identifier=drive_item.id,
|
|
175
|
-
connector_type=
|
|
176
|
+
connector_type=self.connector_type,
|
|
176
177
|
source_identifiers=SourceIdentifiers(
|
|
177
178
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
178
179
|
),
|
|
@@ -201,7 +202,8 @@ class OnedriveIndexer(Indexer):
|
|
|
201
202
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
202
203
|
if "error" in token_resp:
|
|
203
204
|
raise SourceConnectionError(
|
|
204
|
-
f"[{
|
|
205
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
206
|
+
f"({token_resp.get('error_description')})"
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
@@ -221,6 +223,7 @@ class OnedriveDownloaderConfig(DownloaderConfig):
|
|
|
221
223
|
class OnedriveDownloader(Downloader):
|
|
222
224
|
connection_config: OnedriveConnectionConfig
|
|
223
225
|
download_config: OnedriveDownloaderConfig
|
|
226
|
+
connector_type: str = CONNECTOR_TYPE
|
|
224
227
|
|
|
225
228
|
@SourceConnectionNetworkError.wrap
|
|
226
229
|
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
@@ -260,7 +263,9 @@ class OnedriveDownloader(Downloader):
|
|
|
260
263
|
file.download_session(f).execute_query()
|
|
261
264
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
262
265
|
except Exception as e:
|
|
263
|
-
logger.error(
|
|
266
|
+
logger.error(
|
|
267
|
+
f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
|
|
268
|
+
)
|
|
264
269
|
# Re-raise to see full stack trace locally
|
|
265
270
|
raise
|
|
266
271
|
|