unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/integration/connectors/test_confluence.py +2 -2
- test/integration/connectors/test_zendesk.py +31 -53
- test/integration/connectors/utils/validation/source.py +5 -3
- test/unit/v2/connectors/test_confluence.py +35 -3
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +3 -7
- unstructured_ingest/utils/data_prep.py +4 -2
- unstructured_ingest/v2/interfaces/file_data.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
- unstructured_ingest/v2/pipeline/pipeline.py +7 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +3 -3
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
- unstructured_ingest/v2/processes/connectors/local.py +8 -1
- unstructured_ingest/v2/processes/connectors/zendesk/client.py +221 -156
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +83 -274
- unstructured_ingest/v2/processes/embedder.py +3 -4
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/METADATA +20 -20
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/RECORD +39 -37
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
|
|
|
30
30
|
spaces = ["testteamsp", "MFS"]
|
|
31
31
|
|
|
32
32
|
# Create connection and indexer configurations
|
|
33
|
-
access_config = ConfluenceAccessConfig(
|
|
33
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
34
34
|
connection_config = ConfluenceConnectionConfig(
|
|
35
35
|
url=confluence_url,
|
|
36
36
|
username=user_email,
|
|
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
|
|
|
77
77
|
spaces = ["testteamsp1"]
|
|
78
78
|
|
|
79
79
|
# Create connection and indexer configurations
|
|
80
|
-
access_config = ConfluenceAccessConfig(
|
|
80
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
81
81
|
connection_config = ConfluenceConnectionConfig(
|
|
82
82
|
url=confluence_url,
|
|
83
83
|
username=user_email,
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import pytest
|
|
6
5
|
|
|
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
|
|
|
21
20
|
ZendeskIndexerConfig,
|
|
22
21
|
)
|
|
23
22
|
|
|
23
|
+
SUBDOMAIN = "unstructuredhelp"
|
|
24
|
+
EMAIL = "test@unstructured.io"
|
|
24
25
|
|
|
25
|
-
async def zendesk_source_test(
|
|
26
|
-
tmp_path: Path,
|
|
27
|
-
token: Optional[str] = None,
|
|
28
|
-
email: Optional[str] = None,
|
|
29
|
-
subdomain: Optional[str] = None,
|
|
30
|
-
):
|
|
31
26
|
|
|
32
|
-
|
|
27
|
+
@pytest.mark.asyncio
|
|
28
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
29
|
+
@requires_env("ZENDESK_TOKEN")
|
|
30
|
+
async def test_zendesk_source_tickets(temp_dir: Path):
|
|
31
|
+
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
33
32
|
connection_config = ZendeskConnectionConfig(
|
|
34
|
-
subdomain=
|
|
33
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
35
34
|
)
|
|
36
35
|
|
|
37
|
-
index_config = ZendeskIndexerConfig(
|
|
36
|
+
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
38
37
|
|
|
39
38
|
indexer = ZendeskIndexer(
|
|
40
39
|
connection_config=connection_config,
|
|
@@ -43,7 +42,7 @@ async def zendesk_source_test(
|
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
# handle downloader.
|
|
46
|
-
download_config = ZendeskDownloaderConfig(download_dir=
|
|
45
|
+
download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
|
|
47
46
|
|
|
48
47
|
downloader = ZendeskDownloader(
|
|
49
48
|
connection_config=connection_config,
|
|
@@ -57,26 +56,23 @@ async def zendesk_source_test(
|
|
|
57
56
|
downloader=downloader,
|
|
58
57
|
configs=SourceValidationConfigs(
|
|
59
58
|
test_id="zendesk-tickets",
|
|
60
|
-
expected_num_files=
|
|
59
|
+
expected_num_files=8,
|
|
61
60
|
validate_file_data=False,
|
|
62
61
|
validate_downloaded_files=True,
|
|
63
62
|
),
|
|
64
63
|
)
|
|
65
64
|
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
):
|
|
73
|
-
|
|
74
|
-
access_config = ZendeskAccessConfig(api_token=token)
|
|
66
|
+
@pytest.mark.asyncio
|
|
67
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
68
|
+
@requires_env("ZENDESK_TOKEN")
|
|
69
|
+
async def test_zendesk_source_articles(temp_dir):
|
|
70
|
+
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
75
71
|
connection_config = ZendeskConnectionConfig(
|
|
76
|
-
subdomain=
|
|
72
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
77
73
|
)
|
|
78
74
|
|
|
79
|
-
index_config = ZendeskIndexerConfig(
|
|
75
|
+
index_config = ZendeskIndexerConfig(item_type="articles")
|
|
80
76
|
|
|
81
77
|
indexer = ZendeskIndexer(
|
|
82
78
|
connection_config=connection_config,
|
|
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
|
|
|
85
81
|
)
|
|
86
82
|
|
|
87
83
|
# handle downloader.
|
|
88
|
-
download_config = ZendeskDownloaderConfig(download_dir=
|
|
84
|
+
download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
|
|
89
85
|
|
|
90
86
|
downloader = ZendeskDownloader(
|
|
91
87
|
connection_config=connection_config,
|
|
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
|
|
|
99
95
|
downloader=downloader,
|
|
100
96
|
configs=SourceValidationConfigs(
|
|
101
97
|
test_id="zendesk-articles",
|
|
102
|
-
expected_num_files=
|
|
103
|
-
validate_file_data=
|
|
98
|
+
expected_num_files=8,
|
|
99
|
+
validate_file_data=True,
|
|
104
100
|
validate_downloaded_files=True,
|
|
105
101
|
),
|
|
106
102
|
)
|
|
107
103
|
|
|
108
104
|
|
|
109
|
-
@pytest.mark.asyncio
|
|
110
105
|
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
token=os.environ["ZENDESK_TOKEN"],
|
|
116
|
-
email="test@unstructured.io",
|
|
117
|
-
subdomain="unstructuredhelp",
|
|
106
|
+
def test_zendesk_source_articles_fail(temp_dir):
|
|
107
|
+
access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
|
|
108
|
+
connection_config = ZendeskConnectionConfig(
|
|
109
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
118
110
|
)
|
|
119
111
|
|
|
112
|
+
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
120
113
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
await zendesk_source_articles_test(
|
|
126
|
-
tmp_path=temp_dir,
|
|
127
|
-
token=os.environ["ZENDESK_TOKEN"],
|
|
128
|
-
email="test@unstructured.io",
|
|
129
|
-
subdomain="unstructuredhelp",
|
|
114
|
+
indexer = ZendeskIndexer(
|
|
115
|
+
connection_config=connection_config,
|
|
116
|
+
index_config=index_config,
|
|
117
|
+
connector_type=CONNECTOR_TYPE,
|
|
130
118
|
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@pytest.mark.asyncio
|
|
134
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
135
|
-
async def test_zendesk_source_articles_fail(temp_dir):
|
|
136
119
|
with pytest.raises(expected_exception=UserAuthError):
|
|
137
|
-
|
|
138
|
-
tmp_path=temp_dir,
|
|
139
|
-
token="FORCE_FAIL_TOKEN",
|
|
140
|
-
email="test@unstructured.io",
|
|
141
|
-
subdomain="unstructuredhelp",
|
|
142
|
-
)
|
|
120
|
+
indexer.precheck()
|
|
@@ -103,7 +103,7 @@ def check_contents(
|
|
|
103
103
|
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
104
104
|
with file_data_path.open("r") as file:
|
|
105
105
|
expected_file_data_contents = json.load(file)
|
|
106
|
-
current_file_data_contents = file_data.
|
|
106
|
+
current_file_data_contents = json.loads(file_data.model_dump_json())
|
|
107
107
|
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
108
108
|
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
109
109
|
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
@@ -184,7 +184,7 @@ def update_fixtures(
|
|
|
184
184
|
for file_data in all_file_data:
|
|
185
185
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
186
186
|
with file_data_path.open(mode="w") as f:
|
|
187
|
-
|
|
187
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
188
188
|
|
|
189
189
|
# Record file structure of download directory
|
|
190
190
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -216,7 +216,9 @@ def run_all_validations(
|
|
|
216
216
|
len(predownload_file_data) == expected_number_indexed_file_data
|
|
217
217
|
), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
|
|
218
218
|
if expected_num_files := configs.expected_num_files:
|
|
219
|
-
assert
|
|
219
|
+
assert (
|
|
220
|
+
len(postdownload_file_data) == expected_num_files
|
|
221
|
+
), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
|
|
220
222
|
|
|
221
223
|
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
222
224
|
configs.run_file_data_validation(
|
|
@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
|
|
|
11
11
|
with pytest.raises(ValidationError):
|
|
12
12
|
ConfluenceConnectionConfig(
|
|
13
13
|
access_config=ConfluenceAccessConfig(
|
|
14
|
-
password="
|
|
14
|
+
password="password",
|
|
15
15
|
token="access_token",
|
|
16
16
|
),
|
|
17
17
|
username="user_email",
|
|
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def test_connection_config_multiple_auth2():
|
|
23
|
+
with pytest.raises(ValidationError):
|
|
24
|
+
ConfluenceConnectionConfig(
|
|
25
|
+
access_config=ConfluenceAccessConfig(
|
|
26
|
+
api_token="api_token",
|
|
27
|
+
token="access_token",
|
|
28
|
+
),
|
|
29
|
+
username="user_email",
|
|
30
|
+
url="url",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_connection_config_multiple_auth3():
|
|
35
|
+
with pytest.raises(ValidationError):
|
|
36
|
+
ConfluenceConnectionConfig(
|
|
37
|
+
access_config=ConfluenceAccessConfig(
|
|
38
|
+
api_token="api_token",
|
|
39
|
+
password="password",
|
|
40
|
+
),
|
|
41
|
+
username="user_email",
|
|
42
|
+
url="url",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
22
46
|
def test_connection_config_no_auth():
|
|
23
47
|
with pytest.raises(ValidationError):
|
|
24
48
|
ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
|
|
25
49
|
|
|
26
50
|
|
|
27
|
-
def
|
|
51
|
+
def test_connection_config_password_auth():
|
|
52
|
+
ConfluenceConnectionConfig(
|
|
53
|
+
access_config=ConfluenceAccessConfig(password="password"),
|
|
54
|
+
url="url",
|
|
55
|
+
username="user_email",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_connection_config_api_token_auth():
|
|
28
60
|
ConfluenceConnectionConfig(
|
|
29
|
-
access_config=ConfluenceAccessConfig(
|
|
61
|
+
access_config=ConfluenceAccessConfig(api_token="api_token"),
|
|
30
62
|
url="url",
|
|
31
63
|
username="user_email",
|
|
32
64
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.16" # pragma: no cover
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
4
|
from pydantic import Field
|
|
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
18
|
-
embedder_model_name: Optional[str] = Field(
|
|
19
|
-
default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
|
|
20
|
-
)
|
|
18
|
+
embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
|
|
21
19
|
embedder_model_kwargs: Optional[dict] = Field(
|
|
22
20
|
default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
|
|
23
21
|
)
|
|
24
22
|
encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
|
|
25
|
-
cache_folder: Optional[str] = Field(default=None)
|
|
26
23
|
|
|
27
24
|
@requires_dependencies(
|
|
28
25
|
["sentence_transformers"],
|
|
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
33
30
|
|
|
34
31
|
return SentenceTransformer(
|
|
35
32
|
model_name_or_path=self.embedder_model_name,
|
|
36
|
-
cache_folder=self.cache_folder,
|
|
37
33
|
**self.embedder_model_kwargs,
|
|
38
34
|
)
|
|
39
35
|
|
|
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
45
41
|
|
|
46
42
|
@dataclass
|
|
47
43
|
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
48
|
-
config: HuggingFaceEmbeddingConfig
|
|
44
|
+
config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
|
|
49
45
|
|
|
50
46
|
def _embed_query(self, query: str) -> list[float]:
|
|
51
47
|
return self._embed_documents(texts=[query])[0]
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
|
+
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
|
163
163
|
raise IOError("Unsupported file type: {path}")
|
|
164
164
|
|
|
165
165
|
|
|
166
|
-
def get_data(path: Path) -> list[dict]:
|
|
166
|
+
def get_data(path: Union[Path, str]) -> list[dict]:
|
|
167
|
+
if isinstance(path, str):
|
|
168
|
+
path = Path(path)
|
|
167
169
|
try:
|
|
168
170
|
return get_data_by_suffix(path=path)
|
|
169
171
|
except Exception as e:
|
|
@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
|
|
|
102
102
|
try:
|
|
103
103
|
return BatchFileData.from_file(path=path)
|
|
104
104
|
except ValidationError:
|
|
105
|
-
logger.debug(f"{path} not
|
|
105
|
+
logger.debug(f"{path} not detected as batch file data")
|
|
106
106
|
|
|
107
107
|
return FileData.from_file(path=path)
|
|
108
108
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from abc import ABC
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
|
|
|
7
6
|
from pydantic import BaseModel
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.utils import ndjson
|
|
9
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
10
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
12
|
|
|
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
|
|
|
43
43
|
writer.f.flush()
|
|
44
44
|
|
|
45
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
-
|
|
47
|
-
elements_contents = json.load(in_f)
|
|
46
|
+
elements_contents = get_data(path=input_file)
|
|
48
47
|
|
|
49
48
|
conformed_elements = [
|
|
50
49
|
self.conform_dict(element_dict=element, file_data=file_data)
|
|
51
50
|
for element in elements_contents
|
|
52
51
|
]
|
|
53
|
-
|
|
54
|
-
with open(output_file, "w") as out_f:
|
|
55
|
-
json.dump(conformed_elements, out_f, indent=2)
|
|
52
|
+
write_data(path=output_file, data=conformed_elements)
|
|
56
53
|
|
|
57
54
|
def run(
|
|
58
55
|
self,
|
|
@@ -108,6 +108,13 @@ class Pipeline:
|
|
|
108
108
|
uploader_connector_type = self.uploader_step.process.connector_type
|
|
109
109
|
registry_entry = destination_registry[uploader_connector_type]
|
|
110
110
|
if registry_entry.upload_stager and self.stager_step is None:
|
|
111
|
+
try:
|
|
112
|
+
self.stager_step = UploadStageStep(
|
|
113
|
+
process=registry_entry.upload_stager(), context=self.context
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
|
|
111
118
|
raise ValueError(
|
|
112
119
|
f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
|
|
113
120
|
f"expects a stager of type {registry_entry.upload_stager.__name__} "
|
|
@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
|
|
|
88
88
|
f"match size of local file: {file_size_bytes}, updating"
|
|
89
89
|
)
|
|
90
90
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
91
|
-
logger.debug(f"updating file data with new content: {file_data.
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
|
|
92
92
|
with file_data_path.open("w") as file:
|
|
93
|
-
|
|
93
|
+
file.write(file_data.model_dump_json(indent=2))
|
|
94
94
|
|
|
95
95
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
96
96
|
file_data = file_data_from_file(path=file_data_path)
|
|
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
|
|
|
173
173
|
filepath = (self.cache_dir / filename).resolve()
|
|
174
174
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
175
175
|
with open(str(filepath), "w") as f:
|
|
176
|
-
|
|
176
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
177
177
|
return str(filepath)
|
|
178
178
|
|
|
179
179
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
|
|
|
37
37
|
@instrument(span_name=STEP_ID)
|
|
38
38
|
def run(self) -> Generator[str, None, None]:
|
|
39
39
|
for file_data in self.process.run():
|
|
40
|
-
logger.debug(f"generated file data: {file_data.
|
|
40
|
+
logger.debug(f"generated file data: {file_data.model_dump_json()}")
|
|
41
41
|
try:
|
|
42
42
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
43
43
|
filename = f"{record_hash}.json"
|
|
44
44
|
filepath = (self.cache_dir / filename).resolve()
|
|
45
45
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
46
46
|
with open(str(filepath), "w") as f:
|
|
47
|
-
|
|
47
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
48
48
|
yield str(filepath)
|
|
49
49
|
except Exception as e:
|
|
50
50
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
|
|
|
54
54
|
|
|
55
55
|
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
56
|
async for file_data in self.process.run_async():
|
|
57
|
-
logger.debug(f"generated file data: {file_data.
|
|
57
|
+
logger.debug(f"generated file data: {file_data.model_dump_json()}")
|
|
58
58
|
try:
|
|
59
59
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
60
|
filename = f"{record_hash}.json"
|
|
61
61
|
filepath = (self.cache_dir / filename).resolve()
|
|
62
62
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
63
|
with open(str(filepath), "w") as f:
|
|
64
|
-
|
|
64
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
65
65
|
yield str(filepath)
|
|
66
66
|
except Exception as e:
|
|
67
67
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
|
|
|
38
38
|
return not filepath.exists()
|
|
39
39
|
|
|
40
40
|
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
-
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
42
42
|
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
43
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
44
|
return filepath
|
|
@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
|
|
|
35
35
|
|
|
36
36
|
class ConfluenceAccessConfig(AccessConfig):
|
|
37
37
|
password: Optional[str] = Field(
|
|
38
|
-
description="Confluence password
|
|
38
|
+
description="Confluence password",
|
|
39
|
+
default=None,
|
|
40
|
+
)
|
|
41
|
+
api_token: Optional[str] = Field(
|
|
42
|
+
description="Confluence Cloud API token",
|
|
39
43
|
default=None,
|
|
40
44
|
)
|
|
41
45
|
token: Optional[str] = Field(
|
|
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
57
61
|
|
|
58
62
|
def model_post_init(self, __context):
|
|
59
63
|
access_configs = self.access_config.get_secret_value()
|
|
60
|
-
|
|
64
|
+
if access_configs.password and access_configs.api_token:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
"both password and api_token provided, only one allowed, "
|
|
67
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
68
|
+
)
|
|
69
|
+
basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
|
|
61
70
|
pat_auth = access_configs.token
|
|
62
71
|
if self.cloud and not basic_auth:
|
|
63
72
|
raise ValueError(
|
|
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
74
83
|
"no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
|
|
75
84
|
)
|
|
76
85
|
|
|
86
|
+
def password_or_api_token(self) -> str:
|
|
87
|
+
# Confluence takes either password or API token under the same field: password
|
|
88
|
+
# This ambiguity led to confusion, so we are making it specific what you are passing in
|
|
89
|
+
access_configs = self.access_config.get_secret_value()
|
|
90
|
+
if access_configs.password:
|
|
91
|
+
return access_configs.password
|
|
92
|
+
return access_configs.api_token
|
|
93
|
+
|
|
77
94
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
78
95
|
@contextmanager
|
|
79
96
|
def get_client(self) -> "Confluence":
|
|
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
83
100
|
with Confluence(
|
|
84
101
|
url=self.url,
|
|
85
102
|
username=self.username,
|
|
86
|
-
password=
|
|
103
|
+
password=self.password_or_api_token(),
|
|
87
104
|
token=access_configs.token,
|
|
88
105
|
cloud=self.cloud,
|
|
89
106
|
) as client:
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
26
|
|
|
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
76
80
|
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
77
81
|
uploader=DatabricksAWSVolumesUploader,
|
|
78
82
|
uploader_config=DatabricksAWSVolumesUploaderConfig,
|
|
83
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
84
|
+
upload_stager=BlobStoreUploadStager,
|
|
79
85
|
)
|
|
80
86
|
|
|
81
87
|
databricks_aws_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
26
|
|
|
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
91
95
|
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
92
96
|
uploader=DatabricksAzureVolumesUploader,
|
|
93
97
|
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
98
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
99
|
+
upload_stager=BlobStoreUploadStager,
|
|
94
100
|
)
|
|
95
101
|
|
|
96
102
|
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
26
|
|
|
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
74
78
|
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
75
79
|
uploader=DatabricksGoogleVolumesUploader,
|
|
76
80
|
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
81
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
82
|
+
upload_stager=BlobStoreUploadStager,
|
|
77
83
|
)
|
|
78
84
|
|
|
79
85
|
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
17
17
|
DatabricksVolumesUploader,
|
|
18
18
|
DatabricksVolumesUploaderConfig,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
26
|
|
|
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
75
79
|
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
76
80
|
uploader=DatabricksNativeVolumesUploader,
|
|
77
81
|
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
82
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
83
|
+
upload_stager=BlobStoreUploadStager,
|
|
78
84
|
)
|
|
79
85
|
|
|
80
86
|
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
61
61
|
self.upload_config.database, ", ".join(databases)
|
|
62
62
|
)
|
|
63
63
|
)
|
|
64
|
-
cursor.execute("SHOW TABLES")
|
|
64
|
+
cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
|
|
65
65
|
table_names = [r[1] for r in cursor.fetchall()]
|
|
66
66
|
if self.upload_config.table_name not in table_names:
|
|
67
67
|
raise ValueError(
|
|
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
26
|
FsspecUploaderConfig,
|
|
27
27
|
)
|
|
28
28
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
29
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
35
|
from adlfs import AzureBlobFileSystem
|
|
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
|
|
|
194
198
|
uploader=AzureUploader,
|
|
195
199
|
uploader_config=AzureUploaderConfig,
|
|
196
200
|
connection_config=AzureConnectionConfig,
|
|
201
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
202
|
+
upload_stager=BlobStoreUploadStager,
|
|
197
203
|
)
|
|
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
FsspecUploaderConfig,
|
|
29
29
|
)
|
|
30
30
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
31
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
32
|
+
BlobStoreUploadStager,
|
|
33
|
+
BlobStoreUploadStagerConfig,
|
|
34
|
+
)
|
|
31
35
|
|
|
32
36
|
if TYPE_CHECKING:
|
|
33
37
|
from boxfs import BoxFileSystem
|
|
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
|
|
|
167
171
|
uploader=BoxUploader,
|
|
168
172
|
uploader_config=BoxUploaderConfig,
|
|
169
173
|
connection_config=BoxConnectionConfig,
|
|
174
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
175
|
+
upload_stager=BlobStoreUploadStager,
|
|
170
176
|
)
|
|
@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
31
31
|
FsspecUploader,
|
|
32
32
|
FsspecUploaderConfig,
|
|
33
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
35
|
+
BlobStoreUploadStager,
|
|
36
|
+
BlobStoreUploadStagerConfig,
|
|
37
|
+
)
|
|
34
38
|
|
|
35
39
|
if TYPE_CHECKING:
|
|
36
40
|
pass
|
|
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
|
|
|
228
232
|
uploader=DropboxUploader,
|
|
229
233
|
uploader_config=DropboxUploaderConfig,
|
|
230
234
|
connection_config=DropboxConnectionConfig,
|
|
235
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
236
|
+
upload_stager=BlobStoreUploadStager,
|
|
231
237
|
)
|