unstructured-ingest 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_zendesk.py +31 -53
- test/integration/connectors/utils/validation/source.py +5 -3
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +3 -3
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/processes/connectors/astradb.py +1 -1
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/client.py +221 -156
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +83 -274
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/METADATA +20 -20
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/RECORD +15 -15
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.17.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import pytest
|
|
6
5
|
|
|
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
|
|
|
21
20
|
ZendeskIndexerConfig,
|
|
22
21
|
)
|
|
23
22
|
|
|
23
|
+
SUBDOMAIN = "unstructuredhelp"
|
|
24
|
+
EMAIL = "test@unstructured.io"
|
|
24
25
|
|
|
25
|
-
async def zendesk_source_test(
|
|
26
|
-
tmp_path: Path,
|
|
27
|
-
token: Optional[str] = None,
|
|
28
|
-
email: Optional[str] = None,
|
|
29
|
-
subdomain: Optional[str] = None,
|
|
30
|
-
):
|
|
31
26
|
|
|
32
|
-
|
|
27
|
+
@pytest.mark.asyncio
|
|
28
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
29
|
+
@requires_env("ZENDESK_TOKEN")
|
|
30
|
+
async def test_zendesk_source_tickets(temp_dir: Path):
|
|
31
|
+
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
33
32
|
connection_config = ZendeskConnectionConfig(
|
|
34
|
-
subdomain=
|
|
33
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
35
34
|
)
|
|
36
35
|
|
|
37
|
-
index_config = ZendeskIndexerConfig(
|
|
36
|
+
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
38
37
|
|
|
39
38
|
indexer = ZendeskIndexer(
|
|
40
39
|
connection_config=connection_config,
|
|
@@ -43,7 +42,7 @@ async def zendesk_source_test(
|
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
# handle downloader.
|
|
46
|
-
download_config = ZendeskDownloaderConfig(download_dir=
|
|
45
|
+
download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
|
|
47
46
|
|
|
48
47
|
downloader = ZendeskDownloader(
|
|
49
48
|
connection_config=connection_config,
|
|
@@ -57,26 +56,23 @@ async def zendesk_source_test(
|
|
|
57
56
|
downloader=downloader,
|
|
58
57
|
configs=SourceValidationConfigs(
|
|
59
58
|
test_id="zendesk-tickets",
|
|
60
|
-
expected_num_files=
|
|
59
|
+
expected_num_files=8,
|
|
61
60
|
validate_file_data=False,
|
|
62
61
|
validate_downloaded_files=True,
|
|
63
62
|
),
|
|
64
63
|
)
|
|
65
64
|
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
):
|
|
73
|
-
|
|
74
|
-
access_config = ZendeskAccessConfig(api_token=token)
|
|
66
|
+
@pytest.mark.asyncio
|
|
67
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
68
|
+
@requires_env("ZENDESK_TOKEN")
|
|
69
|
+
async def test_zendesk_source_articles(temp_dir):
|
|
70
|
+
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
75
71
|
connection_config = ZendeskConnectionConfig(
|
|
76
|
-
subdomain=
|
|
72
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
77
73
|
)
|
|
78
74
|
|
|
79
|
-
index_config = ZendeskIndexerConfig(
|
|
75
|
+
index_config = ZendeskIndexerConfig(item_type="articles")
|
|
80
76
|
|
|
81
77
|
indexer = ZendeskIndexer(
|
|
82
78
|
connection_config=connection_config,
|
|
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
|
|
|
85
81
|
)
|
|
86
82
|
|
|
87
83
|
# handle downloader.
|
|
88
|
-
download_config = ZendeskDownloaderConfig(download_dir=
|
|
84
|
+
download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
|
|
89
85
|
|
|
90
86
|
downloader = ZendeskDownloader(
|
|
91
87
|
connection_config=connection_config,
|
|
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
|
|
|
99
95
|
downloader=downloader,
|
|
100
96
|
configs=SourceValidationConfigs(
|
|
101
97
|
test_id="zendesk-articles",
|
|
102
|
-
expected_num_files=
|
|
103
|
-
validate_file_data=
|
|
98
|
+
expected_num_files=8,
|
|
99
|
+
validate_file_data=True,
|
|
104
100
|
validate_downloaded_files=True,
|
|
105
101
|
),
|
|
106
102
|
)
|
|
107
103
|
|
|
108
104
|
|
|
109
|
-
@pytest.mark.asyncio
|
|
110
105
|
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
token=os.environ["ZENDESK_TOKEN"],
|
|
116
|
-
email="test@unstructured.io",
|
|
117
|
-
subdomain="unstructuredhelp",
|
|
106
|
+
def test_zendesk_source_articles_fail(temp_dir):
|
|
107
|
+
access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
|
|
108
|
+
connection_config = ZendeskConnectionConfig(
|
|
109
|
+
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
118
110
|
)
|
|
119
111
|
|
|
112
|
+
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
120
113
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
await zendesk_source_articles_test(
|
|
126
|
-
tmp_path=temp_dir,
|
|
127
|
-
token=os.environ["ZENDESK_TOKEN"],
|
|
128
|
-
email="test@unstructured.io",
|
|
129
|
-
subdomain="unstructuredhelp",
|
|
114
|
+
indexer = ZendeskIndexer(
|
|
115
|
+
connection_config=connection_config,
|
|
116
|
+
index_config=index_config,
|
|
117
|
+
connector_type=CONNECTOR_TYPE,
|
|
130
118
|
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@pytest.mark.asyncio
|
|
134
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
135
|
-
async def test_zendesk_source_articles_fail(temp_dir):
|
|
136
119
|
with pytest.raises(expected_exception=UserAuthError):
|
|
137
|
-
|
|
138
|
-
tmp_path=temp_dir,
|
|
139
|
-
token="FORCE_FAIL_TOKEN",
|
|
140
|
-
email="test@unstructured.io",
|
|
141
|
-
subdomain="unstructuredhelp",
|
|
142
|
-
)
|
|
120
|
+
indexer.precheck()
|
|
@@ -103,7 +103,7 @@ def check_contents(
|
|
|
103
103
|
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
104
104
|
with file_data_path.open("r") as file:
|
|
105
105
|
expected_file_data_contents = json.load(file)
|
|
106
|
-
current_file_data_contents = file_data.
|
|
106
|
+
current_file_data_contents = json.loads(file_data.model_dump_json())
|
|
107
107
|
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
108
108
|
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
109
109
|
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
@@ -184,7 +184,7 @@ def update_fixtures(
|
|
|
184
184
|
for file_data in all_file_data:
|
|
185
185
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
186
186
|
with file_data_path.open(mode="w") as f:
|
|
187
|
-
|
|
187
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
188
188
|
|
|
189
189
|
# Record file structure of download directory
|
|
190
190
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -216,7 +216,9 @@ def run_all_validations(
|
|
|
216
216
|
len(predownload_file_data) == expected_number_indexed_file_data
|
|
217
217
|
), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
|
|
218
218
|
if expected_num_files := configs.expected_num_files:
|
|
219
|
-
assert
|
|
219
|
+
assert (
|
|
220
|
+
len(postdownload_file_data) == expected_num_files
|
|
221
|
+
), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
|
|
220
222
|
|
|
221
223
|
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
222
224
|
configs.run_file_data_validation(
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.17" # pragma: no cover
|
|
@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
|
|
|
88
88
|
f"match size of local file: {file_size_bytes}, updating"
|
|
89
89
|
)
|
|
90
90
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
91
|
-
logger.debug(f"updating file data with new content: {file_data.
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
|
|
92
92
|
with file_data_path.open("w") as file:
|
|
93
|
-
|
|
93
|
+
file.write(file_data.model_dump_json(indent=2))
|
|
94
94
|
|
|
95
95
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
96
96
|
file_data = file_data_from_file(path=file_data_path)
|
|
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
|
|
|
173
173
|
filepath = (self.cache_dir / filename).resolve()
|
|
174
174
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
175
175
|
with open(str(filepath), "w") as f:
|
|
176
|
-
|
|
176
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
177
177
|
return str(filepath)
|
|
178
178
|
|
|
179
179
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
|
|
|
37
37
|
@instrument(span_name=STEP_ID)
|
|
38
38
|
def run(self) -> Generator[str, None, None]:
|
|
39
39
|
for file_data in self.process.run():
|
|
40
|
-
logger.debug(f"generated file data: {file_data.
|
|
40
|
+
logger.debug(f"generated file data: {file_data.model_dump_json()}")
|
|
41
41
|
try:
|
|
42
42
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
43
43
|
filename = f"{record_hash}.json"
|
|
44
44
|
filepath = (self.cache_dir / filename).resolve()
|
|
45
45
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
46
46
|
with open(str(filepath), "w") as f:
|
|
47
|
-
|
|
47
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
48
48
|
yield str(filepath)
|
|
49
49
|
except Exception as e:
|
|
50
50
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
|
|
|
54
54
|
|
|
55
55
|
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
56
|
async for file_data in self.process.run_async():
|
|
57
|
-
logger.debug(f"generated file data: {file_data.
|
|
57
|
+
logger.debug(f"generated file data: {file_data.model_dump_json()}")
|
|
58
58
|
try:
|
|
59
59
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
60
|
filename = f"{record_hash}.json"
|
|
61
61
|
filepath = (self.cache_dir / filename).resolve()
|
|
62
62
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
63
|
with open(str(filepath), "w") as f:
|
|
64
|
-
|
|
64
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
65
65
|
yield str(filepath)
|
|
66
66
|
except Exception as e:
|
|
67
67
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -112,7 +112,6 @@ def get_astra_collection(
|
|
|
112
112
|
collection_name: str,
|
|
113
113
|
keyspace: str,
|
|
114
114
|
) -> "AstraDBCollection":
|
|
115
|
-
|
|
116
115
|
astra_db = get_astra_db(connection_config=connection_config, keyspace=keyspace)
|
|
117
116
|
|
|
118
117
|
# astradb will return a collection object in all cases (even if it doesn't exist)
|
|
@@ -315,6 +314,7 @@ class AstraDBUploadStager(UploadStager):
|
|
|
315
314
|
text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
|
|
316
315
|
)
|
|
317
316
|
metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
|
|
317
|
+
metadata.pop("orig_elements", None)
|
|
318
318
|
|
|
319
319
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
320
320
|
self.truncate_dict_elements(element_dict)
|
|
@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
61
61
|
self.upload_config.database, ", ".join(databases)
|
|
62
62
|
)
|
|
63
63
|
)
|
|
64
|
-
cursor.execute("SHOW TABLES")
|
|
64
|
+
cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
|
|
65
65
|
table_names = [r[1] for r in cursor.fetchall()]
|
|
66
66
|
if self.upload_config.table_name not in table_names:
|
|
67
67
|
raise ValueError(
|