unstructured-ingest 0.5.9__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (28) hide show
  1. test/integration/connectors/test_astradb.py +21 -0
  2. test/integration/connectors/test_dropbox.py +151 -0
  3. test/integration/connectors/test_jira.py +67 -0
  4. test/integration/connectors/test_zendesk.py +142 -0
  5. test/integration/connectors/utils/validation/destination.py +2 -1
  6. test/unit/test_utils.py +27 -0
  7. test/unit/v2/connectors/test_jira.py +401 -0
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/embed/openai.py +4 -3
  10. unstructured_ingest/utils/string_and_date_utils.py +25 -0
  11. unstructured_ingest/v2/interfaces/downloader.py +2 -3
  12. unstructured_ingest/v2/processes/connectors/__init__.py +4 -0
  13. unstructured_ingest/v2/processes/connectors/astradb.py +36 -28
  14. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  15. unstructured_ingest/v2/processes/connectors/delta_table.py +2 -0
  16. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +78 -15
  17. unstructured_ingest/v2/processes/connectors/jira.py +453 -0
  18. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +31 -0
  19. unstructured_ingest/v2/processes/connectors/zendesk/client.py +225 -0
  20. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +419 -0
  21. unstructured_ingest/v2/processes/partitioner.py +2 -5
  22. unstructured_ingest/v2/unstructured_api.py +7 -0
  23. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/METADATA +26 -26
  24. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/RECORD +28 -20
  25. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/LICENSE.md +0 -0
  26. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/WHEEL +0 -0
  27. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/entry_points.txt +0 -0
  28. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.astradb import (
31
31
  AstraDBUploader,
32
32
  AstraDBUploaderConfig,
33
33
  AstraDBUploadStager,
34
+ AstraDBUploadStagerConfig,
34
35
  DestinationConnectionError,
35
36
  SourceConnectionError,
36
37
  )
@@ -258,3 +259,23 @@ def test_astra_stager(
258
259
  stager=stager,
259
260
  tmp_dir=tmp_path,
260
261
  )
262
+
263
+
264
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
265
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
266
+ def test_astra_stager_flatten_metadata(
267
+ request: TopRequest,
268
+ upload_file_str: str,
269
+ tmp_path: Path,
270
+ ):
271
+ stager_config = AstraDBUploadStagerConfig(flatten_metadata=True)
272
+ upload_file: Path = request.getfixturevalue(upload_file_str)
273
+ stager = AstraDBUploadStager(upload_stager_config=stager_config)
274
+ stager_validation(
275
+ configs=StagerValidationConfigs(
276
+ test_id=CONNECTOR_TYPE, expected_count=22, expected_folder="stager_flatten_metadata"
277
+ ),
278
+ input_file=upload_file,
279
+ stager=stager,
280
+ tmp_dir=tmp_path,
281
+ )
@@ -0,0 +1,151 @@
1
+ import os
2
+
3
+ import pytest
4
+ import requests
5
+
6
+ from test.integration.connectors.utils.constants import (
7
+ BLOB_STORAGE_TAG,
8
+ SOURCE_TAG,
9
+ )
10
+ from test.integration.connectors.utils.validation.source import (
11
+ SourceValidationConfigs,
12
+ source_connector_validation,
13
+ )
14
+ from test.integration.utils import requires_env
15
+ from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import (
16
+ CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import (
19
+ DropboxAccessConfig,
20
+ DropboxConnectionConfig,
21
+ DropboxDownloader,
22
+ DropboxDownloaderConfig,
23
+ DropboxIndexer,
24
+ DropboxIndexerConfig,
25
+ )
26
+
27
+
28
+ @pytest.mark.asyncio
29
+ @pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
30
+ @requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
31
+ async def test_dropbox_source(temp_dir):
32
+ """
33
+ Integration test for the Dropbox source connector.
34
+
35
+ This test indexes data from dropbox://test-input/ and downloads the resulting files,
36
+ then compares them to fixture data.
37
+ """
38
+ refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
39
+ app_key = os.getenv("DROPBOX_APP_KEY")
40
+ app_secret = os.getenv("DROPBOX_APP_SECRET")
41
+
42
+ connection_config = DropboxConnectionConfig(
43
+ access_config=DropboxAccessConfig(
44
+ refresh_token=refresh_token,
45
+ app_key=app_key,
46
+ app_secret=app_secret,
47
+ )
48
+ )
49
+
50
+ index_config = DropboxIndexerConfig(
51
+ recursive=True,
52
+ remote_url="dropbox://test-input",
53
+ )
54
+ downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
55
+
56
+ indexer = DropboxIndexer(
57
+ connection_config=connection_config,
58
+ index_config=index_config,
59
+ )
60
+ downloader = DropboxDownloader(
61
+ connection_config=connection_config,
62
+ download_config=downloader_config,
63
+ )
64
+
65
+ await source_connector_validation(
66
+ indexer=indexer,
67
+ downloader=downloader,
68
+ configs=SourceValidationConfigs(
69
+ test_id="dropbox",
70
+ expected_num_files=4,
71
+ validate_downloaded_files=True,
72
+ exclude_fields_extend=[
73
+ "metadata.date_created",
74
+ "metadata.date_modified",
75
+ ],
76
+ ),
77
+ )
78
+
79
+
80
+ @pytest.mark.asyncio
81
+ @pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
82
+ @requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
83
+ async def test_dropbox_short_lived_token_via_refresh(temp_dir):
84
+ """
85
+ Demonstrates manually generating an access token from refresh credentials,
86
+ then passing ONLY the short-lived token to the Dropbox connector
87
+ (no app_key, app_secret, or refresh_token in the actual connection config).
88
+
89
+ This effectively mimics an external system that hands us a short-lived token.
90
+ """
91
+ refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
92
+ app_key = os.getenv("DROPBOX_APP_KEY")
93
+ app_secret = os.getenv("DROPBOX_APP_SECRET")
94
+
95
+ # Manually request a short-lived token from Dropbox's OAuth endpoint
96
+ # This call is basically what the connector code does internally,
97
+ # but we're doing it here in the test so we can pass only the short-lived token later.
98
+ response = requests.post(
99
+ "https://api.dropboxapi.com/oauth2/token",
100
+ data={
101
+ "grant_type": "refresh_token",
102
+ "refresh_token": refresh_token,
103
+ },
104
+ auth=(app_key, app_secret),
105
+ timeout=30, # seconds
106
+ )
107
+ response.raise_for_status()
108
+ data = response.json()
109
+ short_lived_token = data["access_token"]
110
+ print("Acquired an access token from Dropbox")
111
+
112
+ # Build connection config with ONLY the short-lived token
113
+ # We omit refresh_token, app_key, and app_secret to confirm that
114
+ # our connector can operate purely on the short-lived token.
115
+ connection_config = DropboxConnectionConfig(
116
+ access_config=DropboxAccessConfig(
117
+ token=short_lived_token,
118
+ app_key=None,
119
+ app_secret=None,
120
+ refresh_token=None,
121
+ )
122
+ )
123
+
124
+ index_config = DropboxIndexerConfig(
125
+ recursive=True,
126
+ remote_url="dropbox://test-input",
127
+ )
128
+ downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
129
+
130
+ indexer = DropboxIndexer(
131
+ connection_config=connection_config,
132
+ index_config=index_config,
133
+ )
134
+ downloader = DropboxDownloader(
135
+ connection_config=connection_config,
136
+ download_config=downloader_config,
137
+ )
138
+
139
+ await source_connector_validation(
140
+ indexer=indexer,
141
+ downloader=downloader,
142
+ configs=SourceValidationConfigs(
143
+ test_id="dropbox_short_lived_via_refresh",
144
+ expected_num_files=4,
145
+ validate_downloaded_files=True,
146
+ exclude_fields_extend=[
147
+ "metadata.date_created",
148
+ "metadata.date_modified",
149
+ ],
150
+ ),
151
+ )
@@ -0,0 +1,67 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
+ from test.integration.connectors.utils.validation.source import (
7
+ SourceValidationConfigs,
8
+ source_connector_validation,
9
+ )
10
+ from test.integration.utils import requires_env
11
+ from unstructured_ingest.v2.processes.connectors.jira import (
12
+ CONNECTOR_TYPE,
13
+ JiraAccessConfig,
14
+ JiraConnectionConfig,
15
+ JiraDownloader,
16
+ JiraDownloaderConfig,
17
+ JiraIndexer,
18
+ JiraIndexerConfig,
19
+ )
20
+
21
+
22
+ @pytest.mark.asyncio
23
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
24
+ @requires_env("JIRA_INGEST_USER_EMAIL", "JIRA_INGEST_API_TOKEN")
25
+ async def test_jira_source(temp_dir):
26
+ # Retrieve environment variables
27
+ jira_url = os.environ.get(
28
+ "JIRA_INGEST_URL", "https://unstructured-jira-connector-test.atlassian.net"
29
+ )
30
+ user_email = os.environ["JIRA_INGEST_USER_EMAIL"]
31
+ api_token = os.environ["JIRA_INGEST_API_TOKEN"]
32
+ projects = ["JCTP1"]
33
+ boards = ["3"]
34
+ issues = ["JCTP2-1", "JCTP2-2", "JCTP2-3"]
35
+
36
+ # Create connection and indexer configurations
37
+ access_config = JiraAccessConfig(password=api_token)
38
+ connection_config = JiraConnectionConfig(
39
+ url=jira_url,
40
+ username=user_email,
41
+ access_config=access_config,
42
+ )
43
+ index_config = JiraIndexerConfig(projects=projects, boards=boards, issues=issues)
44
+
45
+ download_config = JiraDownloaderConfig(download_dir=temp_dir)
46
+
47
+ # Instantiate indexer and downloader
48
+ indexer = JiraIndexer(
49
+ connection_config=connection_config,
50
+ index_config=index_config,
51
+ )
52
+ downloader = JiraDownloader(
53
+ connection_config=connection_config,
54
+ download_config=download_config,
55
+ )
56
+
57
+ # Run the source connector validation
58
+ await source_connector_validation(
59
+ indexer=indexer,
60
+ downloader=downloader,
61
+ configs=SourceValidationConfigs(
62
+ test_id="jira",
63
+ expected_num_files=8,
64
+ validate_file_data=True,
65
+ validate_downloaded_files=True,
66
+ ),
67
+ )
@@ -0,0 +1,142 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import pytest
6
+
7
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
8
+ from test.integration.connectors.utils.validation.source import (
9
+ SourceValidationConfigs,
10
+ source_connector_validation,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.errors import UserAuthError
14
+ from unstructured_ingest.v2.processes.connectors.zendesk import (
15
+ CONNECTOR_TYPE,
16
+ ZendeskAccessConfig,
17
+ ZendeskConnectionConfig,
18
+ ZendeskDownloader,
19
+ ZendeskDownloaderConfig,
20
+ ZendeskIndexer,
21
+ ZendeskIndexerConfig,
22
+ )
23
+
24
+
25
+ async def zendesk_source_test(
26
+ tmp_path: Path,
27
+ token: Optional[str] = None,
28
+ email: Optional[str] = None,
29
+ subdomain: Optional[str] = None,
30
+ ):
31
+
32
+ access_config = ZendeskAccessConfig(api_token=token)
33
+ connection_config = ZendeskConnectionConfig(
34
+ subdomain=subdomain, email=email, access_config=access_config
35
+ )
36
+
37
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
38
+
39
+ indexer = ZendeskIndexer(
40
+ connection_config=connection_config,
41
+ index_config=index_config,
42
+ connector_type=CONNECTOR_TYPE,
43
+ )
44
+
45
+ # handle downloader.
46
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
47
+
48
+ downloader = ZendeskDownloader(
49
+ connection_config=connection_config,
50
+ download_config=download_config,
51
+ connector_type=CONNECTOR_TYPE,
52
+ )
53
+
54
+ # Run the source connector validation
55
+ await source_connector_validation(
56
+ indexer=indexer,
57
+ downloader=downloader,
58
+ configs=SourceValidationConfigs(
59
+ test_id="zendesk-tickets",
60
+ expected_num_files=4,
61
+ validate_file_data=False,
62
+ validate_downloaded_files=True,
63
+ ),
64
+ )
65
+
66
+
67
+ async def zendesk_source_articles_test(
68
+ tmp_path: Path,
69
+ token: Optional[str] = None,
70
+ email: Optional[str] = None,
71
+ subdomain: Optional[str] = None,
72
+ ):
73
+
74
+ access_config = ZendeskAccessConfig(api_token=token)
75
+ connection_config = ZendeskConnectionConfig(
76
+ subdomain=subdomain, email=email, access_config=access_config
77
+ )
78
+
79
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
80
+
81
+ indexer = ZendeskIndexer(
82
+ connection_config=connection_config,
83
+ index_config=index_config,
84
+ connector_type=CONNECTOR_TYPE,
85
+ )
86
+
87
+ # handle downloader.
88
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
89
+
90
+ downloader = ZendeskDownloader(
91
+ connection_config=connection_config,
92
+ download_config=download_config,
93
+ connector_type=CONNECTOR_TYPE,
94
+ )
95
+
96
+ # Run the source connector validation
97
+ await source_connector_validation(
98
+ indexer=indexer,
99
+ downloader=downloader,
100
+ configs=SourceValidationConfigs(
101
+ test_id="zendesk-articles",
102
+ expected_num_files=4,
103
+ validate_file_data=False,
104
+ validate_downloaded_files=True,
105
+ ),
106
+ )
107
+
108
+
109
+ @pytest.mark.asyncio
110
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
111
+ @requires_env("ZENDESK_TOKEN")
112
+ async def test_zendesk_source(temp_dir):
113
+ await zendesk_source_test(
114
+ tmp_path=temp_dir,
115
+ token=os.environ["ZENDESK_TOKEN"],
116
+ email="test@unstructured.io",
117
+ subdomain="unstructuredhelp",
118
+ )
119
+
120
+
121
+ @pytest.mark.asyncio
122
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
123
+ @requires_env("ZENDESK_TOKEN")
124
+ async def test_zendesk_source_articles(temp_dir):
125
+ await zendesk_source_articles_test(
126
+ tmp_path=temp_dir,
127
+ token=os.environ["ZENDESK_TOKEN"],
128
+ email="test@unstructured.io",
129
+ subdomain="unstructuredhelp",
130
+ )
131
+
132
+
133
+ @pytest.mark.asyncio
134
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
135
+ async def test_zendesk_source_articles_fail(temp_dir):
136
+ with pytest.raises(expected_exception=UserAuthError):
137
+ await zendesk_source_articles_test(
138
+ tmp_path=temp_dir,
139
+ token="FORCE_FAIL_TOKEN",
140
+ email="test@unstructured.io",
141
+ subdomain="unstructuredhelp",
142
+ )
@@ -9,9 +9,10 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, Uploa
9
9
 
10
10
  class StagerValidationConfigs(ValidationConfig):
11
11
  expected_count: int
12
+ expected_folder: str = "stager"
12
13
 
13
14
  def stager_output_dir(self) -> Path:
14
- dir = self.test_output_dir() / "stager"
15
+ dir = self.test_output_dir() / self.expected_folder
15
16
  dir.mkdir(exist_ok=True, parents=True)
16
17
  return dir
17
18
 
test/unit/test_utils.py CHANGED
@@ -10,6 +10,7 @@ from unstructured_ingest.cli.utils import extract_config
10
10
  from unstructured_ingest.interfaces import BaseConfig
11
11
  from unstructured_ingest.utils.string_and_date_utils import (
12
12
  ensure_isoformat_datetime,
13
+ fix_unescaped_unicode,
13
14
  json_to_dict,
14
15
  truncate_string_bytes,
15
16
  )
@@ -182,3 +183,29 @@ def test_truncate_string_bytes_return_untouched_string():
182
183
  result = truncate_string_bytes(test_string, max_bytes)
183
184
  assert result == "abcdef"
184
185
  assert len(result.encode("utf-8")) <= max_bytes
186
+
187
+
188
+ def test_fix_unescaped_unicode_valid():
189
+ text = "This is a test with unescaped unicode: \\u0041"
190
+ expected = "This is a test with unescaped unicode: \u0041"
191
+ assert fix_unescaped_unicode(text) == expected
192
+
193
+
194
+ def test_fix_unescaped_unicode_no_unescaped_chars():
195
+ text = "This is a test with no unescaped unicode: \u0041"
196
+ expected = "This is a test with no unescaped unicode: \u0041"
197
+ assert fix_unescaped_unicode(text) == expected
198
+
199
+
200
+ def test_fix_unescaped_unicode_invalid_unicode():
201
+ text = "This is a test with invalid unescaped unicode: \\uZZZZ"
202
+ expected = "This is a test with invalid unescaped unicode: \\uZZZZ"
203
+ assert fix_unescaped_unicode(text) == expected
204
+
205
+
206
+ def test_fix_unescaped_unicode_encoding_error(caplog: pytest.LogCaptureFixture):
207
+ text = "This is a test with unescaped unicode: \\uD83D"
208
+ fix_unescaped_unicode(text)
209
+ with caplog.at_level("WARNING"):
210
+ fix_unescaped_unicode(text)
211
+ assert "Failed to fix unescaped Unicode sequences" in caplog.text