unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from pydantic import ValidationError
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.processes.connectors.confluence import (
|
|
5
|
-
ConfluenceAccessConfig,
|
|
6
|
-
ConfluenceConnectionConfig,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def test_connection_config_multiple_auth():
|
|
11
|
-
with pytest.raises(ValidationError):
|
|
12
|
-
ConfluenceConnectionConfig(
|
|
13
|
-
access_config=ConfluenceAccessConfig(
|
|
14
|
-
password="password",
|
|
15
|
-
token="access_token",
|
|
16
|
-
),
|
|
17
|
-
username="user_email",
|
|
18
|
-
url="url",
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_connection_config_multiple_auth2():
|
|
23
|
-
with pytest.raises(ValidationError):
|
|
24
|
-
ConfluenceConnectionConfig(
|
|
25
|
-
access_config=ConfluenceAccessConfig(
|
|
26
|
-
api_token="api_token",
|
|
27
|
-
token="access_token",
|
|
28
|
-
),
|
|
29
|
-
username="user_email",
|
|
30
|
-
url="url",
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def test_connection_config_multiple_auth3():
|
|
35
|
-
with pytest.raises(ValidationError):
|
|
36
|
-
ConfluenceConnectionConfig(
|
|
37
|
-
access_config=ConfluenceAccessConfig(
|
|
38
|
-
api_token="api_token",
|
|
39
|
-
password="password",
|
|
40
|
-
),
|
|
41
|
-
username="user_email",
|
|
42
|
-
url="url",
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def test_connection_config_no_auth():
|
|
47
|
-
with pytest.raises(ValidationError):
|
|
48
|
-
ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def test_connection_config_password_auth():
|
|
52
|
-
ConfluenceConnectionConfig(
|
|
53
|
-
access_config=ConfluenceAccessConfig(password="password"),
|
|
54
|
-
url="url",
|
|
55
|
-
username="user_email",
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def test_connection_config_api_token_auth():
|
|
60
|
-
ConfluenceConnectionConfig(
|
|
61
|
-
access_config=ConfluenceAccessConfig(api_token="api_token"),
|
|
62
|
-
url="url",
|
|
63
|
-
username="user_email",
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def test_connection_config_pat_auth():
|
|
68
|
-
ConfluenceConnectionConfig(
|
|
69
|
-
access_config=ConfluenceAccessConfig(token="access_token"),
|
|
70
|
-
url="url",
|
|
71
|
-
)
|
|
@@ -1,401 +0,0 @@
|
|
|
1
|
-
from unittest.mock import MagicMock
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from pydantic import ValidationError
|
|
5
|
-
from pytest_mock import MockerFixture
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.processes.connectors.jira import (
|
|
8
|
-
FieldGetter,
|
|
9
|
-
JiraAccessConfig,
|
|
10
|
-
JiraConnectionConfig,
|
|
11
|
-
JiraIndexer,
|
|
12
|
-
JiraIndexerConfig,
|
|
13
|
-
JiraIssueMetadata,
|
|
14
|
-
issues_fetcher_wrapper,
|
|
15
|
-
nested_object_to_field_getter,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@pytest.fixture
|
|
20
|
-
def jira_connection_config():
|
|
21
|
-
access_config = JiraAccessConfig(password="password")
|
|
22
|
-
return JiraConnectionConfig(
|
|
23
|
-
url="http://localhost:1234",
|
|
24
|
-
username="test@example.com",
|
|
25
|
-
access_config=access_config,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@pytest.fixture
|
|
30
|
-
def jira_indexer(jira_connection_config: JiraConnectionConfig):
|
|
31
|
-
indexer_config = JiraIndexerConfig(projects=["TEST1"], boards=["2"], issues=["TEST2-1"])
|
|
32
|
-
return JiraIndexer(connection_config=jira_connection_config, index_config=indexer_config)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@pytest.fixture
|
|
36
|
-
def mock_jira(mocker: MockerFixture):
|
|
37
|
-
mock_client = mocker.patch.object(JiraConnectionConfig, "get_client", autospec=True)
|
|
38
|
-
mock_jira = mocker.MagicMock()
|
|
39
|
-
mock_client.return_value.__enter__.return_value = mock_jira
|
|
40
|
-
return mock_jira
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_jira_indexer_precheck_success(
|
|
44
|
-
caplog: pytest.LogCaptureFixture,
|
|
45
|
-
mocker: MockerFixture,
|
|
46
|
-
jira_indexer: JiraIndexer,
|
|
47
|
-
mock_jira: MagicMock,
|
|
48
|
-
):
|
|
49
|
-
get_permissions = mocker.MagicMock()
|
|
50
|
-
get_permissions.return_value = {"permissions": {"BROWSE_PROJECTS": {"havePermission": True}}}
|
|
51
|
-
mock_jira.get_permissions = get_permissions
|
|
52
|
-
|
|
53
|
-
with caplog.at_level("INFO"):
|
|
54
|
-
jira_indexer.precheck()
|
|
55
|
-
assert "Connection to Jira successful." in caplog.text
|
|
56
|
-
|
|
57
|
-
get_permissions.assert_called_once()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def test_jira_indexer_precheck_no_permission(
|
|
61
|
-
mocker: MockerFixture,
|
|
62
|
-
jira_indexer: JiraIndexer,
|
|
63
|
-
mock_jira: MagicMock,
|
|
64
|
-
):
|
|
65
|
-
get_permissions = mocker.MagicMock()
|
|
66
|
-
get_permissions.return_value = {"permissions": {"BROWSE_PROJECTS": {"havePermission": False}}}
|
|
67
|
-
mock_jira.get_permissions = get_permissions
|
|
68
|
-
|
|
69
|
-
with pytest.raises(ValueError):
|
|
70
|
-
jira_indexer.precheck()
|
|
71
|
-
|
|
72
|
-
get_permissions.assert_called_once()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@pytest.mark.parametrize(
|
|
76
|
-
("project_issues_count", "expected_issues_count"), [(2, 2), ({"total": 2}, 2), (0, 0)]
|
|
77
|
-
)
|
|
78
|
-
def test_jira_indexer_get_issues_within_single_project(
|
|
79
|
-
jira_indexer: JiraIndexer,
|
|
80
|
-
mock_jira: MagicMock,
|
|
81
|
-
project_issues_count,
|
|
82
|
-
expected_issues_count,
|
|
83
|
-
):
|
|
84
|
-
mock_jira.get_project_issues_count.return_value = project_issues_count
|
|
85
|
-
mock_jira.get_all_project_issues.return_value = [
|
|
86
|
-
{"id": "1", "key": "TEST-1"},
|
|
87
|
-
{"id": "2", "key": "TEST-2"},
|
|
88
|
-
]
|
|
89
|
-
|
|
90
|
-
issues = jira_indexer._get_issues_within_single_project("TEST1")
|
|
91
|
-
assert len(issues) == expected_issues_count
|
|
92
|
-
|
|
93
|
-
if issues:
|
|
94
|
-
assert issues[0].id == "1"
|
|
95
|
-
assert issues[0].key == "TEST-1"
|
|
96
|
-
assert issues[1].id == "2"
|
|
97
|
-
assert issues[1].key == "TEST-2"
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def test_jira_indexer_get_issues_within_single_project_error(
|
|
101
|
-
jira_indexer: JiraIndexer,
|
|
102
|
-
mock_jira: MagicMock,
|
|
103
|
-
):
|
|
104
|
-
mock_jira.get_project_issues_count.return_value = {}
|
|
105
|
-
|
|
106
|
-
with pytest.raises(KeyError):
|
|
107
|
-
jira_indexer._get_issues_within_single_project("TEST1")
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def test_jira_indexer_get_issues_within_projects_with_projects(
|
|
111
|
-
jira_indexer: JiraIndexer,
|
|
112
|
-
mock_jira: MagicMock,
|
|
113
|
-
):
|
|
114
|
-
mock_jira.get_project_issues_count.return_value = 2
|
|
115
|
-
mock_jira.get_all_project_issues.return_value = [
|
|
116
|
-
{"id": "1", "key": "TEST-1"},
|
|
117
|
-
{"id": "2", "key": "TEST-2"},
|
|
118
|
-
]
|
|
119
|
-
|
|
120
|
-
issues = jira_indexer._get_issues_within_projects()
|
|
121
|
-
assert len(issues) == 2
|
|
122
|
-
assert issues[0].id == "1"
|
|
123
|
-
assert issues[0].key == "TEST-1"
|
|
124
|
-
assert issues[1].id == "2"
|
|
125
|
-
assert issues[1].key == "TEST-2"
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def test_jira_indexer_get_issues_within_projects_no_projects_with_boards_or_issues(
|
|
129
|
-
mocker: MockerFixture,
|
|
130
|
-
jira_indexer: JiraIndexer,
|
|
131
|
-
):
|
|
132
|
-
jira_indexer.index_config.projects = None
|
|
133
|
-
jira_indexer.index_config.boards = ["2"]
|
|
134
|
-
mocker.patch.object(JiraConnectionConfig, "get_client", autospec=True)
|
|
135
|
-
|
|
136
|
-
issues = jira_indexer._get_issues_within_projects()
|
|
137
|
-
assert issues == []
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def test_jira_indexer_get_issues_within_projects_no_projects_no_boards_no_issues(
|
|
141
|
-
jira_indexer: JiraIndexer,
|
|
142
|
-
mock_jira: MagicMock,
|
|
143
|
-
):
|
|
144
|
-
jira_indexer.index_config.projects = None
|
|
145
|
-
jira_indexer.index_config.boards = None
|
|
146
|
-
jira_indexer.index_config.issues = None
|
|
147
|
-
mock_jira.projects.return_value = [{"key": "TEST1"}, {"key": "TEST2"}]
|
|
148
|
-
mock_jira.get_project_issues_count.return_value = 2
|
|
149
|
-
mock_jira.get_all_project_issues.return_value = [
|
|
150
|
-
{"id": "1", "key": "TEST-1"},
|
|
151
|
-
{"id": "2", "key": "TEST-2"},
|
|
152
|
-
]
|
|
153
|
-
|
|
154
|
-
issues = jira_indexer._get_issues_within_projects()
|
|
155
|
-
assert len(issues) == 4
|
|
156
|
-
assert issues[0].id == "1"
|
|
157
|
-
assert issues[0].key == "TEST-1"
|
|
158
|
-
assert issues[1].id == "2"
|
|
159
|
-
assert issues[1].key == "TEST-2"
|
|
160
|
-
assert issues[2].id == "1"
|
|
161
|
-
assert issues[2].key == "TEST-1"
|
|
162
|
-
assert issues[3].id == "2"
|
|
163
|
-
assert issues[3].key == "TEST-2"
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def test_jira_indexer_get_issues_within_boards(
|
|
167
|
-
jira_indexer: JiraIndexer,
|
|
168
|
-
mock_jira: MagicMock,
|
|
169
|
-
):
|
|
170
|
-
mock_jira.get_issues_for_board.return_value = [
|
|
171
|
-
{"id": "1", "key": "TEST-1"},
|
|
172
|
-
{"id": "2", "key": "TEST-2"},
|
|
173
|
-
]
|
|
174
|
-
|
|
175
|
-
issues = jira_indexer._get_issues_within_boards()
|
|
176
|
-
assert len(issues) == 2
|
|
177
|
-
assert issues[0].id == "1"
|
|
178
|
-
assert issues[0].key == "TEST-1"
|
|
179
|
-
assert issues[1].id == "2"
|
|
180
|
-
assert issues[1].key == "TEST-2"
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def test_jira_indexer_get_issues_within_single_board(
|
|
184
|
-
jira_indexer: JiraIndexer,
|
|
185
|
-
mock_jira: MagicMock,
|
|
186
|
-
):
|
|
187
|
-
mock_jira.get_issues_for_board.return_value = [
|
|
188
|
-
{"id": "1", "key": "TEST-1"},
|
|
189
|
-
{"id": "2", "key": "TEST-2"},
|
|
190
|
-
]
|
|
191
|
-
|
|
192
|
-
issues = jira_indexer._get_issues_within_single_board("1")
|
|
193
|
-
assert len(issues) == 2
|
|
194
|
-
assert issues[0].id == "1"
|
|
195
|
-
assert issues[0].key == "TEST-1"
|
|
196
|
-
assert issues[0].board_id == "1"
|
|
197
|
-
assert issues[1].id == "2"
|
|
198
|
-
assert issues[1].key == "TEST-2"
|
|
199
|
-
assert issues[1].board_id == "1"
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
def test_jira_indexer_get_issues_within_single_board_no_issues(
|
|
203
|
-
jira_indexer: JiraIndexer,
|
|
204
|
-
mock_jira: MagicMock,
|
|
205
|
-
):
|
|
206
|
-
mock_jira.get_issues_for_board.return_value = []
|
|
207
|
-
|
|
208
|
-
issues = jira_indexer._get_issues_within_single_board("1")
|
|
209
|
-
assert len(issues) == 0
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def test_jira_indexer_get_issues(
|
|
213
|
-
jira_indexer: JiraIndexer,
|
|
214
|
-
mock_jira: MagicMock,
|
|
215
|
-
):
|
|
216
|
-
jira_indexer.index_config.issues = ["TEST2-1", "TEST2-2"]
|
|
217
|
-
mock_jira.get_issue.return_value = {
|
|
218
|
-
"id": "ISSUE_ID",
|
|
219
|
-
"key": "ISSUE_KEY",
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
issues = jira_indexer._get_issues()
|
|
223
|
-
assert len(issues) == 2
|
|
224
|
-
assert issues[0].id == "ISSUE_ID"
|
|
225
|
-
assert issues[0].key == "ISSUE_KEY"
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def test_jira_indexer_get_issues_unique_issues(mocker: MockerFixture, jira_indexer: JiraIndexer):
|
|
229
|
-
mocker.patch.object(
|
|
230
|
-
JiraIndexer,
|
|
231
|
-
"_get_issues_within_boards",
|
|
232
|
-
return_value=[
|
|
233
|
-
JiraIssueMetadata(id="1", key="TEST-1", board_id="1"),
|
|
234
|
-
JiraIssueMetadata(id="2", key="TEST-2", board_id="1"),
|
|
235
|
-
],
|
|
236
|
-
)
|
|
237
|
-
mocker.patch.object(
|
|
238
|
-
JiraIndexer,
|
|
239
|
-
"_get_issues_within_projects",
|
|
240
|
-
return_value=[
|
|
241
|
-
JiraIssueMetadata(id="1", key="TEST-1"),
|
|
242
|
-
JiraIssueMetadata(id="3", key="TEST-3"),
|
|
243
|
-
],
|
|
244
|
-
)
|
|
245
|
-
mocker.patch.object(
|
|
246
|
-
JiraIndexer,
|
|
247
|
-
"_get_issues",
|
|
248
|
-
return_value=[
|
|
249
|
-
JiraIssueMetadata(id="4", key="TEST-4"),
|
|
250
|
-
JiraIssueMetadata(id="2", key="TEST-2"),
|
|
251
|
-
],
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
issues = jira_indexer.get_issues()
|
|
255
|
-
assert len(issues) == 4
|
|
256
|
-
assert issues[0].id == "1"
|
|
257
|
-
assert issues[0].key == "TEST-1"
|
|
258
|
-
assert issues[0].board_id == "1"
|
|
259
|
-
assert issues[1].id == "2"
|
|
260
|
-
assert issues[1].key == "TEST-2"
|
|
261
|
-
assert issues[1].board_id == "1"
|
|
262
|
-
assert issues[2].id == "3"
|
|
263
|
-
assert issues[2].key == "TEST-3"
|
|
264
|
-
assert issues[3].id == "4"
|
|
265
|
-
assert issues[3].key == "TEST-4"
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
def test_jira_indexer_get_issues_no_duplicates(mocker: MockerFixture, jira_indexer: JiraIndexer):
|
|
269
|
-
mocker.patch.object(
|
|
270
|
-
JiraIndexer,
|
|
271
|
-
"_get_issues_within_boards",
|
|
272
|
-
return_value=[
|
|
273
|
-
JiraIssueMetadata(id="1", key="TEST-1", board_id="1"),
|
|
274
|
-
],
|
|
275
|
-
)
|
|
276
|
-
mocker.patch.object(
|
|
277
|
-
JiraIndexer,
|
|
278
|
-
"_get_issues_within_projects",
|
|
279
|
-
return_value=[
|
|
280
|
-
JiraIssueMetadata(id="2", key="TEST-2"),
|
|
281
|
-
],
|
|
282
|
-
)
|
|
283
|
-
mocker.patch.object(
|
|
284
|
-
JiraIndexer,
|
|
285
|
-
"_get_issues",
|
|
286
|
-
return_value=[
|
|
287
|
-
JiraIssueMetadata(id="3", key="TEST-3"),
|
|
288
|
-
],
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
issues = jira_indexer.get_issues()
|
|
292
|
-
assert len(issues) == 3
|
|
293
|
-
assert issues[0].id == "1"
|
|
294
|
-
assert issues[0].key == "TEST-1"
|
|
295
|
-
assert issues[0].board_id == "1"
|
|
296
|
-
assert issues[1].id == "2"
|
|
297
|
-
assert issues[1].key == "TEST-2"
|
|
298
|
-
assert issues[2].id == "3"
|
|
299
|
-
assert issues[2].key == "TEST-3"
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
def test_jira_indexer_get_issues_empty(mocker: MockerFixture, jira_indexer: JiraIndexer):
|
|
303
|
-
mocker.patch.object(JiraIndexer, "_get_issues_within_boards", return_value=[])
|
|
304
|
-
mocker.patch.object(JiraIndexer, "_get_issues_within_projects", return_value=[])
|
|
305
|
-
mocker.patch.object(JiraIndexer, "_get_issues", return_value=[])
|
|
306
|
-
|
|
307
|
-
issues = jira_indexer.get_issues()
|
|
308
|
-
assert len(issues) == 0
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
def test_connection_config_multiple_auth():
|
|
312
|
-
with pytest.raises(ValidationError):
|
|
313
|
-
JiraConnectionConfig(
|
|
314
|
-
access_config=JiraAccessConfig(
|
|
315
|
-
password="api_token",
|
|
316
|
-
token="access_token",
|
|
317
|
-
),
|
|
318
|
-
username="user_email",
|
|
319
|
-
url="url",
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def test_connection_config_no_auth():
|
|
324
|
-
with pytest.raises(ValidationError):
|
|
325
|
-
JiraConnectionConfig(access_config=JiraAccessConfig(), url="url")
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
def test_connection_config_basic_auth():
|
|
329
|
-
JiraConnectionConfig(
|
|
330
|
-
access_config=JiraAccessConfig(password="api_token"),
|
|
331
|
-
url="url",
|
|
332
|
-
username="user_email",
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def test_connection_config_pat_auth():
|
|
337
|
-
JiraConnectionConfig(
|
|
338
|
-
access_config=JiraAccessConfig(token="access_token"),
|
|
339
|
-
url="url",
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def test_jira_issue_metadata_object():
|
|
344
|
-
expected = {"id": "10000", "key": "TEST-1", "board_id": "1", "project_id": "TEST"}
|
|
345
|
-
metadata = JiraIssueMetadata(id="10000", key="TEST-1", board_id="1")
|
|
346
|
-
assert expected == metadata.to_dict()
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def test_nested_object_to_field_getter():
|
|
350
|
-
obj = {"a": 1, "b": {"c": 2}}
|
|
351
|
-
fg = nested_object_to_field_getter(obj)
|
|
352
|
-
assert isinstance(fg, FieldGetter)
|
|
353
|
-
assert fg["a"] == 1
|
|
354
|
-
assert isinstance(fg["b"], FieldGetter)
|
|
355
|
-
assert fg["b"]["c"] == 2
|
|
356
|
-
assert isinstance(fg["b"]["d"], FieldGetter)
|
|
357
|
-
assert fg["b"]["d"]["e"] == {}
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
def test_issues_fetcher_wrapper():
|
|
361
|
-
test_issues_to_fetch = 250
|
|
362
|
-
test_issues = [{"id": i} for i in range(0, test_issues_to_fetch)]
|
|
363
|
-
|
|
364
|
-
def mock_func(limit, start):
|
|
365
|
-
return {"results": test_issues[start : start + limit]}
|
|
366
|
-
|
|
367
|
-
wrapped_func = issues_fetcher_wrapper(mock_func, number_of_issues_to_fetch=test_issues_to_fetch)
|
|
368
|
-
results = wrapped_func()
|
|
369
|
-
assert len(results) == 250
|
|
370
|
-
assert results[0]["id"] == 0
|
|
371
|
-
assert results[-1]["id"] == 249
|
|
372
|
-
|
|
373
|
-
test_issues_to_fetch = 150
|
|
374
|
-
test_issues = [{"id": i} for i in range(0, test_issues_to_fetch)]
|
|
375
|
-
|
|
376
|
-
def mock_func_list(limit, start):
|
|
377
|
-
return test_issues[start : start + limit]
|
|
378
|
-
|
|
379
|
-
wrapped_func_list = issues_fetcher_wrapper(
|
|
380
|
-
mock_func_list, number_of_issues_to_fetch=test_issues_to_fetch
|
|
381
|
-
)
|
|
382
|
-
results_list = wrapped_func_list()
|
|
383
|
-
assert len(results_list) == 150
|
|
384
|
-
assert results_list[0]["id"] == 0
|
|
385
|
-
assert results_list[-1]["id"] == 149
|
|
386
|
-
|
|
387
|
-
def mock_func_invalid(limit, start):
|
|
388
|
-
return "invalid"
|
|
389
|
-
|
|
390
|
-
wrapped_func_invalid = issues_fetcher_wrapper(mock_func_invalid, number_of_issues_to_fetch=50)
|
|
391
|
-
with pytest.raises(TypeError):
|
|
392
|
-
wrapped_func_invalid()
|
|
393
|
-
|
|
394
|
-
def mock_func_key_error(limit, start):
|
|
395
|
-
return {"wrong_key": []}
|
|
396
|
-
|
|
397
|
-
wrapped_func_key_error = issues_fetcher_wrapper(
|
|
398
|
-
mock_func_key_error, number_of_issues_to_fetch=50
|
|
399
|
-
)
|
|
400
|
-
with pytest.raises(KeyError):
|
|
401
|
-
wrapped_func_key_error()
|
test/unit/embed/__init__.py
DELETED
|
File without changes
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from unstructured_ingest.embed.mixedbreadai import (
|
|
2
|
-
MixedbreadAIEmbeddingConfig,
|
|
3
|
-
MixedbreadAIEmbeddingEncoder,
|
|
4
|
-
)
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
8
|
-
mock_client = mocker.MagicMock()
|
|
9
|
-
|
|
10
|
-
def mock_embeddings(
|
|
11
|
-
model,
|
|
12
|
-
normalized,
|
|
13
|
-
encoding_format,
|
|
14
|
-
truncation_strategy,
|
|
15
|
-
request_options,
|
|
16
|
-
input,
|
|
17
|
-
):
|
|
18
|
-
mock_response = mocker.MagicMock()
|
|
19
|
-
mock_response.data = [mocker.MagicMock(embedding=[i, i + 1]) for i in range(len(input))]
|
|
20
|
-
return mock_response
|
|
21
|
-
|
|
22
|
-
mock_client.embeddings.side_effect = mock_embeddings
|
|
23
|
-
|
|
24
|
-
# Mock get_client to return our mock_client
|
|
25
|
-
mocker.patch.object(MixedbreadAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
26
|
-
mocker.patch.object(MixedbreadAIEmbeddingEncoder, "get_request_options", return_value={})
|
|
27
|
-
|
|
28
|
-
encoder = MixedbreadAIEmbeddingEncoder(
|
|
29
|
-
config=MixedbreadAIEmbeddingConfig(
|
|
30
|
-
api_key="api_key", model_name="mixedbread-ai/mxbai-embed-large-v1"
|
|
31
|
-
)
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
35
|
-
elements = encoder.embed_documents(
|
|
36
|
-
elements=raw_elements,
|
|
37
|
-
)
|
|
38
|
-
assert len(elements) == 2
|
|
39
|
-
assert elements[0]["text"] == "This is sentence 1"
|
|
40
|
-
assert elements[1]["text"] == "This is sentence 2"
|
|
41
|
-
assert elements[0]["embeddings"] is not None
|
|
42
|
-
assert elements[1]["embeddings"] is not None
|
test/unit/embed/test_octoai.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
-
# Mocked client with the desired behavior for embed_documents
|
|
6
|
-
mock_client = mocker.MagicMock()
|
|
7
|
-
mock_data = []
|
|
8
|
-
for i in range(2):
|
|
9
|
-
data = mocker.MagicMock()
|
|
10
|
-
data.embedding = [1, 2]
|
|
11
|
-
mock_data.append(data)
|
|
12
|
-
mock_response = mocker.MagicMock()
|
|
13
|
-
mock_response.data = mock_data
|
|
14
|
-
mock_client.embeddings.create.return_value = mock_response
|
|
15
|
-
|
|
16
|
-
# Mock get_client to return our mock_client
|
|
17
|
-
mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
|
|
18
|
-
|
|
19
|
-
encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key"))
|
|
20
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
21
|
-
|
|
22
|
-
elements = encoder.embed_documents(
|
|
23
|
-
elements=raw_elements,
|
|
24
|
-
)
|
|
25
|
-
assert len(elements) == 2
|
|
26
|
-
assert elements[0]["text"] == "This is sentence 1"
|
|
27
|
-
assert elements[1]["text"] == "This is sentence 2"
|
test/unit/embed/test_openai.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
-
# Mocked client with the desired behavior for embed_documents
|
|
6
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(4)]
|
|
7
|
-
mock_response = mocker.MagicMock()
|
|
8
|
-
mock_response_data = []
|
|
9
|
-
for i in range(2):
|
|
10
|
-
mock_response_d = mocker.MagicMock()
|
|
11
|
-
mock_response_d.embedding = [1, 2]
|
|
12
|
-
mock_response_data.append(mock_response_d)
|
|
13
|
-
mock_response.data = mock_response_data
|
|
14
|
-
mock_client = mocker.MagicMock()
|
|
15
|
-
mock_client.embeddings.create.return_value = mock_response
|
|
16
|
-
|
|
17
|
-
# Mock get_client to return our mock_client
|
|
18
|
-
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
19
|
-
|
|
20
|
-
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key", batch_size=2))
|
|
21
|
-
|
|
22
|
-
elements = encoder.embed_documents(
|
|
23
|
-
elements=raw_elements,
|
|
24
|
-
)
|
|
25
|
-
assert len(elements) == 4
|
|
26
|
-
assert elements[0]["text"] == "This is sentence 1"
|
|
27
|
-
assert elements[1]["text"] == "This is sentence 2"
|
|
28
|
-
assert mock_client.embeddings.create.call_count == 2
|
test/unit/embed/test_vertexai.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
-
# Mocked client with the desired behavior for embed_documents
|
|
6
|
-
mock_responses = []
|
|
7
|
-
for i in [1, 2]:
|
|
8
|
-
mock_response = mocker.Mock()
|
|
9
|
-
mocker.patch.object(mock_response, "values", i)
|
|
10
|
-
mock_responses.append(mock_response)
|
|
11
|
-
|
|
12
|
-
mock_client = mocker.MagicMock()
|
|
13
|
-
mock_client.get_embeddings.return_value = mock_responses
|
|
14
|
-
|
|
15
|
-
# Mock create_client to return our mock_client
|
|
16
|
-
mocker.patch.object(VertexAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
17
|
-
encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key={"api_key": "value"}))
|
|
18
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
19
|
-
|
|
20
|
-
elements = encoder.embed_documents(
|
|
21
|
-
elements=raw_elements,
|
|
22
|
-
)
|
|
23
|
-
assert len(elements) == 2
|
|
24
|
-
assert elements[0]["text"] == "This is sentence 1"
|
|
25
|
-
assert elements[1]["text"] == "This is sentence 2"
|
test/unit/embed/test_voyageai.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
-
# Mocked client with the desired behavior for embed_documents
|
|
6
|
-
mock_response = mocker.MagicMock()
|
|
7
|
-
mocker.patch.object(mock_response, "embeddings", [1, 2])
|
|
8
|
-
mock_client = mocker.MagicMock()
|
|
9
|
-
mock_client.embed.return_value = mock_response
|
|
10
|
-
|
|
11
|
-
# Mock get_client to return our mock_client
|
|
12
|
-
mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
13
|
-
|
|
14
|
-
encoder = VoyageAIEmbeddingEncoder(
|
|
15
|
-
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
|
|
16
|
-
)
|
|
17
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
18
|
-
|
|
19
|
-
elements = encoder.embed_documents(
|
|
20
|
-
elements=raw_elements,
|
|
21
|
-
)
|
|
22
|
-
assert len(elements) == 2
|
|
23
|
-
assert elements[0]["text"] == "This is sentence 1"
|
|
24
|
-
assert elements[1]["text"] == "This is sentence 2"
|
test/unit/embedders/__init__.py
DELETED
|
File without changes
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
7
|
-
|
|
8
|
-
fake = faker.Faker()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def generate_embedder_config_params() -> dict:
|
|
12
|
-
params = {
|
|
13
|
-
"aws_access_key_id": fake.password(),
|
|
14
|
-
"aws_secret_access_key": fake.password(),
|
|
15
|
-
"region_name": fake.city(),
|
|
16
|
-
}
|
|
17
|
-
if random.random() < 0.5:
|
|
18
|
-
params["embedder_model_name"] = fake.word()
|
|
19
|
-
return params
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.parametrize(
|
|
23
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
24
|
-
)
|
|
25
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
26
|
-
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
|
|
27
|
-
assert embedder_config
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@pytest.mark.parametrize(
|
|
31
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
32
|
-
)
|
|
33
|
-
def test_embedder(embedder_config_params: dict):
|
|
34
|
-
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
|
|
35
|
-
embedder = BedrockEmbeddingEncoder(config=embedder_config)
|
|
36
|
-
assert embedder
|