unstructured-ingest 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/weaviate/test_local.py +27 -6
- test/integration/embedders/test_azure_openai.py +1 -3
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -3
- test/integration/embedders/test_mixedbread.py +2 -2
- test/integration/embedders/test_octoai.py +2 -4
- test/integration/embedders/test_openai.py +2 -4
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +2 -4
- test/integration/embedders/test_voyageai.py +2 -4
- test/integration/embedders/utils.py +12 -14
- test/unit/embed/test_openai.py +12 -4
- test/unit/test_html.py +112 -0
- test/unit/v2/embedders/test_voyageai.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +6 -1
- unstructured_ingest/embed/interfaces.py +9 -6
- unstructured_ingest/embed/mixedbreadai.py +3 -10
- unstructured_ingest/embed/octoai.py +14 -7
- unstructured_ingest/embed/openai.py +18 -5
- unstructured_ingest/embed/togetherai.py +19 -8
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +19 -6
- unstructured_ingest/utils/html.py +143 -93
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/interfaces/uploader.py +14 -1
- unstructured_ingest/v2/pipeline/pipeline.py +20 -6
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
- unstructured_ingest/v2/processes/embedder.py +3 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +38 -36
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
|
25
25
|
start_time = time.time()
|
|
26
26
|
while time.time() - start_time < timeout:
|
|
27
27
|
try:
|
|
28
|
-
requests.get("http://localhost:8080/v1/.well-known/read")
|
|
28
|
+
requests.get("http://localhost:8080/v1/.well-known/read", timeout=1)
|
|
29
29
|
return
|
|
30
30
|
except Exception as e:
|
|
31
31
|
print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
|
|
@@ -34,15 +34,20 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
@pytest.fixture
|
|
37
|
-
def
|
|
37
|
+
def weaviate_instance():
|
|
38
38
|
with container_context(
|
|
39
39
|
image="semitechnologies/weaviate:1.27.3",
|
|
40
40
|
ports={8080: 8080, 50051: 50051},
|
|
41
|
-
):
|
|
41
|
+
) as ctx:
|
|
42
42
|
wait_for_container()
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
yield ctx
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture
|
|
47
|
+
def collection(weaviate_instance, collections_schema_config: dict) -> str:
|
|
48
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
49
|
+
weaviate_client.collections.create_from_dict(config=collections_schema_config)
|
|
50
|
+
return COLLECTION_NAME
|
|
46
51
|
|
|
47
52
|
|
|
48
53
|
def get_count(client: WeaviateClient) -> int:
|
|
@@ -129,3 +134,19 @@ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path
|
|
|
129
134
|
file_data=file_data,
|
|
130
135
|
expected_count=expected_count,
|
|
131
136
|
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
140
|
+
def test_weaviate_local_create_destination(weaviate_instance):
|
|
141
|
+
uploader = LocalWeaviateUploader(
|
|
142
|
+
upload_config=LocalWeaviateUploaderConfig(),
|
|
143
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
144
|
+
)
|
|
145
|
+
collection_name = "system_created"
|
|
146
|
+
created = uploader.create_destination(destination_name=collection_name)
|
|
147
|
+
assert created
|
|
148
|
+
with uploader.connection_config.get_client() as weaviate_client:
|
|
149
|
+
assert weaviate_client.collections.exists(name=collection_name)
|
|
150
|
+
|
|
151
|
+
created = uploader.create_destination(destination_name=collection_name)
|
|
152
|
+
assert not created
|
|
@@ -54,6 +54,4 @@ def test_raw_azure_openai_embedder(embedder_file: Path):
|
|
|
54
54
|
azure_endpoint=azure_data.endpoint,
|
|
55
55
|
)
|
|
56
56
|
)
|
|
57
|
-
validate_raw_embedder(
|
|
58
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
59
|
-
)
|
|
57
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
@@ -55,7 +55,7 @@ def test_raw_bedrock_embedder(embedder_file: Path):
|
|
|
55
55
|
validate_raw_embedder(
|
|
56
56
|
embedder=embedder,
|
|
57
57
|
embedder_file=embedder_file,
|
|
58
|
-
|
|
58
|
+
expected_dimension=1536,
|
|
59
59
|
expected_is_unit_vector=False,
|
|
60
60
|
)
|
|
61
61
|
|
|
@@ -98,6 +98,6 @@ async def test_raw_async_bedrock_embedder(embedder_file: Path):
|
|
|
98
98
|
await validate_raw_embedder_async(
|
|
99
99
|
embedder=embedder,
|
|
100
100
|
embedder_file=embedder_file,
|
|
101
|
-
|
|
101
|
+
expected_dimension=1536,
|
|
102
102
|
expected_is_unit_vector=False,
|
|
103
103
|
)
|
|
@@ -21,6 +21,4 @@ def test_huggingface_embedder(embedder_file: Path):
|
|
|
21
21
|
|
|
22
22
|
def test_raw_hugginface_embedder(embedder_file: Path):
|
|
23
23
|
embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
24
|
-
validate_raw_embedder(
|
|
25
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
|
|
26
|
-
)
|
|
24
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=384)
|
|
@@ -49,7 +49,7 @@ def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
|
49
49
|
validate_raw_embedder(
|
|
50
50
|
embedder=embedder,
|
|
51
51
|
embedder_file=embedder_file,
|
|
52
|
-
|
|
52
|
+
expected_dimension=1024,
|
|
53
53
|
expected_is_unit_vector=False,
|
|
54
54
|
)
|
|
55
55
|
|
|
@@ -66,6 +66,6 @@ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
|
|
|
66
66
|
await validate_raw_embedder_async(
|
|
67
67
|
embedder=embedder,
|
|
68
68
|
embedder_file=embedder_file,
|
|
69
|
-
|
|
69
|
+
expected_dimension=1024,
|
|
70
70
|
expected_is_unit_vector=False,
|
|
71
71
|
)
|
|
@@ -47,9 +47,7 @@ def test_raw_octoai_embedder(embedder_file: Path):
|
|
|
47
47
|
api_key=api_key,
|
|
48
48
|
)
|
|
49
49
|
)
|
|
50
|
-
validate_raw_embedder(
|
|
51
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
52
|
-
)
|
|
50
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
53
51
|
|
|
54
52
|
|
|
55
53
|
@pytest.mark.skip(reason="Unexpected connection error at the moment")
|
|
@@ -73,5 +71,5 @@ async def test_raw_async_octoai_embedder(embedder_file: Path):
|
|
|
73
71
|
)
|
|
74
72
|
)
|
|
75
73
|
await validate_raw_embedder_async(
|
|
76
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
74
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
77
75
|
)
|
|
@@ -47,9 +47,7 @@ def test_raw_openai_embedder(embedder_file: Path):
|
|
|
47
47
|
api_key=api_key,
|
|
48
48
|
)
|
|
49
49
|
)
|
|
50
|
-
validate_raw_embedder(
|
|
51
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
52
|
-
)
|
|
50
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
53
51
|
|
|
54
52
|
|
|
55
53
|
def test_raw_openai_embedder_invalid_credentials():
|
|
@@ -72,5 +70,5 @@ async def test_raw_async_openai_embedder(embedder_file: Path):
|
|
|
72
70
|
)
|
|
73
71
|
)
|
|
74
72
|
await validate_raw_embedder_async(
|
|
75
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
73
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1536
|
|
76
74
|
)
|
|
@@ -46,7 +46,7 @@ def test_raw_togetherai_embedder(embedder_file: Path):
|
|
|
46
46
|
validate_raw_embedder(
|
|
47
47
|
embedder=embedder,
|
|
48
48
|
embedder_file=embedder_file,
|
|
49
|
-
|
|
49
|
+
expected_dimension=768,
|
|
50
50
|
expected_is_unit_vector=False,
|
|
51
51
|
)
|
|
52
52
|
|
|
@@ -66,6 +66,6 @@ async def test_raw_async_togetherai_embedder(embedder_file: Path):
|
|
|
66
66
|
await validate_raw_embedder_async(
|
|
67
67
|
embedder=embedder,
|
|
68
68
|
embedder_file=embedder_file,
|
|
69
|
-
|
|
69
|
+
expected_dimension=768,
|
|
70
70
|
expected_is_unit_vector=False,
|
|
71
71
|
)
|
|
@@ -46,9 +46,7 @@ def test_raw_vertexai_embedder(embedder_file: Path):
|
|
|
46
46
|
api_key=api_key,
|
|
47
47
|
)
|
|
48
48
|
)
|
|
49
|
-
validate_raw_embedder(
|
|
50
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
|
|
51
|
-
)
|
|
49
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
@requires_env(API_KEY)
|
|
@@ -61,5 +59,5 @@ async def test_raw_async_vertexai_embedder(embedder_file: Path):
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
await validate_raw_embedder_async(
|
|
64
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
62
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=768
|
|
65
63
|
)
|
|
@@ -46,9 +46,7 @@ def test_raw_voyageai_embedder(embedder_file: Path):
|
|
|
46
46
|
api_key=api_key,
|
|
47
47
|
)
|
|
48
48
|
)
|
|
49
|
-
validate_raw_embedder(
|
|
50
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
51
|
-
)
|
|
49
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
@requires_env(API_KEY)
|
|
@@ -61,5 +59,5 @@ async def test_raw_async_voyageai_embedder(embedder_file: Path):
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
await validate_raw_embedder_async(
|
|
64
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
62
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
65
63
|
)
|
|
@@ -23,23 +23,22 @@ def validate_embedding_output(original_elements: list[dict], output_elements: li
|
|
|
23
23
|
def validate_raw_embedder(
|
|
24
24
|
embedder: BaseEmbeddingEncoder,
|
|
25
25
|
embedder_file: Path,
|
|
26
|
-
|
|
26
|
+
expected_dimension: Optional[int] = None,
|
|
27
27
|
expected_is_unit_vector: bool = True,
|
|
28
28
|
):
|
|
29
29
|
with open(embedder_file) as f:
|
|
30
30
|
elements = json.load(f)
|
|
31
31
|
all_text = [element["text"] for element in elements]
|
|
32
32
|
single_text = all_text[0]
|
|
33
|
-
|
|
34
|
-
if
|
|
33
|
+
dimension = embedder.dimension
|
|
34
|
+
if expected_dimension:
|
|
35
35
|
assert (
|
|
36
|
-
|
|
37
|
-
), f"
|
|
36
|
+
dimension == expected_dimension
|
|
37
|
+
), f"dimensions {dimension} didn't match expected: {expected_dimension}"
|
|
38
38
|
is_unit_vector = embedder.is_unit_vector
|
|
39
39
|
assert is_unit_vector == expected_is_unit_vector
|
|
40
40
|
single_embedding = embedder.embed_query(query=single_text)
|
|
41
|
-
|
|
42
|
-
assert len(single_embedding) == expected_length
|
|
41
|
+
assert len(single_embedding) == dimension
|
|
43
42
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
44
43
|
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
|
45
44
|
|
|
@@ -47,22 +46,21 @@ def validate_raw_embedder(
|
|
|
47
46
|
async def validate_raw_embedder_async(
|
|
48
47
|
embedder: AsyncBaseEmbeddingEncoder,
|
|
49
48
|
embedder_file: Path,
|
|
50
|
-
|
|
49
|
+
expected_dimension: Optional[int] = None,
|
|
51
50
|
expected_is_unit_vector: bool = True,
|
|
52
51
|
):
|
|
53
52
|
with open(embedder_file) as f:
|
|
54
53
|
elements = json.load(f)
|
|
55
54
|
all_text = [element["text"] for element in elements]
|
|
56
55
|
single_text = all_text[0]
|
|
57
|
-
|
|
58
|
-
if
|
|
56
|
+
dimension = await embedder.dimension
|
|
57
|
+
if expected_dimension:
|
|
59
58
|
assert (
|
|
60
|
-
|
|
61
|
-
), f"
|
|
59
|
+
dimension == expected_dimension
|
|
60
|
+
), f"dimension {dimension} didn't match expected: {expected_dimension}"
|
|
62
61
|
is_unit_vector = await embedder.is_unit_vector
|
|
63
62
|
assert is_unit_vector == expected_is_unit_vector
|
|
64
63
|
single_embedding = await embedder.embed_query(query=single_text)
|
|
65
|
-
|
|
66
|
-
assert len(single_embedding) == expected_length
|
|
64
|
+
assert len(single_embedding) == dimension
|
|
67
65
|
embedded_elements = await embedder.embed_documents(elements=elements)
|
|
68
66
|
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
test/unit/embed/test_openai.py
CHANGED
|
@@ -3,18 +3,26 @@ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbedd
|
|
|
3
3
|
|
|
4
4
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
5
|
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(4)]
|
|
7
|
+
mock_response = mocker.MagicMock()
|
|
8
|
+
mock_response_data = []
|
|
9
|
+
for i in range(2):
|
|
10
|
+
mock_response_d = mocker.MagicMock()
|
|
11
|
+
mock_response_d.embedding = [1, 2]
|
|
12
|
+
mock_response_data.append(mock_response_d)
|
|
13
|
+
mock_response.data = mock_response_data
|
|
6
14
|
mock_client = mocker.MagicMock()
|
|
7
|
-
mock_client.
|
|
15
|
+
mock_client.embeddings.create.return_value = mock_response
|
|
8
16
|
|
|
9
17
|
# Mock get_client to return our mock_client
|
|
10
18
|
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
11
19
|
|
|
12
|
-
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
|
|
13
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
20
|
+
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key", batch_size=2))
|
|
14
21
|
|
|
15
22
|
elements = encoder.embed_documents(
|
|
16
23
|
elements=raw_elements,
|
|
17
24
|
)
|
|
18
|
-
assert len(elements) ==
|
|
25
|
+
assert len(elements) == 4
|
|
19
26
|
assert elements[0]["text"] == "This is sentence 1"
|
|
20
27
|
assert elements[1]["text"] == "This is sentence 2"
|
|
28
|
+
assert mock_client.embeddings.create.call_count == 2
|
test/unit/test_html.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from pytest_mock import MockerFixture
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.utils.html import HtmlMixin
|
|
8
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_extract_images(mocker: MockerFixture):
|
|
12
|
+
mixin = HtmlMixin(extract_images=True)
|
|
13
|
+
mock_download_response = b"DOWNLOADED"
|
|
14
|
+
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
15
|
+
mocked_download_response = mocker.patch(
|
|
16
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
17
|
+
return_value=mock_download_response,
|
|
18
|
+
)
|
|
19
|
+
url = "http://mywebsite.com/path/to/page"
|
|
20
|
+
html = """
|
|
21
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
22
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
23
|
+
<img src="img3.jpg"/>
|
|
24
|
+
<img src="..."/>
|
|
25
|
+
"""
|
|
26
|
+
expected_html = f"""
|
|
27
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
28
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
29
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
30
|
+
<img src="..."/>
|
|
31
|
+
"""
|
|
32
|
+
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
33
|
+
result = mixin.extract_html_images(url=url, html=html)
|
|
34
|
+
result_soup = BeautifulSoup(result, "html.parser")
|
|
35
|
+
assert expected_soup == result_soup
|
|
36
|
+
assert mocked_download_response.call_count == 2
|
|
37
|
+
urls_to_download = [
|
|
38
|
+
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
39
|
+
]
|
|
40
|
+
assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_extract_images_allow_list(mocker: MockerFixture):
|
|
44
|
+
mixin = HtmlMixin(
|
|
45
|
+
extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
|
|
46
|
+
)
|
|
47
|
+
mock_download_response = b"DOWNLOADED"
|
|
48
|
+
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
49
|
+
mocked_download_response = mocker.patch(
|
|
50
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
51
|
+
return_value=mock_download_response,
|
|
52
|
+
)
|
|
53
|
+
url = "http://mywebsite.com/path/to/page"
|
|
54
|
+
html = """
|
|
55
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
56
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
57
|
+
<img src="http://allowedwebsite1.com/img2.jpg"/>
|
|
58
|
+
<img src="http://allowedwebsite2.com/img2.jpg"/>
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
expected_html = f"""
|
|
62
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
63
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
64
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
65
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
66
|
+
"""
|
|
67
|
+
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
68
|
+
result = mixin.extract_html_images(url=url, html=html)
|
|
69
|
+
result_soup = BeautifulSoup(result, "html.parser")
|
|
70
|
+
assert expected_soup == result_soup
|
|
71
|
+
assert mocked_download_response.call_count == 2
|
|
72
|
+
urls_to_download = [
|
|
73
|
+
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
74
|
+
]
|
|
75
|
+
assert urls_to_download == [
|
|
76
|
+
"http://allowedwebsite1.com/img2.jpg",
|
|
77
|
+
"http://allowedwebsite2.com/img2.jpg",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_extract_embedded_docs(mocker: MockerFixture):
|
|
82
|
+
mixin = HtmlMixin(extract_files=True)
|
|
83
|
+
mock_download_response = b"DOWNLOADED"
|
|
84
|
+
mocked_download_response = mocker.patch(
|
|
85
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
86
|
+
return_value=mock_download_response,
|
|
87
|
+
)
|
|
88
|
+
mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
|
|
89
|
+
url = "http://mywebsite.com/path/to/page"
|
|
90
|
+
html = """
|
|
91
|
+
<a href="http://mywebsite.com/file.pdf"/>
|
|
92
|
+
<a href="http://notmywebsite.com/file.pdf"/>
|
|
93
|
+
<a href="http://mywebsite.com/another/link"/>
|
|
94
|
+
<a href="another/link/2"/>
|
|
95
|
+
<a href="file.doc"/>
|
|
96
|
+
"""
|
|
97
|
+
file_data = FileData(
|
|
98
|
+
source_identifiers=SourceIdentifiers(
|
|
99
|
+
fullpath="file.txt",
|
|
100
|
+
filename="file.txt",
|
|
101
|
+
),
|
|
102
|
+
connector_type="my_connector",
|
|
103
|
+
identifier="mock_file_data",
|
|
104
|
+
)
|
|
105
|
+
results = mixin.extract_embedded_files(
|
|
106
|
+
url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
|
|
107
|
+
)
|
|
108
|
+
assert len(results) == 2
|
|
109
|
+
downloaded_urls = [r["file_data"].metadata.url for r in results]
|
|
110
|
+
assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
|
|
111
|
+
assert mocked_download_response.call_count == 2
|
|
112
|
+
assert mocked_write_content.call_count == 2
|
|
@@ -14,7 +14,7 @@ def generate_embedder_config_params() -> dict:
|
|
|
14
14
|
}
|
|
15
15
|
if random.random() < 0.5:
|
|
16
16
|
params["embedder_model_name"] = fake.word()
|
|
17
|
-
params["batch_size"] = fake.random_int()
|
|
17
|
+
params["batch_size"] = fake.random_int(max=100)
|
|
18
18
|
params["truncation"] = fake.boolean()
|
|
19
19
|
params["max_retries"] = fake.random_int()
|
|
20
20
|
params["timeout_in_seconds"] = fake.random_int()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.3" # pragma: no cover
|
|
@@ -33,6 +33,11 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
33
33
|
**self.embedder_model_kwargs,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
+
def get_encoder_kwargs(self) -> dict:
|
|
37
|
+
encoder_kwargs = self.encode_kwargs or {}
|
|
38
|
+
encoder_kwargs["batch_size"] = self.batch_size
|
|
39
|
+
return encoder_kwargs
|
|
40
|
+
|
|
36
41
|
|
|
37
42
|
@dataclass
|
|
38
43
|
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
@@ -43,7 +48,7 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
43
48
|
|
|
44
49
|
def _embed_documents(self, texts: list[str]) -> list[list[float]]:
|
|
45
50
|
client = self.config.get_client()
|
|
46
|
-
embeddings = client.encode(texts, **self.config.
|
|
51
|
+
embeddings = client.encode(texts, **self.config.get_encoder_kwargs())
|
|
47
52
|
return embeddings.tolist()
|
|
48
53
|
|
|
49
54
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
|
-
from pydantic import BaseModel
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class EmbeddingConfig(BaseModel):
|
|
10
|
-
|
|
11
|
+
batch_size: Optional[int] = Field(
|
|
12
|
+
default=32, description="Optional batch size for embedding requests."
|
|
13
|
+
)
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
@dataclass
|
|
@@ -53,9 +56,9 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
53
56
|
is properly configured: e.g., embed a single a element"""
|
|
54
57
|
|
|
55
58
|
@property
|
|
56
|
-
def
|
|
59
|
+
def dimension(self):
|
|
57
60
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
58
|
-
return
|
|
61
|
+
return len(exemplary_embedding)
|
|
59
62
|
|
|
60
63
|
def get_exemplary_embedding(self) -> list[float]:
|
|
61
64
|
return self.embed_query(query="Q")
|
|
@@ -91,9 +94,9 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
91
94
|
is properly configured: e.g., embed a single a element"""
|
|
92
95
|
|
|
93
96
|
@property
|
|
94
|
-
async def
|
|
97
|
+
async def dimension(self):
|
|
95
98
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
96
|
-
return
|
|
99
|
+
return len(exemplary_embedding)
|
|
97
100
|
|
|
98
101
|
async def get_exemplary_embedding(self) -> list[float]:
|
|
99
102
|
return await self.embed_query(query="Q")
|
|
@@ -10,10 +10,10 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
10
10
|
BaseEmbeddingEncoder,
|
|
11
11
|
EmbeddingConfig,
|
|
12
12
|
)
|
|
13
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
15
|
|
|
15
16
|
USER_AGENT = "@mixedbread-ai/unstructured"
|
|
16
|
-
BATCH_SIZE = 128
|
|
17
17
|
TIMEOUT = 60
|
|
18
18
|
MAX_RETRIES = 3
|
|
19
19
|
ENCODING_FORMAT = "float"
|
|
@@ -109,13 +109,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
109
109
|
Returns:
|
|
110
110
|
list[list[float]]: List of embeddings.
|
|
111
111
|
"""
|
|
112
|
-
batch_size = BATCH_SIZE
|
|
113
|
-
batch_itr = range(0, len(texts), batch_size)
|
|
114
112
|
|
|
115
113
|
responses = []
|
|
116
114
|
client = self.config.get_client()
|
|
117
|
-
for
|
|
118
|
-
batch = texts[i : i + batch_size]
|
|
115
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
119
116
|
response = client.embeddings(
|
|
120
117
|
model=self.config.embedder_model_name,
|
|
121
118
|
normalized=True,
|
|
@@ -186,13 +183,9 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
186
183
|
Returns:
|
|
187
184
|
list[list[float]]: List of embeddings.
|
|
188
185
|
"""
|
|
189
|
-
batch_size = BATCH_SIZE
|
|
190
|
-
batch_itr = range(0, len(texts), batch_size)
|
|
191
|
-
|
|
192
186
|
client = self.config.get_async_client()
|
|
193
187
|
tasks = []
|
|
194
|
-
for
|
|
195
|
-
batch = texts[i : i + batch_size]
|
|
188
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
196
189
|
tasks.append(
|
|
197
190
|
client.embeddings(
|
|
198
191
|
model=self.config.embedder_model_name,
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
14
|
from unstructured_ingest.v2.errors import (
|
|
14
15
|
ProviderError,
|
|
@@ -89,12 +90,16 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
89
90
|
|
|
90
91
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
91
92
|
texts = [e.get("text", "") for e in elements]
|
|
93
|
+
embeddings = []
|
|
94
|
+
client = self.config.get_client()
|
|
92
95
|
try:
|
|
93
|
-
|
|
94
|
-
|
|
96
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
97
|
+
response = client.embeddings.create(
|
|
98
|
+
input=batch, model=self.config.embedder_model_name
|
|
99
|
+
)
|
|
100
|
+
embeddings.extend([data.embedding for data in response.data])
|
|
95
101
|
except Exception as e:
|
|
96
102
|
raise self.wrap_error(e=e)
|
|
97
|
-
embeddings = [data.embedding for data in response.data]
|
|
98
103
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
99
104
|
return elements_with_embeddings
|
|
100
105
|
|
|
@@ -119,12 +124,14 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
119
124
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
120
125
|
texts = [e.get("text", "") for e in elements]
|
|
121
126
|
client = self.config.get_async_client()
|
|
127
|
+
embeddings = []
|
|
122
128
|
try:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
129
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
130
|
+
response = await client.embeddings.create(
|
|
131
|
+
input=batch, model=self.config.embedder_model_name
|
|
132
|
+
)
|
|
133
|
+
embeddings.extend([data.embedding for data in response.data])
|
|
126
134
|
except Exception as e:
|
|
127
135
|
raise self.wrap_error(e=e)
|
|
128
|
-
embeddings = [data.embedding for data in response.data]
|
|
129
136
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
130
137
|
return elements_with_embeddings
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
14
|
from unstructured_ingest.v2.errors import (
|
|
14
15
|
ProviderError,
|
|
@@ -80,7 +81,17 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
80
81
|
return response.data[0].embedding
|
|
81
82
|
|
|
82
83
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
83
|
-
|
|
84
|
+
client = self.config.get_client()
|
|
85
|
+
texts = [e.get("text", "") for e in elements]
|
|
86
|
+
embeddings = []
|
|
87
|
+
try:
|
|
88
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
89
|
+
response = client.embeddings.create(
|
|
90
|
+
input=batch, model=self.config.embedder_model_name
|
|
91
|
+
)
|
|
92
|
+
embeddings.extend([data.embedding for data in response.data])
|
|
93
|
+
except Exception as e:
|
|
94
|
+
raise self.wrap_error(e=e)
|
|
84
95
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
85
96
|
return elements_with_embeddings
|
|
86
97
|
|
|
@@ -105,12 +116,14 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
105
116
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
106
117
|
client = self.config.get_async_client()
|
|
107
118
|
texts = [e.get("text", "") for e in elements]
|
|
119
|
+
embeddings = []
|
|
108
120
|
try:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
121
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
122
|
+
response = await client.embeddings.create(
|
|
123
|
+
input=batch, model=self.config.embedder_model_name
|
|
124
|
+
)
|
|
125
|
+
embeddings.extend([data.embedding for data in response.data])
|
|
112
126
|
except Exception as e:
|
|
113
127
|
raise self.wrap_error(e=e)
|
|
114
|
-
embeddings = [data.embedding for data in response.data]
|
|
115
128
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
116
129
|
return elements_with_embeddings
|