unstructured-ingest 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show
  1. test/integration/connectors/weaviate/test_local.py +27 -6
  2. test/integration/embedders/test_azure_openai.py +1 -3
  3. test/integration/embedders/test_bedrock.py +2 -2
  4. test/integration/embedders/test_huggingface.py +1 -3
  5. test/integration/embedders/test_mixedbread.py +2 -2
  6. test/integration/embedders/test_octoai.py +2 -4
  7. test/integration/embedders/test_openai.py +2 -4
  8. test/integration/embedders/test_togetherai.py +2 -2
  9. test/integration/embedders/test_vertexai.py +2 -4
  10. test/integration/embedders/test_voyageai.py +2 -4
  11. test/integration/embedders/utils.py +12 -14
  12. test/unit/embed/test_openai.py +12 -4
  13. test/unit/test_html.py +112 -0
  14. test/unit/v2/embedders/test_voyageai.py +1 -1
  15. unstructured_ingest/__version__.py +1 -1
  16. unstructured_ingest/embed/huggingface.py +6 -1
  17. unstructured_ingest/embed/interfaces.py +9 -6
  18. unstructured_ingest/embed/mixedbreadai.py +3 -10
  19. unstructured_ingest/embed/octoai.py +14 -7
  20. unstructured_ingest/embed/openai.py +18 -5
  21. unstructured_ingest/embed/togetherai.py +19 -8
  22. unstructured_ingest/embed/vertexai.py +13 -6
  23. unstructured_ingest/embed/voyageai.py +19 -6
  24. unstructured_ingest/utils/html.py +143 -93
  25. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  26. unstructured_ingest/v2/interfaces/process.py +3 -0
  27. unstructured_ingest/v2/interfaces/uploader.py +14 -1
  28. unstructured_ingest/v2/pipeline/pipeline.py +20 -6
  29. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  30. unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
  31. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
  32. unstructured_ingest/v2/processes/embedder.py +3 -0
  33. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
  34. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +38 -36
  35. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
  36. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
  37. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
  38. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
25
25
  start_time = time.time()
26
26
  while time.time() - start_time < timeout:
27
27
  try:
28
- requests.get("http://localhost:8080/v1/.well-known/read")
28
+ requests.get("http://localhost:8080/v1/.well-known/read", timeout=1)
29
29
  return
30
30
  except Exception as e:
31
31
  print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
@@ -34,15 +34,20 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
34
34
 
35
35
 
36
36
  @pytest.fixture
37
- def collection(collections_schema_config: dict) -> str:
37
+ def weaviate_instance():
38
38
  with container_context(
39
39
  image="semitechnologies/weaviate:1.27.3",
40
40
  ports={8080: 8080, 50051: 50051},
41
- ):
41
+ ) as ctx:
42
42
  wait_for_container()
43
- with weaviate.connect_to_local() as weaviate_client:
44
- weaviate_client.collections.create_from_dict(config=collections_schema_config)
45
- yield COLLECTION_NAME
43
+ yield ctx
44
+
45
+
46
+ @pytest.fixture
47
+ def collection(weaviate_instance, collections_schema_config: dict) -> str:
48
+ with weaviate.connect_to_local() as weaviate_client:
49
+ weaviate_client.collections.create_from_dict(config=collections_schema_config)
50
+ return COLLECTION_NAME
46
51
 
47
52
 
48
53
  def get_count(client: WeaviateClient) -> int:
@@ -129,3 +134,19 @@ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path
129
134
  file_data=file_data,
130
135
  expected_count=expected_count,
131
136
  )
137
+
138
+
139
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
140
+ def test_weaviate_local_create_destination(weaviate_instance):
141
+ uploader = LocalWeaviateUploader(
142
+ upload_config=LocalWeaviateUploaderConfig(),
143
+ connection_config=LocalWeaviateConnectionConfig(),
144
+ )
145
+ collection_name = "system_created"
146
+ created = uploader.create_destination(destination_name=collection_name)
147
+ assert created
148
+ with uploader.connection_config.get_client() as weaviate_client:
149
+ assert weaviate_client.collections.exists(name=collection_name)
150
+
151
+ created = uploader.create_destination(destination_name=collection_name)
152
+ assert not created
@@ -54,6 +54,4 @@ def test_raw_azure_openai_embedder(embedder_file: Path):
54
54
  azure_endpoint=azure_data.endpoint,
55
55
  )
56
56
  )
57
- validate_raw_embedder(
58
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
59
- )
57
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
@@ -55,7 +55,7 @@ def test_raw_bedrock_embedder(embedder_file: Path):
55
55
  validate_raw_embedder(
56
56
  embedder=embedder,
57
57
  embedder_file=embedder_file,
58
- expected_dimensions=(1536,),
58
+ expected_dimension=1536,
59
59
  expected_is_unit_vector=False,
60
60
  )
61
61
 
@@ -98,6 +98,6 @@ async def test_raw_async_bedrock_embedder(embedder_file: Path):
98
98
  await validate_raw_embedder_async(
99
99
  embedder=embedder,
100
100
  embedder_file=embedder_file,
101
- expected_dimensions=(1536,),
101
+ expected_dimension=1536,
102
102
  expected_is_unit_vector=False,
103
103
  )
@@ -21,6 +21,4 @@ def test_huggingface_embedder(embedder_file: Path):
21
21
 
22
22
  def test_raw_hugginface_embedder(embedder_file: Path):
23
23
  embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
24
- validate_raw_embedder(
25
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
26
- )
24
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=384)
@@ -49,7 +49,7 @@ def test_raw_mixedbread_embedder(embedder_file: Path):
49
49
  validate_raw_embedder(
50
50
  embedder=embedder,
51
51
  embedder_file=embedder_file,
52
- expected_dimensions=(1024,),
52
+ expected_dimension=1024,
53
53
  expected_is_unit_vector=False,
54
54
  )
55
55
 
@@ -66,6 +66,6 @@ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
66
66
  await validate_raw_embedder_async(
67
67
  embedder=embedder,
68
68
  embedder_file=embedder_file,
69
- expected_dimensions=(1024,),
69
+ expected_dimension=1024,
70
70
  expected_is_unit_vector=False,
71
71
  )
@@ -47,9 +47,7 @@ def test_raw_octoai_embedder(embedder_file: Path):
47
47
  api_key=api_key,
48
48
  )
49
49
  )
50
- validate_raw_embedder(
51
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
52
- )
50
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
53
51
 
54
52
 
55
53
  @pytest.mark.skip(reason="Unexpected connection error at the moment")
@@ -73,5 +71,5 @@ async def test_raw_async_octoai_embedder(embedder_file: Path):
73
71
  )
74
72
  )
75
73
  await validate_raw_embedder_async(
76
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
74
+ embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
77
75
  )
@@ -47,9 +47,7 @@ def test_raw_openai_embedder(embedder_file: Path):
47
47
  api_key=api_key,
48
48
  )
49
49
  )
50
- validate_raw_embedder(
51
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
52
- )
50
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
53
51
 
54
52
 
55
53
  def test_raw_openai_embedder_invalid_credentials():
@@ -72,5 +70,5 @@ async def test_raw_async_openai_embedder(embedder_file: Path):
72
70
  )
73
71
  )
74
72
  await validate_raw_embedder_async(
75
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
73
+ embedder=embedder, embedder_file=embedder_file, expected_dimension=1536
76
74
  )
@@ -46,7 +46,7 @@ def test_raw_togetherai_embedder(embedder_file: Path):
46
46
  validate_raw_embedder(
47
47
  embedder=embedder,
48
48
  embedder_file=embedder_file,
49
- expected_dimensions=(768,),
49
+ expected_dimension=768,
50
50
  expected_is_unit_vector=False,
51
51
  )
52
52
 
@@ -66,6 +66,6 @@ async def test_raw_async_togetherai_embedder(embedder_file: Path):
66
66
  await validate_raw_embedder_async(
67
67
  embedder=embedder,
68
68
  embedder_file=embedder_file,
69
- expected_dimensions=(768,),
69
+ expected_dimension=768,
70
70
  expected_is_unit_vector=False,
71
71
  )
@@ -46,9 +46,7 @@ def test_raw_vertexai_embedder(embedder_file: Path):
46
46
  api_key=api_key,
47
47
  )
48
48
  )
49
- validate_raw_embedder(
50
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
51
- )
49
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
52
50
 
53
51
 
54
52
  @requires_env(API_KEY)
@@ -61,5 +59,5 @@ async def test_raw_async_vertexai_embedder(embedder_file: Path):
61
59
  )
62
60
  )
63
61
  await validate_raw_embedder_async(
64
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
62
+ embedder=embedder, embedder_file=embedder_file, expected_dimension=768
65
63
  )
@@ -46,9 +46,7 @@ def test_raw_voyageai_embedder(embedder_file: Path):
46
46
  api_key=api_key,
47
47
  )
48
48
  )
49
- validate_raw_embedder(
50
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
51
- )
49
+ validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
52
50
 
53
51
 
54
52
  @requires_env(API_KEY)
@@ -61,5 +59,5 @@ async def test_raw_async_voyageai_embedder(embedder_file: Path):
61
59
  )
62
60
  )
63
61
  await validate_raw_embedder_async(
64
- embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
62
+ embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
65
63
  )
@@ -23,23 +23,22 @@ def validate_embedding_output(original_elements: list[dict], output_elements: li
23
23
  def validate_raw_embedder(
24
24
  embedder: BaseEmbeddingEncoder,
25
25
  embedder_file: Path,
26
- expected_dimensions: Optional[tuple[int, ...]] = None,
26
+ expected_dimension: Optional[int] = None,
27
27
  expected_is_unit_vector: bool = True,
28
28
  ):
29
29
  with open(embedder_file) as f:
30
30
  elements = json.load(f)
31
31
  all_text = [element["text"] for element in elements]
32
32
  single_text = all_text[0]
33
- num_of_dimensions = embedder.num_of_dimensions
34
- if expected_dimensions:
33
+ dimension = embedder.dimension
34
+ if expected_dimension:
35
35
  assert (
36
- num_of_dimensions == expected_dimensions
37
- ), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
36
+ dimension == expected_dimension
37
+ ), f"dimensions {dimension} didn't match expected: {expected_dimension}"
38
38
  is_unit_vector = embedder.is_unit_vector
39
39
  assert is_unit_vector == expected_is_unit_vector
40
40
  single_embedding = embedder.embed_query(query=single_text)
41
- expected_length = num_of_dimensions[0]
42
- assert len(single_embedding) == expected_length
41
+ assert len(single_embedding) == dimension
43
42
  embedded_elements = embedder.embed_documents(elements=elements)
44
43
  validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
45
44
 
@@ -47,22 +46,21 @@ def validate_raw_embedder(
47
46
  async def validate_raw_embedder_async(
48
47
  embedder: AsyncBaseEmbeddingEncoder,
49
48
  embedder_file: Path,
50
- expected_dimensions: Optional[tuple[int, ...]] = None,
49
+ expected_dimension: Optional[int] = None,
51
50
  expected_is_unit_vector: bool = True,
52
51
  ):
53
52
  with open(embedder_file) as f:
54
53
  elements = json.load(f)
55
54
  all_text = [element["text"] for element in elements]
56
55
  single_text = all_text[0]
57
- num_of_dimensions = await embedder.num_of_dimensions
58
- if expected_dimensions:
56
+ dimension = await embedder.dimension
57
+ if expected_dimension:
59
58
  assert (
60
- num_of_dimensions == expected_dimensions
61
- ), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
59
+ dimension == expected_dimension
60
+ ), f"dimension {dimension} didn't match expected: {expected_dimension}"
62
61
  is_unit_vector = await embedder.is_unit_vector
63
62
  assert is_unit_vector == expected_is_unit_vector
64
63
  single_embedding = await embedder.embed_query(query=single_text)
65
- expected_length = num_of_dimensions[0]
66
- assert len(single_embedding) == expected_length
64
+ assert len(single_embedding) == dimension
67
65
  embedded_elements = await embedder.embed_documents(elements=elements)
68
66
  validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
@@ -3,18 +3,26 @@ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbedd
3
3
 
4
4
  def test_embed_documents_does_not_break_element_to_dict(mocker):
5
5
  # Mocked client with the desired behavior for embed_documents
6
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(4)]
7
+ mock_response = mocker.MagicMock()
8
+ mock_response_data = []
9
+ for i in range(2):
10
+ mock_response_d = mocker.MagicMock()
11
+ mock_response_d.embedding = [1, 2]
12
+ mock_response_data.append(mock_response_d)
13
+ mock_response.data = mock_response_data
6
14
  mock_client = mocker.MagicMock()
7
- mock_client.embed_documents.return_value = [1, 2]
15
+ mock_client.embeddings.create.return_value = mock_response
8
16
 
9
17
  # Mock get_client to return our mock_client
10
18
  mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
11
19
 
12
- encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
13
- raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
20
+ encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key", batch_size=2))
14
21
 
15
22
  elements = encoder.embed_documents(
16
23
  elements=raw_elements,
17
24
  )
18
- assert len(elements) == 2
25
+ assert len(elements) == 4
19
26
  assert elements[0]["text"] == "This is sentence 1"
20
27
  assert elements[1]["text"] == "This is sentence 2"
28
+ assert mock_client.embeddings.create.call_count == 2
test/unit/test_html.py ADDED
@@ -0,0 +1,112 @@
1
+ import base64
2
+ from pathlib import Path
3
+
4
+ from bs4 import BeautifulSoup
5
+ from pytest_mock import MockerFixture
6
+
7
+ from unstructured_ingest.utils.html import HtmlMixin
8
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
9
+
10
+
11
+ def test_extract_images(mocker: MockerFixture):
12
+ mixin = HtmlMixin(extract_images=True)
13
+ mock_download_response = b"DOWNLOADED"
14
+ expected_image_src = base64.b64encode(mock_download_response).decode()
15
+ mocked_download_response = mocker.patch(
16
+ "unstructured_ingest.utils.html.HtmlMixin.download_content",
17
+ return_value=mock_download_response,
18
+ )
19
+ url = "http://mywebsite.com/path/to/page"
20
+ html = """
21
+ <img src="http://mywebsite.com/img1.jpg"/>
22
+ <img src="http://notmywebsite.com/img2.jpg"/>
23
+ <img src="img3.jpg"/>
24
+ <img src="data:image/png;base64,24689654..."/>
25
+ """
26
+ expected_html = f"""
27
+ <img src="data:image/png;base64,{expected_image_src}"/>
28
+ <img src="http://notmywebsite.com/img2.jpg"/>
29
+ <img src="data:image/png;base64,{expected_image_src}"/>
30
+ <img src="data:image/png;base64,24689654..."/>
31
+ """
32
+ expected_soup = BeautifulSoup(expected_html, "html.parser")
33
+ result = mixin.extract_html_images(url=url, html=html)
34
+ result_soup = BeautifulSoup(result, "html.parser")
35
+ assert expected_soup == result_soup
36
+ assert mocked_download_response.call_count == 2
37
+ urls_to_download = [
38
+ call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
39
+ ]
40
+ assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
41
+
42
+
43
+ def test_extract_images_allow_list(mocker: MockerFixture):
44
+ mixin = HtmlMixin(
45
+ extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
46
+ )
47
+ mock_download_response = b"DOWNLOADED"
48
+ expected_image_src = base64.b64encode(mock_download_response).decode()
49
+ mocked_download_response = mocker.patch(
50
+ "unstructured_ingest.utils.html.HtmlMixin.download_content",
51
+ return_value=mock_download_response,
52
+ )
53
+ url = "http://mywebsite.com/path/to/page"
54
+ html = """
55
+ <img src="http://mywebsite.com/img1.jpg"/>
56
+ <img src="http://notmywebsite.com/img2.jpg"/>
57
+ <img src="http://allowedwebsite1.com/img2.jpg"/>
58
+ <img src="http://allowedwebsite2.com/img2.jpg"/>
59
+ """
60
+
61
+ expected_html = f"""
62
+ <img src="http://mywebsite.com/img1.jpg"/>
63
+ <img src="http://notmywebsite.com/img2.jpg"/>
64
+ <img src="data:image/png;base64,{expected_image_src}"/>
65
+ <img src="data:image/png;base64,{expected_image_src}"/>
66
+ """
67
+ expected_soup = BeautifulSoup(expected_html, "html.parser")
68
+ result = mixin.extract_html_images(url=url, html=html)
69
+ result_soup = BeautifulSoup(result, "html.parser")
70
+ assert expected_soup == result_soup
71
+ assert mocked_download_response.call_count == 2
72
+ urls_to_download = [
73
+ call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
74
+ ]
75
+ assert urls_to_download == [
76
+ "http://allowedwebsite1.com/img2.jpg",
77
+ "http://allowedwebsite2.com/img2.jpg",
78
+ ]
79
+
80
+
81
+ def test_extract_embedded_docs(mocker: MockerFixture):
82
+ mixin = HtmlMixin(extract_files=True)
83
+ mock_download_response = b"DOWNLOADED"
84
+ mocked_download_response = mocker.patch(
85
+ "unstructured_ingest.utils.html.HtmlMixin.download_content",
86
+ return_value=mock_download_response,
87
+ )
88
+ mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
89
+ url = "http://mywebsite.com/path/to/page"
90
+ html = """
91
+ <a href="http://mywebsite.com/file.pdf"/>
92
+ <a href="http://notmywebsite.com/file.pdf"/>
93
+ <a href="http://mywebsite.com/another/link"/>
94
+ <a href="another/link/2"/>
95
+ <a href="file.doc"/>
96
+ """
97
+ file_data = FileData(
98
+ source_identifiers=SourceIdentifiers(
99
+ fullpath="file.txt",
100
+ filename="file.txt",
101
+ ),
102
+ connector_type="my_connector",
103
+ identifier="mock_file_data",
104
+ )
105
+ results = mixin.extract_embedded_files(
106
+ url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
107
+ )
108
+ assert len(results) == 2
109
+ downloaded_urls = [r["file_data"].metadata.url for r in results]
110
+ assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
111
+ assert mocked_download_response.call_count == 2
112
+ assert mocked_write_content.call_count == 2
@@ -14,7 +14,7 @@ def generate_embedder_config_params() -> dict:
14
14
  }
15
15
  if random.random() < 0.5:
16
16
  params["embedder_model_name"] = fake.word()
17
- params["batch_size"] = fake.random_int()
17
+ params["batch_size"] = fake.random_int(max=100)
18
18
  params["truncation"] = fake.boolean()
19
19
  params["max_retries"] = fake.random_int()
20
20
  params["timeout_in_seconds"] = fake.random_int()
@@ -1 +1 @@
1
- __version__ = "0.4.2" # pragma: no cover
1
+ __version__ = "0.4.3" # pragma: no cover
@@ -33,6 +33,11 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
33
33
  **self.embedder_model_kwargs,
34
34
  )
35
35
 
36
+ def get_encoder_kwargs(self) -> dict:
37
+ encoder_kwargs = self.encode_kwargs or {}
38
+ encoder_kwargs["batch_size"] = self.batch_size
39
+ return encoder_kwargs
40
+
36
41
 
37
42
  @dataclass
38
43
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
@@ -43,7 +48,7 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
43
48
 
44
49
  def _embed_documents(self, texts: list[str]) -> list[list[float]]:
45
50
  client = self.config.get_client()
46
- embeddings = client.encode(texts, **self.config.encode_kwargs)
51
+ embeddings = client.encode(texts, **self.config.get_encoder_kwargs())
47
52
  return embeddings.tolist()
48
53
 
49
54
  def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -1,13 +1,16 @@
1
1
  import asyncio
2
2
  from abc import ABC, abstractmethod
3
3
  from dataclasses import dataclass
4
+ from typing import Optional
4
5
 
5
6
  import numpy as np
6
- from pydantic import BaseModel
7
+ from pydantic import BaseModel, Field
7
8
 
8
9
 
9
10
  class EmbeddingConfig(BaseModel):
10
- pass
11
+ batch_size: Optional[int] = Field(
12
+ default=32, description="Optional batch size for embedding requests."
13
+ )
11
14
 
12
15
 
13
16
  @dataclass
@@ -53,9 +56,9 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
53
56
  is properly configured: e.g., embed a single a element"""
54
57
 
55
58
  @property
56
- def num_of_dimensions(self) -> tuple[int, ...]:
59
+ def dimension(self):
57
60
  exemplary_embedding = self.get_exemplary_embedding()
58
- return np.shape(exemplary_embedding)
61
+ return len(exemplary_embedding)
59
62
 
60
63
  def get_exemplary_embedding(self) -> list[float]:
61
64
  return self.embed_query(query="Q")
@@ -91,9 +94,9 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
91
94
  is properly configured: e.g., embed a single a element"""
92
95
 
93
96
  @property
94
- async def num_of_dimensions(self) -> tuple[int, ...]:
97
+ async def dimension(self):
95
98
  exemplary_embedding = await self.get_exemplary_embedding()
96
- return np.shape(exemplary_embedding)
99
+ return len(exemplary_embedding)
97
100
 
98
101
  async def get_exemplary_embedding(self) -> list[float]:
99
102
  return await self.embed_query(query="Q")
@@ -10,10 +10,10 @@ from unstructured_ingest.embed.interfaces import (
10
10
  BaseEmbeddingEncoder,
11
11
  EmbeddingConfig,
12
12
  )
13
+ from unstructured_ingest.utils.data_prep import batch_generator
13
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
15
 
15
16
  USER_AGENT = "@mixedbread-ai/unstructured"
16
- BATCH_SIZE = 128
17
17
  TIMEOUT = 60
18
18
  MAX_RETRIES = 3
19
19
  ENCODING_FORMAT = "float"
@@ -109,13 +109,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
109
109
  Returns:
110
110
  list[list[float]]: List of embeddings.
111
111
  """
112
- batch_size = BATCH_SIZE
113
- batch_itr = range(0, len(texts), batch_size)
114
112
 
115
113
  responses = []
116
114
  client = self.config.get_client()
117
- for i in batch_itr:
118
- batch = texts[i : i + batch_size]
115
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
119
116
  response = client.embeddings(
120
117
  model=self.config.embedder_model_name,
121
118
  normalized=True,
@@ -186,13 +183,9 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
186
183
  Returns:
187
184
  list[list[float]]: List of embeddings.
188
185
  """
189
- batch_size = BATCH_SIZE
190
- batch_itr = range(0, len(texts), batch_size)
191
-
192
186
  client = self.config.get_async_client()
193
187
  tasks = []
194
- for i in batch_itr:
195
- batch = texts[i : i + batch_size]
188
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
196
189
  tasks.append(
197
190
  client.embeddings(
198
191
  model=self.config.embedder_model_name,
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -89,12 +90,16 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
89
90
 
90
91
  def embed_documents(self, elements: list[dict]) -> list[dict]:
91
92
  texts = [e.get("text", "") for e in elements]
93
+ embeddings = []
94
+ client = self.config.get_client()
92
95
  try:
93
- client = self.config.get_client()
94
- response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
96
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
97
+ response = client.embeddings.create(
98
+ input=batch, model=self.config.embedder_model_name
99
+ )
100
+ embeddings.extend([data.embedding for data in response.data])
95
101
  except Exception as e:
96
102
  raise self.wrap_error(e=e)
97
- embeddings = [data.embedding for data in response.data]
98
103
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
104
  return elements_with_embeddings
100
105
 
@@ -119,12 +124,14 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
119
124
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
120
125
  texts = [e.get("text", "") for e in elements]
121
126
  client = self.config.get_async_client()
127
+ embeddings = []
122
128
  try:
123
- response = await client.embeddings.create(
124
- input=texts, model=self.config.embedder_model_name
125
- )
129
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
130
+ response = await client.embeddings.create(
131
+ input=batch, model=self.config.embedder_model_name
132
+ )
133
+ embeddings.extend([data.embedding for data in response.data])
126
134
  except Exception as e:
127
135
  raise self.wrap_error(e=e)
128
- embeddings = [data.embedding for data in response.data]
129
136
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
130
137
  return elements_with_embeddings
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -80,7 +81,17 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
80
81
  return response.data[0].embedding
81
82
 
82
83
  def embed_documents(self, elements: list[dict]) -> list[dict]:
83
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
84
+ client = self.config.get_client()
85
+ texts = [e.get("text", "") for e in elements]
86
+ embeddings = []
87
+ try:
88
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
89
+ response = client.embeddings.create(
90
+ input=batch, model=self.config.embedder_model_name
91
+ )
92
+ embeddings.extend([data.embedding for data in response.data])
93
+ except Exception as e:
94
+ raise self.wrap_error(e=e)
84
95
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
85
96
  return elements_with_embeddings
86
97
 
@@ -105,12 +116,14 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
105
116
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
106
117
  client = self.config.get_async_client()
107
118
  texts = [e.get("text", "") for e in elements]
119
+ embeddings = []
108
120
  try:
109
- response = await client.embeddings.create(
110
- input=texts, model=self.config.embedder_model_name
111
- )
121
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
122
+ response = await client.embeddings.create(
123
+ input=batch, model=self.config.embedder_model_name
124
+ )
125
+ embeddings.extend([data.embedding for data in response.data])
112
126
  except Exception as e:
113
127
  raise self.wrap_error(e=e)
114
- embeddings = [data.embedding for data in response.data]
115
128
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
116
129
  return elements_with_embeddings