unstructured-ingest 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
- test/integration/connectors/weaviate/test_local.py +27 -6
- test/integration/embedders/test_azure_openai.py +1 -3
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -3
- test/integration/embedders/test_mixedbread.py +2 -2
- test/integration/embedders/test_octoai.py +2 -4
- test/integration/embedders/test_openai.py +2 -4
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +2 -4
- test/integration/embedders/test_voyageai.py +2 -4
- test/integration/embedders/utils.py +12 -14
- test/unit/embed/test_openai.py +12 -4
- test/unit/test_html.py +112 -0
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
- test/unit/v2/embedders/test_voyageai.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +6 -1
- unstructured_ingest/embed/interfaces.py +9 -6
- unstructured_ingest/embed/mixedbreadai.py +3 -10
- unstructured_ingest/embed/octoai.py +14 -7
- unstructured_ingest/embed/openai.py +18 -5
- unstructured_ingest/embed/togetherai.py +19 -8
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +19 -6
- unstructured_ingest/utils/data_prep.py +1 -1
- unstructured_ingest/utils/html.py +143 -93
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/interfaces/uploader.py +14 -1
- unstructured_ingest/v2/pipeline/pipeline.py +20 -6
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
- unstructured_ingest/v2/processes/connectors/sql/sql.py +4 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
- unstructured_ingest/v2/processes/embedder.py +3 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +45 -41
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
|
@@ -17,11 +17,11 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
|
17
17
|
from unstructured_ingest.v2.logger import logger
|
|
18
18
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
19
19
|
CONNECTOR_TYPE,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
DatabricksDeltaTablesAccessConfig,
|
|
21
|
+
DatabricksDeltaTablesConnectionConfig,
|
|
22
|
+
DatabricksDeltaTablesUploader,
|
|
23
|
+
DatabricksDeltaTablesUploaderConfig,
|
|
24
|
+
DatabricksDeltaTablesUploadStager,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
CATALOG = "utic-dev-tech-fixtures"
|
|
@@ -112,7 +112,7 @@ async def test_databricks_delta_tables_destination(
|
|
|
112
112
|
connector_type=CONNECTOR_TYPE,
|
|
113
113
|
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
114
114
|
)
|
|
115
|
-
stager =
|
|
115
|
+
stager = DatabricksDeltaTablesUploadStager()
|
|
116
116
|
staged_path = stager.run(
|
|
117
117
|
elements_filepath=upload_file,
|
|
118
118
|
file_data=mock_file_data,
|
|
@@ -122,15 +122,15 @@ async def test_databricks_delta_tables_destination(
|
|
|
122
122
|
|
|
123
123
|
assert staged_path.suffix == upload_file.suffix
|
|
124
124
|
|
|
125
|
-
uploader =
|
|
126
|
-
connection_config=
|
|
127
|
-
access_config=
|
|
125
|
+
uploader = DatabricksDeltaTablesUploader(
|
|
126
|
+
connection_config=DatabricksDeltaTablesConnectionConfig(
|
|
127
|
+
access_config=DatabricksDeltaTablesAccessConfig(
|
|
128
128
|
token=env_data.access_token.get_secret_value()
|
|
129
129
|
),
|
|
130
130
|
http_path=env_data.http_path,
|
|
131
131
|
server_hostname=env_data.server_hostname,
|
|
132
132
|
),
|
|
133
|
-
upload_config=
|
|
133
|
+
upload_config=DatabricksDeltaTablesUploaderConfig(
|
|
134
134
|
catalog=CATALOG, database="default", table_name=destination_table
|
|
135
135
|
),
|
|
136
136
|
)
|
|
@@ -25,7 +25,7 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
|
25
25
|
start_time = time.time()
|
|
26
26
|
while time.time() - start_time < timeout:
|
|
27
27
|
try:
|
|
28
|
-
requests.get("http://localhost:8080/v1/.well-known/read")
|
|
28
|
+
requests.get("http://localhost:8080/v1/.well-known/read", timeout=1)
|
|
29
29
|
return
|
|
30
30
|
except Exception as e:
|
|
31
31
|
print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
|
|
@@ -34,15 +34,20 @@ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
@pytest.fixture
|
|
37
|
-
def
|
|
37
|
+
def weaviate_instance():
|
|
38
38
|
with container_context(
|
|
39
39
|
image="semitechnologies/weaviate:1.27.3",
|
|
40
40
|
ports={8080: 8080, 50051: 50051},
|
|
41
|
-
):
|
|
41
|
+
) as ctx:
|
|
42
42
|
wait_for_container()
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
yield ctx
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture
|
|
47
|
+
def collection(weaviate_instance, collections_schema_config: dict) -> str:
|
|
48
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
49
|
+
weaviate_client.collections.create_from_dict(config=collections_schema_config)
|
|
50
|
+
return COLLECTION_NAME
|
|
46
51
|
|
|
47
52
|
|
|
48
53
|
def get_count(client: WeaviateClient) -> int:
|
|
@@ -129,3 +134,19 @@ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path
|
|
|
129
134
|
file_data=file_data,
|
|
130
135
|
expected_count=expected_count,
|
|
131
136
|
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
140
|
+
def test_weaviate_local_create_destination(weaviate_instance):
|
|
141
|
+
uploader = LocalWeaviateUploader(
|
|
142
|
+
upload_config=LocalWeaviateUploaderConfig(),
|
|
143
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
144
|
+
)
|
|
145
|
+
collection_name = "system_created"
|
|
146
|
+
created = uploader.create_destination(destination_name=collection_name)
|
|
147
|
+
assert created
|
|
148
|
+
with uploader.connection_config.get_client() as weaviate_client:
|
|
149
|
+
assert weaviate_client.collections.exists(name=collection_name)
|
|
150
|
+
|
|
151
|
+
created = uploader.create_destination(destination_name=collection_name)
|
|
152
|
+
assert not created
|
|
@@ -54,6 +54,4 @@ def test_raw_azure_openai_embedder(embedder_file: Path):
|
|
|
54
54
|
azure_endpoint=azure_data.endpoint,
|
|
55
55
|
)
|
|
56
56
|
)
|
|
57
|
-
validate_raw_embedder(
|
|
58
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
59
|
-
)
|
|
57
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
@@ -55,7 +55,7 @@ def test_raw_bedrock_embedder(embedder_file: Path):
|
|
|
55
55
|
validate_raw_embedder(
|
|
56
56
|
embedder=embedder,
|
|
57
57
|
embedder_file=embedder_file,
|
|
58
|
-
|
|
58
|
+
expected_dimension=1536,
|
|
59
59
|
expected_is_unit_vector=False,
|
|
60
60
|
)
|
|
61
61
|
|
|
@@ -98,6 +98,6 @@ async def test_raw_async_bedrock_embedder(embedder_file: Path):
|
|
|
98
98
|
await validate_raw_embedder_async(
|
|
99
99
|
embedder=embedder,
|
|
100
100
|
embedder_file=embedder_file,
|
|
101
|
-
|
|
101
|
+
expected_dimension=1536,
|
|
102
102
|
expected_is_unit_vector=False,
|
|
103
103
|
)
|
|
@@ -21,6 +21,4 @@ def test_huggingface_embedder(embedder_file: Path):
|
|
|
21
21
|
|
|
22
22
|
def test_raw_hugginface_embedder(embedder_file: Path):
|
|
23
23
|
embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
24
|
-
validate_raw_embedder(
|
|
25
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
|
|
26
|
-
)
|
|
24
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=384)
|
|
@@ -49,7 +49,7 @@ def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
|
49
49
|
validate_raw_embedder(
|
|
50
50
|
embedder=embedder,
|
|
51
51
|
embedder_file=embedder_file,
|
|
52
|
-
|
|
52
|
+
expected_dimension=1024,
|
|
53
53
|
expected_is_unit_vector=False,
|
|
54
54
|
)
|
|
55
55
|
|
|
@@ -66,6 +66,6 @@ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
|
|
|
66
66
|
await validate_raw_embedder_async(
|
|
67
67
|
embedder=embedder,
|
|
68
68
|
embedder_file=embedder_file,
|
|
69
|
-
|
|
69
|
+
expected_dimension=1024,
|
|
70
70
|
expected_is_unit_vector=False,
|
|
71
71
|
)
|
|
@@ -47,9 +47,7 @@ def test_raw_octoai_embedder(embedder_file: Path):
|
|
|
47
47
|
api_key=api_key,
|
|
48
48
|
)
|
|
49
49
|
)
|
|
50
|
-
validate_raw_embedder(
|
|
51
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
52
|
-
)
|
|
50
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
53
51
|
|
|
54
52
|
|
|
55
53
|
@pytest.mark.skip(reason="Unexpected connection error at the moment")
|
|
@@ -73,5 +71,5 @@ async def test_raw_async_octoai_embedder(embedder_file: Path):
|
|
|
73
71
|
)
|
|
74
72
|
)
|
|
75
73
|
await validate_raw_embedder_async(
|
|
76
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
74
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
77
75
|
)
|
|
@@ -47,9 +47,7 @@ def test_raw_openai_embedder(embedder_file: Path):
|
|
|
47
47
|
api_key=api_key,
|
|
48
48
|
)
|
|
49
49
|
)
|
|
50
|
-
validate_raw_embedder(
|
|
51
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
52
|
-
)
|
|
50
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
53
51
|
|
|
54
52
|
|
|
55
53
|
def test_raw_openai_embedder_invalid_credentials():
|
|
@@ -72,5 +70,5 @@ async def test_raw_async_openai_embedder(embedder_file: Path):
|
|
|
72
70
|
)
|
|
73
71
|
)
|
|
74
72
|
await validate_raw_embedder_async(
|
|
75
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
73
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1536
|
|
76
74
|
)
|
|
@@ -46,7 +46,7 @@ def test_raw_togetherai_embedder(embedder_file: Path):
|
|
|
46
46
|
validate_raw_embedder(
|
|
47
47
|
embedder=embedder,
|
|
48
48
|
embedder_file=embedder_file,
|
|
49
|
-
|
|
49
|
+
expected_dimension=768,
|
|
50
50
|
expected_is_unit_vector=False,
|
|
51
51
|
)
|
|
52
52
|
|
|
@@ -66,6 +66,6 @@ async def test_raw_async_togetherai_embedder(embedder_file: Path):
|
|
|
66
66
|
await validate_raw_embedder_async(
|
|
67
67
|
embedder=embedder,
|
|
68
68
|
embedder_file=embedder_file,
|
|
69
|
-
|
|
69
|
+
expected_dimension=768,
|
|
70
70
|
expected_is_unit_vector=False,
|
|
71
71
|
)
|
|
@@ -46,9 +46,7 @@ def test_raw_vertexai_embedder(embedder_file: Path):
|
|
|
46
46
|
api_key=api_key,
|
|
47
47
|
)
|
|
48
48
|
)
|
|
49
|
-
validate_raw_embedder(
|
|
50
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
|
|
51
|
-
)
|
|
49
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
@requires_env(API_KEY)
|
|
@@ -61,5 +59,5 @@ async def test_raw_async_vertexai_embedder(embedder_file: Path):
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
await validate_raw_embedder_async(
|
|
64
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
62
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=768
|
|
65
63
|
)
|
|
@@ -46,9 +46,7 @@ def test_raw_voyageai_embedder(embedder_file: Path):
|
|
|
46
46
|
api_key=api_key,
|
|
47
47
|
)
|
|
48
48
|
)
|
|
49
|
-
validate_raw_embedder(
|
|
50
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
51
|
-
)
|
|
49
|
+
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
@requires_env(API_KEY)
|
|
@@ -61,5 +59,5 @@ async def test_raw_async_voyageai_embedder(embedder_file: Path):
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
await validate_raw_embedder_async(
|
|
64
|
-
embedder=embedder, embedder_file=embedder_file,
|
|
62
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
65
63
|
)
|
|
@@ -23,23 +23,22 @@ def validate_embedding_output(original_elements: list[dict], output_elements: li
|
|
|
23
23
|
def validate_raw_embedder(
|
|
24
24
|
embedder: BaseEmbeddingEncoder,
|
|
25
25
|
embedder_file: Path,
|
|
26
|
-
|
|
26
|
+
expected_dimension: Optional[int] = None,
|
|
27
27
|
expected_is_unit_vector: bool = True,
|
|
28
28
|
):
|
|
29
29
|
with open(embedder_file) as f:
|
|
30
30
|
elements = json.load(f)
|
|
31
31
|
all_text = [element["text"] for element in elements]
|
|
32
32
|
single_text = all_text[0]
|
|
33
|
-
|
|
34
|
-
if
|
|
33
|
+
dimension = embedder.dimension
|
|
34
|
+
if expected_dimension:
|
|
35
35
|
assert (
|
|
36
|
-
|
|
37
|
-
), f"
|
|
36
|
+
dimension == expected_dimension
|
|
37
|
+
), f"dimensions {dimension} didn't match expected: {expected_dimension}"
|
|
38
38
|
is_unit_vector = embedder.is_unit_vector
|
|
39
39
|
assert is_unit_vector == expected_is_unit_vector
|
|
40
40
|
single_embedding = embedder.embed_query(query=single_text)
|
|
41
|
-
|
|
42
|
-
assert len(single_embedding) == expected_length
|
|
41
|
+
assert len(single_embedding) == dimension
|
|
43
42
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
44
43
|
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
|
45
44
|
|
|
@@ -47,22 +46,21 @@ def validate_raw_embedder(
|
|
|
47
46
|
async def validate_raw_embedder_async(
|
|
48
47
|
embedder: AsyncBaseEmbeddingEncoder,
|
|
49
48
|
embedder_file: Path,
|
|
50
|
-
|
|
49
|
+
expected_dimension: Optional[int] = None,
|
|
51
50
|
expected_is_unit_vector: bool = True,
|
|
52
51
|
):
|
|
53
52
|
with open(embedder_file) as f:
|
|
54
53
|
elements = json.load(f)
|
|
55
54
|
all_text = [element["text"] for element in elements]
|
|
56
55
|
single_text = all_text[0]
|
|
57
|
-
|
|
58
|
-
if
|
|
56
|
+
dimension = await embedder.dimension
|
|
57
|
+
if expected_dimension:
|
|
59
58
|
assert (
|
|
60
|
-
|
|
61
|
-
), f"
|
|
59
|
+
dimension == expected_dimension
|
|
60
|
+
), f"dimension {dimension} didn't match expected: {expected_dimension}"
|
|
62
61
|
is_unit_vector = await embedder.is_unit_vector
|
|
63
62
|
assert is_unit_vector == expected_is_unit_vector
|
|
64
63
|
single_embedding = await embedder.embed_query(query=single_text)
|
|
65
|
-
|
|
66
|
-
assert len(single_embedding) == expected_length
|
|
64
|
+
assert len(single_embedding) == dimension
|
|
67
65
|
embedded_elements = await embedder.embed_documents(elements=elements)
|
|
68
66
|
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
test/unit/embed/test_openai.py
CHANGED
|
@@ -3,18 +3,26 @@ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbedd
|
|
|
3
3
|
|
|
4
4
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
5
|
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(4)]
|
|
7
|
+
mock_response = mocker.MagicMock()
|
|
8
|
+
mock_response_data = []
|
|
9
|
+
for i in range(2):
|
|
10
|
+
mock_response_d = mocker.MagicMock()
|
|
11
|
+
mock_response_d.embedding = [1, 2]
|
|
12
|
+
mock_response_data.append(mock_response_d)
|
|
13
|
+
mock_response.data = mock_response_data
|
|
6
14
|
mock_client = mocker.MagicMock()
|
|
7
|
-
mock_client.
|
|
15
|
+
mock_client.embeddings.create.return_value = mock_response
|
|
8
16
|
|
|
9
17
|
# Mock get_client to return our mock_client
|
|
10
18
|
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
11
19
|
|
|
12
|
-
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
|
|
13
|
-
raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
|
|
20
|
+
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key", batch_size=2))
|
|
14
21
|
|
|
15
22
|
elements = encoder.embed_documents(
|
|
16
23
|
elements=raw_elements,
|
|
17
24
|
)
|
|
18
|
-
assert len(elements) ==
|
|
25
|
+
assert len(elements) == 4
|
|
19
26
|
assert elements[0]["text"] == "This is sentence 1"
|
|
20
27
|
assert elements[1]["text"] == "This is sentence 2"
|
|
28
|
+
assert mock_client.embeddings.create.call_count == 2
|
test/unit/test_html.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from pytest_mock import MockerFixture
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.utils.html import HtmlMixin
|
|
8
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_extract_images(mocker: MockerFixture):
|
|
12
|
+
mixin = HtmlMixin(extract_images=True)
|
|
13
|
+
mock_download_response = b"DOWNLOADED"
|
|
14
|
+
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
15
|
+
mocked_download_response = mocker.patch(
|
|
16
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
17
|
+
return_value=mock_download_response,
|
|
18
|
+
)
|
|
19
|
+
url = "http://mywebsite.com/path/to/page"
|
|
20
|
+
html = """
|
|
21
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
22
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
23
|
+
<img src="img3.jpg"/>
|
|
24
|
+
<img src="data:image/png;base64,24689654..."/>
|
|
25
|
+
"""
|
|
26
|
+
expected_html = f"""
|
|
27
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
28
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
29
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
30
|
+
<img src="data:image/png;base64,24689654..."/>
|
|
31
|
+
"""
|
|
32
|
+
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
33
|
+
result = mixin.extract_html_images(url=url, html=html)
|
|
34
|
+
result_soup = BeautifulSoup(result, "html.parser")
|
|
35
|
+
assert expected_soup == result_soup
|
|
36
|
+
assert mocked_download_response.call_count == 2
|
|
37
|
+
urls_to_download = [
|
|
38
|
+
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
39
|
+
]
|
|
40
|
+
assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_extract_images_allow_list(mocker: MockerFixture):
|
|
44
|
+
mixin = HtmlMixin(
|
|
45
|
+
extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
|
|
46
|
+
)
|
|
47
|
+
mock_download_response = b"DOWNLOADED"
|
|
48
|
+
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
49
|
+
mocked_download_response = mocker.patch(
|
|
50
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
51
|
+
return_value=mock_download_response,
|
|
52
|
+
)
|
|
53
|
+
url = "http://mywebsite.com/path/to/page"
|
|
54
|
+
html = """
|
|
55
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
56
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
57
|
+
<img src="http://allowedwebsite1.com/img2.jpg"/>
|
|
58
|
+
<img src="http://allowedwebsite2.com/img2.jpg"/>
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
expected_html = f"""
|
|
62
|
+
<img src="http://mywebsite.com/img1.jpg"/>
|
|
63
|
+
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
64
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
65
|
+
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
66
|
+
"""
|
|
67
|
+
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
68
|
+
result = mixin.extract_html_images(url=url, html=html)
|
|
69
|
+
result_soup = BeautifulSoup(result, "html.parser")
|
|
70
|
+
assert expected_soup == result_soup
|
|
71
|
+
assert mocked_download_response.call_count == 2
|
|
72
|
+
urls_to_download = [
|
|
73
|
+
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
74
|
+
]
|
|
75
|
+
assert urls_to_download == [
|
|
76
|
+
"http://allowedwebsite1.com/img2.jpg",
|
|
77
|
+
"http://allowedwebsite2.com/img2.jpg",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_extract_embedded_docs(mocker: MockerFixture):
|
|
82
|
+
mixin = HtmlMixin(extract_files=True)
|
|
83
|
+
mock_download_response = b"DOWNLOADED"
|
|
84
|
+
mocked_download_response = mocker.patch(
|
|
85
|
+
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
86
|
+
return_value=mock_download_response,
|
|
87
|
+
)
|
|
88
|
+
mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
|
|
89
|
+
url = "http://mywebsite.com/path/to/page"
|
|
90
|
+
html = """
|
|
91
|
+
<a href="http://mywebsite.com/file.pdf"/>
|
|
92
|
+
<a href="http://notmywebsite.com/file.pdf"/>
|
|
93
|
+
<a href="http://mywebsite.com/another/link"/>
|
|
94
|
+
<a href="another/link/2"/>
|
|
95
|
+
<a href="file.doc"/>
|
|
96
|
+
"""
|
|
97
|
+
file_data = FileData(
|
|
98
|
+
source_identifiers=SourceIdentifiers(
|
|
99
|
+
fullpath="file.txt",
|
|
100
|
+
filename="file.txt",
|
|
101
|
+
),
|
|
102
|
+
connector_type="my_connector",
|
|
103
|
+
identifier="mock_file_data",
|
|
104
|
+
)
|
|
105
|
+
results = mixin.extract_embedded_files(
|
|
106
|
+
url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
|
|
107
|
+
)
|
|
108
|
+
assert len(results) == 2
|
|
109
|
+
downloaded_urls = [r["file_data"].metadata.url for r in results]
|
|
110
|
+
assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
|
|
111
|
+
assert mocked_download_response.call_count == 2
|
|
112
|
+
assert mocked_write_content.call_count == 2
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from pytest_mock import MockerFixture
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
|
|
7
|
+
DatabricksVolumeDeltaTableStager,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def stager():
|
|
13
|
+
return DatabricksVolumeDeltaTableStager()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.parametrize(
|
|
17
|
+
("output_path", "called_output_path"),
|
|
18
|
+
[
|
|
19
|
+
(
|
|
20
|
+
Path("/fake/path/output"),
|
|
21
|
+
Path("/fake/path/output.json"),
|
|
22
|
+
),
|
|
23
|
+
(
|
|
24
|
+
Path("/fake/path/output.ndjson"),
|
|
25
|
+
Path("/fake/path/output.json"),
|
|
26
|
+
),
|
|
27
|
+
],
|
|
28
|
+
)
|
|
29
|
+
def test_write_output(
|
|
30
|
+
mocker: MockerFixture,
|
|
31
|
+
stager: DatabricksVolumeDeltaTableStager,
|
|
32
|
+
output_path: Path,
|
|
33
|
+
called_output_path: Path,
|
|
34
|
+
):
|
|
35
|
+
data = [{"key1": "value1", "key2": "value2"}]
|
|
36
|
+
|
|
37
|
+
mock_get_data = mocker.patch(
|
|
38
|
+
"unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
|
|
39
|
+
return_value=None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
stager.write_output(output_path, data)
|
|
43
|
+
|
|
44
|
+
mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)
|
|
@@ -14,7 +14,7 @@ def generate_embedder_config_params() -> dict:
|
|
|
14
14
|
}
|
|
15
15
|
if random.random() < 0.5:
|
|
16
16
|
params["embedder_model_name"] = fake.word()
|
|
17
|
-
params["batch_size"] = fake.random_int()
|
|
17
|
+
params["batch_size"] = fake.random_int(max=100)
|
|
18
18
|
params["truncation"] = fake.boolean()
|
|
19
19
|
params["max_retries"] = fake.random_int()
|
|
20
20
|
params["timeout_in_seconds"] = fake.random_int()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.3" # pragma: no cover
|
|
@@ -33,6 +33,11 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
33
33
|
**self.embedder_model_kwargs,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
+
def get_encoder_kwargs(self) -> dict:
|
|
37
|
+
encoder_kwargs = self.encode_kwargs or {}
|
|
38
|
+
encoder_kwargs["batch_size"] = self.batch_size
|
|
39
|
+
return encoder_kwargs
|
|
40
|
+
|
|
36
41
|
|
|
37
42
|
@dataclass
|
|
38
43
|
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
@@ -43,7 +48,7 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
43
48
|
|
|
44
49
|
def _embed_documents(self, texts: list[str]) -> list[list[float]]:
|
|
45
50
|
client = self.config.get_client()
|
|
46
|
-
embeddings = client.encode(texts, **self.config.
|
|
51
|
+
embeddings = client.encode(texts, **self.config.get_encoder_kwargs())
|
|
47
52
|
return embeddings.tolist()
|
|
48
53
|
|
|
49
54
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
|
-
from pydantic import BaseModel
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class EmbeddingConfig(BaseModel):
|
|
10
|
-
|
|
11
|
+
batch_size: Optional[int] = Field(
|
|
12
|
+
default=32, description="Optional batch size for embedding requests."
|
|
13
|
+
)
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
@dataclass
|
|
@@ -53,9 +56,9 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
53
56
|
is properly configured: e.g., embed a single a element"""
|
|
54
57
|
|
|
55
58
|
@property
|
|
56
|
-
def
|
|
59
|
+
def dimension(self):
|
|
57
60
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
58
|
-
return
|
|
61
|
+
return len(exemplary_embedding)
|
|
59
62
|
|
|
60
63
|
def get_exemplary_embedding(self) -> list[float]:
|
|
61
64
|
return self.embed_query(query="Q")
|
|
@@ -91,9 +94,9 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
91
94
|
is properly configured: e.g., embed a single a element"""
|
|
92
95
|
|
|
93
96
|
@property
|
|
94
|
-
async def
|
|
97
|
+
async def dimension(self):
|
|
95
98
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
96
|
-
return
|
|
99
|
+
return len(exemplary_embedding)
|
|
97
100
|
|
|
98
101
|
async def get_exemplary_embedding(self) -> list[float]:
|
|
99
102
|
return await self.embed_query(query="Q")
|
|
@@ -10,10 +10,10 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
10
10
|
BaseEmbeddingEncoder,
|
|
11
11
|
EmbeddingConfig,
|
|
12
12
|
)
|
|
13
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
15
|
|
|
15
16
|
USER_AGENT = "@mixedbread-ai/unstructured"
|
|
16
|
-
BATCH_SIZE = 128
|
|
17
17
|
TIMEOUT = 60
|
|
18
18
|
MAX_RETRIES = 3
|
|
19
19
|
ENCODING_FORMAT = "float"
|
|
@@ -109,13 +109,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
109
109
|
Returns:
|
|
110
110
|
list[list[float]]: List of embeddings.
|
|
111
111
|
"""
|
|
112
|
-
batch_size = BATCH_SIZE
|
|
113
|
-
batch_itr = range(0, len(texts), batch_size)
|
|
114
112
|
|
|
115
113
|
responses = []
|
|
116
114
|
client = self.config.get_client()
|
|
117
|
-
for
|
|
118
|
-
batch = texts[i : i + batch_size]
|
|
115
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
119
116
|
response = client.embeddings(
|
|
120
117
|
model=self.config.embedder_model_name,
|
|
121
118
|
normalized=True,
|
|
@@ -186,13 +183,9 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
186
183
|
Returns:
|
|
187
184
|
list[list[float]]: List of embeddings.
|
|
188
185
|
"""
|
|
189
|
-
batch_size = BATCH_SIZE
|
|
190
|
-
batch_itr = range(0, len(texts), batch_size)
|
|
191
|
-
|
|
192
186
|
client = self.config.get_async_client()
|
|
193
187
|
tasks = []
|
|
194
|
-
for
|
|
195
|
-
batch = texts[i : i + batch_size]
|
|
188
|
+
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
196
189
|
tasks.append(
|
|
197
190
|
client.embeddings(
|
|
198
191
|
model=self.config.embedder_model_name,
|