PyPI - unstructured-ingest - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (59) hide show

test/integration/connectors/test_astradb.py ADDED Viewed

@@ -0,0 +1,109 @@
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from astrapy import Collection
+from astrapy import DataAPIClient as AstraDBClient
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.astradb import (
+    CONNECTOR_TYPE,
+    AstraDBAccessConfig,
+    AstraDBConnectionConfig,
+    AstraDBUploader,
+    AstraDBUploaderConfig,
+    AstraDBUploadStager,
+)
+@dataclass(frozen=True)
+class EnvData:
+    api_endpoint: str
+    token: str
+def get_env_data() -> EnvData:
+    return EnvData(
+        api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    )
+@pytest.fixture
+def collection(upload_file: Path) -> Collection:
+    random_id = str(uuid4())[:8]
+    collection_name = f"utic_test_{random_id}"
+    with upload_file.open("r") as upload_fp:
+        upload_data = json.load(upload_fp)
+    first_content = upload_data[0]
+    embeddings = first_content["embeddings"]
+    embedding_dimension = len(embeddings)
+    my_client = AstraDBClient()
+    env_data = get_env_data()
+    astra_db = my_client.get_database(
+        api_endpoint=env_data.api_endpoint,
+        token=env_data.token,
+    )
+    collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
+    try:
+        yield collection
+    finally:
+        astra_db.drop_collection(collection)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    collection: Collection,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AstraDBUploadStager()
+    env_data = get_env_data()
+    uploader = AstraDBUploader(
+        connection_config=AstraDBConnectionConfig(
+            access_config=AstraDBAccessConfig(
+                api_endpoint=env_data.api_endpoint, token=env_data.token
+            ),
+        ),
+        upload_config=AstraDBUploaderConfig(collection_name=collection.name),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
+    assert current_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from collection: {current_count}"
+    )

test/integration/connectors/test_azure_cog_search.py ADDED Viewed

@@ -0,0 +1,233 @@
+import json
+import os
+import time
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+    ComplexField,
+    CorsOptions,
+    HnswAlgorithmConfiguration,
+    HnswParameters,
+    SearchField,
+    SearchFieldDataType,
+    SearchIndex,
+    SimpleField,
+    VectorSearch,
+    VectorSearchAlgorithmMetric,
+    VectorSearchProfile,
+)
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
+    CONNECTOR_TYPE,
+    RECORD_ID_LABEL,
+    AzureAISearchAccessConfig,
+    AzureAISearchConnectionConfig,
+    AzureAISearchUploader,
+    AzureAISearchUploaderConfig,
+    AzureAISearchUploadStager,
+    AzureAISearchUploadStagerConfig,
+)
+repo_path = Path(__file__).parent.resolve()
+API_KEY = "AZURE_SEARCH_API_KEY"
+ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
+def get_api_key() -> str:
+    key = os.environ[API_KEY]
+    return key
+def get_fields() -> list:
+    data_source_fields = [
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(name="version", type=SearchFieldDataType.String),
+        SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="permissions_data", type=SearchFieldDataType.String),
+        SimpleField(name="record_locator", type=SearchFieldDataType.String),
+    ]
+    coordinates_fields = [
+        SimpleField(name="system", type=SearchFieldDataType.String),
+        SimpleField(name="layout_width", type=SearchFieldDataType.Double),
+        SimpleField(name="layout_height", type=SearchFieldDataType.Double),
+        SimpleField(name="points", type=SearchFieldDataType.String),
+    ]
+    metadata_fields = [
+        SimpleField(name="orig_elements", type=SearchFieldDataType.String),
+        SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
+        SimpleField(name="parent_id", type=SearchFieldDataType.String),
+        SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
+        SimpleField(name="filetype", type=SearchFieldDataType.String),
+        SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
+        SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
+        SimpleField(name="file_directory", type=SearchFieldDataType.String),
+        SimpleField(name="filename", type=SearchFieldDataType.String),
+        ComplexField(name="data_source", fields=data_source_fields),
+        ComplexField(name="coordinates", fields=coordinates_fields),
+        SimpleField(
+            name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="page_number", type=SearchFieldDataType.String),
+        SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
+        SimpleField(name="page_name", type=SearchFieldDataType.String),
+        SimpleField(name="url", type=SearchFieldDataType.String),
+        SimpleField(
+            name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(
+            name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
+        ),
+        SimpleField(name="subject", type=SearchFieldDataType.String),
+        SimpleField(name="section", type=SearchFieldDataType.String),
+        SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
+        SimpleField(
+            name="emphasized_text_contents",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(
+            name="emphasized_text_tags",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+        ),
+        SimpleField(name="text_as_html", type=SearchFieldDataType.String),
+        SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
+        SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
+    ]
+    fields = [
+        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+        SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
+        SimpleField(name="element_id", type=SearchFieldDataType.String),
+        SimpleField(name="text", type=SearchFieldDataType.String),
+        SimpleField(name="type", type=SearchFieldDataType.String),
+        ComplexField(name="metadata", fields=metadata_fields),
+        SearchField(
+            name="embeddings",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            vector_search_dimensions=384,
+            vector_search_profile_name="embeddings-config-profile",
+        ),
+    ]
+    return fields
+def get_vector_search() -> VectorSearch:
+    return VectorSearch(
+        algorithms=[
+            HnswAlgorithmConfiguration(
+                name="hnsw-config",
+                parameters=HnswParameters(
+                    metric=VectorSearchAlgorithmMetric.COSINE,
+                ),
+            )
+        ],
+        profiles=[
+            VectorSearchProfile(
+                name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
+            )
+        ],
+    )
+def get_search_index_client() -> SearchIndexClient:
+    api_key = get_api_key()
+    return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
+@pytest.fixture
+def index() -> str:
+    random_id = str(uuid4())[:8]
+    index_name = f"utic-test-{random_id}"
+    client = get_search_index_client()
+    index = SearchIndex(
+        name=index_name,
+        fields=get_fields(),
+        vector_search=get_vector_search(),
+        cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
+    )
+    print(f"creating index: {index_name}")
+    client.create_index(index=index)
+    try:
+        yield index_name
+    finally:
+        print(f"deleting index: {index_name}")
+        client.delete_index(index)
+def validate_count(
+    search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
+) -> None:
+    index_count = search_client.get_document_count()
+    if index_count == expected_count:
+        return
+    tries = 0
+    while tries < retries:
+        time.sleep(interval)
+        index_count = search_client.get_document_count()
+        if index_count == expected_count:
+            break
+    assert index_count == expected_count, (
+        f"Expected count ({expected_count}) doesn't match how "
+        f"much came back from index: {index_count}"
+    )
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@requires_env("AZURE_SEARCH_API_KEY")
+async def test_azure_ai_search_destination(
+    upload_file: Path,
+    index: str,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
+    uploader = AzureAISearchUploader(
+        connection_config=AzureAISearchConnectionConfig(
+            access_config=AzureAISearchAccessConfig(key=get_api_key()),
+            endpoint=ENDPOINT,
+            index=index,
+        ),
+        upload_config=AzureAISearchUploaderConfig(),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    search_client: SearchClient = uploader.connection_config.get_search_client()
+    validate_count(search_client=search_client, expected_count=expected_count)
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    validate_count(search_client=search_client, expected_count=expected_count)

test/integration/connectors/test_kafka.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import socket
+import json
 import tempfile
 from pathlib import Path
 import pytest
-from confluent_kafka import Producer
+from confluent_kafka import Consumer, KafkaError, KafkaException, Producer
+from confluent_kafka.admin import AdminClient, NewTopic
 from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
     SOURCE_TAG,
     env_setup_path,
 )
@@ -14,6 +16,8 @@ from test.integration.connectors.utils.validation import (
     ValidationConfigs,
     source_connector_validation,
 )
+from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.kafka.local import (
     CONNECTOR_TYPE,
     LocalKafkaConnectionConfig,
@@ -21,6 +25,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.local import (
     LocalKafkaDownloaderConfig,
     LocalKafkaIndexer,
     LocalKafkaIndexerConfig,
+    LocalKafkaUploader,
+    LocalKafkaUploaderConfig,
 )
 SEED_MESSAGES = 10
@@ -28,20 +34,33 @@ TOPIC = "fake-topic"
 @pytest.fixture
-def kafka_seed_topic() -> str:
-    with docker_compose_context(docker_compose_path=env_setup_path / "kafka"):
-        conf = {
-            "bootstrap.servers": "localhost:29092",
-            "client.id": socket.gethostname(),
-            "message.max.bytes": 10485760,
-        }
-        producer = Producer(conf)
-        for i in range(SEED_MESSAGES):
-            message = f"This is some text for message {i}"
-            producer.produce(topic=TOPIC, value=message)
-        producer.flush(timeout=10)
-        print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
-        yield TOPIC
+def docker_compose_ctx():
+    with docker_compose_context(docker_compose_path=env_setup_path / "kafka") as ctx:
+        yield ctx
+@pytest.fixture
+def kafka_seed_topic(docker_compose_ctx) -> str:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+    }
+    producer = Producer(conf)
+    for i in range(SEED_MESSAGES):
+        message = f"This is some text for message {i}"
+        producer.produce(topic=TOPIC, value=message)
+    producer.flush(timeout=10)
+    print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
+    return TOPIC
+@pytest.fixture
+def kafka_upload_topic(docker_compose_ctx) -> str:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+    }
+    admin_client = AdminClient(conf)
+    admin_client.create_topics([NewTopic(TOPIC, 1, 1)])
+    return TOPIC
 @pytest.mark.asyncio
@@ -58,6 +77,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
         downloader = LocalKafkaDownloader(
             connection_config=connection_config, download_config=download_config
         )
+        indexer.precheck()
         await source_connector_validation(
             indexer=indexer,
             downloader=downloader,
@@ -65,3 +85,83 @@ async def test_kafka_source_local(kafka_seed_topic: str):
                 test_id="kafka", expected_num_files=5, validate_downloaded_files=True
             ),
         )
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
+def test_kafak_source_local_precheck_fail():
+    connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
+    indexer = LocalKafkaIndexer(
+        connection_config=connection_config,
+        index_config=LocalKafkaIndexerConfig(topic=TOPIC, num_messages_to_consume=5),
+    )
+    with pytest.raises(SourceConnectionError):
+        indexer.precheck()
+def get_all_messages(topic: str, max_empty_messages: int = 5) -> list[dict]:
+    conf = {
+        "bootstrap.servers": "localhost:29092",
+        "group.id": "default_group_id",
+        "enable.auto.commit": "false",
+        "auto.offset.reset": "earliest",
+    }
+    consumer = Consumer(conf)
+    consumer.subscribe([topic])
+    messages = []
+    try:
+        empty_count = 0
+        while empty_count < max_empty_messages:
+            msg = consumer.poll(timeout=1)
+            if msg is None:
+                empty_count += 1
+                continue
+            if msg.error():
+                if msg.error().code() == KafkaError._PARTITION_EOF:
+                    break
+                else:
+                    raise KafkaException(msg.error())
+            try:
+                message = json.loads(msg.value().decode("utf8"))
+                messages.append(message)
+            finally:
+                consumer.commit(asynchronous=False)
+    finally:
+        print("closing consumer")
+        consumer.close()
+    return messages
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+async def test_kafka_destination_local(upload_file: Path, kafka_upload_topic: str):
+    uploader = LocalKafkaUploader(
+        connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
+        upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
+    )
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    uploader.precheck()
+    if uploader.is_async():
+        await uploader.run_async(path=upload_file, file_data=file_data)
+    else:
+        uploader.run(path=upload_file, file_data=file_data)
+    all_messages = get_all_messages(topic=kafka_upload_topic)
+    with upload_file.open("r") as upload_fs:
+        content_to_upload = json.load(upload_fs)
+    assert len(all_messages) == len(content_to_upload), (
+        f"expected number of messages ({len(content_to_upload)}) doesn't "
+        f"match how many messages read off of kakfa topic {kafka_upload_topic}: {len(all_messages)}"
+    )
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+def test_kafak_destination_local_precheck_fail():
+    uploader = LocalKafkaUploader(
+        connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
+        upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
+    )
+    with pytest.raises(DestinationConnectionError):
+        uploader.precheck()

test/integration/connectors/test_pinecone.py ADDED Viewed

@@ -0,0 +1,161 @@
+import json
+import os
+import time
+from pathlib import Path
+from uuid import uuid4
+import pytest
+from pinecone import Pinecone, ServerlessSpec
+from pinecone.core.openapi.shared.exceptions import NotFoundException
+from test.integration.connectors.utils.constants import (
+    DESTINATION_TAG,
+)
+from test.integration.utils import requires_env
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connectors.pinecone import (
+    CONNECTOR_TYPE,
+    PineconeAccessConfig,
+    PineconeConnectionConfig,
+    PineconeUploader,
+    PineconeUploaderConfig,
+    PineconeUploadStager,
+    PineconeUploadStagerConfig,
+)
+API_KEY = "PINECONE_API_KEY"
+def get_api_key() -> str:
+    api_key = os.getenv(API_KEY, None)
+    assert api_key
+    return api_key
+def wait_for_delete(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
+    start = time.time()
+    while True and time.time() - start < timeout:
+        try:
+            description = client.describe_index(name=index_name)
+            logger.info(f"current index status: {description}")
+        except NotFoundException:
+            return
+        time.sleep(interval)
+    raise TimeoutError("time out waiting for index to delete")
+def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
+    def is_ready_status():
+        description = client.describe_index(name=index_name)
+        status = description["status"]
+        return status["ready"]
+    start = time.time()
+    is_ready = is_ready_status()
+    while not is_ready and time.time() - start < timeout:
+        time.sleep(interval)
+        is_ready = is_ready_status()
+    if not is_ready:
+        raise TimeoutError("time out waiting for index to be ready")
+@pytest.fixture
+def pinecone_index() -> str:
+    pinecone = Pinecone(api_key=get_api_key())
+    random_id = str(uuid4()).split("-")[0]
+    index_name = f"ingest-test-{random_id}"
+    assert len(index_name) < 45
+    logger.info(f"Creating index: {index_name}")
+    try:
+        pinecone.create_index(
+            name=index_name,
+            dimension=384,
+            metric="cosine",
+            spec=ServerlessSpec(
+                cloud="aws",
+                region="us-east-1",
+            ),
+            deletion_protection="disabled",
+        )
+        wait_for_ready(client=pinecone, index_name=index_name)
+        yield index_name
+    except Exception as e:
+        logger.error(f"failed to create index {index_name}: {e}")
+    finally:
+        try:
+            logger.info(f"deleting index: {index_name}")
+            pinecone.delete_index(name=index_name)
+            wait_for_delete(client=pinecone, index_name=index_name)
+        except NotFoundException:
+            return
+def validate_pinecone_index(
+    index_name: str, expected_num_of_vectors: int, retries=30, interval=1
+) -> None:
+    # Because there's a delay for the index to catch up to the recent writes, add in a retry
+    pinecone = Pinecone(api_key=get_api_key())
+    index = pinecone.Index(name=index_name)
+    vector_count = -1
+    for i in range(retries):
+        index_stats = index.describe_index_stats()
+        vector_count = index_stats["total_vector_count"]
+        if vector_count == expected_num_of_vectors:
+            logger.info(f"expected {expected_num_of_vectors} == vector count {vector_count}")
+            break
+        logger.info(
+            f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
+        )
+        time.sleep(interval)
+    assert vector_count == expected_num_of_vectors
+@requires_env(API_KEY)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="pinecone_mock_id",
+    )
+    connection_config = PineconeConnectionConfig(
+        index_name=pinecone_index,
+        access_config=PineconeAccessConfig(api_key=get_api_key()),
+    )
+    stager_config = PineconeUploadStagerConfig()
+    stager = PineconeUploadStager(upload_stager_config=stager_config)
+    new_upload_file = stager.run(
+        elements_filepath=upload_file,
+        output_dir=temp_dir,
+        output_filename=upload_file.name,
+        file_data=file_data,
+    )
+    upload_config = PineconeUploaderConfig()
+    uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
+    uploader.precheck()
+    if uploader.is_async():
+        await uploader.run_async(path=new_upload_file, file_data=file_data)
+    else:
+        uploader.run(path=new_upload_file, file_data=file_data)
+    with new_upload_file.open() as f:
+        staged_content = json.load(f)
+    expected_num_of_vectors = len(staged_content)
+    logger.info("validating first upload")
+    validate_pinecone_index(
+        index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
+    )
+    # Rerun uploader and make sure no duplicates exist
+    if uploader.is_async():
+        await uploader.run_async(path=new_upload_file, file_data=file_data)
+    else:
+        uploader.run(path=new_upload_file, file_data=file_data)
+    logger.info("validating second upload")
+    validate_pinecone_index(
+        index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
+    )

unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl