PyPI - unstructured-ingest - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

unstructured-ingest 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (17) hide show

test/integration/connectors/sql/__init__.py ADDED Viewed

File without changes

test/integration/connectors/{test_postgres.py → sql/test_postgres.py} RENAMED Viewed

@@ -1,21 +1,97 @@
 import tempfile
+from contextlib import contextmanager
 from pathlib import Path
+import faker
 import pandas as pd
 import pytest
 from psycopg2 import connect
-from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
+from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
 from test.integration.connectors.utils.docker_compose import docker_compose_context
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.processes.connectors.sql.postgres import (
     CONNECTOR_TYPE,
     PostgresAccessConfig,
     PostgresConnectionConfig,
+    PostgresDownloader,
+    PostgresDownloaderConfig,
+    PostgresIndexer,
+    PostgresIndexerConfig,
     PostgresUploader,
     PostgresUploadStager,
 )
+faker = faker.Faker()
+SEED_DATA_ROWS = 40
+@contextmanager
+def postgres_download_setup() -> None:
+    with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
+        connection = connect(
+            user="unstructured",
+            password="test",
+            dbname="test_db",
+            host="localhost",
+            port=5433,
+        )
+        with connection.cursor() as cursor:
+            for _ in range(SEED_DATA_ROWS):
+                sql_statment = (
+                    f"INSERT INTO cars (brand, price) VALUES "
+                    f"('{faker.word()}', {faker.random_int()})"
+                )
+                cursor.execute(sql_statment)
+            connection.commit()
+        yield
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
+async def test_postgres_source():
+    connect_params = {
+        "host": "localhost",
+        "port": 5433,
+        "database": "test_db",
+        "user": "unstructured",
+        "password": "test",
+    }
+    with postgres_download_setup():
+        with tempfile.TemporaryDirectory() as tmpdir:
+            connection_config = PostgresConnectionConfig(
+                host=connect_params["host"],
+                port=connect_params["port"],
+                database=connect_params["database"],
+                username=connect_params["user"],
+                access_config=PostgresAccessConfig(password=connect_params["password"]),
+            )
+            indexer = PostgresIndexer(
+                connection_config=connection_config,
+                index_config=PostgresIndexerConfig(
+                    table_name="cars", id_column="car_id", batch_size=5
+                ),
+            )
+            downloader = PostgresDownloader(
+                connection_config=connection_config,
+                download_config=PostgresDownloaderConfig(
+                    fields=["car_id", "brand"], download_dir=Path(tmpdir)
+                ),
+            )
+            await source_connector_validation(
+                indexer=indexer,
+                downloader=downloader,
+                configs=ValidationConfigs(
+                    test_id="postgres",
+                    expected_num_files=40,
+                ),
+            )
 def validate_destination(
     connect_params: dict,
@@ -50,7 +126,9 @@ async def test_postgres_destination(upload_file: Path):
     # the postgres destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
     mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
-    with docker_compose_context(docker_compose_path=env_setup_path / "sql"):
+    with docker_compose_context(
+        docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
+    ):
         with tempfile.TemporaryDirectory() as tmpdir:
             stager = PostgresUploadStager()
             stager_params = {

test/integration/connectors/{test_sqlite.py → sql/test_sqlite.py} RENAMED Viewed

@@ -3,39 +3,99 @@ import tempfile
 from contextlib import contextmanager
 from pathlib import Path
+import faker
 import pandas as pd
 import pytest
-from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
+from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
+from test.integration.connectors.utils.validation import (
+    ValidationConfigs,
+    source_connector_validation,
+)
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
     CONNECTOR_TYPE,
     SQLiteConnectionConfig,
+    SQLiteDownloader,
+    SQLiteDownloaderConfig,
+    SQLiteIndexer,
+    SQLiteIndexerConfig,
     SQLiteUploader,
     SQLiteUploadStager,
 )
+faker = faker.Faker()
+SEED_DATA_ROWS = 40
+@contextmanager
+def sqlite_download_setup() -> Path:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = Path(tmpdir) / "mock_database.db"
+        db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
+        assert db_init_path.exists()
+        assert db_init_path.is_file()
+        with sqlite3.connect(database=db_path) as sqlite_connection:
+            cursor = sqlite_connection.cursor()
+            with db_init_path.open("r") as f:
+                query = f.read()
+            cursor.executescript(query)
+            for _ in range(SEED_DATA_ROWS):
+                sql_statment = (
+                    f"INSERT INTO cars (brand, price) "
+                    f"VALUES ('{faker.word()}', {faker.random_int()})"
+                )
+                cursor.execute(sql_statment)
+            sqlite_connection.commit()
+            cursor.close()
+        yield db_path
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
+async def test_sqlite_source():
+    with sqlite_download_setup() as db_path:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            connection_config = SQLiteConnectionConfig(database_path=db_path)
+            indexer = SQLiteIndexer(
+                connection_config=connection_config,
+                index_config=SQLiteIndexerConfig(
+                    table_name="cars", id_column="car_id", batch_size=5
+                ),
+            )
+            downloader = SQLiteDownloader(
+                connection_config=connection_config,
+                download_config=SQLiteDownloaderConfig(
+                    fields=["car_id", "brand"], download_dir=Path(tmpdir)
+                ),
+            )
+            await source_connector_validation(
+                indexer=indexer,
+                downloader=downloader,
+                configs=ValidationConfigs(
+                    test_id="sqlite",
+                    expected_num_files=40,
+                ),
+            )
 @contextmanager
-def sqlite_setup() -> Path:
+def sqlite_upload_setup() -> Path:
     # Provision the local file that sqlite points to to have the desired schema for the integration
     # tests and make sure the file and connection get cleaned up by using a context manager.
     with tempfile.TemporaryDirectory() as tmpdir:
         db_path = Path(tmpdir) / "elements.db"
-        db_init_path = env_setup_path / "sql" / "sqlite-schema.sql"
+        db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
         assert db_init_path.exists()
         assert db_init_path.is_file()
-        connection = None
-        try:
-            connection = sqlite3.connect(database=db_path)
+        with sqlite3.connect(database=db_path) as sqlite_connection:
             with db_init_path.open("r") as f:
                 query = f.read()
-            cursor = connection.cursor()
+            cursor = sqlite_connection.cursor()
             cursor.executescript(query)
-            yield db_path
-        finally:
-            if connection:
-                connection.close()
+        yield db_path
 def validate_destination(db_path: Path, expected_num_elements: int):
@@ -62,7 +122,7 @@ async def test_sqlite_destination(upload_file: Path):
     # the sqlite destination connector doesn't leverage the file data but is required as an input,
     # mocking it with arbitrary values to meet the base requirements:
     mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
-    with sqlite_setup() as db_path:
+    with sqlite_upload_setup() as db_path:
         with tempfile.TemporaryDirectory() as tmpdir:
             stager = SQLiteUploadStager()
             stager_params = {

test/integration/connectors/utils/constants.py CHANGED Viewed

@@ -3,5 +3,5 @@ from pathlib import Path
 SOURCE_TAG = "source"
 DESTINATION_TAG = "destination"
-env_setup_path = Path(__file__).parents[4] / "test_e2e" / "env_setup"
+env_setup_path = Path(__file__).parents[1] / "env_setup"
 expected_results_path = Path(__file__).parents[1] / "expected_results"

test/integration/connectors/utils/validation.py CHANGED Viewed

@@ -180,8 +180,13 @@ async def source_connector_validation(
             resp = await downloader.run_async(file_data=file_data)
         else:
             resp = downloader.run(file_data=file_data)
-        postdownload_file_data = replace(resp["file_data"])
-        all_postdownload_file_data.append(postdownload_file_data)
+        if isinstance(resp, list):
+            for r in resp:
+                postdownload_file_data = replace(r["file_data"])
+                all_postdownload_file_data.append(postdownload_file_data)
+        else:
+            postdownload_file_data = replace(resp["file_data"])
+            all_postdownload_file_data.append(postdownload_file_data)
     if not overwrite_fixtures:
         run_all_validations(
             configs=configs,

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.0" # pragma: no cover
1	+ __version__ = "0.1.1" # pragma: no cover

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import random
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -63,6 +64,7 @@ class FileConfig(BaseModel):
 class FsspecIndexerConfig(FileConfig, IndexerConfig):
     recursive: bool = False
+    sample_n_files: Optional[int] = None
 class FsspecAccessConfig(AccessConfig):
@@ -128,8 +130,23 @@ class FsspecIndexer(Indexer):
         filtered_files = [
             file for file in files if file.get("size") > 0 and file.get("type") == "file"
         ]
+        if self.index_config.sample_n_files:
+            filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
         return filtered_files
+    def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
+        if len(files) <= n:
+            logger.warning(
+                f"number of files to be sampled={n} is not smaller than the number"
+                f" of files found ({len(files)}). Returning all of the files as the"
+                " sample."
+            )
+            return files
+        return random.sample(files, n)
     def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
         raise NotImplementedError()

unstructured_ingest/v2/processes/connectors/kdbai.py CHANGED Viewed

@@ -26,7 +26,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 if TYPE_CHECKING:
-    from kdbai_client import Session, Table
+    from kdbai_client import Database, Session, Table
 CONNECTOR_TYPE = "kdbai"
@@ -99,6 +99,9 @@ class KdbaiUploadStager(UploadStager):
 class KdbaiUploaderConfig(UploaderConfig):
+    database_name: str = Field(
+        default="default", description="The name of the KDBAI database to write into."
+    )
     table_name: str = Field(description="The name of the KDBAI table to write into.")
     batch_size: int = Field(default=100, description="Number of records per batch")
@@ -111,24 +114,29 @@ class KdbaiUploader(Uploader):
     def precheck(self) -> None:
         try:
-            self.get_table()
+            self.get_database()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
-    def get_table(self) -> "Table":
+    def get_database(self) -> "Database":
         session: Session = self.connection_config.get_session()
-        table = session.table(self.upload_config.table_name)
+        db = session.database(self.upload_config.database_name)
+        return db
+    def get_table(self) -> "Table":
+        db = self.get_database()
+        table = db.table(self.upload_config.table_name)
         return table
     def upsert_batch(self, batch: pd.DataFrame):
         table = self.get_table()
-        table.insert(data=batch)
+        table.insert(batch)
     def process_dataframe(self, df: pd.DataFrame):
         logger.debug(
             f"uploading {len(df)} entries to {self.connection_config.endpoint} "
-            f"db in table {self.upload_config.table_name}"
+            f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
         )
         for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
             self.upsert_batch(batch=batch_df)

unstructured_ingest/v2/processes/connectors/sql/postgres.py CHANGED Viewed

@@ -7,12 +7,17 @@ import pandas as pd
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
 from unstructured_ingest.v2.processes.connectors.sql.sql import (
     _DATE_COLUMNS,
     SQLAccessConfig,
     SQLConnectionConfig,
+    SQLDownloader,
+    SQLDownloaderConfig,
+    SQLIndexer,
+    SQLIndexerConfig,
     SQLUploader,
     SQLUploaderConfig,
     SQLUploadStager,
@@ -57,6 +62,57 @@ class PostgresConnectionConfig(SQLConnectionConfig):
         )
+class PostgresIndexerConfig(SQLIndexerConfig):
+    pass
+@dataclass
+class PostgresIndexer(SQLIndexer):
+    connection_config: PostgresConnectionConfig
+    index_config: PostgresIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def _get_doc_ids(self) -> list[str]:
+        connection = self.connection_config.get_connection()
+        with connection.cursor() as cursor:
+            cursor.execute(
+                f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
+            )
+            results = cursor.fetchall()
+            ids = [result[0] for result in results]
+            return ids
+class PostgresDownloaderConfig(SQLDownloaderConfig):
+    pass
+@dataclass
+class PostgresDownloader(SQLDownloader):
+    connection_config: PostgresConnectionConfig
+    download_config: PostgresDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
+        table_name = file_data.additional_metadata["table_name"]
+        id_column = file_data.additional_metadata["id_column"]
+        ids = file_data.additional_metadata["ids"]
+        connection = self.connection_config.get_connection()
+        with connection.cursor() as cursor:
+            fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
+            query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
+                fields=fields,
+                table_name=table_name,
+                id_column=id_column,
+                ids=",".join([str(i) for i in ids]),
+            )
+            logger.debug(f"running query: {query}")
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            columns = [col[0] for col in cursor.description]
+            return rows, columns
 class PostgresUploadStagerConfig(SQLUploadStagerConfig):
     pass

unstructured_ingest/v2/processes/connectors/sql/sql.py CHANGED Viewed

@@ -1,24 +1,34 @@
+import hashlib
 import json
+import sys
 import uuid
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from datetime import date, datetime
 from pathlib import Path
-from typing import Any, Union
+from time import time
+from typing import Any, Generator, Union
 import pandas as pd
 from dateutil import parser
 from pydantic import Field, Secret
-from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
     Uploader,
     UploaderConfig,
     UploadStager,
     UploadStagerConfig,
+    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
@@ -88,6 +98,125 @@ class SQLConnectionConfig(ConnectionConfig, ABC):
         pass
+class SQLIndexerConfig(IndexerConfig):
+    table_name: str
+    id_column: str
+    batch_size: int = 100
+class SQLIndexer(Indexer, ABC):
+    connection_config: SQLConnectionConfig
+    index_config: SQLIndexerConfig
+    @abstractmethod
+    def _get_doc_ids(self) -> list[str]:
+        pass
+    def precheck(self) -> None:
+        try:
+            connection = self.connection_config.get_connection()
+            cursor = connection.cursor()
+            cursor.execute("SELECT 1;")
+            cursor.close()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        ids = self._get_doc_ids()
+        id_batches: list[frozenset[str]] = [
+            frozenset(
+                ids[
+                    i
+                    * self.index_config.batch_size : (i + 1)  # noqa
+                    * self.index_config.batch_size
+                ]
+            )
+            for i in range(
+                (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
+            )
+        ]
+        for batch in id_batches:
+            # Make sure the hash is always a positive number to create identified
+            identified = str(hash(batch) + sys.maxsize + 1)
+            yield FileData(
+                identifier=identified,
+                connector_type=self.connector_type,
+                metadata=FileDataSourceMetadata(
+                    date_processed=str(time()),
+                ),
+                doc_type="batch",
+                additional_metadata={
+                    "ids": list(batch),
+                    "table_name": self.index_config.table_name,
+                    "id_column": self.index_config.id_column,
+                },
+            )
+class SQLDownloaderConfig(DownloaderConfig):
+    fields: list[str] = field(default_factory=list)
+class SQLDownloader(Downloader, ABC):
+    connection_config: SQLConnectionConfig
+    download_config: SQLDownloaderConfig
+    @abstractmethod
+    def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
+        pass
+    def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
+        data = [dict(zip(columns, row)) for row in rows]
+        df = pd.DataFrame(data)
+        dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
+        return dfs
+    def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
+        rows, columns = self.query_db(file_data=file_data)
+        return self.sql_to_df(rows=rows, columns=columns)
+    def get_identifier(self, table_name: str, record_id: str) -> str:
+        f = f"{table_name}-{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def generate_download_response(
+        self, result: pd.DataFrame, file_data: FileData
+    ) -> DownloadResponse:
+        id_column = file_data.additional_metadata["id_column"]
+        table_name = file_data.additional_metadata["table_name"]
+        record_id = result.iloc[0][id_column]
+        filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
+        filename = f"{filename_id}.csv"
+        download_path = self.download_dir / Path(filename)
+        logger.debug(
+            f"Downloading results from table {table_name} and id {record_id} to {download_path}"
+        )
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        result.to_csv(download_path)
+        copied_file_data = replace(file_data)
+        copied_file_data.identifier = filename_id
+        copied_file_data.doc_type = "file"
+        copied_file_data.additional_metadata.pop("ids", None)
+        return super().generate_download_response(
+            file_data=copied_file_data, download_path=download_path
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        data_dfs = self.get_data(file_data=file_data)
+        download_responses = []
+        for df in data_dfs:
+            download_responses.append(
+                self.generate_download_response(result=df, file_data=file_data)
+            )
+        return download_responses
 class SQLUploadStagerConfig(UploadStagerConfig):
     pass

unstructured_ingest/v2/processes/connectors/sql/sqlite.py CHANGED Viewed

@@ -5,14 +5,19 @@ from typing import TYPE_CHECKING, Any
 import numpy as np
 import pandas as pd
-from pydantic import Field, Secret
+from pydantic import Field, Secret, model_validator
+from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
 from unstructured_ingest.v2.processes.connectors.sql.sql import (
     _DATE_COLUMNS,
     SQLAccessConfig,
     SQLConnectionConfig,
+    SQLDownloader,
+    SQLDownloaderConfig,
+    SQLIndexer,
+    SQLIndexerConfig,
     SQLUploader,
     SQLUploaderConfig,
     SQLUploadStager,
@@ -37,7 +42,14 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
     database_path: Path = Field(
         description="Path to the .db file.",
     )
-    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    @model_validator(mode="after")
+    def check_database_path(self) -> "SQLiteConnectionConfig":
+        if not self.database_path.exists():
+            raise ValueError(f"{self.database_path} does not exist")
+        if not self.database_path.is_file():
+            raise ValueError(f"{self.database_path} is not a valid file")
+        return self
     def get_connection(self) -> "SqliteConnection":
         from sqlite3 import connect
@@ -45,6 +57,57 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
         return connect(database=self.database_path)
+class SQLiteIndexerConfig(SQLIndexerConfig):
+    pass
+@dataclass
+class SQLiteIndexer(SQLIndexer):
+    connection_config: SQLConnectionConfig
+    index_config: SQLIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def _get_doc_ids(self) -> list[str]:
+        with self.connection_config.get_connection() as sqlite_connection:
+            cursor = sqlite_connection.cursor()
+            cursor.execute(
+                f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
+            )
+            results = cursor.fetchall()
+            ids = [result[0] for result in results]
+            return ids
+class SQLiteDownloaderConfig(SQLDownloaderConfig):
+    pass
+@dataclass
+class SQLiteDownloader(SQLDownloader):
+    connection_config: SQLConnectionConfig
+    download_config: SQLDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
+        table_name = file_data.additional_metadata["table_name"]
+        id_column = file_data.additional_metadata["id_column"]
+        ids = file_data.additional_metadata["ids"]
+        with self.connection_config.get_connection() as sqlite_connection:
+            cursor = sqlite_connection.cursor()
+            fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
+            query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
+                fields=fields,
+                table_name=table_name,
+                id_column=id_column,
+                ids=",".join([str(i) for i in ids]),
+            )
+            logger.debug(f"running query: {query}")
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            columns = [col[0] for col in cursor.description]
+            return rows, columns
 class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
     pass

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unstructured-ingest
-Version: 0.1.0
+Version: 0.1.1
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0,<3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
-Requires-Dist: pydantic>=2.7
-Requires-Dist: opentelemetry-sdk
-Requires-Dist: tqdm
-Requires-Dist: pandas
 Requires-Dist: python-dateutil
+Requires-Dist: pandas
+Requires-Dist: pydantic>=2.7
 Requires-Dist: dataclasses-json
+Requires-Dist: opentelemetry-sdk
 Requires-Dist: click
+Requires-Dist: tqdm
 Provides-Extra: airtable
 Requires-Dist: pyairtable; extra == "airtable"
 Provides-Extra: astradb
@@ -44,8 +44,8 @@ Provides-Extra: biomed
 Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: requests; extra == "biomed"
 Provides-Extra: box
-Requires-Dist: fsspec; extra == "box"
 Requires-Dist: boxfs; extra == "box"
+Requires-Dist: fsspec; extra == "box"
 Provides-Extra: chroma
 Requires-Dist: chromadb; extra == "chroma"
 Provides-Extra: clarifai
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
 Provides-Extra: docx
 Requires-Dist: unstructured[docx]; extra == "docx"
 Provides-Extra: dropbox
-Requires-Dist: dropboxdrivefs; extra == "dropbox"
 Requires-Dist: fsspec; extra == "dropbox"
+Requires-Dist: dropboxdrivefs; extra == "dropbox"
 Provides-Extra: elasticsearch
 Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
 Provides-Extra: embed-huggingface
@@ -87,12 +87,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
 Provides-Extra: epub
 Requires-Dist: unstructured[epub]; extra == "epub"
 Provides-Extra: gcs
-Requires-Dist: bs4; extra == "gcs"
 Requires-Dist: gcsfs; extra == "gcs"
+Requires-Dist: bs4; extra == "gcs"
 Requires-Dist: fsspec; extra == "gcs"
 Provides-Extra: github
-Requires-Dist: requests; extra == "github"
 Requires-Dist: pygithub>1.58.0; extra == "github"
+Requires-Dist: requests; extra == "github"
 Provides-Extra: gitlab
 Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
 Provides-Extra: kafka
 Requires-Dist: confluent-kafka; extra == "kafka"
 Provides-Extra: kdbai
-Requires-Dist: kdbai-client; extra == "kdbai"
+Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
 Provides-Extra: md
 Requires-Dist: unstructured[md]; extra == "md"
 Provides-Extra: milvus
@@ -116,15 +116,15 @@ Provides-Extra: msg
 Requires-Dist: unstructured[msg]; extra == "msg"
 Provides-Extra: notion
 Requires-Dist: notion-client; extra == "notion"
-Requires-Dist: httpx; extra == "notion"
-Requires-Dist: backoff; extra == "notion"
 Requires-Dist: htmlBuilder; extra == "notion"
+Requires-Dist: backoff; extra == "notion"
+Requires-Dist: httpx; extra == "notion"
 Provides-Extra: odt
 Requires-Dist: unstructured[odt]; extra == "odt"
 Provides-Extra: onedrive
+Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
 Requires-Dist: bs4; extra == "onedrive"
 Requires-Dist: msal; extra == "onedrive"
-Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
 Provides-Extra: openai
 Requires-Dist: openai; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
 Provides-Extra: org
 Requires-Dist: unstructured[org]; extra == "org"
 Provides-Extra: outlook
-Requires-Dist: msal; extra == "outlook"
 Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
+Requires-Dist: msal; extra == "outlook"
 Provides-Extra: pdf
 Requires-Dist: unstructured[pdf]; extra == "pdf"
 Provides-Extra: pinecone
@@ -156,16 +156,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
 Provides-Extra: rtf
 Requires-Dist: unstructured[rtf]; extra == "rtf"
 Provides-Extra: s3
-Requires-Dist: s3fs; extra == "s3"
 Requires-Dist: fsspec; extra == "s3"
+Requires-Dist: s3fs; extra == "s3"
 Provides-Extra: salesforce
 Requires-Dist: simple-salesforce; extra == "salesforce"
 Provides-Extra: sftp
 Requires-Dist: fsspec; extra == "sftp"
 Requires-Dist: paramiko; extra == "sftp"
 Provides-Extra: sharepoint
-Requires-Dist: msal; extra == "sharepoint"
 Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
+Requires-Dist: msal; extra == "sharepoint"
 Provides-Extra: singlestore
 Requires-Dist: singlestoredb; extra == "singlestore"
 Provides-Extra: slack

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD RENAMED Viewed

@@ -5,15 +5,16 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
 test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
 test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
-test/integration/connectors/test_postgres.py,sha256=9uaqlUmLpVF09cwKSw7Yldq2kjU00WBedbEIgyJG5Cw,3998
 test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
-test/integration/connectors/test_sqlite.py,sha256=NnLdyt3FfM1A53tXPJbgIcsy-iEgYY8OZYOfliFqifM,3507
 test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
+test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
+test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
 test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-test/integration/connectors/utils/constants.py,sha256=OjxLmmzCbDNqH5tK0jWFxDgIkM973cr3SmFIRk7aySc,222
+test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
 test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
-test/integration/connectors/utils/validation.py,sha256=VNvyutfnWbnesavL_V5SjM2H3LoOHnkW7Paq8RO4WbM,8199
+test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
 test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
 test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -42,7 +43,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
 test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
 test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=J87Ao0q5WoHKbDEbH6O10GOGaMO3yEUCBOxCqbm715I,42
+unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -361,7 +362,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-H
 unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
 unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
 unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
-unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
+unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
 unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
 unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
 unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
@@ -384,18 +385,18 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
 unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
 unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
 unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
-unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=usLzU2NA5D_a1juhja4jyJP_CzW4h-5rZ22bWVwvZGQ,10853
+unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
 unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
 unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
 unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
 unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
 unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
-unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=hqNuGYR_9o5LmfVDXnm3jBF5Pk-s7R66d0epF2uBYuM,4083
-unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=8bDUgyDurQelOabNnSG6ejWWsnLGWf-A-lWrpwYDGQM,5140
-unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=K-Lquxxqa1m5fk9by-5sasq561TRFAeV_SZ1Hc_b9Hk,3426
-unstructured_ingest-0.1.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.1.0.dist-info/METADATA,sha256=mNOS5HjbygWcTZ5eFlxoPpvt6dVAjkYniNHpk6tLvQw,7181
-unstructured_ingest-0.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-unstructured_ingest-0.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.1.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
-unstructured_ingest-0.1.0.dist-info/RECORD,,
+unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
+unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
+unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
+unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
+unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
+unstructured_ingest-0.1.1.dist-info/RECORD,,

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl