PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/connectors/sql/sql.py ADDED Viewed

@@ -0,0 +1,456 @@
+import hashlib
+import json
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from time import time
+from typing import TYPE_CHECKING, Any, Generator, Union
+from dateutil import parser
+from pydantic import BaseModel, Field, Secret
+from unstructured_ingest.data_types.file_data import (
+    BatchFileData,
+    BatchItem,
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    Indexer,
+    IndexerConfig,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+    download_responses,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils.constants import RECORD_ID_LABEL
+from unstructured_ingest.utils.data_prep import (
+    get_data_df,
+    get_enhanced_element_id,
+    get_json_data,
+    split_dataframe,
+    write_data,
+)
+if TYPE_CHECKING:
+    from pandas import DataFrame
+_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
+class SqlAdditionalMetadata(BaseModel):
+    table_name: str
+    id_column: str
+class SqlBatchFileData(BatchFileData):
+    additional_metadata: SqlAdditionalMetadata
+def parse_date_string(date_value: Union[str, int]) -> datetime:
+    try:
+        timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
+        return datetime.fromtimestamp(timestamp)
+    except Exception as e:
+        logger.debug(f"date {date_value} string not a timestamp: {e}")
+    if isinstance(date_value, str):
+        try:
+            return datetime.fromisoformat(date_value)
+        except Exception:
+            pass
+    return parser.parse(date_value)
+class SQLAccessConfig(AccessConfig):
+    pass
+class SQLConnectionConfig(ConnectionConfig, ABC):
+    access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
+    @abstractmethod
+    @contextmanager
+    def get_connection(self) -> Generator[Any, None, None]:
+        pass
+    @abstractmethod
+    @contextmanager
+    def get_cursor(self) -> Generator[Any, None, None]:
+        pass
+class SQLIndexerConfig(IndexerConfig):
+    table_name: str
+    id_column: str
+    batch_size: int = 100
+class SQLIndexer(Indexer, ABC):
+    connection_config: SQLConnectionConfig
+    index_config: SQLIndexerConfig
+    @contextmanager
+    def get_cursor(self) -> Generator[Any, None, None]:
+        with self.connection_config.get_cursor() as cursor:
+            yield cursor
+    def _get_doc_ids(self) -> list[str]:
+        with self.get_cursor() as cursor:
+            cursor.execute(
+                f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
+            )
+            results = cursor.fetchall()
+            ids = sorted([result[0] for result in results])
+            return ids
+    def precheck(self) -> None:
+        try:
+            with self.get_cursor() as cursor:
+                cursor.execute("SELECT 1;")
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def run(self, **kwargs: Any) -> Generator[SqlBatchFileData, None, None]:
+        ids = self._get_doc_ids()
+        id_batches: list[frozenset[str]] = [
+            frozenset(
+                ids[
+                    i * self.index_config.batch_size : (i + 1)  # noqa
+                    * self.index_config.batch_size
+                ]
+            )
+            for i in range(
+                (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
+            )
+        ]
+        for batch in id_batches:
+            batch_items = [BatchItem(identifier=str(b)) for b in batch]
+            display_name = (
+                f"{self.index_config.table_name}-{self.index_config.id_column}"
+                f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
+            )
+            # Make sure the hash is always a positive number to create identified
+            yield SqlBatchFileData(
+                connector_type=self.connector_type,
+                metadata=FileDataSourceMetadata(
+                    date_processed=str(time()),
+                ),
+                additional_metadata=SqlAdditionalMetadata(
+                    table_name=self.index_config.table_name, id_column=self.index_config.id_column
+                ),
+                batch_items=batch_items,
+                display_name=display_name,
+            )
+class SQLDownloaderConfig(DownloaderConfig):
+    fields: list[str] = field(default_factory=list)
+class SQLDownloader(Downloader, ABC):
+    connection_config: SQLConnectionConfig
+    download_config: SQLDownloaderConfig
+    @contextmanager
+    def get_cursor(self) -> Generator[Any, None, None]:
+        with self.connection_config.get_cursor() as cursor:
+            yield cursor
+    @abstractmethod
+    def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
+        pass
+    def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list["DataFrame"]:
+        import pandas as pd
+        data = [dict(zip(columns, row)) for row in rows]
+        df = pd.DataFrame(data)
+        dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
+        return dfs
+    def get_data(self, file_data: SqlBatchFileData) -> list["DataFrame"]:
+        rows, columns = self.query_db(file_data=file_data)
+        return self.sql_to_df(rows=rows, columns=columns)
+    def get_identifier(self, table_name: str, record_id: str) -> str:
+        f = f"{table_name}-{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def generate_download_response(
+        self, result: "DataFrame", file_data: SqlBatchFileData
+    ) -> DownloadResponse:
+        id_column = file_data.additional_metadata.id_column
+        table_name = file_data.additional_metadata.table_name
+        record_id = result.iloc[0][id_column]
+        filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
+        filename = f"{filename_id}.csv"
+        download_path = self.download_dir / Path(filename)
+        logger.debug(
+            f"Downloading results from table {table_name} and id {record_id} to {download_path}"
+        )
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        result.to_csv(download_path, index=False)
+        file_data.source_identifiers = SourceIdentifiers(
+            filename=filename,
+            fullpath=filename,
+        )
+        cast_file_data = FileData.cast(file_data=file_data)
+        cast_file_data.identifier = filename_id
+        return super().generate_download_response(
+            file_data=cast_file_data, download_path=download_path
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        sql_filedata = SqlBatchFileData.cast(file_data=file_data)
+        data_dfs = self.get_data(file_data=sql_filedata)
+        download_responses = []
+        for df in data_dfs:
+            download_responses.append(
+                self.generate_download_response(result=df, file_data=sql_filedata)
+            )
+        return download_responses
+class SQLUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class SQLUploadStager(UploadStager):
+    upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
+        metadata: dict[str, Any] = data.pop("metadata", {})
+        data_source = metadata.pop("data_source", {})
+        coordinates = metadata.pop("coordinates", {})
+        data.update(metadata)
+        data.update(data_source)
+        data.update(coordinates)
+        data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
+        data[RECORD_ID_LABEL] = file_data.identifier
+        return data
+    def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
+        for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
+            df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
+        for column in filter(
+            lambda x: x in df.columns,
+            ("permissions_data", "record_locator", "points", "links"),
+        ):
+            df[column] = df[column].apply(
+                lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
+            )
+        for column in filter(
+            lambda x: x in df.columns,
+            ("version", "page_number", "regex_metadata"),
+        ):
+            df[column] = df[column].apply(str)
+        return df
+    def write_output(self, output_path: Path, data: list[dict]) -> Path:
+        write_data(path=output_path, data=data)
+        return output_path
+    def run(
+        self,
+        elements_filepath: Path,
+        file_data: FileData,
+        output_dir: Path,
+        output_filename: str,
+        **kwargs: Any,
+    ) -> Path:
+        import pandas as pd
+        elements_contents = get_json_data(path=elements_filepath)
+        df = pd.DataFrame(
+            data=[
+                self.conform_dict(element_dict=element_dict, file_data=file_data)
+                for element_dict in elements_contents
+            ]
+        )
+        df = self.conform_dataframe(df=df)
+        output_filename_suffix = Path(elements_filepath).suffix
+        output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
+        output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
+        final_output_path = self.write_output(
+            output_path=output_path, data=df.to_dict(orient="records")
+        )
+        return final_output_path
+class SQLUploaderConfig(UploaderConfig):
+    batch_size: int = Field(default=50, description="Number of records per batch")
+    table_name: str = Field(default="elements", description="which table to upload contents to")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
+@dataclass
+class SQLUploader(Uploader):
+    upload_config: SQLUploaderConfig
+    connection_config: SQLConnectionConfig
+    values_delimiter: str = "?"
+    _columns: list[str] = field(init=False, default=None)
+    def precheck(self) -> None:
+        try:
+            with self.get_cursor() as cursor:
+                cursor.execute("SELECT 1;")
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @contextmanager
+    def get_cursor(self) -> Generator[Any, None, None]:
+        with self.connection_config.get_cursor() as cursor:
+            yield cursor
+    def prepare_data(
+        self, columns: list[str], data: tuple[tuple[Any, ...], ...]
+    ) -> list[tuple[Any, ...]]:
+        import pandas as pd
+        output = []
+        for row in data:
+            parsed = []
+            for column_name, value in zip(columns, row):
+                if column_name in _DATE_COLUMNS:
+                    if value is None or pd.isna(value):  # pandas is nan
+                        parsed.append(None)
+                    else:
+                        parsed.append(parse_date_string(value))
+                else:
+                    parsed.append(value)
+            output.append(tuple(parsed))
+        return output
+    def _fit_to_schema(
+        self, df: "DataFrame", add_missing_columns: bool = True, case_sensitive: bool = True
+    ) -> "DataFrame":
+        import pandas as pd
+        table_columns = self.get_table_columns()
+        columns = set(df.columns if case_sensitive else df.columns.str.lower())
+        schema_fields = set(
+            table_columns if case_sensitive else {col.lower() for col in table_columns}
+        )
+        columns_to_drop = columns - schema_fields
+        missing_columns = schema_fields - columns
+        if columns_to_drop:
+            logger.info(
+                "Following columns will be dropped to match the table's schema: "
+                f"{', '.join(columns_to_drop)}"
+            )
+        if missing_columns and add_missing_columns:
+            logger.info(
+                "Following null filled columns will be added to match the table's schema:"
+                f" {', '.join(missing_columns)} "
+            )
+        df = df.drop(columns=columns_to_drop)
+        if add_missing_columns:
+            for column in missing_columns:
+                df[column] = pd.Series()
+        return df
+    def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
+        import numpy as np
+        if self.can_delete():
+            self.delete_by_record_id(file_data=file_data)
+        else:
+            logger.warning(
+                f"table doesn't contain expected "
+                f"record id column "
+                f"{self.upload_config.record_id_key}, skipping delete"
+            )
+        df = self._fit_to_schema(df=df)
+        df.replace({np.nan: None}, inplace=True)
+        columns = list(df.columns)
+        stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
+            table_name=self.upload_config.table_name,
+            columns=",".join(columns),
+            values=",".join([self.values_delimiter for _ in columns]),
+        )
+        logger.info(
+            f"writing a total of {len(df)} elements via"
+            f" document batches to destination"
+            f" table named {self.upload_config.table_name}"
+            f" with batch size {self.upload_config.batch_size}"
+        )
+        for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
+            with self.get_cursor() as cursor:
+                values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
+                # For debugging purposes:
+                # for val in values:
+                #     try:
+                #         cursor.execute(stmt, val)
+                #     except Exception as e:
+                #         print(f"Error: {e}")
+                #         print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
+                logger.debug(f"running query: {stmt}")
+                cursor.executemany(stmt, values)
+    def get_table_columns(self) -> list[str]:
+        if self._columns is None:
+            with self.get_cursor() as cursor:
+                cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
+                self._columns = [desc[0] for desc in cursor.description]
+        return self._columns
+    def can_delete(self) -> bool:
+        return self.upload_config.record_id_key in self.get_table_columns()
+    def delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with data "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from table {self.upload_config.table_name}"
+        )
+        stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}"  # noqa: E501
+        with self.get_cursor() as cursor:
+            cursor.execute(stmt, [file_data.identifier])
+            rowcount = cursor.rowcount
+            if rowcount > 0:
+                logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        import pandas as pd
+        df = pd.DataFrame(data)
+        self.upload_dataframe(df=df, file_data=file_data)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        df = get_data_df(path=path)
+        self.upload_dataframe(df=df, file_data=file_data)

unstructured_ingest/processes/connectors/sql/sqlite.py ADDED Viewed

@@ -0,0 +1,179 @@
+import json
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator
+from pydantic import Field, Secret, model_validator
+from unstructured_ingest.data_types.file_data import FileData
+from unstructured_ingest.error import ValueError
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
+from unstructured_ingest.processes.connectors.sql.sql import (
+    _DATE_COLUMNS,
+    SQLAccessConfig,
+    SqlBatchFileData,
+    SQLConnectionConfig,
+    SQLDownloader,
+    SQLDownloaderConfig,
+    SQLIndexer,
+    SQLIndexerConfig,
+    SQLUploader,
+    SQLUploaderConfig,
+    SQLUploadStager,
+    SQLUploadStagerConfig,
+    parse_date_string,
+)
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from sqlite3 import Connection as SqliteConnection
+    from sqlite3 import Cursor as SqliteCursor
+CONNECTOR_TYPE = "sqlite"
+class SQLiteAccessConfig(SQLAccessConfig):
+    pass
+class SQLiteConnectionConfig(SQLConnectionConfig):
+    access_config: Secret[SQLiteAccessConfig] = Field(
+        default=SQLiteAccessConfig(), validate_default=True
+    )
+    database_path: Path = Field(
+        description="Path to the .db file.",
+    )
+    @model_validator(mode="after")
+    def check_database_path(self) -> "SQLiteConnectionConfig":
+        if not self.database_path.exists():
+            raise ValueError(f"{self.database_path} does not exist")
+        if not self.database_path.is_file():
+            raise ValueError(f"{self.database_path} is not a valid file")
+        return self
+    @contextmanager
+    def get_connection(self) -> Generator["SqliteConnection", None, None]:
+        from sqlite3 import connect
+        connection = connect(database=self.database_path)
+        try:
+            yield connection
+        finally:
+            connection.commit()
+            connection.close()
+    @contextmanager
+    def get_cursor(self) -> Generator["SqliteCursor", None, None]:
+        with self.get_connection() as connection:
+            cursor = connection.cursor()
+            try:
+                yield cursor
+            finally:
+                cursor.close()
+class SQLiteIndexerConfig(SQLIndexerConfig):
+    pass
+@dataclass
+class SQLiteIndexer(SQLIndexer):
+    connection_config: SQLConnectionConfig
+    index_config: SQLIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+class SQLiteDownloaderConfig(SQLDownloaderConfig):
+    pass
+@dataclass
+class SQLiteDownloader(SQLDownloader):
+    connection_config: SQLConnectionConfig
+    download_config: SQLDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    values_delimiter: str = "?"
+    def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
+        table_name = file_data.additional_metadata.table_name
+        id_column = file_data.additional_metadata.id_column
+        ids = [item.identifier for item in file_data.batch_items]
+        with self.connection_config.get_connection() as sqlite_connection:
+            cursor = sqlite_connection.cursor()
+            fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
+            values = ",".join(self.values_delimiter for _ in ids)
+            query = f"SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})"
+            logger.debug(f"running query: {query}\nwith values: {ids}")
+            cursor.execute(query, ids)
+            rows = cursor.fetchall()
+            columns = [col[0] for col in cursor.description]
+            return rows, columns
+class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
+    pass
+class SQLiteUploadStager(SQLUploadStager):
+    upload_stager_config: SQLiteUploadStagerConfig
+class SQLiteUploaderConfig(SQLUploaderConfig):
+    pass
+@dataclass
+class SQLiteUploader(SQLUploader):
+    upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
+    connection_config: SQLiteConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["pandas"])
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        super().run(path=path, file_data=file_data, **kwargs)
+    @requires_dependencies(["pandas"])
+    def prepare_data(
+        self, columns: list[str], data: tuple[tuple[Any, ...], ...]
+    ) -> list[tuple[Any, ...]]:
+        import pandas as pd
+        output = []
+        for row in data:
+            parsed = []
+            for column_name, value in zip(columns, row):
+                if isinstance(value, (list, dict)):
+                    value = json.dumps(value)
+                if column_name in _DATE_COLUMNS:
+                    if value is None or pd.isna(value):
+                        parsed.append(None)
+                    else:
+                        parsed.append(parse_date_string(value))
+                else:
+                    parsed.append(value)
+            output.append(tuple(parsed))
+        return output
+sqlite_source_entry = SourceRegistryEntry(
+    connection_config=SQLiteConnectionConfig,
+    indexer_config=SQLiteIndexerConfig,
+    indexer=SQLiteIndexer,
+    downloader_config=SQLiteDownloaderConfig,
+    downloader=SQLiteDownloader,
+)
+sqlite_destination_entry = DestinationRegistryEntry(
+    connection_config=SQLiteConnectionConfig,
+    uploader=SQLiteUploader,
+    uploader_config=SQLiteUploaderConfig,
+    upload_stager=SQLiteUploadStager,
+    upload_stager_config=SQLiteUploadStagerConfig,
+)