unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "1.0.2" # pragma: no cover
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Ingest CLI
|
|
2
|
+
This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
|
|
3
|
+
|
|
4
|
+
## Design Reference
|
|
5
|
+
[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
|
|
6
|
+
source and destination connectors.
|
|
7
|
+
|
|
8
|
+
To manually run the cli:
|
|
9
|
+
```shell
|
|
10
|
+
PYTHONPATH=. python unstructured_ingest/v2/main.py --help
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The `main.py` file simply wraps the generated Click command created in `cli.py`.
|
|
14
|
+
|
|
15
|
+
### Source Commands
|
|
16
|
+
All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
|
|
17
|
+
different connectors with shared and unique parameters.
|
|
18
|
+
|
|
19
|
+
### Destination Commands
|
|
20
|
+
All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
|
|
21
|
+
sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
|
|
22
|
+
generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
|
|
23
|
+
|
|
24
|
+
### Configs
|
|
25
|
+
The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
|
|
26
|
+
are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
|
|
27
|
+
Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
|
|
28
|
+
into dataclasses that have nested fields (such as access configs).
|
|
@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
114
114
|
|
|
115
115
|
@dataclass
|
|
116
116
|
class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
117
|
-
|
|
118
117
|
config: MixedbreadAIEmbeddingConfig
|
|
119
118
|
|
|
120
119
|
async def get_exemplary_embedding(self) -> list[float]:
|
|
@@ -8,7 +8,7 @@ from pydantic import BaseModel
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseProcess
|
|
10
10
|
from unstructured_ingest.utils import ndjson
|
|
11
|
-
from unstructured_ingest.utils.data_prep import
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class UploadStagerConfig(BaseModel):
|
|
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
43
43
|
writer.f.flush()
|
|
44
44
|
|
|
45
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
-
elements_contents =
|
|
46
|
+
elements_contents = get_json_data(path=input_file)
|
|
47
47
|
|
|
48
48
|
conformed_elements = [
|
|
49
49
|
self.conform_dict(element_dict=element, file_data=file_data)
|
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseConnector, BaseProcess
|
|
10
|
-
from unstructured_ingest.utils.data_prep import
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class UploaderConfig(BaseModel):
|
|
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
45
45
|
return False
|
|
46
46
|
|
|
47
47
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
48
|
-
data =
|
|
48
|
+
data = get_json_data(path=path)
|
|
49
49
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
50
50
|
|
|
51
51
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
52
|
-
data =
|
|
52
|
+
data = get_json_data(path=path)
|
|
53
53
|
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
54
54
|
|
|
55
55
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
unstructured_ingest/main.py
CHANGED
|
File without changes
|
|
@@ -119,7 +119,7 @@ class PipelineStep(ABC):
|
|
|
119
119
|
iterable = iterable or []
|
|
120
120
|
if iterable:
|
|
121
121
|
logger.info(
|
|
122
|
-
f"calling {self.__class__.__name__}
|
|
122
|
+
f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
|
|
123
123
|
)
|
|
124
124
|
else:
|
|
125
125
|
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
@@ -220,7 +220,7 @@ class Pipeline:
|
|
|
220
220
|
|
|
221
221
|
def _run(self):
|
|
222
222
|
logger.info(
|
|
223
|
-
f"running local pipeline: {self} with configs:
|
|
223
|
+
f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
|
|
224
224
|
)
|
|
225
225
|
if self.context.mp_supported:
|
|
226
226
|
manager = mp.Manager()
|
|
@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
|
|
|
24
24
|
default="https://api.unstructuredapp.io/general/v0/general",
|
|
25
25
|
description="If chunking via api, use the following host.",
|
|
26
26
|
)
|
|
27
|
+
chunk_api_timeout_ms: Optional[int] = Field(
|
|
28
|
+
default=None, description="Timeout in milliseconds for all api call during chunking."
|
|
29
|
+
)
|
|
27
30
|
chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
|
|
28
31
|
chunk_api_key: Optional[SecretStr] = Field(
|
|
29
32
|
default=None, description="API Key for chunking endpoint."
|
|
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
|
|
|
120
123
|
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
121
124
|
filename=elements_filepath,
|
|
122
125
|
api_parameters=self.config.to_chunking_kwargs(),
|
|
126
|
+
timeout_ms=self.config.chunk_api_timeout_ms,
|
|
123
127
|
)
|
|
124
128
|
|
|
125
129
|
elements = assign_and_map_hash_ids(elements=elements)
|
|
@@ -3,7 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
4
|
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
import pandas
|
|
7
6
|
from pydantic import BaseModel, Field, Secret, field_validator
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
|
|
|
213
212
|
row_dict.update(table_row["fields"])
|
|
214
213
|
return row_dict
|
|
215
214
|
|
|
215
|
+
@requires_dependencies(["pandas"], extras="airtable")
|
|
216
216
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
217
|
+
import pandas as pd
|
|
218
|
+
|
|
217
219
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
218
220
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
219
|
-
df =
|
|
221
|
+
df = pd.DataFrame.from_dict(
|
|
220
222
|
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
221
223
|
).sort_index(axis=1)
|
|
222
224
|
download_path = self.get_download_path(file_data=file_data)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import csv
|
|
2
3
|
import hashlib
|
|
4
|
+
import os
|
|
3
5
|
import re
|
|
4
6
|
from dataclasses import dataclass, field
|
|
5
7
|
from pathlib import Path
|
|
@@ -8,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
8
10
|
|
|
9
11
|
from pydantic import BaseModel, Field, Secret
|
|
10
12
|
|
|
11
|
-
from unstructured_ingest import __name__ as integration_name
|
|
12
13
|
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
13
14
|
from unstructured_ingest.data_types.file_data import (
|
|
14
15
|
BatchFileData,
|
|
@@ -43,7 +44,7 @@ from unstructured_ingest.processes.connector_registry import (
|
|
|
43
44
|
)
|
|
44
45
|
from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
|
|
45
46
|
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
46
|
-
from unstructured_ingest.utils.data_prep import batch_generator,
|
|
47
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
|
|
47
48
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
48
49
|
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
49
50
|
|
|
@@ -83,10 +84,8 @@ class AstraDBConnectionConfig(ConnectionConfig):
|
|
|
83
84
|
|
|
84
85
|
# Create a client object to interact with the Astra DB
|
|
85
86
|
# caller_name/version for Astra DB tracking
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
caller_version=integration_version,
|
|
89
|
-
)
|
|
87
|
+
user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
|
|
88
|
+
return AstraDBClient(callers=[(user_agent, integration_version)])
|
|
90
89
|
|
|
91
90
|
|
|
92
91
|
def get_astra_db(
|
|
@@ -141,7 +140,7 @@ async def get_async_astra_collection(
|
|
|
141
140
|
)
|
|
142
141
|
|
|
143
142
|
# Get async collection from AsyncDatabase
|
|
144
|
-
async_astra_db_collection =
|
|
143
|
+
async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
|
|
145
144
|
return async_astra_db_collection
|
|
146
145
|
|
|
147
146
|
|
|
@@ -360,13 +359,22 @@ class AstraDBUploader(Uploader):
|
|
|
360
359
|
upload_config: AstraDBUploaderConfig
|
|
361
360
|
connector_type: str = CONNECTOR_TYPE
|
|
362
361
|
|
|
362
|
+
def is_async(self) -> bool:
|
|
363
|
+
return True
|
|
364
|
+
|
|
363
365
|
def init(self, **kwargs: Any) -> None:
|
|
364
366
|
self.create_destination(**kwargs)
|
|
365
367
|
|
|
368
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
366
369
|
def precheck(self) -> None:
|
|
367
370
|
try:
|
|
368
371
|
if self.upload_config.collection_name:
|
|
369
|
-
|
|
372
|
+
collection = get_astra_collection(
|
|
373
|
+
connection_config=self.connection_config,
|
|
374
|
+
collection_name=self.upload_config.collection_name,
|
|
375
|
+
keyspace=self.upload_config.keyspace,
|
|
376
|
+
)
|
|
377
|
+
collection.options()
|
|
370
378
|
else:
|
|
371
379
|
# check for db connection only if collection name is not provided
|
|
372
380
|
get_astra_db(
|
|
@@ -377,17 +385,7 @@ class AstraDBUploader(Uploader):
|
|
|
377
385
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
378
386
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
379
387
|
|
|
380
|
-
@requires_dependencies(["astrapy"], extras="astradb")
|
|
381
|
-
def get_collection(self, collection_name: Optional[str] = None) -> "AstraDBCollection":
|
|
382
|
-
return get_astra_collection(
|
|
383
|
-
connection_config=self.connection_config,
|
|
384
|
-
collection_name=collection_name or self.upload_config.collection_name,
|
|
385
|
-
keyspace=self.upload_config.keyspace,
|
|
386
|
-
)
|
|
387
|
-
|
|
388
388
|
def _collection_exists(self, collection_name: str):
|
|
389
|
-
from astrapy.exceptions import CollectionNotFoundException
|
|
390
|
-
|
|
391
389
|
collection = get_astra_collection(
|
|
392
390
|
connection_config=self.connection_config,
|
|
393
391
|
collection_name=collection_name,
|
|
@@ -397,8 +395,10 @@ class AstraDBUploader(Uploader):
|
|
|
397
395
|
try:
|
|
398
396
|
collection.options()
|
|
399
397
|
return True
|
|
400
|
-
except
|
|
401
|
-
|
|
398
|
+
except RuntimeError as e:
|
|
399
|
+
if "not found" in str(e):
|
|
400
|
+
return False
|
|
401
|
+
raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
|
|
402
402
|
except Exception as e:
|
|
403
403
|
logger.error(f"failed to check if astra collection exists : {e}")
|
|
404
404
|
raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
|
|
@@ -422,6 +422,8 @@ class AstraDBUploader(Uploader):
|
|
|
422
422
|
self.upload_config.collection_name = collection_name
|
|
423
423
|
|
|
424
424
|
if not self._collection_exists(collection_name):
|
|
425
|
+
from astrapy.info import CollectionDefinition
|
|
426
|
+
|
|
425
427
|
astra_db = get_astra_db(
|
|
426
428
|
connection_config=self.connection_config, keyspace=self.upload_config.keyspace
|
|
427
429
|
)
|
|
@@ -429,44 +431,56 @@ class AstraDBUploader(Uploader):
|
|
|
429
431
|
f"creating default astra collection '{collection_name}' with dimension "
|
|
430
432
|
f"{vector_length} and metric {similarity_metric}"
|
|
431
433
|
)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
dimension=vector_length
|
|
435
|
-
|
|
434
|
+
definition = (
|
|
435
|
+
CollectionDefinition.builder()
|
|
436
|
+
.set_vector_dimension(dimension=vector_length)
|
|
437
|
+
.set_vector_metric(similarity_metric)
|
|
438
|
+
.build()
|
|
436
439
|
)
|
|
440
|
+
(astra_db.create_collection(collection_name, definition=definition),)
|
|
437
441
|
return True
|
|
438
442
|
logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
|
|
439
443
|
return False
|
|
440
444
|
|
|
441
|
-
def delete_by_record_id(self, collection: "
|
|
445
|
+
async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
|
|
442
446
|
logger.debug(
|
|
443
447
|
f"deleting records from collection {collection.name} "
|
|
444
448
|
f"with {self.upload_config.record_id_key} "
|
|
445
449
|
f"set to {file_data.identifier}"
|
|
446
450
|
)
|
|
447
451
|
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
448
|
-
delete_resp = collection.delete_many(filter=delete_filter)
|
|
452
|
+
delete_resp = await collection.delete_many(filter=delete_filter)
|
|
449
453
|
logger.debug(
|
|
450
454
|
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
451
455
|
)
|
|
452
456
|
|
|
453
|
-
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
457
|
+
async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
454
458
|
logger.info(
|
|
455
459
|
f"writing {len(data)} objects to destination "
|
|
456
460
|
f"collection {self.upload_config.collection_name}"
|
|
457
461
|
)
|
|
458
462
|
|
|
459
463
|
astra_db_batch_size = self.upload_config.batch_size
|
|
460
|
-
|
|
464
|
+
async_astra_collection = await get_async_astra_collection(
|
|
465
|
+
connection_config=self.connection_config,
|
|
466
|
+
collection_name=self.upload_config.collection_name,
|
|
467
|
+
keyspace=self.upload_config.keyspace,
|
|
468
|
+
)
|
|
461
469
|
|
|
462
|
-
self.delete_by_record_id(collection=
|
|
470
|
+
await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
|
|
471
|
+
await asyncio.gather(
|
|
472
|
+
*[
|
|
473
|
+
async_astra_collection.insert_many(chunk)
|
|
474
|
+
for chunk in batch_generator(data, astra_db_batch_size)
|
|
475
|
+
]
|
|
476
|
+
)
|
|
463
477
|
|
|
464
|
-
|
|
465
|
-
|
|
478
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
479
|
+
data = get_json_data(path=path)
|
|
480
|
+
await self.run_data(data=data, file_data=file_data)
|
|
466
481
|
|
|
467
|
-
def run(self,
|
|
468
|
-
|
|
469
|
-
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
482
|
+
def run(self, **kwargs: Any) -> Any:
|
|
483
|
+
raise NotImplementedError("Use astradb run_async instead")
|
|
470
484
|
|
|
471
485
|
|
|
472
486
|
astra_db_source_entry = SourceRegistryEntry(
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
|
28
28
|
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
token: Optional[str] = Field(
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
|
28
28
|
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
azure_workspace_resource_id: Optional[str] = Field(
|
|
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
|
47
47
|
)
|
|
48
48
|
azure_environment: Optional[str] = Field(
|
|
49
49
|
default=None,
|
|
50
|
-
description="The Azure environment type for a
|
|
50
|
+
description="The Azure environment type for a specific set of API endpoints",
|
|
51
51
|
examples=["Public", "UsGov", "China", "Germany"],
|
|
52
52
|
)
|
|
53
53
|
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
|
28
28
|
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
google_credentials: Optional[str] = None
|
|
@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
166
166
|
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
167
167
|
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
168
168
|
logger.debug(
|
|
169
|
-
f"migrating content from {catalog_path} to "
|
|
170
|
-
f"table {self.upload_config.table_name}"
|
|
169
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
171
170
|
)
|
|
172
171
|
data = get_json_data(path=path)
|
|
173
172
|
columns = data[0].keys()
|
|
@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
|
|
|
181
181
|
df = pd.DataFrame(data=data)
|
|
182
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
183
|
|
|
184
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
184
185
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
186
|
df = get_data_df(path)
|
|
186
187
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.data_types.file_data import FileData
|
|
6
6
|
from unstructured_ingest.interfaces import UploadStager
|
|
7
|
-
from unstructured_ingest.utils.data_prep import
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
|
|
8
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
_COLUMNS = (
|
|
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
81
81
|
) -> Path:
|
|
82
82
|
import pandas as pd
|
|
83
83
|
|
|
84
|
-
elements_contents =
|
|
84
|
+
elements_contents = get_json_data(path=elements_filepath)
|
|
85
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
86
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
87
87
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
|
|
|
67
67
|
|
|
68
68
|
@contextmanager
|
|
69
69
|
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
-
with self.get_client() as client:
|
|
71
|
-
|
|
72
|
-
yield cursor
|
|
70
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
73
72
|
|
|
74
73
|
|
|
75
74
|
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
66
66
|
|
|
67
67
|
@contextmanager
|
|
68
68
|
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
-
with self.get_client() as client:
|
|
70
|
-
|
|
71
|
-
yield cursor
|
|
69
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
70
|
+
yield cursor
|
|
72
71
|
|
|
73
72
|
|
|
74
73
|
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
|
|
|
134
134
|
|
|
135
135
|
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
136
136
|
metadata: dict[str, str] = {}
|
|
137
|
-
with
|
|
138
|
-
|
|
139
|
-
|
|
137
|
+
with (
|
|
138
|
+
contextlib.suppress(AttributeError),
|
|
139
|
+
self.connection_config.get_client(protocol=self.index_config.protocol) as client,
|
|
140
|
+
):
|
|
141
|
+
metadata = client.metadata(path=path)
|
|
140
142
|
record_locator = {
|
|
141
143
|
"protocol": self.index_config.protocol,
|
|
142
144
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
|
|
|
230
230
|
download_path = self.get_download_path(file_data=file_data)
|
|
231
231
|
if download_path is None:
|
|
232
232
|
logger.error(
|
|
233
|
-
"Generated download path is None, source_identifiers might be
|
|
234
|
-
"from FileData."
|
|
233
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
235
234
|
)
|
|
236
235
|
raise ValueError("Generated invalid download path.")
|
|
237
236
|
|
|
@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
334
334
|
recursive: bool = False,
|
|
335
335
|
previous_path: Optional[str] = None,
|
|
336
336
|
) -> list[dict]:
|
|
337
|
-
|
|
338
337
|
fields_input = "nextPageToken, files({})".format(",".join(self.fields))
|
|
339
338
|
q = f"'{object_id}' in parents"
|
|
340
339
|
# Filter by extension but still include any directories
|
|
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
394
393
|
if not self.is_dir(root_info):
|
|
395
394
|
data = [self.map_file_data(root_info)]
|
|
396
395
|
else:
|
|
397
|
-
|
|
398
396
|
file_contents = self.get_paginated_results(
|
|
399
397
|
files_client=files_client,
|
|
400
398
|
object_id=object_id,
|
|
@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
9
8
|
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
10
|
from unstructured_ingest.data_types.file_data import FileData
|
|
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
|
|
|
29
28
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
29
|
|
|
31
30
|
if TYPE_CHECKING:
|
|
31
|
+
from pandas import DataFrame
|
|
32
32
|
from pyarrow import Table as ArrowTable
|
|
33
33
|
from pyiceberg.catalog.rest import RestCatalog
|
|
34
34
|
from pyiceberg.table import Table, Transaction
|
|
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
96
96
|
return UserAuthError(e)
|
|
97
97
|
if 400 <= response_code < 500:
|
|
98
98
|
logger.error(
|
|
99
|
-
f"Request to {url}
|
|
100
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
99
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
101
100
|
)
|
|
102
101
|
return UserError(e)
|
|
103
102
|
if response_code > 500:
|
|
104
103
|
logger.error(
|
|
105
|
-
f"Request to {url}
|
|
106
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
104
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
107
105
|
)
|
|
108
106
|
return ProviderError(e)
|
|
109
107
|
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
217
215
|
return self.upload_config.record_id_key in self.get_table_columns()
|
|
218
216
|
|
|
219
217
|
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
220
|
-
def _df_to_arrow_table(self, df:
|
|
218
|
+
def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
|
|
221
219
|
import pyarrow as pa
|
|
222
220
|
|
|
223
221
|
# Iceberg will automatically fill missing columns with nulls
|
|
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
277
275
|
except Exception as e:
|
|
278
276
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
279
277
|
|
|
280
|
-
def upload_dataframe(self, df:
|
|
278
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
281
279
|
data_table = self._df_to_arrow_table(df)
|
|
282
280
|
|
|
283
281
|
with self.get_table() as table:
|
|
284
282
|
self.upload_data_table(table, data_table, file_data)
|
|
285
283
|
|
|
284
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
286
285
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
286
|
+
import pandas as pd
|
|
287
|
+
|
|
287
288
|
df = pd.DataFrame(data)
|
|
288
289
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
289
290
|
|
|
291
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
290
292
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
291
293
|
df = get_data_df(path=path)
|
|
292
294
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
|
|
|
141
141
|
df = pd.DataFrame(data=data)
|
|
142
142
|
self.process_dataframe(df=df)
|
|
143
143
|
|
|
144
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
144
145
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
146
|
data = get_data_df(path=path)
|
|
146
147
|
self.process_dataframe(df=data)
|
|
@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
|
|
|
199
199
|
download_path = self.get_download_path(file_data)
|
|
200
200
|
if download_path is None:
|
|
201
201
|
logger.error(
|
|
202
|
-
"Generated download path is None, source_identifiers might be
|
|
203
|
-
"from FileData."
|
|
202
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
204
203
|
)
|
|
205
204
|
raise ValueError("Generated invalid download path.")
|
|
206
205
|
|