unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "1.0.1" # pragma: no cover
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Ingest CLI
|
|
2
|
+
This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
|
|
3
|
+
|
|
4
|
+
## Design Reference
|
|
5
|
+
[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
|
|
6
|
+
source and destination connectors.
|
|
7
|
+
|
|
8
|
+
To manually run the cli:
|
|
9
|
+
```shell
|
|
10
|
+
PYTHONPATH=. python unstructured_ingest/v2/main.py --help
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The `main.py` file simply wraps the generated Click command created in `cli.py`.
|
|
14
|
+
|
|
15
|
+
### Source Commands
|
|
16
|
+
All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
|
|
17
|
+
different connectors with shared and unique parameters.
|
|
18
|
+
|
|
19
|
+
### Destination Commands
|
|
20
|
+
All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
|
|
21
|
+
sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
|
|
22
|
+
generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
|
|
23
|
+
|
|
24
|
+
### Configs
|
|
25
|
+
The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
|
|
26
|
+
are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
|
|
27
|
+
Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
|
|
28
|
+
into dataclasses that have nested fields (such as access configs).
|
|
@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
114
114
|
|
|
115
115
|
@dataclass
|
|
116
116
|
class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
117
|
-
|
|
118
117
|
config: MixedbreadAIEmbeddingConfig
|
|
119
118
|
|
|
120
119
|
async def get_exemplary_embedding(self) -> list[float]:
|
|
@@ -8,7 +8,7 @@ from pydantic import BaseModel
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseProcess
|
|
10
10
|
from unstructured_ingest.utils import ndjson
|
|
11
|
-
from unstructured_ingest.utils.data_prep import
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class UploadStagerConfig(BaseModel):
|
|
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
43
43
|
writer.f.flush()
|
|
44
44
|
|
|
45
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
-
elements_contents =
|
|
46
|
+
elements_contents = get_json_data(path=input_file)
|
|
47
47
|
|
|
48
48
|
conformed_elements = [
|
|
49
49
|
self.conform_dict(element_dict=element, file_data=file_data)
|
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseConnector, BaseProcess
|
|
10
|
-
from unstructured_ingest.utils.data_prep import
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class UploaderConfig(BaseModel):
|
|
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
45
45
|
return False
|
|
46
46
|
|
|
47
47
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
48
|
-
data =
|
|
48
|
+
data = get_json_data(path=path)
|
|
49
49
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
50
50
|
|
|
51
51
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
52
|
-
data =
|
|
52
|
+
data = get_json_data(path=path)
|
|
53
53
|
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
54
54
|
|
|
55
55
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
unstructured_ingest/main.py
CHANGED
|
File without changes
|
|
@@ -119,7 +119,7 @@ class PipelineStep(ABC):
|
|
|
119
119
|
iterable = iterable or []
|
|
120
120
|
if iterable:
|
|
121
121
|
logger.info(
|
|
122
|
-
f"calling {self.__class__.__name__}
|
|
122
|
+
f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
|
|
123
123
|
)
|
|
124
124
|
else:
|
|
125
125
|
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
@@ -220,7 +220,7 @@ class Pipeline:
|
|
|
220
220
|
|
|
221
221
|
def _run(self):
|
|
222
222
|
logger.info(
|
|
223
|
-
f"running local pipeline: {self} with configs:
|
|
223
|
+
f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
|
|
224
224
|
)
|
|
225
225
|
if self.context.mp_supported:
|
|
226
226
|
manager = mp.Manager()
|
|
@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
|
|
|
24
24
|
default="https://api.unstructuredapp.io/general/v0/general",
|
|
25
25
|
description="If chunking via api, use the following host.",
|
|
26
26
|
)
|
|
27
|
+
chunk_api_timeout_ms: Optional[int] = Field(
|
|
28
|
+
default=None, description="Timeout in milliseconds for all api call during chunking."
|
|
29
|
+
)
|
|
27
30
|
chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
|
|
28
31
|
chunk_api_key: Optional[SecretStr] = Field(
|
|
29
32
|
default=None, description="API Key for chunking endpoint."
|
|
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
|
|
|
120
123
|
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
121
124
|
filename=elements_filepath,
|
|
122
125
|
api_parameters=self.config.to_chunking_kwargs(),
|
|
126
|
+
timeout_ms=self.config.chunk_api_timeout_ms,
|
|
123
127
|
)
|
|
124
128
|
|
|
125
129
|
elements = assign_and_map_hash_ids(elements=elements)
|
|
@@ -3,7 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
4
|
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
import pandas
|
|
7
6
|
from pydantic import BaseModel, Field, Secret, field_validator
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
|
|
|
213
212
|
row_dict.update(table_row["fields"])
|
|
214
213
|
return row_dict
|
|
215
214
|
|
|
215
|
+
@requires_dependencies(["pandas"], extras="airtable")
|
|
216
216
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
217
|
+
import pandas as pd
|
|
218
|
+
|
|
217
219
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
218
220
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
219
|
-
df =
|
|
221
|
+
df = pd.DataFrame.from_dict(
|
|
220
222
|
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
221
223
|
).sort_index(axis=1)
|
|
222
224
|
download_path = self.get_download_path(file_data=file_data)
|
|
@@ -43,7 +43,7 @@ from unstructured_ingest.processes.connector_registry import (
|
|
|
43
43
|
)
|
|
44
44
|
from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
|
|
45
45
|
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
46
|
-
from unstructured_ingest.utils.data_prep import batch_generator,
|
|
46
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
|
|
47
47
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
48
48
|
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
49
49
|
|
|
@@ -465,7 +465,7 @@ class AstraDBUploader(Uploader):
|
|
|
465
465
|
collection.insert_many(chunk)
|
|
466
466
|
|
|
467
467
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
468
|
-
data =
|
|
468
|
+
data = get_json_data(path=path)
|
|
469
469
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
470
470
|
|
|
471
471
|
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
|
28
28
|
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
token: Optional[str] = Field(
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
|
28
28
|
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
azure_workspace_resource_id: Optional[str] = Field(
|
|
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
|
47
47
|
)
|
|
48
48
|
azure_environment: Optional[str] = Field(
|
|
49
49
|
default=None,
|
|
50
|
-
description="The Azure environment type for a
|
|
50
|
+
description="The Azure environment type for a specific set of API endpoints",
|
|
51
51
|
examples=["Public", "UsGov", "China", "Germany"],
|
|
52
52
|
)
|
|
53
53
|
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
|
28
28
|
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
google_credentials: Optional[str] = None
|
|
@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
166
166
|
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
167
167
|
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
168
168
|
logger.debug(
|
|
169
|
-
f"migrating content from {catalog_path} to "
|
|
170
|
-
f"table {self.upload_config.table_name}"
|
|
169
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
171
170
|
)
|
|
172
171
|
data = get_json_data(path=path)
|
|
173
172
|
columns = data[0].keys()
|
|
@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
|
|
|
181
181
|
df = pd.DataFrame(data=data)
|
|
182
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
183
|
|
|
184
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
184
185
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
186
|
df = get_data_df(path)
|
|
186
187
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.data_types.file_data import FileData
|
|
6
6
|
from unstructured_ingest.interfaces import UploadStager
|
|
7
|
-
from unstructured_ingest.utils.data_prep import
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
|
|
8
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
_COLUMNS = (
|
|
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
81
81
|
) -> Path:
|
|
82
82
|
import pandas as pd
|
|
83
83
|
|
|
84
|
-
elements_contents =
|
|
84
|
+
elements_contents = get_json_data(path=elements_filepath)
|
|
85
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
86
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
87
87
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
|
|
|
67
67
|
|
|
68
68
|
@contextmanager
|
|
69
69
|
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
-
with self.get_client() as client:
|
|
71
|
-
|
|
72
|
-
yield cursor
|
|
70
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
73
72
|
|
|
74
73
|
|
|
75
74
|
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
66
66
|
|
|
67
67
|
@contextmanager
|
|
68
68
|
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
-
with self.get_client() as client:
|
|
70
|
-
|
|
71
|
-
yield cursor
|
|
69
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
70
|
+
yield cursor
|
|
72
71
|
|
|
73
72
|
|
|
74
73
|
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
|
|
|
134
134
|
|
|
135
135
|
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
136
136
|
metadata: dict[str, str] = {}
|
|
137
|
-
with
|
|
138
|
-
|
|
139
|
-
|
|
137
|
+
with (
|
|
138
|
+
contextlib.suppress(AttributeError),
|
|
139
|
+
self.connection_config.get_client(protocol=self.index_config.protocol) as client,
|
|
140
|
+
):
|
|
141
|
+
metadata = client.metadata(path=path)
|
|
140
142
|
record_locator = {
|
|
141
143
|
"protocol": self.index_config.protocol,
|
|
142
144
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
|
|
|
230
230
|
download_path = self.get_download_path(file_data=file_data)
|
|
231
231
|
if download_path is None:
|
|
232
232
|
logger.error(
|
|
233
|
-
"Generated download path is None, source_identifiers might be
|
|
234
|
-
"from FileData."
|
|
233
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
235
234
|
)
|
|
236
235
|
raise ValueError("Generated invalid download path.")
|
|
237
236
|
|
|
@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
334
334
|
recursive: bool = False,
|
|
335
335
|
previous_path: Optional[str] = None,
|
|
336
336
|
) -> list[dict]:
|
|
337
|
-
|
|
338
337
|
fields_input = "nextPageToken, files({})".format(",".join(self.fields))
|
|
339
338
|
q = f"'{object_id}' in parents"
|
|
340
339
|
# Filter by extension but still include any directories
|
|
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
394
393
|
if not self.is_dir(root_info):
|
|
395
394
|
data = [self.map_file_data(root_info)]
|
|
396
395
|
else:
|
|
397
|
-
|
|
398
396
|
file_contents = self.get_paginated_results(
|
|
399
397
|
files_client=files_client,
|
|
400
398
|
object_id=object_id,
|
|
@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
9
8
|
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
10
|
from unstructured_ingest.data_types.file_data import FileData
|
|
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
|
|
|
29
28
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
29
|
|
|
31
30
|
if TYPE_CHECKING:
|
|
31
|
+
from pandas import DataFrame
|
|
32
32
|
from pyarrow import Table as ArrowTable
|
|
33
33
|
from pyiceberg.catalog.rest import RestCatalog
|
|
34
34
|
from pyiceberg.table import Table, Transaction
|
|
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
96
96
|
return UserAuthError(e)
|
|
97
97
|
if 400 <= response_code < 500:
|
|
98
98
|
logger.error(
|
|
99
|
-
f"Request to {url}
|
|
100
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
99
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
101
100
|
)
|
|
102
101
|
return UserError(e)
|
|
103
102
|
if response_code > 500:
|
|
104
103
|
logger.error(
|
|
105
|
-
f"Request to {url}
|
|
106
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
104
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
107
105
|
)
|
|
108
106
|
return ProviderError(e)
|
|
109
107
|
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
217
215
|
return self.upload_config.record_id_key in self.get_table_columns()
|
|
218
216
|
|
|
219
217
|
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
220
|
-
def _df_to_arrow_table(self, df:
|
|
218
|
+
def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
|
|
221
219
|
import pyarrow as pa
|
|
222
220
|
|
|
223
221
|
# Iceberg will automatically fill missing columns with nulls
|
|
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
277
275
|
except Exception as e:
|
|
278
276
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
279
277
|
|
|
280
|
-
def upload_dataframe(self, df:
|
|
278
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
281
279
|
data_table = self._df_to_arrow_table(df)
|
|
282
280
|
|
|
283
281
|
with self.get_table() as table:
|
|
284
282
|
self.upload_data_table(table, data_table, file_data)
|
|
285
283
|
|
|
284
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
286
285
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
286
|
+
import pandas as pd
|
|
287
|
+
|
|
287
288
|
df = pd.DataFrame(data)
|
|
288
289
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
289
290
|
|
|
291
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
290
292
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
291
293
|
df = get_data_df(path=path)
|
|
292
294
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
|
|
|
141
141
|
df = pd.DataFrame(data=data)
|
|
142
142
|
self.process_dataframe(df=df)
|
|
143
143
|
|
|
144
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
144
145
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
146
|
data = get_data_df(path=path)
|
|
146
147
|
self.process_dataframe(df=data)
|
|
@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
|
|
|
199
199
|
download_path = self.get_download_path(file_data)
|
|
200
200
|
if download_path is None:
|
|
201
201
|
logger.error(
|
|
202
|
-
"Generated download path is None, source_identifiers might be
|
|
203
|
-
"from FileData."
|
|
202
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
204
203
|
)
|
|
205
204
|
raise ValueError("Generated invalid download path.")
|
|
206
205
|
|
|
@@ -143,36 +143,40 @@ class RedisUploader(Uploader):
|
|
|
143
143
|
await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
|
|
144
144
|
|
|
145
145
|
async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
|
|
146
|
-
async with
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
146
|
+
async with (
|
|
147
|
+
self.connection_config.create_async_client() as async_client,
|
|
148
|
+
async_client.pipeline(transaction=True) as pipe,
|
|
149
|
+
):
|
|
150
|
+
for element in batch:
|
|
151
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
152
|
+
if redis_stack:
|
|
153
|
+
pipe.json().set(key_with_prefix, "$", element)
|
|
154
|
+
else:
|
|
155
|
+
pipe.set(key_with_prefix, json.dumps(element))
|
|
156
|
+
await pipe.execute()
|
|
155
157
|
|
|
156
158
|
@requires_dependencies(["redis"], extras="redis")
|
|
157
159
|
async def _check_redis_stack(self, element: dict) -> bool:
|
|
158
160
|
from redis import exceptions as redis_exceptions
|
|
159
161
|
|
|
160
162
|
redis_stack = True
|
|
161
|
-
async with
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
163
|
+
async with (
|
|
164
|
+
self.connection_config.create_async_client() as async_client,
|
|
165
|
+
async_client.pipeline(transaction=True) as pipe,
|
|
166
|
+
):
|
|
167
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
168
|
+
try:
|
|
169
|
+
# Redis with stack extension supports JSON type
|
|
170
|
+
await pipe.json().set(key_with_prefix, "$", element).execute()
|
|
171
|
+
except redis_exceptions.ResponseError as e:
|
|
172
|
+
message = str(e)
|
|
173
|
+
if "unknown command `JSON.SET`" in message:
|
|
174
|
+
# if this error occurs, Redis server doesn't support JSON type,
|
|
175
|
+
# so save as string type instead
|
|
176
|
+
await pipe.set(key_with_prefix, json.dumps(element)).execute()
|
|
177
|
+
redis_stack = False
|
|
178
|
+
else:
|
|
179
|
+
raise e
|
|
176
180
|
return redis_stack
|
|
177
181
|
|
|
178
182
|
|
|
@@ -81,7 +81,7 @@ class SalesforceAccessConfig(AccessConfig):
|
|
|
81
81
|
consumer_key: str
|
|
82
82
|
private_key_path: Optional[Path] = Field(
|
|
83
83
|
default=None,
|
|
84
|
-
description="Path to the private key file.
|
|
84
|
+
description="Path to the private key file. Key file is usually named server.key.",
|
|
85
85
|
)
|
|
86
86
|
private_key: Optional[str] = Field(default=None, description="Contents of the private key")
|
|
87
87
|
|
|
@@ -166,8 +166,7 @@ class SlackDownloader(Downloader):
|
|
|
166
166
|
download_path = self.get_download_path(file_data)
|
|
167
167
|
if download_path is None:
|
|
168
168
|
logger.error(
|
|
169
|
-
"Generated download path is None, source_identifiers might be
|
|
170
|
-
"from FileData."
|
|
169
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
171
170
|
)
|
|
172
171
|
raise ValueError("Generated invalid download path.")
|
|
173
172
|
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, Secret
|
|
@@ -128,6 +129,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
|
|
|
128
129
|
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
129
130
|
connector_type: str = CONNECTOR_TYPE
|
|
130
131
|
|
|
132
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
133
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
134
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
135
|
+
|
|
131
136
|
@contextmanager
|
|
132
137
|
def get_cursor(self) -> Generator[Any, None, None]:
|
|
133
138
|
with self.connection_config.get_cursor() as cursor:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from contextlib import contextmanager
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, Secret
|
|
6
7
|
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
7
9
|
from unstructured_ingest.logger import logger
|
|
8
10
|
from unstructured_ingest.processes.connector_registry import (
|
|
9
11
|
DestinationRegistryEntry,
|
|
@@ -144,6 +146,10 @@ class PostgresUploader(SQLUploader):
|
|
|
144
146
|
connector_type: str = CONNECTOR_TYPE
|
|
145
147
|
values_delimiter: str = "%s"
|
|
146
148
|
|
|
149
|
+
@requires_dependencies(["pandas"], extras="postgres")
|
|
150
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
151
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
152
|
+
|
|
147
153
|
|
|
148
154
|
postgres_source_entry = SourceRegistryEntry(
|
|
149
155
|
connection_config=PostgresConnectionConfig,
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
6
|
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
8
10
|
from unstructured_ingest.logger import logger
|
|
9
11
|
from unstructured_ingest.processes.connector_registry import (
|
|
10
12
|
DestinationRegistryEntry,
|
|
@@ -65,12 +67,11 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
|
|
|
65
67
|
|
|
66
68
|
@contextmanager
|
|
67
69
|
def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
|
|
68
|
-
with self.get_connection() as connection:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
cursor.close()
|
|
70
|
+
with self.get_connection() as connection, connection.cursor() as cursor:
|
|
71
|
+
try:
|
|
72
|
+
yield cursor
|
|
73
|
+
finally:
|
|
74
|
+
cursor.close()
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
class SingleStoreIndexerConfig(SQLIndexerConfig):
|
|
@@ -131,6 +132,10 @@ class SingleStoreUploader(SQLUploader):
|
|
|
131
132
|
values_delimiter: str = "%s"
|
|
132
133
|
connector_type: str = CONNECTOR_TYPE
|
|
133
134
|
|
|
135
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
136
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
137
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
138
|
+
|
|
134
139
|
@requires_dependencies(["pandas"], extras="singlestore")
|
|
135
140
|
def prepare_data(
|
|
136
141
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
6
|
|
|
6
7
|
from pydantic import Field, Secret
|
|
@@ -173,6 +174,10 @@ class SnowflakeUploader(SQLUploader):
|
|
|
173
174
|
connector_type: str = CONNECTOR_TYPE
|
|
174
175
|
values_delimiter: str = "?"
|
|
175
176
|
|
|
177
|
+
@requires_dependencies(["pandas"], extras="snowflake")
|
|
178
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
180
|
+
|
|
176
181
|
@requires_dependencies(["pandas"], extras="snowflake")
|
|
177
182
|
def prepare_data(
|
|
178
183
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|