unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -36,9 +36,9 @@ from unstructured_ingest.interfaces import (
|
|
|
36
36
|
from unstructured_ingest.logger import logger
|
|
37
37
|
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
38
38
|
from unstructured_ingest.utils.data_prep import (
|
|
39
|
-
get_data,
|
|
40
39
|
get_data_df,
|
|
41
40
|
get_enhanced_element_id,
|
|
41
|
+
get_json_data,
|
|
42
42
|
split_dataframe,
|
|
43
43
|
write_data,
|
|
44
44
|
)
|
|
@@ -122,8 +122,7 @@ class SQLIndexer(Indexer, ABC):
|
|
|
122
122
|
id_batches: list[frozenset[str]] = [
|
|
123
123
|
frozenset(
|
|
124
124
|
ids[
|
|
125
|
-
i
|
|
126
|
-
* self.index_config.batch_size : (i + 1) # noqa
|
|
125
|
+
i * self.index_config.batch_size : (i + 1) # noqa
|
|
127
126
|
* self.index_config.batch_size
|
|
128
127
|
]
|
|
129
128
|
)
|
|
@@ -272,7 +271,7 @@ class SQLUploadStager(UploadStager):
|
|
|
272
271
|
) -> Path:
|
|
273
272
|
import pandas as pd
|
|
274
273
|
|
|
275
|
-
elements_contents =
|
|
274
|
+
elements_contents = get_json_data(path=elements_filepath)
|
|
276
275
|
|
|
277
276
|
df = pd.DataFrame(
|
|
278
277
|
data=[
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Generator
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret, model_validator
|
|
8
8
|
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
10
|
from unstructured_ingest.logger import logger
|
|
10
11
|
from unstructured_ingest.processes.connector_registry import (
|
|
11
12
|
DestinationRegistryEntry,
|
|
@@ -133,6 +134,10 @@ class SQLiteUploader(SQLUploader):
|
|
|
133
134
|
connection_config: SQLiteConnectionConfig
|
|
134
135
|
connector_type: str = CONNECTOR_TYPE
|
|
135
136
|
|
|
137
|
+
@requires_dependencies(["pandas"])
|
|
138
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
139
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
140
|
+
|
|
136
141
|
@requires_dependencies(["pandas"])
|
|
137
142
|
def prepare_data(
|
|
138
143
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from contextlib import contextmanager
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, Secret
|
|
@@ -68,9 +69,8 @@ class VastdbConnectionConfig(SQLConnectionConfig):
|
|
|
68
69
|
|
|
69
70
|
@contextmanager
|
|
70
71
|
def get_cursor(self) -> "VastdbTransaction":
|
|
71
|
-
with self.get_connection() as connection:
|
|
72
|
-
|
|
73
|
-
yield transaction
|
|
72
|
+
with self.get_connection() as connection, connection.transaction() as transaction:
|
|
73
|
+
yield transaction
|
|
74
74
|
|
|
75
75
|
@contextmanager
|
|
76
76
|
def get_table(self, table_name: str) -> "VastdbTable":
|
|
@@ -190,6 +190,10 @@ class VastdbUploader(SQLUploader):
|
|
|
190
190
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
191
191
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
192
192
|
|
|
193
|
+
@requires_dependencies(["pandas"], extras="vastdb")
|
|
194
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
195
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
196
|
+
|
|
193
197
|
@requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
|
|
194
198
|
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
195
199
|
import numpy as np
|
|
@@ -108,7 +108,6 @@ class VectaraUploaderConfig(UploaderConfig):
|
|
|
108
108
|
|
|
109
109
|
@dataclass
|
|
110
110
|
class VectaraUploader(Uploader):
|
|
111
|
-
|
|
112
111
|
connector_type: str = CONNECTOR_TYPE
|
|
113
112
|
upload_config: VectaraUploaderConfig
|
|
114
113
|
connection_config: VectaraConnectionConfig
|
|
@@ -336,7 +335,6 @@ class VectaraUploader(Uploader):
|
|
|
336
335
|
file_data: FileData,
|
|
337
336
|
**kwargs: Any,
|
|
338
337
|
) -> None:
|
|
339
|
-
|
|
340
338
|
logger.info(f"inserting / updating {len(data)} documents to Vectara ")
|
|
341
339
|
await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
|
|
342
340
|
|
|
@@ -53,7 +53,6 @@ class ZendeskConnectionConfig(ConnectionConfig):
|
|
|
53
53
|
access_config: Secret[ZendeskAccessConfig]
|
|
54
54
|
|
|
55
55
|
def get_client(self) -> ZendeskClient:
|
|
56
|
-
|
|
57
56
|
access_config = self.access_config.get_secret_value()
|
|
58
57
|
|
|
59
58
|
return ZendeskClient(
|
|
@@ -206,7 +205,6 @@ class ZendeskDownloader(Downloader):
|
|
|
206
205
|
await f.write(comment.as_text())
|
|
207
206
|
|
|
208
207
|
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
209
|
-
|
|
210
208
|
zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
|
|
211
209
|
|
|
212
210
|
item_type = zendesk_filedata.additional_metadata.item_type
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
|
6
6
|
from pydantic import BaseModel, Field, SecretStr
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.interfaces.process import BaseProcess
|
|
9
|
-
from unstructured_ingest.utils.data_prep import
|
|
9
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
@@ -192,7 +192,7 @@ class Embedder(BaseProcess, ABC):
|
|
|
192
192
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
193
193
|
# TODO update base embedder classes to support async
|
|
194
194
|
embedder = self.config.get_embedder()
|
|
195
|
-
elements =
|
|
195
|
+
elements = get_json_data(path=elements_filepath)
|
|
196
196
|
if not elements:
|
|
197
197
|
return []
|
|
198
198
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
@@ -13,7 +13,7 @@ from unstructured_ingest.logger import logger
|
|
|
13
13
|
class FiltererConfig(BaseModel):
|
|
14
14
|
file_glob: Optional[list[str]] = Field(
|
|
15
15
|
default=None,
|
|
16
|
-
description="file globs to limit which data_types of
|
|
16
|
+
description="file globs to limit which data_types of files are accepted",
|
|
17
17
|
examples=["*.pdf", "*.html"],
|
|
18
18
|
)
|
|
19
19
|
max_file_size: Optional[int] = Field(
|
|
@@ -68,6 +68,9 @@ class PartitionerConfig(BaseModel):
|
|
|
68
68
|
description="Use a remote API to partition the files."
|
|
69
69
|
" Otherwise, use the function from partition.auto",
|
|
70
70
|
)
|
|
71
|
+
api_timeout_ms: Optional[int] = Field(
|
|
72
|
+
default=None, description="Timeout in milliseconds for all api call during partitioning."
|
|
73
|
+
)
|
|
71
74
|
api_key: Optional[SecretStr] = Field(
|
|
72
75
|
default=None, description="API Key for partition endpoint."
|
|
73
76
|
)
|
|
@@ -188,6 +191,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
188
191
|
api_key=self.config.api_key.get_secret_value(),
|
|
189
192
|
filename=filename,
|
|
190
193
|
api_parameters=self.config.to_partition_kwargs(),
|
|
194
|
+
timeout_ms=self.config.api_timeout_ms,
|
|
191
195
|
)
|
|
192
196
|
|
|
193
197
|
# Append the data source metadata the auto partition does for you
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.data_types.file_data import FileData
|
|
6
6
|
from unstructured_ingest.interfaces import UploadStager, UploadStagerConfig
|
|
7
|
-
from unstructured_ingest.utils.data_prep import
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class BlobStoreUploadStagerConfig(UploadStagerConfig):
|
|
@@ -27,6 +27,6 @@ class BlobStoreUploadStager(UploadStager):
|
|
|
27
27
|
) -> Path:
|
|
28
28
|
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
29
29
|
# Always save as json
|
|
30
|
-
data =
|
|
30
|
+
data = get_json_data(elements_filepath)
|
|
31
31
|
write_data(path=output_file.with_suffix(".json"), data=data)
|
|
32
32
|
return output_file.with_suffix(".json")
|
|
@@ -80,7 +80,11 @@ def wrap_error(e: Exception) -> Exception:
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
async def call_api_async(
|
|
83
|
-
server_url: Optional[str],
|
|
83
|
+
server_url: Optional[str],
|
|
84
|
+
api_key: Optional[str],
|
|
85
|
+
filename: Path,
|
|
86
|
+
api_parameters: dict,
|
|
87
|
+
timeout_ms: Optional[int] = None,
|
|
84
88
|
) -> list[dict]:
|
|
85
89
|
"""Call the Unstructured API using unstructured-client.
|
|
86
90
|
|
|
@@ -94,13 +98,10 @@ async def call_api_async(
|
|
|
94
98
|
"""
|
|
95
99
|
from unstructured_client import UnstructuredClient
|
|
96
100
|
|
|
97
|
-
client = UnstructuredClient(
|
|
98
|
-
server_url=server_url,
|
|
99
|
-
api_key_auth=api_key,
|
|
100
|
-
)
|
|
101
|
+
client = UnstructuredClient(server_url=server_url, api_key_auth=api_key)
|
|
101
102
|
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
102
103
|
try:
|
|
103
|
-
res = await client.general.partition_async(request=partition_request)
|
|
104
|
+
res = await client.general.partition_async(request=partition_request, timeout_ms=timeout_ms)
|
|
104
105
|
except Exception as e:
|
|
105
106
|
raise wrap_error(e)
|
|
106
107
|
|
|
@@ -108,7 +109,11 @@ async def call_api_async(
|
|
|
108
109
|
|
|
109
110
|
|
|
110
111
|
def call_api(
|
|
111
|
-
server_url: Optional[str],
|
|
112
|
+
server_url: Optional[str],
|
|
113
|
+
api_key: Optional[str],
|
|
114
|
+
filename: Path,
|
|
115
|
+
api_parameters: dict,
|
|
116
|
+
timeout_ms: Optional[int] = None,
|
|
112
117
|
) -> list[dict]:
|
|
113
118
|
"""Call the Unstructured API using unstructured-client.
|
|
114
119
|
|
|
@@ -128,7 +133,7 @@ def call_api(
|
|
|
128
133
|
)
|
|
129
134
|
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
130
135
|
try:
|
|
131
|
-
res = client.general.partition(request=partition_request)
|
|
136
|
+
res = client.general.partition(request=partition_request, timeout_ms=timeout_ms)
|
|
132
137
|
except Exception as e:
|
|
133
138
|
raise wrap_error(e)
|
|
134
139
|
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar,
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
6
6
|
from uuid import NAMESPACE_DNS, uuid5
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
@@ -171,15 +171,13 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
|
171
171
|
raise IOError("Unsupported file type: {path}")
|
|
172
172
|
|
|
173
173
|
|
|
174
|
-
def
|
|
175
|
-
if isinstance(path, str):
|
|
176
|
-
path = Path(path)
|
|
177
|
-
try:
|
|
178
|
-
return get_data_by_suffix(path=path)
|
|
179
|
-
except Exception as e:
|
|
180
|
-
logger.warning(f"failed to read {path} by extension: {e}")
|
|
181
|
-
# Fall back
|
|
174
|
+
def get_json_data(path: Path) -> list[dict]:
|
|
182
175
|
with path.open() as f:
|
|
176
|
+
# Attempt by prefix
|
|
177
|
+
if path.suffix == ".json":
|
|
178
|
+
return json.load(f)
|
|
179
|
+
elif path.suffix == ".ndjson":
|
|
180
|
+
return ndjson.load(f)
|
|
183
181
|
try:
|
|
184
182
|
return json.load(f)
|
|
185
183
|
except Exception as e:
|
|
@@ -188,29 +186,7 @@ def get_data(path: Union[Path, str]) -> list[dict]:
|
|
|
188
186
|
return ndjson.load(f)
|
|
189
187
|
except Exception as e:
|
|
190
188
|
logger.warning(f"failed to read {path} as ndjson: {e}")
|
|
191
|
-
|
|
192
|
-
import pandas as pd
|
|
193
|
-
|
|
194
|
-
try:
|
|
195
|
-
df = pd.read_csv(path)
|
|
196
|
-
return df.to_dict(orient="records")
|
|
197
|
-
except Exception as e:
|
|
198
|
-
logger.warning(f"failed to read {path} as csv: {e}")
|
|
199
|
-
try:
|
|
200
|
-
df = pd.read_parquet(path)
|
|
201
|
-
return df.to_dict(orient="records")
|
|
202
|
-
except Exception as e:
|
|
203
|
-
logger.warning(f"failed to read {path} as parquet: {e}")
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def get_json_data(path: Path) -> list[dict]:
|
|
207
|
-
with path.open() as f:
|
|
208
|
-
if path.suffix == ".json":
|
|
209
|
-
return json.load(f)
|
|
210
|
-
elif path.suffix == ".ndjson":
|
|
211
|
-
return ndjson.load(f)
|
|
212
|
-
else:
|
|
213
|
-
raise ValueError(f"Unsupported file type: {path}")
|
|
189
|
+
raise ValueError(f"Unsupported json file: {path}")
|
|
214
190
|
|
|
215
191
|
|
|
216
192
|
@requires_dependencies(["pandas"])
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unstructured_ingest
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
|
+
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
License-File: LICENSE.md
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Education
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: <3.13,>=3.9
|
|
21
|
+
Requires-Dist: click
|
|
22
|
+
Requires-Dist: dataclasses-json
|
|
23
|
+
Requires-Dist: opentelemetry-sdk
|
|
24
|
+
Requires-Dist: pydantic>=2.7
|
|
25
|
+
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: tqdm
|
|
27
|
+
Provides-Extra: airtable
|
|
28
|
+
Requires-Dist: pandas; extra == 'airtable'
|
|
29
|
+
Requires-Dist: pyairtable; extra == 'airtable'
|
|
30
|
+
Provides-Extra: astradb
|
|
31
|
+
Requires-Dist: astrapy; extra == 'astradb'
|
|
32
|
+
Provides-Extra: azure
|
|
33
|
+
Requires-Dist: adlfs; extra == 'azure'
|
|
34
|
+
Requires-Dist: fsspec; extra == 'azure'
|
|
35
|
+
Provides-Extra: azure-ai-search
|
|
36
|
+
Requires-Dist: azure-search-documents; extra == 'azure-ai-search'
|
|
37
|
+
Provides-Extra: bedrock
|
|
38
|
+
Requires-Dist: aioboto3; extra == 'bedrock'
|
|
39
|
+
Requires-Dist: boto3; extra == 'bedrock'
|
|
40
|
+
Provides-Extra: biomed
|
|
41
|
+
Requires-Dist: bs4; extra == 'biomed'
|
|
42
|
+
Requires-Dist: requests; extra == 'biomed'
|
|
43
|
+
Provides-Extra: box
|
|
44
|
+
Requires-Dist: boxfs; extra == 'box'
|
|
45
|
+
Requires-Dist: fsspec; extra == 'box'
|
|
46
|
+
Provides-Extra: chroma
|
|
47
|
+
Requires-Dist: chromadb; extra == 'chroma'
|
|
48
|
+
Provides-Extra: clarifai
|
|
49
|
+
Requires-Dist: clarifai; extra == 'clarifai'
|
|
50
|
+
Provides-Extra: confluence
|
|
51
|
+
Requires-Dist: atlassian-python-api; extra == 'confluence'
|
|
52
|
+
Requires-Dist: requests; extra == 'confluence'
|
|
53
|
+
Provides-Extra: couchbase
|
|
54
|
+
Requires-Dist: couchbase; extra == 'couchbase'
|
|
55
|
+
Provides-Extra: databricks-delta-tables
|
|
56
|
+
Requires-Dist: databricks-sql-connector; extra == 'databricks-delta-tables'
|
|
57
|
+
Requires-Dist: pandas; extra == 'databricks-delta-tables'
|
|
58
|
+
Provides-Extra: databricks-volumes
|
|
59
|
+
Requires-Dist: databricks-sdk; extra == 'databricks-volumes'
|
|
60
|
+
Provides-Extra: delta-table
|
|
61
|
+
Requires-Dist: boto3; extra == 'delta-table'
|
|
62
|
+
Requires-Dist: deltalake; extra == 'delta-table'
|
|
63
|
+
Requires-Dist: pandas; extra == 'delta-table'
|
|
64
|
+
Provides-Extra: discord
|
|
65
|
+
Requires-Dist: discord-py; extra == 'discord'
|
|
66
|
+
Provides-Extra: doc
|
|
67
|
+
Requires-Dist: unstructured[doc]; extra == 'doc'
|
|
68
|
+
Provides-Extra: docx
|
|
69
|
+
Requires-Dist: unstructured[docx]; extra == 'docx'
|
|
70
|
+
Provides-Extra: dropbox
|
|
71
|
+
Requires-Dist: dropboxdrivefs; extra == 'dropbox'
|
|
72
|
+
Requires-Dist: fsspec; extra == 'dropbox'
|
|
73
|
+
Provides-Extra: duckdb
|
|
74
|
+
Requires-Dist: duckdb; extra == 'duckdb'
|
|
75
|
+
Requires-Dist: pandas; extra == 'duckdb'
|
|
76
|
+
Provides-Extra: elasticsearch
|
|
77
|
+
Requires-Dist: elasticsearch[async]; extra == 'elasticsearch'
|
|
78
|
+
Provides-Extra: epub
|
|
79
|
+
Requires-Dist: unstructured[epub]; extra == 'epub'
|
|
80
|
+
Provides-Extra: gcs
|
|
81
|
+
Requires-Dist: bs4; extra == 'gcs'
|
|
82
|
+
Requires-Dist: fsspec; extra == 'gcs'
|
|
83
|
+
Requires-Dist: gcsfs; extra == 'gcs'
|
|
84
|
+
Provides-Extra: github
|
|
85
|
+
Requires-Dist: pygithub>1.58.0; extra == 'github'
|
|
86
|
+
Requires-Dist: requests; extra == 'github'
|
|
87
|
+
Provides-Extra: gitlab
|
|
88
|
+
Requires-Dist: python-gitlab; extra == 'gitlab'
|
|
89
|
+
Provides-Extra: google-drive
|
|
90
|
+
Requires-Dist: google-api-python-client; extra == 'google-drive'
|
|
91
|
+
Provides-Extra: hubspot
|
|
92
|
+
Requires-Dist: hubspot-api-client; extra == 'hubspot'
|
|
93
|
+
Requires-Dist: urllib3; extra == 'hubspot'
|
|
94
|
+
Provides-Extra: huggingface
|
|
95
|
+
Requires-Dist: sentence-transformers; extra == 'huggingface'
|
|
96
|
+
Provides-Extra: ibm-watsonx-s3
|
|
97
|
+
Requires-Dist: httpx; extra == 'ibm-watsonx-s3'
|
|
98
|
+
Requires-Dist: pandas; extra == 'ibm-watsonx-s3'
|
|
99
|
+
Requires-Dist: pyarrow; extra == 'ibm-watsonx-s3'
|
|
100
|
+
Requires-Dist: pyiceberg; extra == 'ibm-watsonx-s3'
|
|
101
|
+
Requires-Dist: tenacity; extra == 'ibm-watsonx-s3'
|
|
102
|
+
Provides-Extra: image
|
|
103
|
+
Requires-Dist: unstructured[image]; extra == 'image'
|
|
104
|
+
Provides-Extra: jira
|
|
105
|
+
Requires-Dist: atlassian-python-api; extra == 'jira'
|
|
106
|
+
Provides-Extra: kafka
|
|
107
|
+
Requires-Dist: confluent-kafka; extra == 'kafka'
|
|
108
|
+
Provides-Extra: kdbai
|
|
109
|
+
Requires-Dist: kdbai-client>=1.4.0; extra == 'kdbai'
|
|
110
|
+
Requires-Dist: pandas; extra == 'kdbai'
|
|
111
|
+
Provides-Extra: lancedb
|
|
112
|
+
Requires-Dist: lancedb; extra == 'lancedb'
|
|
113
|
+
Provides-Extra: md
|
|
114
|
+
Requires-Dist: unstructured[md]; extra == 'md'
|
|
115
|
+
Provides-Extra: milvus
|
|
116
|
+
Requires-Dist: pymilvus; extra == 'milvus'
|
|
117
|
+
Provides-Extra: mixedbreadai
|
|
118
|
+
Requires-Dist: mixedbread-ai; extra == 'mixedbreadai'
|
|
119
|
+
Provides-Extra: mongodb
|
|
120
|
+
Requires-Dist: pymongo; extra == 'mongodb'
|
|
121
|
+
Provides-Extra: msg
|
|
122
|
+
Requires-Dist: unstructured[msg]; extra == 'msg'
|
|
123
|
+
Provides-Extra: neo4j
|
|
124
|
+
Requires-Dist: cymple; extra == 'neo4j'
|
|
125
|
+
Requires-Dist: neo4j-rust-ext; extra == 'neo4j'
|
|
126
|
+
Requires-Dist: networkx; extra == 'neo4j'
|
|
127
|
+
Provides-Extra: notion
|
|
128
|
+
Requires-Dist: backoff; extra == 'notion'
|
|
129
|
+
Requires-Dist: htmlbuilder; extra == 'notion'
|
|
130
|
+
Requires-Dist: httpx; extra == 'notion'
|
|
131
|
+
Requires-Dist: notion-client; extra == 'notion'
|
|
132
|
+
Provides-Extra: octoai
|
|
133
|
+
Requires-Dist: openai; extra == 'octoai'
|
|
134
|
+
Requires-Dist: tiktoken; extra == 'octoai'
|
|
135
|
+
Provides-Extra: odt
|
|
136
|
+
Requires-Dist: unstructured[odt]; extra == 'odt'
|
|
137
|
+
Provides-Extra: onedrive
|
|
138
|
+
Requires-Dist: msal; extra == 'onedrive'
|
|
139
|
+
Requires-Dist: office365-rest-python-client; extra == 'onedrive'
|
|
140
|
+
Requires-Dist: requests; extra == 'onedrive'
|
|
141
|
+
Provides-Extra: openai
|
|
142
|
+
Requires-Dist: openai; extra == 'openai'
|
|
143
|
+
Requires-Dist: tiktoken; extra == 'openai'
|
|
144
|
+
Provides-Extra: opensearch
|
|
145
|
+
Requires-Dist: opensearch-py; extra == 'opensearch'
|
|
146
|
+
Provides-Extra: org
|
|
147
|
+
Requires-Dist: unstructured[org]; extra == 'org'
|
|
148
|
+
Provides-Extra: outlook
|
|
149
|
+
Requires-Dist: msal; extra == 'outlook'
|
|
150
|
+
Requires-Dist: office365-rest-python-client; extra == 'outlook'
|
|
151
|
+
Provides-Extra: pdf
|
|
152
|
+
Requires-Dist: unstructured[pdf]; extra == 'pdf'
|
|
153
|
+
Provides-Extra: pinecone
|
|
154
|
+
Requires-Dist: pinecone; extra == 'pinecone'
|
|
155
|
+
Provides-Extra: postgres
|
|
156
|
+
Requires-Dist: pandas; extra == 'postgres'
|
|
157
|
+
Requires-Dist: psycopg2-binary; extra == 'postgres'
|
|
158
|
+
Provides-Extra: ppt
|
|
159
|
+
Requires-Dist: unstructured[ppt]; extra == 'ppt'
|
|
160
|
+
Provides-Extra: pptx
|
|
161
|
+
Requires-Dist: unstructured[pptx]; extra == 'pptx'
|
|
162
|
+
Provides-Extra: qdrant
|
|
163
|
+
Requires-Dist: qdrant-client; extra == 'qdrant'
|
|
164
|
+
Provides-Extra: reddit
|
|
165
|
+
Requires-Dist: praw; extra == 'reddit'
|
|
166
|
+
Provides-Extra: redis
|
|
167
|
+
Requires-Dist: redis; extra == 'redis'
|
|
168
|
+
Provides-Extra: remote
|
|
169
|
+
Requires-Dist: unstructured-client>=0.30.0; extra == 'remote'
|
|
170
|
+
Provides-Extra: rst
|
|
171
|
+
Requires-Dist: unstructured[rst]; extra == 'rst'
|
|
172
|
+
Provides-Extra: rtf
|
|
173
|
+
Requires-Dist: unstructured[rtf]; extra == 'rtf'
|
|
174
|
+
Provides-Extra: s3
|
|
175
|
+
Requires-Dist: fsspec; extra == 's3'
|
|
176
|
+
Requires-Dist: s3fs; extra == 's3'
|
|
177
|
+
Provides-Extra: salesforce
|
|
178
|
+
Requires-Dist: simple-salesforce; extra == 'salesforce'
|
|
179
|
+
Provides-Extra: sftp
|
|
180
|
+
Requires-Dist: fsspec; extra == 'sftp'
|
|
181
|
+
Requires-Dist: paramiko; extra == 'sftp'
|
|
182
|
+
Provides-Extra: sharepoint
|
|
183
|
+
Requires-Dist: msal; extra == 'sharepoint'
|
|
184
|
+
Requires-Dist: office365-rest-python-client; extra == 'sharepoint'
|
|
185
|
+
Requires-Dist: requests; extra == 'sharepoint'
|
|
186
|
+
Provides-Extra: singlestore
|
|
187
|
+
Requires-Dist: pandas; extra == 'singlestore'
|
|
188
|
+
Requires-Dist: singlestoredb; extra == 'singlestore'
|
|
189
|
+
Provides-Extra: slack
|
|
190
|
+
Requires-Dist: slack-sdk[optional]; extra == 'slack'
|
|
191
|
+
Provides-Extra: snowflake
|
|
192
|
+
Requires-Dist: pandas; extra == 'snowflake'
|
|
193
|
+
Requires-Dist: psycopg2-binary; extra == 'snowflake'
|
|
194
|
+
Requires-Dist: snowflake-connector-python; extra == 'snowflake'
|
|
195
|
+
Provides-Extra: togetherai
|
|
196
|
+
Requires-Dist: together; extra == 'togetherai'
|
|
197
|
+
Provides-Extra: tsv
|
|
198
|
+
Requires-Dist: unstructured[tsv]; extra == 'tsv'
|
|
199
|
+
Provides-Extra: vastdb
|
|
200
|
+
Requires-Dist: ibis; extra == 'vastdb'
|
|
201
|
+
Requires-Dist: pandas; extra == 'vastdb'
|
|
202
|
+
Requires-Dist: pyarrow; extra == 'vastdb'
|
|
203
|
+
Requires-Dist: vastdb; extra == 'vastdb'
|
|
204
|
+
Provides-Extra: vectara
|
|
205
|
+
Requires-Dist: aiofiles; extra == 'vectara'
|
|
206
|
+
Requires-Dist: httpx; extra == 'vectara'
|
|
207
|
+
Requires-Dist: requests; extra == 'vectara'
|
|
208
|
+
Provides-Extra: vertexai
|
|
209
|
+
Requires-Dist: vertexai; extra == 'vertexai'
|
|
210
|
+
Provides-Extra: voyageai
|
|
211
|
+
Requires-Dist: voyageai; extra == 'voyageai'
|
|
212
|
+
Provides-Extra: weaviate
|
|
213
|
+
Requires-Dist: weaviate-client; extra == 'weaviate'
|
|
214
|
+
Provides-Extra: wikipedia
|
|
215
|
+
Requires-Dist: wikipedia; extra == 'wikipedia'
|
|
216
|
+
Provides-Extra: xlsx
|
|
217
|
+
Requires-Dist: unstructured[xlsx]; extra == 'xlsx'
|
|
218
|
+
Provides-Extra: zendesk
|
|
219
|
+
Requires-Dist: aiofiles; extra == 'zendesk'
|
|
220
|
+
Requires-Dist: bs4; extra == 'zendesk'
|
|
221
|
+
Requires-Dist: httpx; extra == 'zendesk'
|
|
222
|
+
Description-Content-Type: text/markdown
|
|
223
|
+
|
|
224
|
+
# Unstructured Ingest
|
|
225
|
+
|
|
226
|
+
For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
|