unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "1.0.1" # pragma: no cover
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Ingest CLI
|
|
2
|
+
This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
|
|
3
|
+
|
|
4
|
+
## Design Reference
|
|
5
|
+
[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
|
|
6
|
+
source and destination connectors.
|
|
7
|
+
|
|
8
|
+
To manually run the cli:
|
|
9
|
+
```shell
|
|
10
|
+
PYTHONPATH=. python unstructured_ingest/v2/main.py --help
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The `main.py` file simply wraps the generated Click command created in `cli.py`.
|
|
14
|
+
|
|
15
|
+
### Source Commands
|
|
16
|
+
All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
|
|
17
|
+
different connectors with shared and unique parameters.
|
|
18
|
+
|
|
19
|
+
### Destination Commands
|
|
20
|
+
All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
|
|
21
|
+
sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
|
|
22
|
+
generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
|
|
23
|
+
|
|
24
|
+
### Configs
|
|
25
|
+
The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
|
|
26
|
+
are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
|
|
27
|
+
Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
|
|
28
|
+
into dataclasses that have nested fields (such as access configs).
|
|
@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
114
114
|
|
|
115
115
|
@dataclass
|
|
116
116
|
class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
117
|
-
|
|
118
117
|
config: MixedbreadAIEmbeddingConfig
|
|
119
118
|
|
|
120
119
|
async def get_exemplary_embedding(self) -> list[float]:
|
|
@@ -8,7 +8,7 @@ from pydantic import BaseModel
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseProcess
|
|
10
10
|
from unstructured_ingest.utils import ndjson
|
|
11
|
-
from unstructured_ingest.utils.data_prep import
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class UploadStagerConfig(BaseModel):
|
|
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
43
43
|
writer.f.flush()
|
|
44
44
|
|
|
45
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
-
elements_contents =
|
|
46
|
+
elements_contents = get_json_data(path=input_file)
|
|
47
47
|
|
|
48
48
|
conformed_elements = [
|
|
49
49
|
self.conform_dict(element_dict=element, file_data=file_data)
|
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
9
|
from unstructured_ingest.interfaces import BaseConnector, BaseProcess
|
|
10
|
-
from unstructured_ingest.utils.data_prep import
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class UploaderConfig(BaseModel):
|
|
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
45
45
|
return False
|
|
46
46
|
|
|
47
47
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
48
|
-
data =
|
|
48
|
+
data = get_json_data(path=path)
|
|
49
49
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
50
50
|
|
|
51
51
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
52
|
-
data =
|
|
52
|
+
data = get_json_data(path=path)
|
|
53
53
|
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
54
54
|
|
|
55
55
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
unstructured_ingest/logger.py
CHANGED
|
@@ -1,99 +1,8 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import typing as t
|
|
5
2
|
|
|
6
3
|
logger = logging.getLogger("unstructured_ingest")
|
|
7
4
|
|
|
8
5
|
|
|
9
|
-
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
10
|
-
sensitive_fields = [
|
|
11
|
-
"account_name",
|
|
12
|
-
"client_id",
|
|
13
|
-
]
|
|
14
|
-
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
15
|
-
return (
|
|
16
|
-
v
|
|
17
|
-
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
18
|
-
or k.lower() in sensitive_fields
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def hide_sensitive_fields(
|
|
23
|
-
data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
|
|
24
|
-
) -> dict:
|
|
25
|
-
"""
|
|
26
|
-
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
27
|
-
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
28
|
-
any string value can be parsed as valid json and process that dict as well and replace
|
|
29
|
-
the original string with the json.dumps() version of the redacted dict.
|
|
30
|
-
"""
|
|
31
|
-
new_data = data.copy()
|
|
32
|
-
for k, v in new_data.items():
|
|
33
|
-
if is_sensitive_fn(k, v):
|
|
34
|
-
new_data[k] = "*******"
|
|
35
|
-
if isinstance(v, dict):
|
|
36
|
-
new_data[k] = hide_sensitive_fields(v)
|
|
37
|
-
if isinstance(v, str):
|
|
38
|
-
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
39
|
-
try:
|
|
40
|
-
json_data = json.loads(v)
|
|
41
|
-
if isinstance(json_data, dict):
|
|
42
|
-
updated_data = hide_sensitive_fields(json_data)
|
|
43
|
-
new_data[k] = json.dumps(updated_data)
|
|
44
|
-
except json.JSONDecodeError:
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
return new_data
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def redact_jsons(s: str) -> str:
|
|
51
|
-
"""
|
|
52
|
-
Takes in a generic string and pulls out all valid json content. Leverages
|
|
53
|
-
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
54
|
-
original json with the new redacted format. There can be any number of valid
|
|
55
|
-
jsons in a generic string and this will work. Having extra '{' without a
|
|
56
|
-
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
chars = list(s)
|
|
60
|
-
if "{" not in chars:
|
|
61
|
-
return s
|
|
62
|
-
i = 0
|
|
63
|
-
jsons = []
|
|
64
|
-
i = 0
|
|
65
|
-
while i < len(chars):
|
|
66
|
-
char = chars[i]
|
|
67
|
-
if char == "{":
|
|
68
|
-
stack = [char]
|
|
69
|
-
current = [char]
|
|
70
|
-
while len(stack) != 0 and i < len(chars):
|
|
71
|
-
i += 1
|
|
72
|
-
char = chars[i]
|
|
73
|
-
current.append(char)
|
|
74
|
-
if char == "{":
|
|
75
|
-
stack.append(char)
|
|
76
|
-
if char == "}":
|
|
77
|
-
stack.pop(-1)
|
|
78
|
-
jsons.append("".join(current))
|
|
79
|
-
continue
|
|
80
|
-
i += 1
|
|
81
|
-
for j in jsons:
|
|
82
|
-
try:
|
|
83
|
-
formatted_j = json.dumps(json.loads(j))
|
|
84
|
-
except json.JSONDecodeError:
|
|
85
|
-
formatted_j = json.dumps(ast.literal_eval(j))
|
|
86
|
-
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
87
|
-
s = s.replace(j, hidden_j)
|
|
88
|
-
return s
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class SensitiveFormatter(logging.Formatter):
|
|
92
|
-
def format(self, record):
|
|
93
|
-
s = super().format(record=record)
|
|
94
|
-
return redact_jsons(s)
|
|
95
|
-
|
|
96
|
-
|
|
97
6
|
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
7
|
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
8
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
@@ -106,7 +15,7 @@ def remove_root_handlers(logger: logging.Logger) -> None:
|
|
|
106
15
|
def ingest_log_streaming_init(level: int) -> None:
|
|
107
16
|
handler = logging.StreamHandler()
|
|
108
17
|
handler.name = "ingest_log_handler"
|
|
109
|
-
formatter =
|
|
18
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
110
19
|
handler.setFormatter(formatter)
|
|
111
20
|
|
|
112
21
|
# Only want to add the handler once
|
|
@@ -122,7 +31,7 @@ def make_default_logger(level: int) -> logging.Logger:
|
|
|
122
31
|
logger = logging.getLogger("unstructured_ingest")
|
|
123
32
|
handler = logging.StreamHandler()
|
|
124
33
|
handler.name = "ingest_log_handler"
|
|
125
|
-
formatter =
|
|
34
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
126
35
|
handler.setFormatter(formatter)
|
|
127
36
|
logger.addHandler(handler)
|
|
128
37
|
logger.setLevel(level)
|
unstructured_ingest/main.py
CHANGED
|
File without changes
|
|
@@ -119,7 +119,7 @@ class PipelineStep(ABC):
|
|
|
119
119
|
iterable = iterable or []
|
|
120
120
|
if iterable:
|
|
121
121
|
logger.info(
|
|
122
|
-
f"calling {self.__class__.__name__}
|
|
122
|
+
f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
|
|
123
123
|
)
|
|
124
124
|
else:
|
|
125
125
|
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
@@ -220,7 +220,7 @@ class Pipeline:
|
|
|
220
220
|
|
|
221
221
|
def _run(self):
|
|
222
222
|
logger.info(
|
|
223
|
-
f"running local pipeline: {self} with configs:
|
|
223
|
+
f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
|
|
224
224
|
)
|
|
225
225
|
if self.context.mp_supported:
|
|
226
226
|
manager = mp.Manager()
|
|
@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
|
|
|
24
24
|
default="https://api.unstructuredapp.io/general/v0/general",
|
|
25
25
|
description="If chunking via api, use the following host.",
|
|
26
26
|
)
|
|
27
|
+
chunk_api_timeout_ms: Optional[int] = Field(
|
|
28
|
+
default=None, description="Timeout in milliseconds for all api call during chunking."
|
|
29
|
+
)
|
|
27
30
|
chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
|
|
28
31
|
chunk_api_key: Optional[SecretStr] = Field(
|
|
29
32
|
default=None, description="API Key for chunking endpoint."
|
|
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
|
|
|
120
123
|
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
121
124
|
filename=elements_filepath,
|
|
122
125
|
api_parameters=self.config.to_chunking_kwargs(),
|
|
126
|
+
timeout_ms=self.config.chunk_api_timeout_ms,
|
|
123
127
|
)
|
|
124
128
|
|
|
125
129
|
elements = assign_and_map_hash_ids(elements=elements)
|
|
@@ -3,7 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
4
|
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
import pandas
|
|
7
6
|
from pydantic import BaseModel, Field, Secret, field_validator
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
|
|
|
213
212
|
row_dict.update(table_row["fields"])
|
|
214
213
|
return row_dict
|
|
215
214
|
|
|
215
|
+
@requires_dependencies(["pandas"], extras="airtable")
|
|
216
216
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
217
|
+
import pandas as pd
|
|
218
|
+
|
|
217
219
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
218
220
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
219
|
-
df =
|
|
221
|
+
df = pd.DataFrame.from_dict(
|
|
220
222
|
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
221
223
|
).sort_index(axis=1)
|
|
222
224
|
download_path = self.get_download_path(file_data=file_data)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"properties": [
|
|
3
|
+
{
|
|
4
|
+
"dataType": [
|
|
5
|
+
"text"
|
|
6
|
+
],
|
|
7
|
+
"indexFilterable": true,
|
|
8
|
+
"indexSearchable": true,
|
|
9
|
+
"name": "record_id",
|
|
10
|
+
"tokenization": "word"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"dataType": [
|
|
14
|
+
"text"
|
|
15
|
+
],
|
|
16
|
+
"indexFilterable": true,
|
|
17
|
+
"indexSearchable": true,
|
|
18
|
+
"name": "text",
|
|
19
|
+
"tokenization": "word"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"vectorizer": "none"
|
|
23
|
+
}
|
|
@@ -43,7 +43,7 @@ from unstructured_ingest.processes.connector_registry import (
|
|
|
43
43
|
)
|
|
44
44
|
from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
|
|
45
45
|
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
46
|
-
from unstructured_ingest.utils.data_prep import batch_generator,
|
|
46
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
|
|
47
47
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
48
48
|
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
49
49
|
|
|
@@ -465,7 +465,7 @@ class AstraDBUploader(Uploader):
|
|
|
465
465
|
collection.insert_many(chunk)
|
|
466
466
|
|
|
467
467
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
468
|
-
data =
|
|
468
|
+
data = get_json_data(path=path)
|
|
469
469
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
470
470
|
|
|
471
471
|
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
|
28
28
|
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
token: Optional[str] = Field(
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
|
28
28
|
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
azure_workspace_resource_id: Optional[str] = Field(
|
|
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
|
47
47
|
)
|
|
48
48
|
azure_environment: Optional[str] = Field(
|
|
49
49
|
default=None,
|
|
50
|
-
description="The Azure environment type for a
|
|
50
|
+
description="The Azure environment type for a specific set of API endpoints",
|
|
51
51
|
examples=["Public", "UsGov", "China", "Germany"],
|
|
52
52
|
)
|
|
53
53
|
|
|
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
|
28
28
|
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
29
|
account_id: Optional[str] = Field(
|
|
30
30
|
default=None,
|
|
31
|
-
description="The Databricks account ID for the Databricks
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
32
|
)
|
|
33
33
|
profile: Optional[str] = None
|
|
34
34
|
google_credentials: Optional[str] = None
|
|
@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
166
166
|
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
167
167
|
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
168
168
|
logger.debug(
|
|
169
|
-
f"migrating content from {catalog_path} to "
|
|
170
|
-
f"table {self.upload_config.table_name}"
|
|
169
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
171
170
|
)
|
|
172
171
|
data = get_json_data(path=path)
|
|
173
172
|
columns = data[0].keys()
|
|
@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
|
|
|
181
181
|
df = pd.DataFrame(data=data)
|
|
182
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
183
|
|
|
184
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
184
185
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
186
|
df = get_data_df(path)
|
|
186
187
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.data_types.file_data import FileData
|
|
6
6
|
from unstructured_ingest.interfaces import UploadStager
|
|
7
|
-
from unstructured_ingest.utils.data_prep import
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
|
|
8
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
_COLUMNS = (
|
|
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
81
81
|
) -> Path:
|
|
82
82
|
import pandas as pd
|
|
83
83
|
|
|
84
|
-
elements_contents =
|
|
84
|
+
elements_contents = get_json_data(path=elements_filepath)
|
|
85
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
86
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
87
87
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
|
|
|
67
67
|
|
|
68
68
|
@contextmanager
|
|
69
69
|
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
-
with self.get_client() as client:
|
|
71
|
-
|
|
72
|
-
yield cursor
|
|
70
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
73
72
|
|
|
74
73
|
|
|
75
74
|
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
66
66
|
|
|
67
67
|
@contextmanager
|
|
68
68
|
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
-
with self.get_client() as client:
|
|
70
|
-
|
|
71
|
-
yield cursor
|
|
69
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
70
|
+
yield cursor
|
|
72
71
|
|
|
73
72
|
|
|
74
73
|
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
|
|
|
116
115
|
df = pd.DataFrame(data=data)
|
|
117
116
|
self.upload_dataframe(df=df)
|
|
118
117
|
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
119
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
120
|
df = get_data_df(path)
|
|
121
121
|
self.upload_dataframe(df=df)
|
|
@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
|
|
|
134
134
|
|
|
135
135
|
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
136
136
|
metadata: dict[str, str] = {}
|
|
137
|
-
with
|
|
138
|
-
|
|
139
|
-
|
|
137
|
+
with (
|
|
138
|
+
contextlib.suppress(AttributeError),
|
|
139
|
+
self.connection_config.get_client(protocol=self.index_config.protocol) as client,
|
|
140
|
+
):
|
|
141
|
+
metadata = client.metadata(path=path)
|
|
140
142
|
record_locator = {
|
|
141
143
|
"protocol": self.index_config.protocol,
|
|
142
144
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
|
|
|
230
230
|
download_path = self.get_download_path(file_data=file_data)
|
|
231
231
|
if download_path is None:
|
|
232
232
|
logger.error(
|
|
233
|
-
"Generated download path is None, source_identifiers might be
|
|
234
|
-
"from FileData."
|
|
233
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
235
234
|
)
|
|
236
235
|
raise ValueError("Generated invalid download path.")
|
|
237
236
|
|
|
@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
334
334
|
recursive: bool = False,
|
|
335
335
|
previous_path: Optional[str] = None,
|
|
336
336
|
) -> list[dict]:
|
|
337
|
-
|
|
338
337
|
fields_input = "nextPageToken, files({})".format(",".join(self.fields))
|
|
339
338
|
q = f"'{object_id}' in parents"
|
|
340
339
|
# Filter by extension but still include any directories
|
|
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
394
393
|
if not self.is_dir(root_info):
|
|
395
394
|
data = [self.map_file_data(root_info)]
|
|
396
395
|
else:
|
|
397
|
-
|
|
398
396
|
file_contents = self.get_paginated_results(
|
|
399
397
|
files_client=files_client,
|
|
400
398
|
object_id=object_id,
|
|
@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
9
8
|
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
10
|
from unstructured_ingest.data_types.file_data import FileData
|
|
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
|
|
|
29
28
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
29
|
|
|
31
30
|
if TYPE_CHECKING:
|
|
31
|
+
from pandas import DataFrame
|
|
32
32
|
from pyarrow import Table as ArrowTable
|
|
33
33
|
from pyiceberg.catalog.rest import RestCatalog
|
|
34
34
|
from pyiceberg.table import Table, Transaction
|
|
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
96
96
|
return UserAuthError(e)
|
|
97
97
|
if 400 <= response_code < 500:
|
|
98
98
|
logger.error(
|
|
99
|
-
f"Request to {url}
|
|
100
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
99
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
101
100
|
)
|
|
102
101
|
return UserError(e)
|
|
103
102
|
if response_code > 500:
|
|
104
103
|
logger.error(
|
|
105
|
-
f"Request to {url}
|
|
106
|
-
f"in IBM watsonx.data connector, status code {response_code}"
|
|
104
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
107
105
|
)
|
|
108
106
|
return ProviderError(e)
|
|
109
107
|
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
217
215
|
return self.upload_config.record_id_key in self.get_table_columns()
|
|
218
216
|
|
|
219
217
|
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
220
|
-
def _df_to_arrow_table(self, df:
|
|
218
|
+
def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
|
|
221
219
|
import pyarrow as pa
|
|
222
220
|
|
|
223
221
|
# Iceberg will automatically fill missing columns with nulls
|
|
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
277
275
|
except Exception as e:
|
|
278
276
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
279
277
|
|
|
280
|
-
def upload_dataframe(self, df:
|
|
278
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
281
279
|
data_table = self._df_to_arrow_table(df)
|
|
282
280
|
|
|
283
281
|
with self.get_table() as table:
|
|
284
282
|
self.upload_data_table(table, data_table, file_data)
|
|
285
283
|
|
|
284
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
286
285
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
286
|
+
import pandas as pd
|
|
287
|
+
|
|
287
288
|
df = pd.DataFrame(data)
|
|
288
289
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
289
290
|
|
|
291
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
290
292
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
291
293
|
df = get_data_df(path=path)
|
|
292
294
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
|
|
|
141
141
|
df = pd.DataFrame(data=data)
|
|
142
142
|
self.process_dataframe(df=df)
|
|
143
143
|
|
|
144
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
144
145
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
146
|
data = get_data_df(path=path)
|
|
146
147
|
self.process_dataframe(df=data)
|
|
@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
|
|
|
199
199
|
download_path = self.get_download_path(file_data)
|
|
200
200
|
if download_path is None:
|
|
201
201
|
logger.error(
|
|
202
|
-
"Generated download path is None, source_identifiers might be
|
|
203
|
-
"from FileData."
|
|
202
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
204
203
|
)
|
|
205
204
|
raise ValueError("Generated invalid download path.")
|
|
206
205
|
|