unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import uuid
|
|
3
1
|
from dataclasses import dataclass
|
|
4
2
|
from pathlib import Path
|
|
5
3
|
from typing import Any
|
|
@@ -7,6 +5,7 @@ from typing import Any
|
|
|
7
5
|
import pandas as pd
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
8
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
9
|
|
|
11
10
|
_COLUMNS = (
|
|
12
11
|
"id",
|
|
@@ -56,6 +55,22 @@ _COLUMNS = (
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class BaseDuckDBUploadStager(UploadStager):
|
|
58
57
|
|
|
58
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
59
|
+
data = element_dict.copy()
|
|
60
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
61
|
+
data_source = metadata.pop("data_source", {})
|
|
62
|
+
coordinates = metadata.pop("coordinates", {})
|
|
63
|
+
|
|
64
|
+
data.update(metadata)
|
|
65
|
+
data.update(data_source)
|
|
66
|
+
data.update(coordinates)
|
|
67
|
+
|
|
68
|
+
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
69
|
+
|
|
70
|
+
# remove extraneous, not supported columns
|
|
71
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
72
|
+
return data
|
|
73
|
+
|
|
59
74
|
def run(
|
|
60
75
|
self,
|
|
61
76
|
elements_filepath: Path,
|
|
@@ -64,29 +79,14 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
64
79
|
output_filename: str,
|
|
65
80
|
**kwargs: Any,
|
|
66
81
|
) -> Path:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
70
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
-
|
|
72
|
-
output = []
|
|
73
|
-
for data in elements_contents:
|
|
74
|
-
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
75
|
-
data_source = metadata.pop("data_source", {})
|
|
76
|
-
coordinates = metadata.pop("coordinates", {})
|
|
77
|
-
|
|
78
|
-
data.update(metadata)
|
|
79
|
-
data.update(data_source)
|
|
80
|
-
data.update(coordinates)
|
|
81
|
-
|
|
82
|
-
data["id"] = str(uuid.uuid4())
|
|
83
|
-
|
|
84
|
-
# remove extraneous, not supported columns
|
|
85
|
-
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
86
|
-
|
|
87
|
-
output.append(data)
|
|
82
|
+
elements_contents = self.get_data(elements_filepath=elements_filepath)
|
|
83
|
+
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
88
84
|
|
|
89
|
-
|
|
85
|
+
output = [
|
|
86
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
87
|
+
for element_dict in elements_contents
|
|
88
|
+
]
|
|
89
|
+
df = pd.DataFrame(data=output)
|
|
90
90
|
|
|
91
91
|
for column in filter(
|
|
92
92
|
lambda x: x in df.columns,
|
|
@@ -94,6 +94,6 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
94
94
|
):
|
|
95
95
|
df[column] = df[column].apply(str)
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
data = df.to_dict(orient="records")
|
|
98
|
+
self.write_output(output_path=output_path, data=data)
|
|
99
99
|
return output_path
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
12
|
from unstructured_ingest.v2.interfaces import (
|
|
11
13
|
AccessConfig,
|
|
@@ -55,6 +57,20 @@ class DuckDBConnectionConfig(ConnectionConfig):
|
|
|
55
57
|
"through the `database` argument"
|
|
56
58
|
)
|
|
57
59
|
|
|
60
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
61
|
+
@contextmanager
|
|
62
|
+
def get_client(self) -> Generator["DuckDBConnection", None, None]:
|
|
63
|
+
import duckdb
|
|
64
|
+
|
|
65
|
+
with duckdb.connect(self.database) as client:
|
|
66
|
+
yield client
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
+
with self.get_client() as client:
|
|
71
|
+
with client.cursor() as cursor:
|
|
72
|
+
yield cursor
|
|
73
|
+
|
|
58
74
|
|
|
59
75
|
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
60
76
|
pass
|
|
@@ -79,34 +95,27 @@ class DuckDBUploader(Uploader):
|
|
|
79
95
|
|
|
80
96
|
def precheck(self) -> None:
|
|
81
97
|
try:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
cursor.close()
|
|
98
|
+
with self.connection_config.get_cursor() as cursor:
|
|
99
|
+
cursor.execute("SELECT 1;")
|
|
85
100
|
except Exception as e:
|
|
86
101
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
87
102
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
88
103
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
return self._make_duckdb_connection
|
|
104
|
+
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
105
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
92
106
|
|
|
93
|
-
|
|
94
|
-
def _make_duckdb_connection(self) -> "DuckDBConnection":
|
|
95
|
-
import duckdb
|
|
96
|
-
|
|
97
|
-
return duckdb.connect(self.connection_config.database)
|
|
98
|
-
|
|
99
|
-
def upload_contents(self, path: Path) -> None:
|
|
100
|
-
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
101
|
-
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
102
|
-
|
|
103
|
-
with self.connection() as conn:
|
|
107
|
+
with self.connection_config.get_client() as conn:
|
|
104
108
|
conn.query(
|
|
105
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM
|
|
109
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
106
110
|
)
|
|
107
111
|
|
|
112
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
113
|
+
df = pd.DataFrame(data=data)
|
|
114
|
+
self.upload_dataframe(df=df)
|
|
115
|
+
|
|
108
116
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
109
|
-
|
|
117
|
+
df = get_data_df(path)
|
|
118
|
+
self.upload_dataframe(df=df)
|
|
110
119
|
|
|
111
120
|
|
|
112
121
|
duckdb_destination_entry = DestinationRegistryEntry(
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
9
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
10
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
13
|
from unstructured_ingest.v2.interfaces import (
|
|
12
14
|
AccessConfig,
|
|
@@ -27,13 +29,12 @@ CONNECTOR_TYPE = "motherduck"
|
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class MotherDuckAccessConfig(AccessConfig):
|
|
30
|
-
md_token:
|
|
32
|
+
md_token: str = Field(default=None, description="MotherDuck token")
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
class MotherDuckConnectionConfig(ConnectionConfig):
|
|
34
36
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
35
|
-
database:
|
|
36
|
-
default=None,
|
|
37
|
+
database: str = Field(
|
|
37
38
|
description="Database name. Name of the MotherDuck database.",
|
|
38
39
|
)
|
|
39
40
|
db_schema: Optional[str] = Field(
|
|
@@ -48,17 +49,26 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
48
49
|
default=MotherDuckAccessConfig(), validate_default=True
|
|
49
50
|
)
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"
|
|
61
|
-
|
|
52
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
53
|
+
@contextmanager
|
|
54
|
+
def get_client(self) -> Generator["MotherDuckConnection", None, None]:
|
|
55
|
+
import duckdb
|
|
56
|
+
|
|
57
|
+
access_config = self.access_config.get_secret_value()
|
|
58
|
+
with duckdb.connect(
|
|
59
|
+
f"md:?motherduck_token={access_config.md_token}",
|
|
60
|
+
config={
|
|
61
|
+
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
62
|
+
},
|
|
63
|
+
) as conn:
|
|
64
|
+
conn.sql(f"USE {self.database}")
|
|
65
|
+
yield conn
|
|
66
|
+
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
+
with self.get_client() as client:
|
|
70
|
+
with client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
62
72
|
|
|
63
73
|
|
|
64
74
|
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
@@ -84,44 +94,27 @@ class MotherDuckUploader(Uploader):
|
|
|
84
94
|
|
|
85
95
|
def precheck(self) -> None:
|
|
86
96
|
try:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
cursor.close()
|
|
97
|
+
with self.connection_config.get_cursor() as cursor:
|
|
98
|
+
cursor.execute("SELECT 1;")
|
|
90
99
|
except Exception as e:
|
|
91
100
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
92
101
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
102
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
return self._make_motherduck_connection
|
|
97
|
-
|
|
98
|
-
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
99
|
-
def _make_motherduck_connection(self) -> "MotherDuckConnection":
|
|
100
|
-
import duckdb
|
|
103
|
+
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
104
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
101
105
|
|
|
102
|
-
|
|
103
|
-
conn = duckdb.connect(
|
|
104
|
-
f"md:?motherduck_token={access_config.md_token}",
|
|
105
|
-
config={
|
|
106
|
-
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
107
|
-
},
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
conn.sql(f"USE {self.connection_config.database}")
|
|
111
|
-
|
|
112
|
-
return conn
|
|
113
|
-
|
|
114
|
-
def upload_contents(self, path: Path) -> None:
|
|
115
|
-
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
116
|
-
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
117
|
-
|
|
118
|
-
with self.connection() as conn:
|
|
106
|
+
with self.connection_config.get_client() as conn:
|
|
119
107
|
conn.query(
|
|
120
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM
|
|
108
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
121
109
|
)
|
|
122
110
|
|
|
111
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
112
|
+
df = pd.DataFrame(data=data)
|
|
113
|
+
self.upload_dataframe(df=df)
|
|
114
|
+
|
|
123
115
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
124
|
-
|
|
116
|
+
df = get_data_df(path)
|
|
117
|
+
self.upload_dataframe(df=df)
|
|
125
118
|
|
|
126
119
|
|
|
127
120
|
motherduck_destination_entry = DestinationRegistryEntry(
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
|
-
import sys
|
|
5
3
|
from contextlib import contextmanager
|
|
6
4
|
from dataclasses import dataclass, field
|
|
7
5
|
from pathlib import Path
|
|
@@ -16,11 +14,17 @@ from unstructured_ingest.error import (
|
|
|
16
14
|
SourceConnectionNetworkError,
|
|
17
15
|
WriteError,
|
|
18
16
|
)
|
|
19
|
-
from unstructured_ingest.utils.data_prep import
|
|
17
|
+
from unstructured_ingest.utils.data_prep import (
|
|
18
|
+
batch_generator,
|
|
19
|
+
flatten_dict,
|
|
20
|
+
generator_batching_wbytes,
|
|
21
|
+
)
|
|
20
22
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
23
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
22
24
|
from unstructured_ingest.v2.interfaces import (
|
|
23
25
|
AccessConfig,
|
|
26
|
+
BatchFileData,
|
|
27
|
+
BatchItem,
|
|
24
28
|
ConnectionConfig,
|
|
25
29
|
Downloader,
|
|
26
30
|
DownloaderConfig,
|
|
@@ -49,6 +53,14 @@ if TYPE_CHECKING:
|
|
|
49
53
|
CONNECTOR_TYPE = "elasticsearch"
|
|
50
54
|
|
|
51
55
|
|
|
56
|
+
class ElastisearchAdditionalMetadata(BaseModel):
|
|
57
|
+
index_name: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ElasticsearchBatchFileData(BatchFileData):
|
|
61
|
+
additional_metadata: ElastisearchAdditionalMetadata
|
|
62
|
+
|
|
63
|
+
|
|
52
64
|
class ElasticsearchAccessConfig(AccessConfig):
|
|
53
65
|
password: Optional[str] = Field(
|
|
54
66
|
default=None, description="password when using basic auth or connecting to a cloud instance"
|
|
@@ -175,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
|
|
|
175
187
|
|
|
176
188
|
return {hit["_id"] for hit in hits}
|
|
177
189
|
|
|
178
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
190
|
+
def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
|
|
179
191
|
all_ids = self._get_doc_ids()
|
|
180
192
|
ids = list(all_ids)
|
|
181
|
-
|
|
182
|
-
frozenset(
|
|
183
|
-
ids[
|
|
184
|
-
i
|
|
185
|
-
* self.index_config.batch_size : (i + 1) # noqa
|
|
186
|
-
* self.index_config.batch_size
|
|
187
|
-
]
|
|
188
|
-
)
|
|
189
|
-
for i in range(
|
|
190
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
191
|
-
)
|
|
192
|
-
]
|
|
193
|
-
for batch in id_batches:
|
|
193
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
194
194
|
# Make sure the hash is always a positive number to create identified
|
|
195
|
-
|
|
196
|
-
yield FileData(
|
|
197
|
-
identifier=identified,
|
|
195
|
+
yield ElasticsearchBatchFileData(
|
|
198
196
|
connector_type=CONNECTOR_TYPE,
|
|
199
|
-
doc_type="batch",
|
|
200
197
|
metadata=FileDataSourceMetadata(
|
|
201
198
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
202
199
|
date_processed=str(time()),
|
|
203
200
|
),
|
|
204
|
-
additional_metadata=
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
201
|
+
additional_metadata=ElastisearchAdditionalMetadata(
|
|
202
|
+
index_name=self.index_config.index_name,
|
|
203
|
+
),
|
|
204
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
208
205
|
)
|
|
209
206
|
|
|
210
207
|
|
|
@@ -238,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
238
235
|
return concatenated_values
|
|
239
236
|
|
|
240
237
|
def generate_download_response(
|
|
241
|
-
self, result: dict, index_name: str, file_data:
|
|
238
|
+
self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
|
|
242
239
|
) -> DownloadResponse:
|
|
243
240
|
record_id = result["_id"]
|
|
244
241
|
filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
|
|
@@ -258,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
|
|
|
258
255
|
exc_info=True,
|
|
259
256
|
)
|
|
260
257
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
),
|
|
275
|
-
),
|
|
276
|
-
path=download_path,
|
|
258
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
259
|
+
cast_file_data.identifier = filename_id
|
|
260
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
261
|
+
cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
|
|
262
|
+
cast_file_data.metadata.record_locator = {
|
|
263
|
+
"hosts": self.connection_config.hosts,
|
|
264
|
+
"index_name": index_name,
|
|
265
|
+
"document_id": record_id,
|
|
266
|
+
}
|
|
267
|
+
cast_file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
268
|
+
return super().generate_download_response(
|
|
269
|
+
file_data=cast_file_data,
|
|
270
|
+
download_path=download_path,
|
|
277
271
|
)
|
|
278
272
|
|
|
279
273
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -286,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
|
|
|
286
280
|
|
|
287
281
|
return AsyncElasticsearch, async_scan
|
|
288
282
|
|
|
289
|
-
async def run_async(self, file_data:
|
|
283
|
+
async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
|
|
284
|
+
elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
|
|
290
285
|
AsyncClient, async_scan = self.load_async()
|
|
291
286
|
|
|
292
|
-
index_name: str =
|
|
293
|
-
ids: list[str] =
|
|
287
|
+
index_name: str = elasticsearch_filedata.additional_metadata.index_name
|
|
288
|
+
ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
|
|
294
289
|
|
|
295
290
|
scan_query = {
|
|
296
291
|
"_source": self.download_config.fields,
|
|
@@ -308,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
308
303
|
):
|
|
309
304
|
download_responses.append(
|
|
310
305
|
self.generate_download_response(
|
|
311
|
-
result=result, index_name=index_name, file_data=
|
|
306
|
+
result=result, index_name=index_name, file_data=elasticsearch_filedata
|
|
312
307
|
)
|
|
313
308
|
)
|
|
314
309
|
return download_responses
|
|
@@ -324,7 +319,8 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
|
|
|
324
319
|
class ElasticsearchUploadStager(UploadStager):
|
|
325
320
|
upload_stager_config: ElasticsearchUploadStagerConfig
|
|
326
321
|
|
|
327
|
-
def conform_dict(self,
|
|
322
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
323
|
+
data = element_dict.copy()
|
|
328
324
|
resp = {
|
|
329
325
|
"_index": self.upload_stager_config.index_name,
|
|
330
326
|
"_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
@@ -340,29 +336,6 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
340
336
|
resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
|
|
341
337
|
return resp
|
|
342
338
|
|
|
343
|
-
def run(
|
|
344
|
-
self,
|
|
345
|
-
elements_filepath: Path,
|
|
346
|
-
file_data: FileData,
|
|
347
|
-
output_dir: Path,
|
|
348
|
-
output_filename: str,
|
|
349
|
-
**kwargs: Any,
|
|
350
|
-
) -> Path:
|
|
351
|
-
with open(elements_filepath) as elements_file:
|
|
352
|
-
elements_contents = json.load(elements_file)
|
|
353
|
-
conformed_elements = [
|
|
354
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
355
|
-
]
|
|
356
|
-
if Path(output_filename).suffix != ".json":
|
|
357
|
-
output_filename = f"{output_filename}.json"
|
|
358
|
-
else:
|
|
359
|
-
output_filename = f"{Path(output_filename).stem}.json"
|
|
360
|
-
output_path = Path(output_dir) / output_filename
|
|
361
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
362
|
-
with open(output_path, "w") as output_file:
|
|
363
|
-
json.dump(conformed_elements, output_file, indent=2)
|
|
364
|
-
return output_path
|
|
365
|
-
|
|
366
339
|
|
|
367
340
|
class ElasticsearchUploaderConfig(UploaderConfig):
|
|
368
341
|
index_name: str = Field(
|
|
@@ -427,16 +400,14 @@ class ElasticsearchUploader(Uploader):
|
|
|
427
400
|
raise WriteError(f"failed to delete records: {failures}")
|
|
428
401
|
|
|
429
402
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
430
|
-
def
|
|
403
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
|
|
431
404
|
from elasticsearch.helpers.errors import BulkIndexError
|
|
432
405
|
|
|
433
406
|
parallel_bulk = self.load_parallel_bulk()
|
|
434
|
-
with path.open("r") as file:
|
|
435
|
-
elements_dict = json.load(file)
|
|
436
407
|
upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
|
|
437
408
|
|
|
438
409
|
logger.info(
|
|
439
|
-
f"writing {len(
|
|
410
|
+
f"writing {len(data)} elements via document batches to destination "
|
|
440
411
|
f"index named {self.upload_config.index_name} at {upload_destination} with "
|
|
441
412
|
f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
|
|
442
413
|
f"{self.upload_config.num_threads} (number of) threads"
|
|
@@ -451,7 +422,7 @@ class ElasticsearchUploader(Uploader):
|
|
|
451
422
|
f"This may cause issues when uploading."
|
|
452
423
|
)
|
|
453
424
|
for batch in generator_batching_wbytes(
|
|
454
|
-
|
|
425
|
+
data, batch_size_limit_bytes=self.upload_config.batch_size_bytes
|
|
455
426
|
):
|
|
456
427
|
try:
|
|
457
428
|
iterator = parallel_bulk(
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
25
|
)
|
|
26
26
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
27
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from adlfs import AzureBlobFileSystem
|
|
30
|
+
|
|
28
31
|
CONNECTOR_TYPE = "azure"
|
|
29
32
|
|
|
30
33
|
|
|
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
|
|
|
89
92
|
}
|
|
90
93
|
return access_configs
|
|
91
94
|
|
|
95
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
|
|
98
|
+
with super().get_client(protocol=protocol) as client:
|
|
99
|
+
yield client
|
|
100
|
+
|
|
92
101
|
|
|
93
102
|
@dataclass
|
|
94
103
|
class AzureIndexer(FsspecIndexer):
|
|
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
|
|
|
96
105
|
index_config: AzureIndexerConfig
|
|
97
106
|
connector_type: str = CONNECTOR_TYPE
|
|
98
107
|
|
|
99
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
100
|
-
def precheck(self) -> None:
|
|
101
|
-
super().precheck()
|
|
102
|
-
|
|
103
108
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
104
109
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
105
110
|
|
|
106
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
107
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
108
|
-
return super().run(**kwargs)
|
|
109
|
-
|
|
110
111
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
112
|
path = file_data["name"]
|
|
112
113
|
date_created = (
|
|
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
149
150
|
connector_type: str = CONNECTOR_TYPE
|
|
150
151
|
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
151
152
|
|
|
152
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
153
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
154
|
-
return super().run(file_data=file_data, **kwargs)
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
157
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
158
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
153
|
|
|
161
154
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
162
155
|
pass
|
|
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
|
|
|
168
161
|
connection_config: AzureConnectionConfig
|
|
169
162
|
upload_config: AzureUploaderConfig = field(default=None)
|
|
170
163
|
|
|
171
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
172
|
-
def __post_init__(self):
|
|
173
|
-
super().__post_init__()
|
|
174
|
-
|
|
175
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
176
|
-
def precheck(self) -> None:
|
|
177
|
-
super().precheck()
|
|
178
|
-
|
|
179
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
180
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
182
|
-
|
|
183
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
184
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
186
|
-
|
|
187
164
|
|
|
188
165
|
azure_source_entry = SourceRegistryEntry(
|
|
189
166
|
indexer=AzureIndexer,
|