unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
1
- import json
2
- import uuid
3
1
  from dataclasses import dataclass
4
2
  from pathlib import Path
5
3
  from typing import Any
@@ -7,6 +5,7 @@ from typing import Any
7
5
  import pandas as pd
8
6
 
9
7
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
8
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
10
9
 
11
10
  _COLUMNS = (
12
11
  "id",
@@ -56,6 +55,22 @@ _COLUMNS = (
56
55
  @dataclass
57
56
  class BaseDuckDBUploadStager(UploadStager):
58
57
 
58
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
59
+ data = element_dict.copy()
60
+ metadata: dict[str, Any] = data.pop("metadata", {})
61
+ data_source = metadata.pop("data_source", {})
62
+ coordinates = metadata.pop("coordinates", {})
63
+
64
+ data.update(metadata)
65
+ data.update(data_source)
66
+ data.update(coordinates)
67
+
68
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
69
+
70
+ # remove extraneous, not supported columns
71
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
72
+ return data
73
+
59
74
  def run(
60
75
  self,
61
76
  elements_filepath: Path,
@@ -64,29 +79,14 @@ class BaseDuckDBUploadStager(UploadStager):
64
79
  output_filename: str,
65
80
  **kwargs: Any,
66
81
  ) -> Path:
67
- with open(elements_filepath) as elements_file:
68
- elements_contents: list[dict] = json.load(elements_file)
69
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
70
- output_path.parent.mkdir(parents=True, exist_ok=True)
71
-
72
- output = []
73
- for data in elements_contents:
74
- metadata: dict[str, Any] = data.pop("metadata", {})
75
- data_source = metadata.pop("data_source", {})
76
- coordinates = metadata.pop("coordinates", {})
77
-
78
- data.update(metadata)
79
- data.update(data_source)
80
- data.update(coordinates)
81
-
82
- data["id"] = str(uuid.uuid4())
83
-
84
- # remove extraneous, not supported columns
85
- data = {k: v for k, v in data.items() if k in _COLUMNS}
86
-
87
- output.append(data)
82
+ elements_contents = self.get_data(elements_filepath=elements_filepath)
83
+ output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
88
84
 
89
- df = pd.DataFrame.from_dict(output)
85
+ output = [
86
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
87
+ for element_dict in elements_contents
88
+ ]
89
+ df = pd.DataFrame(data=output)
90
90
 
91
91
  for column in filter(
92
92
  lambda x: x in df.columns,
@@ -94,6 +94,6 @@ class BaseDuckDBUploadStager(UploadStager):
94
94
  ):
95
95
  df[column] = df[column].apply(str)
96
96
 
97
- with output_path.open("w") as output_file:
98
- df.to_json(output_file, orient="records", lines=True)
97
+ data = df.to_dict(orient="records")
98
+ self.write_output(output_path=output_path, data=data)
99
99
  return output_path
@@ -1,11 +1,13 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
- from typing import TYPE_CHECKING, Any, Callable, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.data_prep import get_data_df
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
12
  from unstructured_ingest.v2.interfaces import (
11
13
  AccessConfig,
@@ -55,6 +57,20 @@ class DuckDBConnectionConfig(ConnectionConfig):
55
57
  "through the `database` argument"
56
58
  )
57
59
 
60
+ @requires_dependencies(["duckdb"], extras="duckdb")
61
+ @contextmanager
62
+ def get_client(self) -> Generator["DuckDBConnection", None, None]:
63
+ import duckdb
64
+
65
+ with duckdb.connect(self.database) as client:
66
+ yield client
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
70
+ with self.get_client() as client:
71
+ with client.cursor() as cursor:
72
+ yield cursor
73
+
58
74
 
59
75
  class DuckDBUploadStagerConfig(UploadStagerConfig):
60
76
  pass
@@ -79,34 +95,27 @@ class DuckDBUploader(Uploader):
79
95
 
80
96
  def precheck(self) -> None:
81
97
  try:
82
- cursor = self.connection().cursor()
83
- cursor.execute("SELECT 1;")
84
- cursor.close()
98
+ with self.connection_config.get_cursor() as cursor:
99
+ cursor.execute("SELECT 1;")
85
100
  except Exception as e:
86
101
  logger.error(f"failed to validate connection: {e}", exc_info=True)
87
102
  raise DestinationConnectionError(f"failed to validate connection: {e}")
88
103
 
89
- @property
90
- def connection(self) -> Callable[[], "DuckDBConnection"]:
91
- return self._make_duckdb_connection
104
+ def upload_dataframe(self, df: pd.DataFrame) -> None:
105
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
92
106
 
93
- @requires_dependencies(["duckdb"], extras="duckdb")
94
- def _make_duckdb_connection(self) -> "DuckDBConnection":
95
- import duckdb
96
-
97
- return duckdb.connect(self.connection_config.database)
98
-
99
- def upload_contents(self, path: Path) -> None:
100
- df_elements = pd.read_json(path, orient="records", lines=True)
101
- logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
102
-
103
- with self.connection() as conn:
107
+ with self.connection_config.get_client() as conn:
104
108
  conn.query(
105
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
109
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
106
110
  )
107
111
 
112
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
113
+ df = pd.DataFrame(data=data)
114
+ self.upload_dataframe(df=df)
115
+
108
116
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
109
- self.upload_contents(path=path)
117
+ df = get_data_df(path)
118
+ self.upload_dataframe(df=df)
110
119
 
111
120
 
112
121
  duckdb_destination_entry = DestinationRegistryEntry(
@@ -1,12 +1,14 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
- from typing import TYPE_CHECKING, Any, Callable, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
9
10
  from unstructured_ingest.error import DestinationConnectionError
11
+ from unstructured_ingest.utils.data_prep import get_data_df
10
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
13
  from unstructured_ingest.v2.interfaces import (
12
14
  AccessConfig,
@@ -27,13 +29,12 @@ CONNECTOR_TYPE = "motherduck"
27
29
 
28
30
 
29
31
  class MotherDuckAccessConfig(AccessConfig):
30
- md_token: Optional[str] = Field(default=None, description="MotherDuck token")
32
+ md_token: str = Field(default=None, description="MotherDuck token")
31
33
 
32
34
 
33
35
  class MotherDuckConnectionConfig(ConnectionConfig):
34
36
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
35
- database: Optional[str] = Field(
36
- default=None,
37
+ database: str = Field(
37
38
  description="Database name. Name of the MotherDuck database.",
38
39
  )
39
40
  db_schema: Optional[str] = Field(
@@ -48,17 +49,26 @@ class MotherDuckConnectionConfig(ConnectionConfig):
48
49
  default=MotherDuckAccessConfig(), validate_default=True
49
50
  )
50
51
 
51
- def __post_init__(self):
52
- if self.database is None:
53
- raise ValueError(
54
- "A MotherDuck connection requires a database (string) to be passed "
55
- "through the `database` argument"
56
- )
57
- if self.access_config.md_token is None:
58
- raise ValueError(
59
- "A MotherDuck connection requires a md_token (MotherDuck token) to be passed "
60
- "using MotherDuckAccessConfig through the `access_config` argument"
61
- )
52
+ @requires_dependencies(["duckdb"], extras="duckdb")
53
+ @contextmanager
54
+ def get_client(self) -> Generator["MotherDuckConnection", None, None]:
55
+ import duckdb
56
+
57
+ access_config = self.access_config.get_secret_value()
58
+ with duckdb.connect(
59
+ f"md:?motherduck_token={access_config.md_token}",
60
+ config={
61
+ "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
+ },
63
+ ) as conn:
64
+ conn.sql(f"USE {self.database}")
65
+ yield conn
66
+
67
+ @contextmanager
68
+ def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
69
+ with self.get_client() as client:
70
+ with client.cursor() as cursor:
71
+ yield cursor
62
72
 
63
73
 
64
74
  class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -84,44 +94,27 @@ class MotherDuckUploader(Uploader):
84
94
 
85
95
  def precheck(self) -> None:
86
96
  try:
87
- cursor = self.connection().cursor()
88
- cursor.execute("SELECT 1;")
89
- cursor.close()
97
+ with self.connection_config.get_cursor() as cursor:
98
+ cursor.execute("SELECT 1;")
90
99
  except Exception as e:
91
100
  logger.error(f"failed to validate connection: {e}", exc_info=True)
92
101
  raise DestinationConnectionError(f"failed to validate connection: {e}")
93
102
 
94
- @property
95
- def connection(self) -> Callable[[], "MotherDuckConnection"]:
96
- return self._make_motherduck_connection
97
-
98
- @requires_dependencies(["duckdb"], extras="duckdb")
99
- def _make_motherduck_connection(self) -> "MotherDuckConnection":
100
- import duckdb
103
+ def upload_dataframe(self, df: pd.DataFrame) -> None:
104
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
101
105
 
102
- access_config = self.connection_config.access_config.get_secret_value()
103
- conn = duckdb.connect(
104
- f"md:?motherduck_token={access_config.md_token}",
105
- config={
106
- "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
107
- },
108
- )
109
-
110
- conn.sql(f"USE {self.connection_config.database}")
111
-
112
- return conn
113
-
114
- def upload_contents(self, path: Path) -> None:
115
- df_elements = pd.read_json(path, orient="records", lines=True)
116
- logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
117
-
118
- with self.connection() as conn:
106
+ with self.connection_config.get_client() as conn:
119
107
  conn.query(
120
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
108
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
121
109
  )
122
110
 
111
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
112
+ df = pd.DataFrame(data=data)
113
+ self.upload_dataframe(df=df)
114
+
123
115
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
124
- self.upload_contents(path=path)
116
+ df = get_data_df(path)
117
+ self.upload_dataframe(df=df)
125
118
 
126
119
 
127
120
  motherduck_destination_entry = DestinationRegistryEntry(
@@ -1,7 +1,5 @@
1
1
  import collections
2
2
  import hashlib
3
- import json
4
- import sys
5
3
  from contextlib import contextmanager
6
4
  from dataclasses import dataclass, field
7
5
  from pathlib import Path
@@ -16,11 +14,17 @@ from unstructured_ingest.error import (
16
14
  SourceConnectionNetworkError,
17
15
  WriteError,
18
16
  )
19
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
17
+ from unstructured_ingest.utils.data_prep import (
18
+ batch_generator,
19
+ flatten_dict,
20
+ generator_batching_wbytes,
21
+ )
20
22
  from unstructured_ingest.utils.dep_check import requires_dependencies
21
23
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
22
24
  from unstructured_ingest.v2.interfaces import (
23
25
  AccessConfig,
26
+ BatchFileData,
27
+ BatchItem,
24
28
  ConnectionConfig,
25
29
  Downloader,
26
30
  DownloaderConfig,
@@ -49,6 +53,14 @@ if TYPE_CHECKING:
49
53
  CONNECTOR_TYPE = "elasticsearch"
50
54
 
51
55
 
56
+ class ElastisearchAdditionalMetadata(BaseModel):
57
+ index_name: str
58
+
59
+
60
+ class ElasticsearchBatchFileData(BatchFileData):
61
+ additional_metadata: ElastisearchAdditionalMetadata
62
+
63
+
52
64
  class ElasticsearchAccessConfig(AccessConfig):
53
65
  password: Optional[str] = Field(
54
66
  default=None, description="password when using basic auth or connecting to a cloud instance"
@@ -175,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
175
187
 
176
188
  return {hit["_id"] for hit in hits}
177
189
 
178
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
190
+ def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
179
191
  all_ids = self._get_doc_ids()
180
192
  ids = list(all_ids)
181
- id_batches: list[frozenset[str]] = [
182
- frozenset(
183
- ids[
184
- i
185
- * self.index_config.batch_size : (i + 1) # noqa
186
- * self.index_config.batch_size
187
- ]
188
- )
189
- for i in range(
190
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
191
- )
192
- ]
193
- for batch in id_batches:
193
+ for batch in batch_generator(ids, self.index_config.batch_size):
194
194
  # Make sure the hash is always a positive number to create identified
195
- identified = str(hash(batch) + sys.maxsize + 1)
196
- yield FileData(
197
- identifier=identified,
195
+ yield ElasticsearchBatchFileData(
198
196
  connector_type=CONNECTOR_TYPE,
199
- doc_type="batch",
200
197
  metadata=FileDataSourceMetadata(
201
198
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
202
199
  date_processed=str(time()),
203
200
  ),
204
- additional_metadata={
205
- "ids": list(batch),
206
- "index_name": self.index_config.index_name,
207
- },
201
+ additional_metadata=ElastisearchAdditionalMetadata(
202
+ index_name=self.index_config.index_name,
203
+ ),
204
+ batch_items=[BatchItem(identifier=b) for b in batch],
208
205
  )
209
206
 
210
207
 
@@ -238,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
238
235
  return concatenated_values
239
236
 
240
237
  def generate_download_response(
241
- self, result: dict, index_name: str, file_data: FileData
238
+ self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
242
239
  ) -> DownloadResponse:
243
240
  record_id = result["_id"]
244
241
  filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
@@ -258,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
258
255
  exc_info=True,
259
256
  )
260
257
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
261
- return DownloadResponse(
262
- file_data=FileData(
263
- identifier=filename_id,
264
- connector_type=CONNECTOR_TYPE,
265
- source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
266
- metadata=FileDataSourceMetadata(
267
- version=str(result["_version"]) if "_version" in result else None,
268
- date_processed=str(time()),
269
- record_locator={
270
- "hosts": self.connection_config.hosts,
271
- "index_name": index_name,
272
- "document_id": record_id,
273
- },
274
- ),
275
- ),
276
- path=download_path,
258
+ cast_file_data = FileData.cast(file_data=file_data)
259
+ cast_file_data.identifier = filename_id
260
+ cast_file_data.metadata.date_processed = str(time())
261
+ cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
262
+ cast_file_data.metadata.record_locator = {
263
+ "hosts": self.connection_config.hosts,
264
+ "index_name": index_name,
265
+ "document_id": record_id,
266
+ }
267
+ cast_file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
268
+ return super().generate_download_response(
269
+ file_data=cast_file_data,
270
+ download_path=download_path,
277
271
  )
278
272
 
279
273
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -286,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
286
280
 
287
281
  return AsyncElasticsearch, async_scan
288
282
 
289
- async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
283
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
284
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
290
285
  AsyncClient, async_scan = self.load_async()
291
286
 
292
- index_name: str = file_data.additional_metadata["index_name"]
293
- ids: list[str] = file_data.additional_metadata["ids"]
287
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
288
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
294
289
 
295
290
  scan_query = {
296
291
  "_source": self.download_config.fields,
@@ -308,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
308
303
  ):
309
304
  download_responses.append(
310
305
  self.generate_download_response(
311
- result=result, index_name=index_name, file_data=file_data
306
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
312
307
  )
313
308
  )
314
309
  return download_responses
@@ -324,7 +319,8 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
324
319
  class ElasticsearchUploadStager(UploadStager):
325
320
  upload_stager_config: ElasticsearchUploadStagerConfig
326
321
 
327
- def conform_dict(self, data: dict, file_data: FileData) -> dict:
322
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
323
+ data = element_dict.copy()
328
324
  resp = {
329
325
  "_index": self.upload_stager_config.index_name,
330
326
  "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
@@ -340,29 +336,6 @@ class ElasticsearchUploadStager(UploadStager):
340
336
  resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
341
337
  return resp
342
338
 
343
- def run(
344
- self,
345
- elements_filepath: Path,
346
- file_data: FileData,
347
- output_dir: Path,
348
- output_filename: str,
349
- **kwargs: Any,
350
- ) -> Path:
351
- with open(elements_filepath) as elements_file:
352
- elements_contents = json.load(elements_file)
353
- conformed_elements = [
354
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
355
- ]
356
- if Path(output_filename).suffix != ".json":
357
- output_filename = f"{output_filename}.json"
358
- else:
359
- output_filename = f"{Path(output_filename).stem}.json"
360
- output_path = Path(output_dir) / output_filename
361
- output_path.parent.mkdir(parents=True, exist_ok=True)
362
- with open(output_path, "w") as output_file:
363
- json.dump(conformed_elements, output_file, indent=2)
364
- return output_path
365
-
366
339
 
367
340
  class ElasticsearchUploaderConfig(UploaderConfig):
368
341
  index_name: str = Field(
@@ -427,16 +400,14 @@ class ElasticsearchUploader(Uploader):
427
400
  raise WriteError(f"failed to delete records: {failures}")
428
401
 
429
402
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
430
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None: # type: ignore
403
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
431
404
  from elasticsearch.helpers.errors import BulkIndexError
432
405
 
433
406
  parallel_bulk = self.load_parallel_bulk()
434
- with path.open("r") as file:
435
- elements_dict = json.load(file)
436
407
  upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
437
408
 
438
409
  logger.info(
439
- f"writing {len(elements_dict)} elements via document batches to destination "
410
+ f"writing {len(data)} elements via document batches to destination "
440
411
  f"index named {self.upload_config.index_name} at {upload_destination} with "
441
412
  f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
442
413
  f"{self.upload_config.num_threads} (number of) threads"
@@ -451,7 +422,7 @@ class ElasticsearchUploader(Uploader):
451
422
  f"This may cause issues when uploading."
452
423
  )
453
424
  for batch in generator_batching_wbytes(
454
- elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
425
+ data, batch_size_limit_bytes=self.upload_config.batch_size_bytes
455
426
  ):
456
427
  try:
457
428
  iterator = parallel_bulk(
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
25
  )
26
26
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
27
27
 
28
+ if TYPE_CHECKING:
29
+ from adlfs import AzureBlobFileSystem
30
+
28
31
  CONNECTOR_TYPE = "azure"
29
32
 
30
33
 
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
89
92
  }
90
93
  return access_configs
91
94
 
95
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
96
+ @contextmanager
97
+ def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
98
+ with super().get_client(protocol=protocol) as client:
99
+ yield client
100
+
92
101
 
93
102
  @dataclass
94
103
  class AzureIndexer(FsspecIndexer):
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
96
105
  index_config: AzureIndexerConfig
97
106
  connector_type: str = CONNECTOR_TYPE
98
107
 
99
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
100
- def precheck(self) -> None:
101
- super().precheck()
102
-
103
108
  def sterilize_info(self, file_data: dict) -> dict:
104
109
  return sterilize_dict(data=file_data, default=azure_json_serial)
105
110
 
106
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
- return super().run(**kwargs)
109
-
110
111
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
112
  path = file_data["name"]
112
113
  date_created = (
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
149
150
  connector_type: str = CONNECTOR_TYPE
150
151
  download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
151
152
 
152
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
153
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
154
- return super().run(file_data=file_data, **kwargs)
155
-
156
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
157
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
158
- return await super().run_async(file_data=file_data, **kwargs)
159
-
160
153
 
161
154
  class AzureUploaderConfig(FsspecUploaderConfig):
162
155
  pass
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
168
161
  connection_config: AzureConnectionConfig
169
162
  upload_config: AzureUploaderConfig = field(default=None)
170
163
 
171
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
172
- def __post_init__(self):
173
- super().__post_init__()
174
-
175
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
176
- def precheck(self) -> None:
177
- super().precheck()
178
-
179
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
180
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
- return super().run(path=path, file_data=file_data, **kwargs)
182
-
183
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
184
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
185
- return await super().run_async(path=path, file_data=file_data, **kwargs)
186
-
187
164
 
188
165
  azure_source_entry = SourceRegistryEntry(
189
166
  indexer=AzureIndexer,