unstructured-ingest 0.5.21__py3-none-any.whl → 0.5.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  import time
4
+ from functools import lru_cache
4
5
  from pathlib import Path
5
6
  from typing import Generator
6
7
  from uuid import uuid4
@@ -25,24 +26,29 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
25
26
  )
26
27
 
27
28
 
28
- def validate_upload(response: dict, expected_data: dict):
29
+ def validate_upload(document: dict, expected_data: dict):
30
+ logger.info(f"validating document: {document}")
29
31
  element_id = expected_data["element_id"]
30
32
  expected_text = expected_data["text"]
31
33
  filename = expected_data["metadata"]["filename"]
32
34
  filetype = expected_data["metadata"]["filetype"]
33
35
  page_number = expected_data["metadata"]["page_number"]
34
36
 
35
- response = response["search_results"][0]
36
-
37
- assert response is not None
38
- assert response["text"] == expected_text
39
- assert response["part_metadata"]["element_id"] == element_id
40
- assert response["part_metadata"]["filename"] == filename
41
- assert response["part_metadata"]["filetype"] == filetype
42
- assert response["part_metadata"]["page_number"] == page_number
37
+ assert document is not None
38
+ speech_parts = document["parts"]
39
+ assert speech_parts
40
+ first_part = speech_parts[0]
41
+ assert first_part["text"] == expected_text
42
+ part_metadata = first_part["metadata"]
43
+ assert part_metadata
44
+ assert part_metadata["element_id"] == element_id
45
+ assert part_metadata["filename"] == filename
46
+ assert part_metadata["filetype"] == filetype
47
+ assert part_metadata["page_number"] == page_number
43
48
 
44
49
 
45
50
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
51
+ @lru_cache()
46
52
  def _get_jwt_token():
47
53
  """Connect to the server and get a JWT token."""
48
54
  customer_id = os.environ["VECTARA_CUSTOMER_ID"]
@@ -65,23 +71,12 @@ def _get_jwt_token():
65
71
  return response_json.get("access_token")
66
72
 
67
73
 
68
- def query_data(corpus_key: str, element_id: str) -> dict:
74
+ def list_documents(corpus_key: str) -> list[str]:
69
75
 
70
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
76
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
71
77
 
72
78
  # the query below requires the corpus to have filter attributes for element_id
73
79
 
74
- data = json.dumps(
75
- {
76
- "query": "string",
77
- "search": {
78
- "metadata_filter": f"part.element_id = '{element_id}'",
79
- "lexical_interpolation": 1,
80
- "limit": 10,
81
- },
82
- }
83
- )
84
-
85
80
  jwt_token = _get_jwt_token()
86
81
  headers = {
87
82
  "Content-Type": "application/json",
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
90
85
  "X-source": "unstructured",
91
86
  }
92
87
 
93
- response = requests.post(url, headers=headers, data=data)
88
+ response = requests.get(url, headers=headers)
94
89
  response.raise_for_status()
95
90
  response_json = response.json()
91
+ documents = response_json.get("documents", [])
92
+ return documents
93
+
96
94
 
97
- return response_json
95
+ def fetch_document(corpus_key: str, documents_id: str) -> dict:
96
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
97
+ jwt_token = _get_jwt_token()
98
+ headers = {
99
+ "Content-Type": "application/json",
100
+ "Accept": "application/json",
101
+ "Authorization": f"Bearer {jwt_token}",
102
+ "X-source": "unstructured",
103
+ }
104
+
105
+ response = requests.get(url, headers=headers)
106
+ response.raise_for_status()
107
+ return response.json()
98
108
 
99
109
 
100
110
  def create_corpora(corpus_key: str, corpus_name: str) -> None:
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
148
158
  response.raise_for_status()
149
159
 
150
160
 
151
- def list_corpora() -> list:
152
- url = "https://api.vectara.io/v2/corpora?limit=100"
161
+ def get_metadata(corpus_key: str):
162
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
153
163
  jwt_token = _get_jwt_token()
154
164
  headers = {
155
165
  "Content-Type": "application/json",
@@ -159,35 +169,28 @@ def list_corpora() -> list:
159
169
  }
160
170
  response = requests.get(url, headers=headers)
161
171
  response.raise_for_status()
162
- response_json = response.json()
163
- if response_json.get("corpora"):
164
- return [item["key"] for item in response_json.get("corpora")]
165
- else:
166
- return []
172
+ return response.json()
167
173
 
168
174
 
169
175
  def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170
- def is_ready_status():
171
- corpora_list = list_corpora()
172
- return corpus_key in corpora_list
173
-
174
176
  start = time.time()
175
- is_ready = is_ready_status()
176
- while not is_ready and time.time() - start < timeout:
177
- time.sleep(interval)
178
- is_ready = is_ready_status()
179
- if not is_ready:
180
- raise TimeoutError("time out waiting for corpus to be ready")
177
+ while time.time() - start < timeout:
178
+ try:
179
+ get_metadata(corpus_key)
180
+ return
181
+ except requests.HTTPError:
182
+ time.sleep(interval)
183
+ raise TimeoutError("time out waiting for corpus to be ready")
181
184
 
182
185
 
183
186
  def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184
187
  start = time.time()
185
188
  while time.time() - start < timeout:
186
- corpora_list = list_corpora()
187
- if corpus_key not in corpora_list:
189
+ try:
190
+ get_metadata(corpus_key)
191
+ time.sleep(interval)
192
+ except requests.HTTPError:
188
193
  return
189
- time.sleep(interval)
190
-
191
194
  raise TimeoutError("time out waiting for corpus to delete")
192
195
 
193
196
 
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
210
213
  wait_for_delete(corpus_key=corpus_key)
211
214
 
212
215
 
216
+ def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
217
+ start = time.time()
218
+ while time.time() - start < timeout:
219
+ all_document_meta = list_documents(corpus_key)
220
+ if not all_document_meta:
221
+ time.sleep(interval)
222
+ continue
223
+ else:
224
+ return all_document_meta
225
+ raise TimeoutError("time out waiting for document to be ready")
226
+
227
+
213
228
  @pytest.mark.asyncio
214
229
  @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
215
230
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
231
  async def test_vectara_destination(
217
- upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
232
+ upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
218
233
  ):
219
234
  corpus_key = corpora_util
220
235
  connection_kwargs = {
@@ -231,7 +246,7 @@ async def test_vectara_destination(
231
246
  identifier="mock-file-data",
232
247
  )
233
248
 
234
- stager_config = VectaraUploadStagerConfig(batch_size=10)
249
+ stager_config = VectaraUploadStagerConfig()
235
250
  stager = VectaraUploadStager(upload_stager_config=stager_config)
236
251
  new_upload_file = stager.run(
237
252
  elements_filepath=upload_file,
@@ -260,11 +275,8 @@ async def test_vectara_destination(
260
275
  elements = json.load(upload_fp)
261
276
  first_element = elements[0]
262
277
 
263
- for i in range(retries):
264
- response = query_data(corpus_key, first_element["element_id"])
265
- if not response["search_results"]:
266
- time.sleep(interval)
267
- else:
268
- break
269
-
270
- validate_upload(response=response, expected_data=first_element)
278
+ all_document_meta = wait_for_doc_meta(corpus_key)
279
+ assert len(all_document_meta) == 1
280
+ document_meta = all_document_meta[0]
281
+ document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
282
+ validate_upload(document=document, expected_data=first_element)
@@ -1 +1 @@
1
- __version__ = "0.5.21" # pragma: no cover
1
+ __version__ = "0.5.25" # pragma: no cover
@@ -1,6 +1,13 @@
1
+ from unstructured_ingest.v2.types.file_data import (
2
+ BatchFileData,
3
+ BatchItem,
4
+ FileData,
5
+ FileDataSourceMetadata,
6
+ SourceIdentifiers,
7
+ )
8
+
1
9
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
10
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
11
  from .indexer import Indexer, IndexerConfig
5
12
  from .process import BaseProcess
6
13
  from .processor import ProcessorConfig
@@ -1,116 +1,13 @@
1
- import json
2
- from pathlib import Path
3
- from typing import Any, Optional
4
- from uuid import NAMESPACE_DNS, uuid5
5
-
6
- from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
-
8
- from unstructured_ingest.v2.logger import logger
9
-
10
-
11
- class SourceIdentifiers(BaseModel):
12
- filename: str
13
- fullpath: str
14
- rel_path: Optional[str] = None
15
-
16
- @property
17
- def filename_stem(self) -> str:
18
- return Path(self.filename).stem
19
-
20
- @property
21
- def relative_path(self) -> str:
22
- return self.rel_path or self.fullpath
23
-
24
-
25
- class FileDataSourceMetadata(BaseModel):
26
- url: Optional[str] = None
27
- version: Optional[str] = None
28
- record_locator: Optional[dict[str, Any]] = None
29
- date_created: Optional[str] = None
30
- date_modified: Optional[str] = None
31
- date_processed: Optional[str] = None
32
- permissions_data: Optional[list[dict[str, Any]]] = None
33
- filesize_bytes: Optional[int] = None
34
-
35
-
36
- class FileData(BaseModel):
37
- identifier: str
38
- connector_type: str
39
- source_identifiers: SourceIdentifiers
40
- metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
- additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
- reprocess: bool = False
43
- local_download_path: Optional[str] = None
44
- display_name: Optional[str] = None
45
-
46
- @classmethod
47
- def from_file(cls, path: str) -> "FileData":
48
- path = Path(path).resolve()
49
- if not path.exists() or not path.is_file():
50
- raise ValueError(f"file path not valid: {path}")
51
- with open(str(path.resolve()), "rb") as f:
52
- file_data_dict = json.load(f)
53
- file_data = cls.model_validate(file_data_dict)
54
- return file_data
55
-
56
- @classmethod
57
- def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
- file_data_dict = file_data.model_dump()
59
- return cls.model_validate(file_data_dict, **kwargs)
60
-
61
- def to_file(self, path: str) -> None:
62
- path = Path(path).resolve()
63
- path.parent.mkdir(parents=True, exist_ok=True)
64
- with open(str(path.resolve()), "w") as f:
65
- json.dump(self.model_dump(), f, indent=2)
66
-
67
-
68
- class BatchItem(BaseModel):
69
- identifier: str
70
- version: Optional[str] = None
71
-
72
-
73
- class BatchFileData(FileData):
74
- identifier: str = Field(init=False)
75
- batch_items: list[BatchItem]
76
- source_identifiers: Optional[SourceIdentifiers] = None
77
-
78
- @field_validator("batch_items")
79
- @classmethod
80
- def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
- if not v:
82
- raise ValueError("batch items cannot be empty")
83
- all_identifiers = [item.identifier for item in v]
84
- if len(all_identifiers) != len(set(all_identifiers)):
85
- raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
- sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
- return sorted_batch_items
88
-
89
- @model_validator(mode="before")
90
- @classmethod
91
- def populate_identifier(cls, data: Any) -> Any:
92
- if isinstance(data, dict) and "identifier" not in data:
93
- batch_items = data["batch_items"]
94
- identifier_data = json.dumps(
95
- {item.identifier: item.version for item in batch_items}, sort_keys=True
96
- )
97
- data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
- return data
99
-
100
-
101
- def file_data_from_file(path: str) -> FileData:
102
- try:
103
- return BatchFileData.from_file(path=path)
104
- except ValidationError:
105
- logger.debug(f"{path} not detected as batch file data")
106
-
107
- return FileData.from_file(path=path)
108
-
109
-
110
- def file_data_from_dict(data: dict) -> FileData:
111
- try:
112
- return BatchFileData.model_validate(data)
113
- except ValidationError:
114
- logger.debug(f"{data} not valid for batch file data")
115
-
116
- return FileData.model_validate(data)
1
+ """
2
+ COMPATABILITY NOTICE:
3
+ This file has moved to the v2/types/ module.
4
+ The following line exists for backward compatibility.
5
+ """
6
+
7
+ from unstructured_ingest.v2.types.file_data import * # noqa - star imports are bad, but this is for maximal backward compatability
8
+
9
+ # Eventually this file should go away. Let's start warning users now:
10
+ logger.warning( # noqa - using logger from the star import
11
+ "Importing file_data.py through interfaces is deprecated. "
12
+ "Please use unstructured_ingest.v2.types.file_data instead!"
13
+ )
@@ -1,13 +1,13 @@
1
- import json
2
1
  import os
2
+ import tempfile
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Generator
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from unstructured_ingest.utils.data_prep import write_data
10
+ from unstructured_ingest.utils.data_prep import get_data_df, write_data
11
11
  from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
12
12
  from unstructured_ingest.v2.logger import logger
13
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -22,6 +22,9 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
22
22
 
23
23
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
24
24
 
25
+ if TYPE_CHECKING:
26
+ from pandas import DataFrame
27
+
25
28
 
26
29
  class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
27
30
  database: str = Field(description="Database name", default="default")
@@ -30,10 +33,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
30
33
 
31
34
  @dataclass
32
35
  class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
33
- def write_output(self, output_path: Path, data: list[dict]) -> None:
36
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
34
37
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
35
38
  # and always write it as a json file
36
- write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
39
+ final_output_path = output_path.with_suffix(".json")
40
+ write_data(path=final_output_path, data=data, indent=None)
41
+ return final_output_path
37
42
 
38
43
 
39
44
  @dataclass
@@ -41,6 +46,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
41
46
  connection_config: DatabricksDeltaTablesConnectionConfig
42
47
  upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
48
  connector_type: str = CONNECTOR_TYPE
49
+ _columns: Optional[dict[str, str]] = None
44
50
 
45
51
  def precheck(self) -> None:
46
52
  with self.connection_config.get_cursor() as cursor:
@@ -84,20 +90,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
84
90
  cursor.execute(f"USE DATABASE {self.upload_config.database}")
85
91
  yield cursor
86
92
 
87
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
88
- with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
89
- catalog_path = self.get_output_path(file_data=file_data)
90
- logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
91
- cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
92
- logger.debug(
93
- f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
93
+ def get_table_columns(self) -> dict[str, str]:
94
+ if self._columns is None:
95
+ with self.get_cursor() as cursor:
96
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
97
+ self._columns = {desc[0]: desc[1] for desc in cursor.description}
98
+ return self._columns
99
+
100
+ def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
101
+ import pandas as pd
102
+
103
+ table_columns = self.get_table_columns()
104
+ columns = set(df.columns)
105
+ schema_fields = set(table_columns.keys())
106
+ columns_to_drop = columns - schema_fields
107
+ missing_columns = schema_fields - columns
108
+
109
+ if columns_to_drop:
110
+ logger.info(
111
+ "Following columns will be dropped to match the table's schema: "
112
+ f"{', '.join(columns_to_drop)}"
113
+ )
114
+ if missing_columns and add_missing_columns:
115
+ logger.info(
116
+ "Following null filled columns will be added to match the table's schema:"
117
+ f" {', '.join(missing_columns)} "
94
118
  )
95
- with path.open() as f:
96
- data = json.load(f)
97
- columns = data[0].keys()
98
- column_str = ", ".join(columns)
99
- sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
100
- cursor.execute(sql_statment)
119
+
120
+ df = df.drop(columns=columns_to_drop)
121
+
122
+ if add_missing_columns:
123
+ for column in missing_columns:
124
+ df[column] = pd.Series()
125
+ return df
126
+
127
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
128
+ with tempfile.TemporaryDirectory() as temp_dir:
129
+ df = get_data_df()
130
+ df = self._fit_to_schema(df=df)
131
+ temp_path = Path(temp_dir) / path.name
132
+ df.to_json(temp_path, orient="records", lines=False)
133
+ with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
134
+ catalog_path = self.get_output_path(file_data=file_data)
135
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
136
+ cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
137
+ logger.debug(
138
+ f"migrating content from {catalog_path} to "
139
+ f"table {self.upload_config.table_name}"
140
+ )
141
+ columns = list(df.columns)
142
+ column_str = ", ".join(columns)
143
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
144
+ cursor.execute(sql_statment)
101
145
 
102
146
 
103
147
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  from time import time
7
7
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
8
 
9
- from pydantic import BaseModel, Field, Secret, SecretStr
9
+ from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
10
10
 
11
11
  from unstructured_ingest.error import (
12
12
  DestinationConnectionError,
@@ -98,6 +98,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
98
98
  ca_certs: Optional[Path] = None
99
99
  access_config: Secret[ElasticsearchAccessConfig]
100
100
 
101
+ @field_validator("hosts", mode="before")
102
+ def to_list(cls, value):
103
+ if isinstance(value, str):
104
+ return [value]
105
+ return value
106
+
101
107
  def get_client_kwargs(self) -> dict:
102
108
  # Update auth related fields to conform to what the SDK expects based on the
103
109
  # supported methods:
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
2
2
  from pathlib import Path
3
3
  from typing import TYPE_CHECKING, Optional
4
4
 
5
- from pydantic import BaseModel, Field, Secret
5
+ from pydantic import BaseModel, Field, Secret, field_validator
6
6
 
7
7
  from unstructured_ingest.error import (
8
8
  DestinationConnectionError,
@@ -78,6 +78,12 @@ class OpenSearchConnectionConfig(ConnectionConfig):
78
78
 
79
79
  access_config: Secret[OpenSearchAccessConfig]
80
80
 
81
+ @field_validator("hosts", mode="before")
82
+ def to_list(cls, value):
83
+ if isinstance(value, str):
84
+ return [value]
85
+ return value
86
+
81
87
  def get_client_kwargs(self) -> dict:
82
88
  # Update auth related fields to conform to what the SDK expects based on the
83
89
  # supported methods:
@@ -33,6 +33,9 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
33
33
 
34
34
  CONNECTOR_TYPE = "s3"
35
35
 
36
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
37
+ CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
38
+
36
39
  if TYPE_CHECKING:
37
40
  from s3fs import S3FileSystem
38
41
 
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
91
94
  if isinstance(e, PermissionError):
92
95
  return UserAuthError(e)
93
96
  if isinstance(e, FileNotFoundError):
94
- return UserError(e)
97
+ return UserError(f"File not found: {e}")
95
98
  if cause := getattr(e, "__cause__", None):
96
99
  error_response = cause.response
97
100
  error_meta = error_response["ResponseMetadata"]
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
140
143
  }
141
144
  if metadata:
142
145
  record_locator["metadata"] = metadata
146
+ issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
147
+ if issue_characters:
148
+ logger.warning(
149
+ f"File path {path} contains characters "
150
+ f"that can cause issues with S3: {issue_characters}"
151
+ )
143
152
  return FileDataSourceMetadata(
144
153
  date_created=date_created,
145
154
  date_modified=date_modified,
@@ -251,8 +251,9 @@ class SQLUploadStager(UploadStager):
251
251
  df[column] = df[column].apply(str)
252
252
  return df
253
253
 
254
- def write_output(self, output_path: Path, data: list[dict]) -> None:
254
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
255
255
  write_data(path=output_path, data=data)
256
+ return output_path
256
257
 
257
258
  def run(
258
259
  self,
@@ -278,8 +279,10 @@ class SQLUploadStager(UploadStager):
278
279
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
279
280
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
280
281
 
281
- self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
282
- return output_path
282
+ final_output_path = self.write_output(
283
+ output_path=output_path, data=df.to_dict(orient="records")
284
+ )
285
+ return final_output_path
283
286
 
284
287
 
285
288
  class SQLUploaderConfig(UploaderConfig):
File without changes
@@ -0,0 +1,116 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
+
8
+ from unstructured_ingest.v2.logger import logger
9
+
10
+
11
+ class SourceIdentifiers(BaseModel):
12
+ filename: str
13
+ fullpath: str
14
+ rel_path: Optional[str] = None
15
+
16
+ @property
17
+ def filename_stem(self) -> str:
18
+ return Path(self.filename).stem
19
+
20
+ @property
21
+ def relative_path(self) -> str:
22
+ return self.rel_path or self.fullpath
23
+
24
+
25
+ class FileDataSourceMetadata(BaseModel):
26
+ url: Optional[str] = None
27
+ version: Optional[str] = None
28
+ record_locator: Optional[dict[str, Any]] = None
29
+ date_created: Optional[str] = None
30
+ date_modified: Optional[str] = None
31
+ date_processed: Optional[str] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
+ filesize_bytes: Optional[int] = None
34
+
35
+
36
+ class FileData(BaseModel):
37
+ identifier: str
38
+ connector_type: str
39
+ source_identifiers: SourceIdentifiers
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
+ reprocess: bool = False
43
+ local_download_path: Optional[str] = None
44
+ display_name: Optional[str] = None
45
+
46
+ @classmethod
47
+ def from_file(cls, path: str) -> "FileData":
48
+ path = Path(path).resolve()
49
+ if not path.exists() or not path.is_file():
50
+ raise ValueError(f"file path not valid: {path}")
51
+ with open(str(path.resolve()), "rb") as f:
52
+ file_data_dict = json.load(f)
53
+ file_data = cls.model_validate(file_data_dict)
54
+ return file_data
55
+
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
61
+ def to_file(self, path: str) -> None:
62
+ path = Path(path).resolve()
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(str(path.resolve()), "w") as f:
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+ source_identifiers: Optional[SourceIdentifiers] = None
77
+
78
+ @field_validator("batch_items")
79
+ @classmethod
80
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
+ if not v:
82
+ raise ValueError("batch items cannot be empty")
83
+ all_identifiers = [item.identifier for item in v]
84
+ if len(all_identifiers) != len(set(all_identifiers)):
85
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
+ return sorted_batch_items
88
+
89
+ @model_validator(mode="before")
90
+ @classmethod
91
+ def populate_identifier(cls, data: Any) -> Any:
92
+ if isinstance(data, dict) and "identifier" not in data:
93
+ batch_items = data["batch_items"]
94
+ identifier_data = json.dumps(
95
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
96
+ )
97
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
+ return data
99
+
100
+
101
+ def file_data_from_file(path: str) -> FileData:
102
+ try:
103
+ return BatchFileData.from_file(path=path)
104
+ except ValidationError:
105
+ logger.debug(f"{path} not detected as batch file data")
106
+
107
+ return FileData.from_file(path=path)
108
+
109
+
110
+ def file_data_from_dict(data: dict) -> FileData:
111
+ try:
112
+ return BatchFileData.model_validate(data)
113
+ except ValidationError:
114
+ logger.debug(f"{data} not valid for batch file data")
115
+
116
+ return FileData.model_validate(data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.21
3
+ Version: 0.5.25
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: opentelemetry-sdk
26
+ Requires-Dist: python-dateutil
25
27
  Requires-Dist: click
26
28
  Requires-Dist: dataclasses_json
27
- Requires-Dist: pydantic>=2.7
28
- Requires-Dist: python-dateutil
29
- Requires-Dist: opentelemetry-sdk
30
29
  Requires-Dist: tqdm
30
+ Requires-Dist: pydantic>=2.7
31
31
  Requires-Dist: numpy
32
32
  Requires-Dist: pandas
33
33
  Provides-Extra: remote
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
103
103
  Requires-Dist: numpy; extra == "astradb"
104
104
  Requires-Dist: pandas; extra == "astradb"
105
105
  Provides-Extra: azure
106
- Requires-Dist: fsspec; extra == "azure"
107
106
  Requires-Dist: adlfs; extra == "azure"
107
+ Requires-Dist: fsspec; extra == "azure"
108
108
  Requires-Dist: numpy; extra == "azure"
109
109
  Requires-Dist: pandas; extra == "azure"
110
110
  Provides-Extra: azure-ai-search
@@ -112,13 +112,13 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
112
112
  Requires-Dist: numpy; extra == "azure-ai-search"
113
113
  Requires-Dist: pandas; extra == "azure-ai-search"
114
114
  Provides-Extra: biomed
115
- Requires-Dist: bs4; extra == "biomed"
116
115
  Requires-Dist: requests; extra == "biomed"
116
+ Requires-Dist: bs4; extra == "biomed"
117
117
  Requires-Dist: numpy; extra == "biomed"
118
118
  Requires-Dist: pandas; extra == "biomed"
119
119
  Provides-Extra: box
120
- Requires-Dist: fsspec; extra == "box"
121
120
  Requires-Dist: boxfs; extra == "box"
121
+ Requires-Dist: fsspec; extra == "box"
122
122
  Requires-Dist: numpy; extra == "box"
123
123
  Requires-Dist: pandas; extra == "box"
124
124
  Provides-Extra: chroma
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
148
148
  Requires-Dist: numpy; extra == "discord"
149
149
  Requires-Dist: pandas; extra == "discord"
150
150
  Provides-Extra: dropbox
151
- Requires-Dist: fsspec; extra == "dropbox"
152
151
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
152
+ Requires-Dist: fsspec; extra == "dropbox"
153
153
  Requires-Dist: numpy; extra == "dropbox"
154
154
  Requires-Dist: pandas; extra == "dropbox"
155
155
  Provides-Extra: duckdb
@@ -162,13 +162,13 @@ Requires-Dist: numpy; extra == "elasticsearch"
162
162
  Requires-Dist: pandas; extra == "elasticsearch"
163
163
  Provides-Extra: gcs
164
164
  Requires-Dist: bs4; extra == "gcs"
165
- Requires-Dist: fsspec; extra == "gcs"
166
165
  Requires-Dist: gcsfs; extra == "gcs"
166
+ Requires-Dist: fsspec; extra == "gcs"
167
167
  Requires-Dist: numpy; extra == "gcs"
168
168
  Requires-Dist: pandas; extra == "gcs"
169
169
  Provides-Extra: github
170
- Requires-Dist: pygithub>1.58.0; extra == "github"
171
170
  Requires-Dist: requests; extra == "github"
171
+ Requires-Dist: pygithub>1.58.0; extra == "github"
172
172
  Requires-Dist: numpy; extra == "github"
173
173
  Requires-Dist: pandas; extra == "github"
174
174
  Provides-Extra: gitlab
@@ -180,15 +180,15 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
180
180
  Requires-Dist: numpy; extra == "google-drive"
181
181
  Requires-Dist: pandas; extra == "google-drive"
182
182
  Provides-Extra: hubspot
183
- Requires-Dist: urllib3; extra == "hubspot"
184
183
  Requires-Dist: hubspot-api-client; extra == "hubspot"
184
+ Requires-Dist: urllib3; extra == "hubspot"
185
185
  Requires-Dist: numpy; extra == "hubspot"
186
186
  Requires-Dist: pandas; extra == "hubspot"
187
187
  Provides-Extra: ibm-watsonx-s3
188
+ Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
188
189
  Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
189
190
  Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
190
191
  Requires-Dist: httpx; extra == "ibm-watsonx-s3"
191
- Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
192
192
  Requires-Dist: numpy; extra == "ibm-watsonx-s3"
193
193
  Requires-Dist: pandas; extra == "ibm-watsonx-s3"
194
194
  Provides-Extra: jira
@@ -217,21 +217,21 @@ Requires-Dist: numpy; extra == "mongodb"
217
217
  Requires-Dist: pandas; extra == "mongodb"
218
218
  Provides-Extra: neo4j
219
219
  Requires-Dist: networkx; extra == "neo4j"
220
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
221
220
  Requires-Dist: cymple; extra == "neo4j"
221
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
222
222
  Requires-Dist: numpy; extra == "neo4j"
223
223
  Requires-Dist: pandas; extra == "neo4j"
224
224
  Provides-Extra: notion
225
- Requires-Dist: httpx; extra == "notion"
226
- Requires-Dist: htmlBuilder; extra == "notion"
227
225
  Requires-Dist: notion-client; extra == "notion"
228
226
  Requires-Dist: backoff; extra == "notion"
227
+ Requires-Dist: htmlBuilder; extra == "notion"
228
+ Requires-Dist: httpx; extra == "notion"
229
229
  Requires-Dist: numpy; extra == "notion"
230
230
  Requires-Dist: pandas; extra == "notion"
231
231
  Provides-Extra: onedrive
232
- Requires-Dist: bs4; extra == "onedrive"
233
232
  Requires-Dist: msal; extra == "onedrive"
234
233
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
+ Requires-Dist: bs4; extra == "onedrive"
235
235
  Requires-Dist: numpy; extra == "onedrive"
236
236
  Requires-Dist: pandas; extra == "onedrive"
237
237
  Provides-Extra: opensearch
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
278
278
  Requires-Dist: numpy; extra == "salesforce"
279
279
  Requires-Dist: pandas; extra == "salesforce"
280
280
  Provides-Extra: sftp
281
- Requires-Dist: fsspec; extra == "sftp"
282
281
  Requires-Dist: paramiko; extra == "sftp"
282
+ Requires-Dist: fsspec; extra == "sftp"
283
283
  Requires-Dist: numpy; extra == "sftp"
284
284
  Requires-Dist: pandas; extra == "sftp"
285
285
  Provides-Extra: slack
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
287
287
  Requires-Dist: numpy; extra == "slack"
288
288
  Requires-Dist: pandas; extra == "slack"
289
289
  Provides-Extra: snowflake
290
- Requires-Dist: psycopg2-binary; extra == "snowflake"
291
290
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
291
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
292
292
  Requires-Dist: numpy; extra == "snowflake"
293
293
  Requires-Dist: pandas; extra == "snowflake"
294
294
  Provides-Extra: wikipedia
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
312
312
  Requires-Dist: numpy; extra == "singlestore"
313
313
  Requires-Dist: pandas; extra == "singlestore"
314
314
  Provides-Extra: vectara
315
- Requires-Dist: httpx; extra == "vectara"
316
- Requires-Dist: aiofiles; extra == "vectara"
317
315
  Requires-Dist: requests; extra == "vectara"
316
+ Requires-Dist: aiofiles; extra == "vectara"
317
+ Requires-Dist: httpx; extra == "vectara"
318
318
  Requires-Dist: numpy; extra == "vectara"
319
319
  Requires-Dist: pandas; extra == "vectara"
320
320
  Provides-Extra: vastdb
321
- Requires-Dist: ibis; extra == "vastdb"
322
321
  Requires-Dist: pyarrow; extra == "vastdb"
322
+ Requires-Dist: ibis; extra == "vastdb"
323
323
  Requires-Dist: vastdb; extra == "vastdb"
324
324
  Requires-Dist: numpy; extra == "vastdb"
325
325
  Requires-Dist: pandas; extra == "vastdb"
326
326
  Provides-Extra: zendesk
327
- Requires-Dist: bs4; extra == "zendesk"
328
- Requires-Dist: httpx; extra == "zendesk"
329
327
  Requires-Dist: aiofiles; extra == "zendesk"
328
+ Requires-Dist: httpx; extra == "zendesk"
329
+ Requires-Dist: bs4; extra == "zendesk"
330
330
  Requires-Dist: numpy; extra == "zendesk"
331
331
  Requires-Dist: pandas; extra == "zendesk"
332
332
  Provides-Extra: embed-huggingface
@@ -334,8 +334,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
334
334
  Requires-Dist: numpy; extra == "embed-huggingface"
335
335
  Requires-Dist: pandas; extra == "embed-huggingface"
336
336
  Provides-Extra: embed-octoai
337
- Requires-Dist: tiktoken; extra == "embed-octoai"
338
337
  Requires-Dist: openai; extra == "embed-octoai"
338
+ Requires-Dist: tiktoken; extra == "embed-octoai"
339
339
  Requires-Dist: numpy; extra == "embed-octoai"
340
340
  Requires-Dist: pandas; extra == "embed-octoai"
341
341
  Provides-Extra: embed-vertexai
@@ -351,13 +351,13 @@ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
351
351
  Requires-Dist: numpy; extra == "embed-mixedbreadai"
352
352
  Requires-Dist: pandas; extra == "embed-mixedbreadai"
353
353
  Provides-Extra: openai
354
- Requires-Dist: tiktoken; extra == "openai"
355
354
  Requires-Dist: openai; extra == "openai"
355
+ Requires-Dist: tiktoken; extra == "openai"
356
356
  Requires-Dist: numpy; extra == "openai"
357
357
  Requires-Dist: pandas; extra == "openai"
358
358
  Provides-Extra: bedrock
359
- Requires-Dist: aioboto3; extra == "bedrock"
360
359
  Requires-Dist: boto3; extra == "bedrock"
360
+ Requires-Dist: aioboto3; extra == "bedrock"
361
361
  Requires-Dist: numpy; extra == "bedrock"
362
362
  Requires-Dist: pandas; extra == "bedrock"
363
363
  Provides-Extra: togetherai
@@ -24,7 +24,7 @@ test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfv
24
24
  test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
- test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
27
+ test/integration/connectors/test_vectara.py,sha256=thM9vIWn7vcH1xjQK3owuEJMr65Z7L4j7NICsMpsMv8,9290
28
28
  test/integration/connectors/test_zendesk.py,sha256=nMBVNlEQr1uvmI1fzUC1bmoa2doXnYp5n4bMJS2FN-o,3727
29
29
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
113
113
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
114
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
115
115
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
116
- unstructured_ingest/__version__.py,sha256=b5BrQJjlBZoPiM_J1cJDbJABGvcwaDFb_Bvwb0AHN10,43
116
+ unstructured_ingest/__version__.py,sha256=A9I2h_N6BTgmKRhQ1HbPOAJuwdOFgMb_aDmK1czvHyc,43
117
117
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
118
118
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
119
119
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -398,10 +398,10 @@ unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdj
398
398
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
399
399
  unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
400
400
  unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
401
- unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
401
+ unstructured_ingest/v2/interfaces/__init__.py,sha256=Jn5qtWOnmBZzsb2PoQYN3Xj5xHa9thSVc0BEoIN0Pw0,1059
402
402
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
403
403
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
404
- unstructured_ingest/v2/interfaces/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
404
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=DQYzXr8yjlm6VkGuwQLGJ1sia4Gr0d__POAFLrow1PE,525
405
405
  unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
406
406
  unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
407
407
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
@@ -462,21 +462,21 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6q
462
462
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
463
463
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
464
464
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
465
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=FZhjrMYBr_je6mWYp7MUUvyKR9YwGD2HiNljeT7U5ws,5044
465
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=0kEtIVQSD6RhLAqpc-0BNFQazS7lnsnWalaN3Mdn97g,6805
466
466
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
467
467
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
468
468
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
469
469
  unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=RW-Cw94Hs3ZsN8Kb4ciSh_N-Qkp0cqkw_xkJbt8CDNU,4656
470
470
  unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
471
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
472
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
471
+ unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=KmlQCA7LXppxhL9e27LBBqNT999nUcc39qe2IkZsUJ8,18988
472
+ unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=tzOV0eNMyVHMXE5nedp6u0yyWC0Gn_blklg2ZdoOa4c,6956
473
473
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
474
474
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
475
475
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
476
476
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
477
477
  unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
478
478
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
479
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
479
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=MtD41jZQXB-fqNzW3Whqq6ydQYDUK6Jub7sSPvgLErw,7130
480
480
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
481
481
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
482
482
  unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
@@ -568,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
568
568
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
569
569
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
570
570
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
571
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=G28VUR0zaMVmQtbdZG6TRpkWFHvXJqFrr7SBuyM-fME,15608
571
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=CbysCnBBHtmYkqXiaoZSazI1ombNltrsqFrY-gQzm4U,15683
572
572
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
573
573
  unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
574
574
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -581,9 +581,11 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
581
581
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
582
582
  unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
583
583
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
584
- unstructured_ingest-0.5.21.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
585
- unstructured_ingest-0.5.21.dist-info/METADATA,sha256=c1bUHvgG6X9QOiAD669sVHAFkGfI2tBTRBM-eRJBLiU,14999
586
- unstructured_ingest-0.5.21.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
587
- unstructured_ingest-0.5.21.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
588
- unstructured_ingest-0.5.21.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
589
- unstructured_ingest-0.5.21.dist-info/RECORD,,
584
+ unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
585
+ unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
586
+ unstructured_ingest-0.5.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
587
+ unstructured_ingest-0.5.25.dist-info/METADATA,sha256=Z_PvUmam-C56UwoY92VhbvUd-fubXBHevjSMHKVgPx4,14999
588
+ unstructured_ingest-0.5.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
589
+ unstructured_ingest-0.5.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
590
+ unstructured_ingest-0.5.25.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
591
+ unstructured_ingest-0.5.25.dist-info/RECORD,,