unstructured-ingest 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (27) hide show
  1. test/integration/connectors/sql/test_vastdb.py +34 -0
  2. test/integration/connectors/test_google_drive.py +257 -0
  3. test/unit/v2/connectors/motherduck/__init__.py +0 -0
  4. test/unit/v2/connectors/motherduck/test_base.py +74 -0
  5. unstructured_ingest/__version__.py +1 -1
  6. unstructured_ingest/embed/bedrock.py +13 -6
  7. unstructured_ingest/embed/huggingface.py +11 -4
  8. unstructured_ingest/embed/interfaces.py +2 -21
  9. unstructured_ingest/embed/mixedbreadai.py +13 -4
  10. unstructured_ingest/embed/octoai.py +13 -6
  11. unstructured_ingest/embed/openai.py +13 -6
  12. unstructured_ingest/embed/togetherai.py +13 -4
  13. unstructured_ingest/embed/vertexai.py +13 -6
  14. unstructured_ingest/embed/voyageai.py +13 -4
  15. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -0
  16. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -4
  17. unstructured_ingest/v2/processes/connectors/google_drive.py +144 -13
  18. unstructured_ingest/v2/processes/connectors/pinecone.py +1 -0
  19. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +53 -3
  20. unstructured_ingest/v2/processes/connectors/sql/sql.py +3 -47
  21. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +4 -12
  22. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/METADATA +18 -18
  23. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/RECORD +27 -23
  24. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/LICENSE.md +0 -0
  25. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/WHEEL +0 -0
  26. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/entry_points.txt +0 -0
  27. {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from pydantic import Field, Secret, ValidationError
9
9
  from pydantic.functional_validators import BeforeValidator
10
10
 
11
11
  from unstructured_ingest.embed.interfaces import (
12
+ EMBEDDINGS_KEY,
12
13
  AsyncBaseEmbeddingEncoder,
13
14
  BaseEmbeddingEncoder,
14
15
  EmbeddingConfig,
@@ -75,9 +76,12 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
75
76
  return self._embed_documents(elements=[query])[0]
76
77
 
77
78
  def embed_documents(self, elements: list[dict]) -> list[dict]:
78
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
79
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
- return elements_with_embeddings
79
+ elements = elements.copy()
80
+ elements_with_text = [e for e in elements if e.get("text")]
81
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
82
+ for element, embedding in zip(elements_with_text, embeddings):
83
+ element[EMBEDDINGS_KEY] = embedding
84
+ return elements
81
85
 
82
86
  @requires_dependencies(
83
87
  ["vertexai"],
@@ -110,9 +114,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
110
114
  return embedding[0]
111
115
 
112
116
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
113
- embeddings = await self._embed_documents([e.get("text", "") for e in elements])
114
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
115
- return elements_with_embeddings
117
+ elements = elements.copy()
118
+ elements_with_text = [e for e in elements if e.get("text")]
119
+ embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
120
+ for element, embedding in zip(elements_with_text, embeddings):
121
+ element[EMBEDDINGS_KEY] = embedding
122
+ return elements
116
123
 
117
124
  @requires_dependencies(
118
125
  ["vertexai"],
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Optional
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
7
8
  AsyncBaseEmbeddingEncoder,
8
9
  BaseEmbeddingEncoder,
9
10
  EmbeddingConfig,
@@ -107,8 +108,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
107
108
  return embeddings
108
109
 
109
110
  def embed_documents(self, elements: list[dict]) -> list[dict]:
110
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
111
- return self._add_embeddings_to_elements(elements, embeddings)
111
+ elements = elements.copy()
112
+ elements_with_text = [e for e in elements if e.get("text")]
113
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
114
+ for element, embedding in zip(elements_with_text, embeddings):
115
+ element[EMBEDDINGS_KEY] = embedding
116
+ return elements
112
117
 
113
118
  def embed_query(self, query: str) -> list[float]:
114
119
  return self._embed_documents(elements=[query])[0]
@@ -135,8 +140,12 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
135
140
  return embeddings
136
141
 
137
142
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
138
- embeddings = await self._embed_documents([e.get("text", "") for e in elements])
139
- return self._add_embeddings_to_elements(elements, embeddings)
143
+ elements = elements.copy()
144
+ elements_with_text = [e for e in elements if e.get("text")]
145
+ embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
146
+ for element, embedding in zip(elements_with_text, embeddings):
147
+ element[EMBEDDINGS_KEY] = embedding
148
+ return elements
140
149
 
141
150
  async def embed_query(self, query: str) -> list[float]:
142
151
  embedding = await self._embed_documents(elements=[query])
@@ -81,6 +81,8 @@ class BaseDuckDBUploadStager(UploadStager):
81
81
  **kwargs: Any,
82
82
  ) -> Path:
83
83
  elements_contents = get_data(path=elements_filepath)
84
+ output_filename_suffix = Path(elements_filepath).suffix
85
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
84
86
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
85
87
 
86
88
  output = [
@@ -61,7 +61,7 @@ class MotherDuckConnectionConfig(ConnectionConfig):
61
61
  "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
62
  },
63
63
  ) as conn:
64
- conn.sql(f"USE {self.database}")
64
+ conn.sql(f'USE "{self.database}"')
65
65
  yield conn
66
66
 
67
67
  @contextmanager
@@ -102,11 +102,12 @@ class MotherDuckUploader(Uploader):
102
102
 
103
103
  def upload_dataframe(self, df: pd.DataFrame) -> None:
104
104
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
+ database = self.connection_config.database
106
+ db_schema = self.connection_config.db_schema
107
+ table = self.connection_config.table
105
108
 
106
109
  with self.connection_config.get_client() as conn:
107
- conn.query(
108
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
109
- )
110
+ conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
110
111
 
111
112
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
112
113
  df = pd.DataFrame(data=data)
@@ -132,12 +132,141 @@ class GoogleDriveIndexer(Indexer):
132
132
  ]
133
133
  )
134
134
 
135
+ @staticmethod
136
+ def verify_drive_api_enabled(client) -> None:
137
+ from googleapiclient.errors import HttpError
138
+
139
+ """
140
+ Makes a lightweight API call to verify that the Drive API is enabled.
141
+ If the API is not enabled, an HttpError should be raised.
142
+ """
143
+ try:
144
+ # A very minimal call: list 1 file from the drive.
145
+ client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
146
+ except HttpError as e:
147
+ error_content = e.content.decode() if hasattr(e, "content") else ""
148
+ lower_error = error_content.lower()
149
+ if "drive api" in lower_error and (
150
+ "not enabled" in lower_error or "not been used" in lower_error
151
+ ):
152
+ raise SourceConnectionError(
153
+ "Google Drive API is not enabled for your project. \
154
+ Please enable it in the Google Cloud Console."
155
+ )
156
+ else:
157
+ raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
158
+
159
+ @staticmethod
160
+ def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
161
+ """
162
+ Count non-folder files recursively under the given folder.
163
+ If `extensions` is provided, only count files
164
+ whose `fileExtension` matches one of the values.
165
+ """
166
+ count = 0
167
+ stack = [folder_id]
168
+ while stack:
169
+ current_folder = stack.pop()
170
+ # Always list all items under the current folder.
171
+ query = f"'{current_folder}' in parents"
172
+ page_token = None
173
+ while True:
174
+ response = files_client.list(
175
+ spaces="drive",
176
+ q=query,
177
+ fields="nextPageToken, files(id, mimeType, fileExtension)",
178
+ pageToken=page_token,
179
+ pageSize=1000,
180
+ ).execute()
181
+ for item in response.get("files", []):
182
+ if item.get("mimeType") == "application/vnd.google-apps.folder":
183
+ # Always traverse sub-folders regardless of extension filter.
184
+ stack.append(item["id"])
185
+ else:
186
+ if extensions:
187
+ # Use a case-insensitive comparison for the file extension.
188
+ file_ext = (item.get("fileExtension") or "").lower()
189
+ valid_exts = [e.lower() for e in extensions]
190
+ if file_ext in valid_exts:
191
+ count += 1
192
+ else:
193
+ count += 1
194
+ page_token = response.get("nextPageToken")
195
+ if not page_token:
196
+ break
197
+ return count
198
+
135
199
  def precheck(self) -> None:
200
+ """
201
+ Enhanced precheck that verifies not only connectivity
202
+ but also that the provided drive_id is valid and accessible.
203
+ """
136
204
  try:
137
- self.connection_config.get_client()
205
+ with self.connection_config.get_client() as client:
206
+ # First, verify that the Drive API is enabled.
207
+ self.verify_drive_api_enabled(client)
208
+
209
+ # Try to retrieve metadata for the drive id.
210
+ # This will catch errors such as an invalid drive id or insufficient permissions.
211
+ root_info = self.get_root_info(
212
+ files_client=client, object_id=self.connection_config.drive_id
213
+ )
214
+ logger.info(
215
+ f"Successfully retrieved drive root info: "
216
+ f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
217
+ )
218
+
219
+ # If the target is a folder, perform file count check.
220
+ if self.is_dir(root_info):
221
+ if self.index_config.recursive:
222
+ file_count = self.count_files_recursively(
223
+ client,
224
+ self.connection_config.drive_id,
225
+ extensions=self.index_config.extensions,
226
+ )
227
+ if file_count == 0:
228
+ logger.warning(
229
+ "Empty folder: no files found recursively in the folder. \
230
+ Please verify that the folder contains files and \
231
+ that the service account has proper permissions."
232
+ )
233
+ # raise SourceConnectionError(
234
+ # "Empty folder: no files found recursively in the folder. "
235
+ # "Please verify that the folder contains files and \
236
+ # that the service account has proper permissions."
237
+ # )
238
+ else:
239
+ logger.info(f"Found {file_count} files recursively in the folder.")
240
+ else:
241
+ # Non-recursive: check for at least one immediate non-folder child.
242
+ response = client.list(
243
+ spaces="drive",
244
+ fields="files(id)",
245
+ pageSize=1,
246
+ q=f"'{self.connection_config.drive_id}' in parents",
247
+ ).execute()
248
+ if not response.get("files"):
249
+ logger.warning(
250
+ "Empty folder: no files found at the folder's root level. "
251
+ "Please verify that the folder contains files and \
252
+ that the service account has proper permissions."
253
+ )
254
+ # raise SourceConnectionError(
255
+ # "Empty folder: no files found at the folder's root level. "
256
+ # "Please verify that the folder contains files and \
257
+ # that the service account has proper permissions."
258
+ # )
259
+ else:
260
+ logger.info("Found files at the folder's root level.")
261
+ else:
262
+ # If the target is a file, precheck passes.
263
+ logger.info("Drive ID corresponds to a file. Precheck passed.")
264
+
138
265
  except Exception as e:
139
- logger.error(f"failed to validate connection: {e}", exc_info=True)
140
- raise SourceConnectionError(f"failed to validate connection: {e}")
266
+ logger.error(
267
+ "Failed to validate Google Drive connection during precheck", exc_info=True
268
+ )
269
+ raise SourceConnectionError(f"Precheck failed: {e}")
141
270
 
142
271
  @staticmethod
143
272
  def is_dir(record: dict) -> bool:
@@ -310,20 +439,22 @@ class GoogleDriveDownloader(Downloader):
310
439
  from googleapiclient.http import MediaIoBaseDownload
311
440
 
312
441
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
313
- mime_type = file_data.additional_metadata["mimeType"]
314
442
  record_id = file_data.identifier
443
+ mime_type = file_data.additional_metadata["mimeType"]
444
+ if not mime_type:
445
+ raise TypeError(
446
+ f"File not supported. Name: {file_data.source_identifiers.filename} "
447
+ f"ID: {record_id} "
448
+ f"MimeType: {mime_type}"
449
+ )
315
450
  with self.connection_config.get_client() as client:
316
- if mime_type.startswith("application/vnd.google-apps"):
451
+ if (
452
+ mime_type.startswith("application/vnd.google-apps")
453
+ and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
454
+ ):
317
455
  export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
318
- self.meta.get("mimeType"), # type: ignore
456
+ mime_type, # type: ignore
319
457
  )
320
- if not export_mime:
321
- raise TypeError(
322
- f"File not supported. Name: {file_data.source_identifiers.filename} "
323
- f"ID: {record_id} "
324
- f"MimeType: {mime_type}"
325
- )
326
-
327
458
  request = client.export_media(
328
459
  fileId=record_id,
329
460
  mimeType=export_mime,
@@ -81,6 +81,7 @@ ALLOWED_FIELDS = (
81
81
  "link_urls",
82
82
  "link_texts",
83
83
  "text_as_html",
84
+ "entities",
84
85
  )
85
86
 
86
87
 
@@ -1,6 +1,7 @@
1
+ import json
1
2
  from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, Generator, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
@@ -15,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
15
16
  SourceRegistryEntry,
16
17
  )
17
18
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
19
+ _DATE_COLUMNS,
18
20
  SQLAccessConfig,
19
21
  SqlBatchFileData,
20
22
  SQLConnectionConfig,
@@ -26,6 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
26
28
  SQLUploaderConfig,
27
29
  SQLUploadStager,
28
30
  SQLUploadStagerConfig,
31
+ parse_date_string,
29
32
  )
30
33
 
31
34
  if TYPE_CHECKING:
@@ -34,6 +37,17 @@ if TYPE_CHECKING:
34
37
 
35
38
  CONNECTOR_TYPE = "snowflake"
36
39
 
40
+ _ARRAY_COLUMNS = (
41
+ "embeddings",
42
+ "languages",
43
+ "link_urls",
44
+ "link_texts",
45
+ "sent_from",
46
+ "sent_to",
47
+ "emphasized_text_contents",
48
+ "emphasized_text_tags",
49
+ )
50
+
37
51
 
38
52
  class SnowflakeAccessConfig(SQLAccessConfig):
39
53
  password: Optional[str] = Field(default=None, description="DB password")
@@ -160,6 +174,42 @@ class SnowflakeUploader(SQLUploader):
160
174
  connector_type: str = CONNECTOR_TYPE
161
175
  values_delimiter: str = "?"
162
176
 
177
+ def prepare_data(
178
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
179
+ ) -> list[tuple[Any, ...]]:
180
+ output = []
181
+ for row in data:
182
+ parsed = []
183
+ for column_name, value in zip(columns, row):
184
+ if column_name in _DATE_COLUMNS:
185
+ if value is None or pd.isna(value): # pandas is nan
186
+ parsed.append(None)
187
+ else:
188
+ parsed.append(parse_date_string(value))
189
+ elif column_name in _ARRAY_COLUMNS:
190
+ if not isinstance(value, list) and (
191
+ value is None or pd.isna(value)
192
+ ): # pandas is nan
193
+ parsed.append(None)
194
+ else:
195
+ parsed.append(json.dumps(value))
196
+ else:
197
+ parsed.append(value)
198
+ output.append(tuple(parsed))
199
+ return output
200
+
201
+ def _parse_values(self, columns: list[str]) -> str:
202
+ return ",".join(
203
+ [
204
+ (
205
+ f"PARSE_JSON({self.values_delimiter})"
206
+ if col in _ARRAY_COLUMNS
207
+ else self.values_delimiter
208
+ )
209
+ for col in columns
210
+ ]
211
+ )
212
+
163
213
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
164
214
  if self.can_delete():
165
215
  self.delete_by_record_id(file_data=file_data)
@@ -173,10 +223,10 @@ class SnowflakeUploader(SQLUploader):
173
223
  self._fit_to_schema(df=df)
174
224
 
175
225
  columns = list(df.columns)
176
- stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
226
+ stmt = "INSERT INTO {table_name} ({columns}) SELECT {values}".format(
177
227
  table_name=self.upload_config.table_name,
178
228
  columns=",".join(columns),
179
- values=",".join([self.values_delimiter for _ in columns]),
229
+ values=self._parse_values(columns),
180
230
  )
181
231
  logger.info(
182
232
  f"writing a total of {len(df)} elements via"
@@ -38,48 +38,6 @@ from unstructured_ingest.v2.interfaces import (
38
38
  from unstructured_ingest.v2.logger import logger
39
39
  from unstructured_ingest.v2.utils import get_enhanced_element_id
40
40
 
41
- _COLUMNS = (
42
- "id",
43
- "element_id",
44
- "text",
45
- "embeddings",
46
- "type",
47
- "system",
48
- "layout_width",
49
- "layout_height",
50
- "points",
51
- "url",
52
- "version",
53
- "date_created",
54
- "date_modified",
55
- "date_processed",
56
- "permissions_data",
57
- "record_locator",
58
- "category_depth",
59
- "parent_id",
60
- "attached_filename",
61
- "filetype",
62
- "last_modified",
63
- "file_directory",
64
- "filename",
65
- "languages",
66
- "page_number",
67
- "links",
68
- "page_name",
69
- "link_urls",
70
- "link_texts",
71
- "sent_from",
72
- "sent_to",
73
- "subject",
74
- "section",
75
- "header_footer_type",
76
- "emphasized_text_contents",
77
- "emphasized_text_tags",
78
- "text_as_html",
79
- "regex_metadata",
80
- "detection_class_prob",
81
- )
82
-
83
41
  _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
84
42
 
85
43
 
@@ -270,10 +228,8 @@ class SQLUploadStager(UploadStager):
270
228
 
271
229
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
272
230
 
273
- # remove extraneous, not supported columns
274
- element = {k: v for k, v in data.items() if k in _COLUMNS}
275
- element[RECORD_ID_LABEL] = file_data.identifier
276
- return element
231
+ data[RECORD_ID_LABEL] = file_data.identifier
232
+ return data
277
233
 
278
234
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
279
235
  for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
@@ -375,7 +331,7 @@ class SQLUploader(Uploader):
375
331
  missing_columns = schema_fields - columns
376
332
 
377
333
  if columns_to_drop:
378
- logger.warning(
334
+ logger.info(
379
335
  "Following columns will be dropped to match the table's schema: "
380
336
  f"{', '.join(columns_to_drop)}"
381
337
  )
@@ -19,7 +19,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
19
19
  SourceRegistryEntry,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
22
- _COLUMNS,
23
22
  SQLAccessConfig,
24
23
  SqlBatchFileData,
25
24
  SQLConnectionConfig,
@@ -149,13 +148,11 @@ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
149
148
  default=None,
150
149
  description="Map of column names to rename, ex: {'old_name': 'new_name'}",
151
150
  )
152
- additional_columns: Optional[list[str]] = Field(
153
- default_factory=list, description="Additional columns to include in the upload"
154
- )
155
151
 
156
152
 
153
+ @dataclass
157
154
  class VastdbUploadStager(SQLUploadStager):
158
- upload_stager_config: VastdbUploadStagerConfig
155
+ upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
159
156
 
160
157
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
161
158
  data = element_dict.copy()
@@ -168,13 +165,8 @@ class VastdbUploadStager(SQLUploadStager):
168
165
  data.update(coordinates)
169
166
 
170
167
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
171
-
172
- # remove extraneous, not supported columns
173
- # but also allow for additional columns
174
- approved_columns = set(_COLUMNS).union(self.upload_stager_config.additional_columns)
175
- element = {k: v for k, v in data.items() if k in approved_columns}
176
- element[RECORD_ID_LABEL] = file_data.identifier
177
- return element
168
+ data[RECORD_ID_LABEL] = file_data.identifier
169
+ return data
178
170
 
179
171
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
180
172
  df = super().conform_dataframe(df=df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,37 +23,37 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pandas
26
- Requires-Dist: dataclasses-json
27
26
  Requires-Dist: pydantic>=2.7
28
- Requires-Dist: click
29
- Requires-Dist: tqdm
27
+ Requires-Dist: dataclasses-json
30
28
  Requires-Dist: python-dateutil
31
29
  Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: click
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: aioboto3; extra == "bedrock"
43
42
  Requires-Dist: boto3; extra == "bedrock"
43
+ Requires-Dist: aioboto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Requires-Dist: bs4; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: boxfs; extra == "box"
49
48
  Requires-Dist: fsspec; extra == "box"
49
+ Requires-Dist: boxfs; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: requests; extra == "confluence"
56
55
  Requires-Dist: atlassian-python-api; extra == "confluence"
56
+ Requires-Dist: requests; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
63
  Provides-Extra: databricks-volumes
64
64
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
65
  Provides-Extra: delta-table
66
- Requires-Dist: deltalake; extra == "delta-table"
67
66
  Requires-Dist: boto3; extra == "delta-table"
67
+ Requires-Dist: deltalake; extra == "delta-table"
68
68
  Provides-Extra: discord
69
69
  Requires-Dist: discord.py; extra == "discord"
70
70
  Provides-Extra: doc
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
83
83
  Provides-Extra: embed-mixedbreadai
84
84
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
85
85
  Provides-Extra: embed-octoai
86
- Requires-Dist: tiktoken; extra == "embed-octoai"
87
86
  Requires-Dist: openai; extra == "embed-octoai"
87
+ Requires-Dist: tiktoken; extra == "embed-octoai"
88
88
  Provides-Extra: embed-vertexai
89
89
  Requires-Dist: vertexai; extra == "embed-vertexai"
90
90
  Provides-Extra: embed-voyageai
@@ -92,9 +92,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
- Requires-Dist: gcsfs; extra == "gcs"
96
- Requires-Dist: bs4; extra == "gcs"
97
95
  Requires-Dist: fsspec; extra == "gcs"
96
+ Requires-Dist: bs4; extra == "gcs"
97
+ Requires-Dist: gcsfs; extra == "gcs"
98
98
  Provides-Extra: github
99
99
  Requires-Dist: pygithub>1.58.0; extra == "github"
100
100
  Requires-Dist: requests; extra == "github"
@@ -126,10 +126,10 @@ Requires-Dist: networkx; extra == "neo4j"
126
126
  Requires-Dist: cymple; extra == "neo4j"
127
127
  Requires-Dist: neo4j; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: httpx; extra == "notion"
129
+ Requires-Dist: htmlBuilder; extra == "notion"
130
130
  Requires-Dist: backoff; extra == "notion"
131
131
  Requires-Dist: notion-client; extra == "notion"
132
- Requires-Dist: htmlBuilder; extra == "notion"
132
+ Requires-Dist: httpx; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
@@ -137,8 +137,8 @@ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
137
  Requires-Dist: bs4; extra == "onedrive"
138
138
  Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: openai
140
- Requires-Dist: tiktoken; extra == "openai"
141
140
  Requires-Dist: openai; extra == "openai"
141
+ Requires-Dist: tiktoken; extra == "openai"
142
142
  Provides-Extra: opensearch
143
143
  Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
@@ -174,8 +174,8 @@ Requires-Dist: fsspec; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
177
- Requires-Dist: paramiko; extra == "sftp"
178
177
  Requires-Dist: fsspec; extra == "sftp"
178
+ Requires-Dist: paramiko; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
180
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
181
  Requires-Dist: msal; extra == "sharepoint"
@@ -192,11 +192,11 @@ Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
194
  Requires-Dist: vastdb; extra == "vastdb"
195
- Requires-Dist: ibis; extra == "vastdb"
196
195
  Requires-Dist: pyarrow; extra == "vastdb"
196
+ Requires-Dist: ibis; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
- Requires-Dist: httpx; extra == "vectara"
199
198
  Requires-Dist: requests; extra == "vectara"
199
+ Requires-Dist: httpx; extra == "vectara"
200
200
  Requires-Dist: aiofiles; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"