unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,310 @@
1
+ import hashlib
2
+ import json
3
+ import sys
4
+ import uuid
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field, replace
7
+ from datetime import date, datetime
8
+ from pathlib import Path
9
+ from time import time
10
+ from typing import Any, Generator, Union
11
+
12
+ import pandas as pd
13
+ from dateutil import parser
14
+ from pydantic import Field, Secret
15
+
16
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
17
+ from unstructured_ingest.v2.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ FileData,
24
+ FileDataSourceMetadata,
25
+ Indexer,
26
+ IndexerConfig,
27
+ Uploader,
28
+ UploaderConfig,
29
+ UploadStager,
30
+ UploadStagerConfig,
31
+ download_responses,
32
+ )
33
+ from unstructured_ingest.v2.logger import logger
34
+
35
+ _COLUMNS = (
36
+ "id",
37
+ "element_id",
38
+ "text",
39
+ "embeddings",
40
+ "type",
41
+ "system",
42
+ "layout_width",
43
+ "layout_height",
44
+ "points",
45
+ "url",
46
+ "version",
47
+ "date_created",
48
+ "date_modified",
49
+ "date_processed",
50
+ "permissions_data",
51
+ "record_locator",
52
+ "category_depth",
53
+ "parent_id",
54
+ "attached_filename",
55
+ "filetype",
56
+ "last_modified",
57
+ "file_directory",
58
+ "filename",
59
+ "languages",
60
+ "page_number",
61
+ "links",
62
+ "page_name",
63
+ "link_urls",
64
+ "link_texts",
65
+ "sent_from",
66
+ "sent_to",
67
+ "subject",
68
+ "section",
69
+ "header_footer_type",
70
+ "emphasized_text_contents",
71
+ "emphasized_text_tags",
72
+ "text_as_html",
73
+ "regex_metadata",
74
+ "detection_class_prob",
75
+ )
76
+
77
+ _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
78
+
79
+
80
+ def parse_date_string(date_value: Union[str, int]) -> date:
81
+ try:
82
+ timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
83
+ return datetime.fromtimestamp(timestamp)
84
+ except Exception as e:
85
+ logger.debug(f"date {date_value} string not a timestamp: {e}")
86
+ return parser.parse(date_value)
87
+
88
+
89
+ class SQLAccessConfig(AccessConfig):
90
+ pass
91
+
92
+
93
+ class SQLConnectionConfig(ConnectionConfig, ABC):
94
+ access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
95
+
96
+ @abstractmethod
97
+ def get_connection(self) -> Any:
98
+ pass
99
+
100
+
101
+ class SQLIndexerConfig(IndexerConfig):
102
+ table_name: str
103
+ id_column: str
104
+ batch_size: int = 100
105
+
106
+
107
+ class SQLIndexer(Indexer, ABC):
108
+ connection_config: SQLConnectionConfig
109
+ index_config: SQLIndexerConfig
110
+
111
+ @abstractmethod
112
+ def _get_doc_ids(self) -> list[str]:
113
+ pass
114
+
115
+ def precheck(self) -> None:
116
+ try:
117
+ connection = self.connection_config.get_connection()
118
+ cursor = connection.cursor()
119
+ cursor.execute("SELECT 1;")
120
+ cursor.close()
121
+ except Exception as e:
122
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
123
+ raise SourceConnectionError(f"failed to validate connection: {e}")
124
+
125
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
126
+ ids = self._get_doc_ids()
127
+ id_batches: list[frozenset[str]] = [
128
+ frozenset(
129
+ ids[
130
+ i
131
+ * self.index_config.batch_size : (i + 1) # noqa
132
+ * self.index_config.batch_size
133
+ ]
134
+ )
135
+ for i in range(
136
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
137
+ )
138
+ ]
139
+ for batch in id_batches:
140
+ # Make sure the hash is always a positive number to create identified
141
+ identified = str(hash(batch) + sys.maxsize + 1)
142
+ yield FileData(
143
+ identifier=identified,
144
+ connector_type=self.connector_type,
145
+ metadata=FileDataSourceMetadata(
146
+ date_processed=str(time()),
147
+ ),
148
+ doc_type="batch",
149
+ additional_metadata={
150
+ "ids": list(batch),
151
+ "table_name": self.index_config.table_name,
152
+ "id_column": self.index_config.id_column,
153
+ },
154
+ )
155
+
156
+
157
+ class SQLDownloaderConfig(DownloaderConfig):
158
+ fields: list[str] = field(default_factory=list)
159
+
160
+
161
+ class SQLDownloader(Downloader, ABC):
162
+ connection_config: SQLConnectionConfig
163
+ download_config: SQLDownloaderConfig
164
+
165
+ @abstractmethod
166
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
167
+ pass
168
+
169
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
170
+ data = [dict(zip(columns, row)) for row in rows]
171
+ df = pd.DataFrame(data)
172
+ dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
173
+ return dfs
174
+
175
+ def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
176
+ rows, columns = self.query_db(file_data=file_data)
177
+ return self.sql_to_df(rows=rows, columns=columns)
178
+
179
+ def get_identifier(self, table_name: str, record_id: str) -> str:
180
+ f = f"{table_name}-{record_id}"
181
+ if self.download_config.fields:
182
+ f = "{}-{}".format(
183
+ f,
184
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
185
+ )
186
+ return f
187
+
188
+ def generate_download_response(
189
+ self, result: pd.DataFrame, file_data: FileData
190
+ ) -> DownloadResponse:
191
+ id_column = file_data.additional_metadata["id_column"]
192
+ table_name = file_data.additional_metadata["table_name"]
193
+ record_id = result.iloc[0][id_column]
194
+ filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
195
+ filename = f"{filename_id}.csv"
196
+ download_path = self.download_dir / Path(filename)
197
+ logger.debug(
198
+ f"Downloading results from table {table_name} and id {record_id} to {download_path}"
199
+ )
200
+ download_path.parent.mkdir(parents=True, exist_ok=True)
201
+ result.to_csv(download_path)
202
+ copied_file_data = replace(file_data)
203
+ copied_file_data.identifier = filename_id
204
+ copied_file_data.doc_type = "file"
205
+ copied_file_data.additional_metadata.pop("ids", None)
206
+ return super().generate_download_response(
207
+ file_data=copied_file_data, download_path=download_path
208
+ )
209
+
210
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
211
+ data_dfs = self.get_data(file_data=file_data)
212
+ download_responses = []
213
+ for df in data_dfs:
214
+ download_responses.append(
215
+ self.generate_download_response(result=df, file_data=file_data)
216
+ )
217
+ return download_responses
218
+
219
+
220
+ class SQLUploadStagerConfig(UploadStagerConfig):
221
+ pass
222
+
223
+
224
+ @dataclass
225
+ class SQLUploadStager(UploadStager):
226
+ upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
227
+
228
+ def run(
229
+ self,
230
+ elements_filepath: Path,
231
+ file_data: FileData,
232
+ output_dir: Path,
233
+ output_filename: str,
234
+ **kwargs: Any,
235
+ ) -> Path:
236
+ with open(elements_filepath) as elements_file:
237
+ elements_contents: list[dict] = json.load(elements_file)
238
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
239
+ output_path.parent.mkdir(parents=True, exist_ok=True)
240
+
241
+ output = []
242
+ for data in elements_contents:
243
+ metadata: dict[str, Any] = data.pop("metadata", {})
244
+ data_source = metadata.pop("data_source", {})
245
+ coordinates = metadata.pop("coordinates", {})
246
+
247
+ data.update(metadata)
248
+ data.update(data_source)
249
+ data.update(coordinates)
250
+
251
+ data["id"] = str(uuid.uuid4())
252
+
253
+ # remove extraneous, not supported columns
254
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
255
+
256
+ output.append(data)
257
+
258
+ df = pd.DataFrame.from_dict(output)
259
+ for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
260
+ df[column] = df[column].apply(parse_date_string)
261
+ for column in filter(
262
+ lambda x: x in df.columns,
263
+ ("permissions_data", "record_locator", "points", "links"),
264
+ ):
265
+ df[column] = df[column].apply(
266
+ lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
267
+ )
268
+ for column in filter(
269
+ lambda x: x in df.columns,
270
+ ("version", "page_number", "regex_metadata"),
271
+ ):
272
+ df[column] = df[column].apply(str)
273
+
274
+ with output_path.open("w") as output_file:
275
+ df.to_json(output_file, orient="records", lines=True)
276
+ return output_path
277
+
278
+
279
+ class SQLUploaderConfig(UploaderConfig):
280
+ batch_size: int = Field(default=50, description="Number of records per batch")
281
+ table_name: str = Field(default="elements", description="which table to upload contents to")
282
+
283
+
284
+ @dataclass
285
+ class SQLUploader(Uploader):
286
+ upload_config: SQLUploaderConfig
287
+ connection_config: SQLConnectionConfig
288
+
289
+ def precheck(self) -> None:
290
+ try:
291
+ connection = self.connection_config.get_connection()
292
+ cursor = connection.cursor()
293
+ cursor.execute("SELECT 1;")
294
+ cursor.close()
295
+ except Exception as e:
296
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
297
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
298
+
299
+ @abstractmethod
300
+ def prepare_data(
301
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
302
+ ) -> list[tuple[Any, ...]]:
303
+ pass
304
+
305
+ @abstractmethod
306
+ def upload_contents(self, path: Path) -> None:
307
+ pass
308
+
309
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
310
+ self.upload_contents(path=path)
@@ -0,0 +1,172 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import Field, Secret, model_validator
9
+
10
+ from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
13
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
14
+ _DATE_COLUMNS,
15
+ SQLAccessConfig,
16
+ SQLConnectionConfig,
17
+ SQLDownloader,
18
+ SQLDownloaderConfig,
19
+ SQLIndexer,
20
+ SQLIndexerConfig,
21
+ SQLUploader,
22
+ SQLUploaderConfig,
23
+ SQLUploadStager,
24
+ SQLUploadStagerConfig,
25
+ parse_date_string,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from sqlite3 import Connection as SqliteConnection
30
+
31
+ CONNECTOR_TYPE = "sqlite"
32
+
33
+
34
+ class SQLiteAccessConfig(SQLAccessConfig):
35
+ pass
36
+
37
+
38
+ class SQLiteConnectionConfig(SQLConnectionConfig):
39
+ access_config: Secret[SQLiteAccessConfig] = Field(
40
+ default=SQLiteAccessConfig(), validate_default=True
41
+ )
42
+ database_path: Path = Field(
43
+ description="Path to the .db file.",
44
+ )
45
+
46
+ @model_validator(mode="after")
47
+ def check_database_path(self) -> "SQLiteConnectionConfig":
48
+ if not self.database_path.exists():
49
+ raise ValueError(f"{self.database_path} does not exist")
50
+ if not self.database_path.is_file():
51
+ raise ValueError(f"{self.database_path} is not a valid file")
52
+ return self
53
+
54
+ def get_connection(self) -> "SqliteConnection":
55
+ from sqlite3 import connect
56
+
57
+ return connect(database=self.database_path)
58
+
59
+
60
+ class SQLiteIndexerConfig(SQLIndexerConfig):
61
+ pass
62
+
63
+
64
+ @dataclass
65
+ class SQLiteIndexer(SQLIndexer):
66
+ connection_config: SQLConnectionConfig
67
+ index_config: SQLIndexerConfig
68
+ connector_type: str = CONNECTOR_TYPE
69
+
70
+ def _get_doc_ids(self) -> list[str]:
71
+ with self.connection_config.get_connection() as sqlite_connection:
72
+ cursor = sqlite_connection.cursor()
73
+ cursor.execute(
74
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
75
+ )
76
+ results = cursor.fetchall()
77
+ ids = [result[0] for result in results]
78
+ return ids
79
+
80
+
81
+ class SQLiteDownloaderConfig(SQLDownloaderConfig):
82
+ pass
83
+
84
+
85
+ @dataclass
86
+ class SQLiteDownloader(SQLDownloader):
87
+ connection_config: SQLConnectionConfig
88
+ download_config: SQLDownloaderConfig
89
+ connector_type: str = CONNECTOR_TYPE
90
+
91
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
92
+ table_name = file_data.additional_metadata["table_name"]
93
+ id_column = file_data.additional_metadata["id_column"]
94
+ ids = file_data.additional_metadata["ids"]
95
+ with self.connection_config.get_connection() as sqlite_connection:
96
+ cursor = sqlite_connection.cursor()
97
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
98
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
99
+ fields=fields,
100
+ table_name=table_name,
101
+ id_column=id_column,
102
+ ids=",".join([str(i) for i in ids]),
103
+ )
104
+ logger.debug(f"running query: {query}")
105
+ cursor.execute(query)
106
+ rows = cursor.fetchall()
107
+ columns = [col[0] for col in cursor.description]
108
+ return rows, columns
109
+
110
+
111
+ class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
112
+ pass
113
+
114
+
115
+ class SQLiteUploadStager(SQLUploadStager):
116
+ upload_stager_config: SQLiteUploadStagerConfig
117
+
118
+
119
+ class SQLiteUploaderConfig(SQLUploaderConfig):
120
+ pass
121
+
122
+
123
+ @dataclass
124
+ class SQLiteUploader(SQLUploader):
125
+ upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
126
+ connection_config: SQLiteConnectionConfig
127
+ connector_type: str = CONNECTOR_TYPE
128
+
129
+ def prepare_data(
130
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
131
+ ) -> list[tuple[Any, ...]]:
132
+ output = []
133
+ for row in data:
134
+ parsed = []
135
+ for column_name, value in zip(columns, row):
136
+ if isinstance(value, (list, dict)):
137
+ value = json.dumps(value)
138
+ if column_name in _DATE_COLUMNS:
139
+ if value is None:
140
+ parsed.append(None)
141
+ else:
142
+ parsed.append(parse_date_string(value))
143
+ else:
144
+ parsed.append(value)
145
+ output.append(tuple(parsed))
146
+ return output
147
+
148
+ def upload_contents(self, path: Path) -> None:
149
+ df = pd.read_json(path, orient="records", lines=True)
150
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
151
+ df.replace({np.nan: None}, inplace=True)
152
+
153
+ columns = tuple(df.columns)
154
+ stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
155
+ VALUES({','.join(['?' for x in columns])})" # noqa E501
156
+
157
+ for rows in pd.read_json(
158
+ path, orient="records", lines=True, chunksize=self.upload_config.batch_size
159
+ ):
160
+ with self.connection_config.get_connection() as conn:
161
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
162
+ conn.executemany(stmt, values)
163
+ conn.commit()
164
+
165
+
166
+ sqlite_destination_entry = DestinationRegistryEntry(
167
+ connection_config=SQLiteConnectionConfig,
168
+ uploader=SQLiteUploader,
169
+ uploader_config=SQLiteUploaderConfig,
170
+ upload_stager=SQLiteUploadStager,
171
+ upload_stager_config=SQLiteUploadStagerConfig,
172
+ )
@@ -22,6 +22,7 @@ class EmbedderConfig(BaseModel):
22
22
  "voyageai",
23
23
  "octoai",
24
24
  "mixedbread-ai",
25
+ "togetherai",
25
26
  ]
26
27
  ] = Field(default=None, description="Type of the embedding class to be used.")
27
28
  embedding_api_key: Optional[SecretStr] = Field(
@@ -107,6 +108,16 @@ class EmbedderConfig(BaseModel):
107
108
  config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
108
109
  )
109
110
 
111
+ def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
112
+ from unstructured_ingest.embed.togetherai import (
113
+ TogetherAIEmbeddingConfig,
114
+ TogetherAIEmbeddingEncoder,
115
+ )
116
+
117
+ return TogetherAIEmbeddingEncoder(
118
+ config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
119
+ )
120
+
110
121
  def get_embedder(self) -> "BaseEmbeddingEncoder":
111
122
  kwargs: dict[str, Any] = {}
112
123
  if self.embedding_api_key:
@@ -133,6 +144,8 @@ class EmbedderConfig(BaseModel):
133
144
  return self.get_voyageai_embedder(embedding_kwargs=kwargs)
134
145
  if self.embedding_provider == "mixedbread-ai":
135
146
  return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
147
+ if self.embedding_provider == "togetherai":
148
+ return self.get_togetherai_embedder(embedding_kwargs=kwargs)
136
149
 
137
150
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
138
151
 
@@ -55,7 +55,7 @@ class PartitionerConfig(BaseModel):
55
55
  "fields if they exist and drop all other fields. ",
56
56
  )
57
57
  partition_endpoint: Optional[str] = Field(
58
- default="https://api.unstructured.io/general/v0/general",
58
+ default="https://api.unstructuredapp.io/general/v0/general",
59
59
  description="If partitioning via api, use the following host.",
60
60
  )
61
61
  partition_by_api: bool = Field(
@@ -153,6 +153,7 @@ class Partitioner(BaseProcess, ABC):
153
153
  async def partition_via_api(
154
154
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
155
155
  ) -> list[dict]:
156
+ metadata = metadata or {}
156
157
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
157
158
 
158
159
  elements = await call_api(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.25
3
+ Version: 0.1.1
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
25
27
  Requires-Dist: pydantic>=2.7
26
28
  Requires-Dist: dataclasses-json
27
29
  Requires-Dist: opentelemetry-sdk
28
- Requires-Dist: python-dateutil
29
- Requires-Dist: pandas
30
- Requires-Dist: tqdm
31
30
  Requires-Dist: click
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -41,11 +41,11 @@ Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: requests; extra == "biomed"
45
44
  Requires-Dist: bs4; extra == "biomed"
45
+ Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: fsspec; extra == "box"
48
47
  Requires-Dist: boxfs; extra == "box"
48
+ Requires-Dist: fsspec; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
72
  Requires-Dist: fsspec; extra == "dropbox"
73
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -88,11 +88,11 @@ Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
90
  Requires-Dist: gcsfs; extra == "gcs"
91
- Requires-Dist: fsspec; extra == "gcs"
92
91
  Requires-Dist: bs4; extra == "gcs"
92
+ Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: requests; extra == "github"
95
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
+ Requires-Dist: requests; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
106
106
  Requires-Dist: confluent-kafka; extra == "kafka"
107
107
  Provides-Extra: kdbai
108
- Requires-Dist: kdbai-client; extra == "kdbai"
108
+ Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
109
109
  Provides-Extra: md
110
110
  Requires-Dist: unstructured[md]; extra == "md"
111
111
  Provides-Extra: milvus
@@ -116,15 +116,15 @@ Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
118
  Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: httpx; extra == "notion"
120
119
  Requires-Dist: htmlBuilder; extra == "notion"
121
120
  Requires-Dist: backoff; extra == "notion"
121
+ Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
125
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
126
- Requires-Dist: msal; extra == "onedrive"
127
126
  Requires-Dist: bs4; extra == "onedrive"
127
+ Requires-Dist: msal; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
130
130
  Requires-Dist: tiktoken; extra == "openai"
@@ -156,13 +156,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
159
  Requires-Dist: fsspec; extra == "s3"
160
+ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
164
- Requires-Dist: paramiko; extra == "sftp"
165
164
  Requires-Dist: fsspec; extra == "sftp"
165
+ Requires-Dist: paramiko; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
167
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
168
  Requires-Dist: msal; extra == "sharepoint"
@@ -170,6 +170,8 @@ Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
172
  Requires-Dist: slack-sdk; extra == "slack"
173
+ Provides-Extra: togetherai
174
+ Requires-Dist: together; extra == "togetherai"
173
175
  Provides-Extra: tsv
174
176
  Requires-Dist: unstructured[tsv]; extra == "tsv"
175
177
  Provides-Extra: vectara