unstructured-ingest 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (34) hide show
  1. test/integration/connectors/conftest.py +13 -0
  2. test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
  3. test/integration/connectors/sql/__init__.py +0 -0
  4. test/integration/connectors/{test_postgres.py → sql/test_postgres.py} +76 -2
  5. test/integration/connectors/sql/test_snowflake.py +205 -0
  6. test/integration/connectors/{test_sqlite.py → sql/test_sqlite.py} +68 -12
  7. test/integration/connectors/test_delta_table.py +138 -0
  8. test/integration/connectors/utils/constants.py +1 -1
  9. test/integration/connectors/utils/docker.py +78 -0
  10. test/integration/connectors/utils/validation.py +100 -4
  11. unstructured_ingest/__version__.py +1 -1
  12. unstructured_ingest/v2/cli/utils/click.py +32 -1
  13. unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
  14. unstructured_ingest/v2/interfaces/indexer.py +4 -1
  15. unstructured_ingest/v2/pipeline/pipeline.py +10 -2
  16. unstructured_ingest/v2/pipeline/steps/index.py +18 -1
  17. unstructured_ingest/v2/processes/connectors/__init__.py +10 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +1 -1
  19. unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
  20. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  21. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  22. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  23. unstructured_ingest/v2/processes/connectors/sql/__init__.py +10 -2
  24. unstructured_ingest/v2/processes/connectors/sql/postgres.py +77 -25
  25. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +164 -0
  26. unstructured_ingest/v2/processes/connectors/sql/sql.py +163 -6
  27. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +86 -24
  28. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/METADATA +16 -14
  29. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/RECORD +33 -27
  30. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
  31. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/LICENSE.md +0 -0
  32. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/WHEEL +0 -0
  33. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/entry_points.txt +0 -0
  34. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,36 @@
1
+ import hashlib
1
2
  import json
3
+ import sys
2
4
  import uuid
3
5
  from abc import ABC, abstractmethod
4
- from dataclasses import dataclass, field
6
+ from contextlib import contextmanager
7
+ from dataclasses import dataclass, field, replace
5
8
  from datetime import date, datetime
6
9
  from pathlib import Path
7
- from typing import Any, Union
10
+ from time import time
11
+ from typing import Any, Generator, Union
8
12
 
13
+ import numpy as np
9
14
  import pandas as pd
10
15
  from dateutil import parser
11
16
  from pydantic import Field, Secret
12
17
 
13
- from unstructured_ingest.error import DestinationConnectionError
18
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
14
19
  from unstructured_ingest.v2.interfaces import (
15
20
  AccessConfig,
16
21
  ConnectionConfig,
22
+ Downloader,
23
+ DownloaderConfig,
24
+ DownloadResponse,
17
25
  FileData,
26
+ FileDataSourceMetadata,
27
+ Indexer,
28
+ IndexerConfig,
18
29
  Uploader,
19
30
  UploaderConfig,
20
31
  UploadStager,
21
32
  UploadStagerConfig,
33
+ download_responses,
22
34
  )
23
35
  from unstructured_ingest.v2.logger import logger
24
36
 
@@ -84,9 +96,137 @@ class SQLConnectionConfig(ConnectionConfig, ABC):
84
96
  access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
85
97
 
86
98
  @abstractmethod
87
- def get_connection(self) -> Any:
99
+ @contextmanager
100
+ def get_connection(self) -> Generator[Any, None, None]:
88
101
  pass
89
102
 
103
+ @abstractmethod
104
+ @contextmanager
105
+ def get_cursor(self) -> Generator[Any, None, None]:
106
+ pass
107
+
108
+
109
+ class SQLIndexerConfig(IndexerConfig):
110
+ table_name: str
111
+ id_column: str
112
+ batch_size: int = 100
113
+
114
+
115
+ class SQLIndexer(Indexer, ABC):
116
+ connection_config: SQLConnectionConfig
117
+ index_config: SQLIndexerConfig
118
+
119
+ def _get_doc_ids(self) -> list[str]:
120
+ with self.connection_config.get_cursor() as cursor:
121
+ cursor.execute(
122
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
123
+ )
124
+ results = cursor.fetchall()
125
+ ids = [result[0] for result in results]
126
+ return ids
127
+
128
+ def precheck(self) -> None:
129
+ try:
130
+ with self.connection_config.get_cursor() as cursor:
131
+ cursor.execute("SELECT 1;")
132
+ except Exception as e:
133
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ raise SourceConnectionError(f"failed to validate connection: {e}")
135
+
136
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
137
+ ids = self._get_doc_ids()
138
+ id_batches: list[frozenset[str]] = [
139
+ frozenset(
140
+ ids[
141
+ i
142
+ * self.index_config.batch_size : (i + 1) # noqa
143
+ * self.index_config.batch_size
144
+ ]
145
+ )
146
+ for i in range(
147
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
148
+ )
149
+ ]
150
+ for batch in id_batches:
151
+ # Make sure the hash is always a positive number to create identified
152
+ identified = str(hash(batch) + sys.maxsize + 1)
153
+ yield FileData(
154
+ identifier=identified,
155
+ connector_type=self.connector_type,
156
+ metadata=FileDataSourceMetadata(
157
+ date_processed=str(time()),
158
+ ),
159
+ doc_type="batch",
160
+ additional_metadata={
161
+ "ids": list(batch),
162
+ "table_name": self.index_config.table_name,
163
+ "id_column": self.index_config.id_column,
164
+ },
165
+ )
166
+
167
+
168
+ class SQLDownloaderConfig(DownloaderConfig):
169
+ fields: list[str] = field(default_factory=list)
170
+
171
+
172
+ class SQLDownloader(Downloader, ABC):
173
+ connection_config: SQLConnectionConfig
174
+ download_config: SQLDownloaderConfig
175
+
176
+ @abstractmethod
177
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
178
+ pass
179
+
180
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
181
+ data = [dict(zip(columns, row)) for row in rows]
182
+ df = pd.DataFrame(data)
183
+ dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
184
+ return dfs
185
+
186
+ def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
187
+ rows, columns = self.query_db(file_data=file_data)
188
+ return self.sql_to_df(rows=rows, columns=columns)
189
+
190
+ def get_identifier(self, table_name: str, record_id: str) -> str:
191
+ f = f"{table_name}-{record_id}"
192
+ if self.download_config.fields:
193
+ f = "{}-{}".format(
194
+ f,
195
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
196
+ )
197
+ return f
198
+
199
+ def generate_download_response(
200
+ self, result: pd.DataFrame, file_data: FileData
201
+ ) -> DownloadResponse:
202
+ id_column = file_data.additional_metadata["id_column"]
203
+ table_name = file_data.additional_metadata["table_name"]
204
+ record_id = result.iloc[0][id_column]
205
+ filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
206
+ filename = f"{filename_id}.csv"
207
+ download_path = self.download_dir / Path(filename)
208
+ logger.debug(
209
+ f"Downloading results from table {table_name} and id {record_id} to {download_path}"
210
+ )
211
+ download_path.parent.mkdir(parents=True, exist_ok=True)
212
+ result.to_csv(download_path, index=False)
213
+ copied_file_data = replace(file_data)
214
+ copied_file_data.identifier = filename_id
215
+ copied_file_data.doc_type = "file"
216
+ copied_file_data.additional_metadata.pop("ids", None)
217
+ return super().generate_download_response(
218
+ file_data=copied_file_data, download_path=download_path
219
+ )
220
+
221
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
222
+ data_dfs = self.get_data(file_data=file_data)
223
+ download_responses = []
224
+ for df in data_dfs:
225
+ download_responses.append(
226
+ self.generate_download_response(result=df, file_data=file_data)
227
+ )
228
+ return download_responses
229
+
90
230
 
91
231
  class SQLUploadStagerConfig(UploadStagerConfig):
92
232
  pass
@@ -156,6 +296,7 @@ class SQLUploaderConfig(UploaderConfig):
156
296
  class SQLUploader(Uploader):
157
297
  upload_config: SQLUploaderConfig
158
298
  connection_config: SQLConnectionConfig
299
+ values_delimiter: str = "?"
159
300
 
160
301
  def precheck(self) -> None:
161
302
  try:
@@ -173,9 +314,25 @@ class SQLUploader(Uploader):
173
314
  ) -> list[tuple[Any, ...]]:
174
315
  pass
175
316
 
176
- @abstractmethod
177
317
  def upload_contents(self, path: Path) -> None:
178
- pass
318
+ df = pd.read_json(path, orient="records", lines=True)
319
+ df.replace({np.nan: None}, inplace=True)
320
+
321
+ columns = list(df.columns)
322
+ stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})" # noqa E501
323
+
324
+ for rows in pd.read_json(
325
+ path, orient="records", lines=True, chunksize=self.upload_config.batch_size
326
+ ):
327
+ with self.connection_config.get_cursor() as cursor:
328
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
329
+ # for val in values:
330
+ # try:
331
+ # cursor.execute(stmt, val)
332
+ # except Exception as e:
333
+ # print(f"Error: {e}")
334
+ # print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
335
+ cursor.executemany(stmt, values)
179
336
 
180
337
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
338
  self.upload_contents(path=path)
@@ -1,18 +1,25 @@
1
1
  import json
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
4
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Any, Generator
5
6
 
6
- import numpy as np
7
- import pandas as pd
8
- from pydantic import Field, Secret
7
+ from pydantic import Field, Secret, model_validator
9
8
 
9
+ from unstructured_ingest.v2.interfaces import FileData
10
10
  from unstructured_ingest.v2.logger import logger
11
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
11
+ from unstructured_ingest.v2.processes.connector_registry import (
12
+ DestinationRegistryEntry,
13
+ SourceRegistryEntry,
14
+ )
12
15
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
13
16
  _DATE_COLUMNS,
14
17
  SQLAccessConfig,
15
18
  SQLConnectionConfig,
19
+ SQLDownloader,
20
+ SQLDownloaderConfig,
21
+ SQLIndexer,
22
+ SQLIndexerConfig,
16
23
  SQLUploader,
17
24
  SQLUploaderConfig,
18
25
  SQLUploadStager,
@@ -22,6 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
22
29
 
23
30
  if TYPE_CHECKING:
24
31
  from sqlite3 import Connection as SqliteConnection
32
+ from sqlite3 import Cursor as SqliteCursor
25
33
 
26
34
  CONNECTOR_TYPE = "sqlite"
27
35
 
@@ -37,12 +45,75 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
37
45
  database_path: Path = Field(
38
46
  description="Path to the .db file.",
39
47
  )
40
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
41
48
 
42
- def get_connection(self) -> "SqliteConnection":
49
+ @model_validator(mode="after")
50
+ def check_database_path(self) -> "SQLiteConnectionConfig":
51
+ if not self.database_path.exists():
52
+ raise ValueError(f"{self.database_path} does not exist")
53
+ if not self.database_path.is_file():
54
+ raise ValueError(f"{self.database_path} is not a valid file")
55
+ return self
56
+
57
+ @contextmanager
58
+ def get_connection(self) -> Generator["SqliteConnection", None, None]:
43
59
  from sqlite3 import connect
44
60
 
45
- return connect(database=self.database_path)
61
+ connection = connect(database=self.database_path)
62
+ try:
63
+ yield connection
64
+ finally:
65
+ connection.commit()
66
+ connection.close()
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["SqliteCursor", None, None]:
70
+ with self.get_connection() as connection:
71
+ cursor = connection.cursor()
72
+ try:
73
+ yield cursor
74
+ finally:
75
+ cursor.close()
76
+
77
+
78
+ class SQLiteIndexerConfig(SQLIndexerConfig):
79
+ pass
80
+
81
+
82
+ @dataclass
83
+ class SQLiteIndexer(SQLIndexer):
84
+ connection_config: SQLConnectionConfig
85
+ index_config: SQLIndexerConfig
86
+ connector_type: str = CONNECTOR_TYPE
87
+
88
+
89
+ class SQLiteDownloaderConfig(SQLDownloaderConfig):
90
+ pass
91
+
92
+
93
+ @dataclass
94
+ class SQLiteDownloader(SQLDownloader):
95
+ connection_config: SQLConnectionConfig
96
+ download_config: SQLDownloaderConfig
97
+ connector_type: str = CONNECTOR_TYPE
98
+
99
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
100
+ table_name = file_data.additional_metadata["table_name"]
101
+ id_column = file_data.additional_metadata["id_column"]
102
+ ids = file_data.additional_metadata["ids"]
103
+ with self.connection_config.get_connection() as sqlite_connection:
104
+ cursor = sqlite_connection.cursor()
105
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
106
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
107
+ fields=fields,
108
+ table_name=table_name,
109
+ id_column=id_column,
110
+ ids=",".join([str(i) for i in ids]),
111
+ )
112
+ logger.debug(f"running query: {query}")
113
+ cursor.execute(query)
114
+ rows = cursor.fetchall()
115
+ columns = [col[0] for col in cursor.description]
116
+ return rows, columns
46
117
 
47
118
 
48
119
  class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
@@ -82,23 +153,14 @@ class SQLiteUploader(SQLUploader):
82
153
  output.append(tuple(parsed))
83
154
  return output
84
155
 
85
- def upload_contents(self, path: Path) -> None:
86
- df = pd.read_json(path, orient="records", lines=True)
87
- logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
88
- df.replace({np.nan: None}, inplace=True)
89
-
90
- columns = tuple(df.columns)
91
- stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
92
- VALUES({','.join(['?' for x in columns])})" # noqa E501
93
-
94
- for rows in pd.read_json(
95
- path, orient="records", lines=True, chunksize=self.upload_config.batch_size
96
- ):
97
- with self.connection_config.get_connection() as conn:
98
- values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
99
- conn.executemany(stmt, values)
100
- conn.commit()
101
156
 
157
+ sqlite_source_entry = SourceRegistryEntry(
158
+ connection_config=SQLiteConnectionConfig,
159
+ indexer_config=SQLiteIndexerConfig,
160
+ indexer=SQLIndexer,
161
+ downloader_config=SQLiteDownloaderConfig,
162
+ downloader=SQLiteDownloader,
163
+ )
102
164
 
103
165
  sqlite_destination_entry = DestinationRegistryEntry(
104
166
  connection_config=SQLiteConnectionConfig,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -24,11 +24,11 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pydantic>=2.7
26
26
  Requires-Dist: opentelemetry-sdk
27
+ Requires-Dist: python-dateutil
27
28
  Requires-Dist: tqdm
28
29
  Requires-Dist: pandas
29
- Requires-Dist: python-dateutil
30
- Requires-Dist: dataclasses-json
31
30
  Requires-Dist: click
31
+ Requires-Dist: dataclasses-json
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -44,8 +44,8 @@ Provides-Extra: biomed
44
44
  Requires-Dist: bs4; extra == "biomed"
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: fsspec; extra == "box"
48
47
  Requires-Dist: boxfs; extra == "box"
48
+ Requires-Dist: fsspec; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -91,8 +91,8 @@ Requires-Dist: bs4; extra == "gcs"
91
91
  Requires-Dist: gcsfs; extra == "gcs"
92
92
  Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: requests; extra == "github"
95
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
+ Requires-Dist: requests; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
106
106
  Requires-Dist: confluent-kafka; extra == "kafka"
107
107
  Provides-Extra: kdbai
108
- Requires-Dist: kdbai-client; extra == "kdbai"
108
+ Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
109
109
  Provides-Extra: md
110
110
  Requires-Dist: unstructured[md]; extra == "md"
111
111
  Provides-Extra: milvus
@@ -115,16 +115,16 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
- Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: httpx; extra == "notion"
120
118
  Requires-Dist: backoff; extra == "notion"
119
+ Requires-Dist: httpx; extra == "notion"
120
+ Requires-Dist: notion-client; extra == "notion"
121
121
  Requires-Dist: htmlBuilder; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
125
  Requires-Dist: bs4; extra == "onedrive"
126
- Requires-Dist: msal; extra == "onedrive"
127
126
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
127
+ Requires-Dist: msal; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
130
130
  Requires-Dist: tiktoken; extra == "openai"
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
134
134
  Requires-Dist: unstructured[org]; extra == "org"
135
135
  Provides-Extra: outlook
136
- Requires-Dist: msal; extra == "outlook"
137
136
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
137
+ Requires-Dist: msal; extra == "outlook"
138
138
  Provides-Extra: pdf
139
139
  Requires-Dist: unstructured[pdf]; extra == "pdf"
140
140
  Provides-Extra: pinecone
@@ -156,20 +156,22 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
159
  Requires-Dist: fsspec; extra == "s3"
160
+ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
164
- Requires-Dist: fsspec; extra == "sftp"
165
164
  Requires-Dist: paramiko; extra == "sftp"
165
+ Requires-Dist: fsspec; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
- Requires-Dist: msal; extra == "sharepoint"
168
167
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
+ Requires-Dist: msal; extra == "sharepoint"
169
169
  Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
- Requires-Dist: slack-sdk; extra == "slack"
172
+ Requires-Dist: slack-sdk[optional]; extra == "slack"
173
+ Provides-Extra: snowflake
174
+ Requires-Dist: snowflake; extra == "snowflake"
173
175
  Provides-Extra: togetherai
174
176
  Requires-Dist: together; extra == "togetherai"
175
177
  Provides-Extra: tsv
@@ -4,16 +4,20 @@ test/integration/utils.py,sha256=CWqzEGw6TA_ZoP9hRUkW64TWYssooBbufcTRmbJvod8,401
4
4
  test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
8
- test/integration/connectors/test_postgres.py,sha256=9uaqlUmLpVF09cwKSw7Yldq2kjU00WBedbEIgyJG5Cw,3998
7
+ test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
+ test/integration/connectors/test_delta_table.py,sha256=4_KPyQJpd6DmyIjjtXWPMw6NNf7xULRkxmqfbvmZ80g,5018
9
9
  test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
10
- test/integration/connectors/test_sqlite.py,sha256=NnLdyt3FfM1A53tXPJbgIcsy-iEgYY8OZYOfliFqifM,3507
11
10
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
11
+ test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
12
+ test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ test/integration/connectors/sql/test_postgres.py,sha256=gDBuNyvWmpVPmDrSSYC99z3t17B_a196P1MwIAOp5Dk,6584
14
+ test/integration/connectors/sql/test_snowflake.py,sha256=XXU2-2z_k8jHWP684v2IuaGOlV3cmPpg3RxkwMp08v8,6998
15
+ test/integration/connectors/sql/test_sqlite.py,sha256=51QrFufAq-XxNjHAkmPWxdJUkGdIRRIGKeRT09A5pkA,5704
13
16
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- test/integration/connectors/utils/constants.py,sha256=OjxLmmzCbDNqH5tK0jWFxDgIkM973cr3SmFIRk7aySc,222
17
+ test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
18
+ test/integration/connectors/utils/docker.py,sha256=-wknXRVlzr3BVPdEhCyJgsdNjO9aSb2xjb-mQ306j7Q,2256
15
19
  test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
16
- test/integration/connectors/utils/validation.py,sha256=VNvyutfnWbnesavL_V5SjM2H3LoOHnkW7Paq8RO4WbM,8199
20
+ test/integration/connectors/utils/validation.py,sha256=gnflehoYbFkSBJdXQV-7HwcrlL_Cuqni2ri1YmArjT0,12019
17
21
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
22
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
19
23
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -42,7 +46,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
42
46
  test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
43
47
  test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
44
48
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
45
- unstructured_ingest/__version__.py,sha256=J87Ao0q5WoHKbDEbH6O10GOGaMO3yEUCBOxCqbm715I,42
49
+ unstructured_ingest/__version__.py,sha256=BPrBFKCFfY7EcVqYVDVJGmj1rrsGlJa3283pycTFA3o,42
46
50
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
47
51
  unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
48
52
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -320,13 +324,13 @@ unstructured_ingest/v2/cli/base/dest.py,sha256=zDjqek7anr0JQ2ptEl8KIAsUXuCuHRnBQ
320
324
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
321
325
  unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
322
326
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
- unstructured_ingest/v2/cli/utils/click.py,sha256=HCEcdHf8Lck0zcx3kidKjLbHDHXIBxPRL2MGgtKtDlg,6967
324
- unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
327
+ unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
328
+ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
325
329
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
326
330
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
327
331
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
328
332
  unstructured_ingest/v2/interfaces/file_data.py,sha256=ieJK-hqHCEOmoYNGoFbCHziSaZyMtRS9VpSoYbwoKCE,1944
329
- unstructured_ingest/v2/interfaces/indexer.py,sha256=Bd1S-gTLsxhJBLEh1lYm_gXqwQLaEZMoqPq9yGxtN_E,713
333
+ unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
330
334
  unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
331
335
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
332
336
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=ZFkDxcwKn-6EPrTbdBEgOkz1kGAq4gUtze98KP48KG4,1146
@@ -334,13 +338,13 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=JmZDl1blJa5rS61YHCae3Hfet84
334
338
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
335
339
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
336
340
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
337
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=x6hanD7Cj7Wd5MBUvb33UwXQMZxubzwlAiYyBCMukuc,15693
341
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=7Yg8_xwlSX6lA-oPGlTcn6KXZ9kc51zsoJxME5TiUlw,15956
338
342
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
339
343
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=rYVcHSXeQSzWszg6VmtYlNc66Gsx-22Ti0BxPyQaJak,3135
340
344
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=lzvOl5SoUK6OCCVVeG4CzdPIGj6eKKCGdciNo_0RMNk,8173
341
345
  unstructured_ingest/v2/pipeline/steps/embed.py,sha256=-YFvmchdsonWiSXxaD7PJfuUUtMLklaQM_8kZCQxCdM,3113
342
346
  unstructured_ingest/v2/pipeline/steps/filter.py,sha256=q7bNieaFMprqoF8Mx7w-ZN6jyA5peiGeTGyPtvcV-uw,1199
343
- unstructured_ingest/v2/pipeline/steps/index.py,sha256=nfDo-wt5sooKtMHKG7sI42m1L44uw-pxErDlDB1engw,2678
347
+ unstructured_ingest/v2/pipeline/steps/index.py,sha256=YUUf1sYZRZSrRgapca3Sfzk1sNPJ05yyTQ5wKlyDjEo,3543
344
348
  unstructured_ingest/v2/pipeline/steps/partition.py,sha256=9MQViptxK3ALKco8uE4gK9PpEoGq5JjzyU14C_18blU,3193
345
349
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=cphKgHScLz2rNLZRI5Olsb6dAH-MKGu3p6MYS1BEzkA,2246
346
350
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=CFSy4tGp6BAvF0oIwWFN8v4zFzh5pRDeESjEn5iP9hE,1756
@@ -352,16 +356,16 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
352
356
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
353
357
  unstructured_ingest/v2/processes/partitioner.py,sha256=2Lhztd730soVC2TOqrn_ba7CGZna8AHHpqJY2ZUYVxE,7776
354
358
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
355
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=glyowqb93_NNreQXoRLbF0PvzMc6Ptv0ARfl3xfSH4E,4967
359
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=a7L4N7A2-SzthS6-42FKWymQRW1ydr0cGvDdI2QE--I,5377
356
360
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
357
361
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
358
362
  unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
359
363
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
360
364
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
361
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
365
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=ZZfdNTw1W0ISQGWCtM1JuIME26FYzuPBOqRKql0wlLg,7013
362
366
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
363
367
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
364
- unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
368
+ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
365
369
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
366
370
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
367
371
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
@@ -372,10 +376,11 @@ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6
372
376
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
373
377
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
374
378
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
379
+ unstructured_ingest/v2/processes/connectors/slack.py,sha256=b9IanzUApUexiJzuNg7PR3tujOoeG8dhM0L0v4MDuPw,9256
375
380
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
376
381
  unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
377
382
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
378
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=db4PxE1LiKWUq0b9THABFRChArAfHps89pZBglqEg3c,6521
383
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=IBCGt6BQ7vULkPI3jTJZ52emwYg7QeyLZXjOFz9SO3E,6549
379
384
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
380
385
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
381
386
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
@@ -384,18 +389,19 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
384
389
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
385
390
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
386
391
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
387
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=usLzU2NA5D_a1juhja4jyJP_CzW4h-5rZ22bWVwvZGQ,10853
392
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
388
393
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
389
394
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
390
395
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
391
396
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
392
- unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
393
- unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=hqNuGYR_9o5LmfVDXnm3jBF5Pk-s7R66d0epF2uBYuM,4083
394
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=8bDUgyDurQelOabNnSG6ejWWsnLGWf-A-lWrpwYDGQM,5140
395
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=K-Lquxxqa1m5fk9by-5sasq561TRFAeV_SZ1Hc_b9Hk,3426
396
- unstructured_ingest-0.1.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
397
- unstructured_ingest-0.1.0.dist-info/METADATA,sha256=mNOS5HjbygWcTZ5eFlxoPpvt6dVAjkYniNHpk6tLvQw,7181
398
- unstructured_ingest-0.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
399
- unstructured_ingest-0.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
400
- unstructured_ingest-0.1.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
401
- unstructured_ingest-0.1.0.dist-info/RECORD,,
397
+ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=hdGD-V4U3RgnVoJV5S3exKVUfzCLLY7wTwKWvVaihJs,1098
398
+ unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=WUqyjzjmuVvLKCMKnhFhYNRAAQs_cFh0DkSXAJEERyU,5548
399
+ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=AcW2TxEalYj6c8fhrOWB78JlaB-1hApmdDzCUhQlzW4,5513
400
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=XdMJRgQvcR4Lo2Udl1y8-ZkJw6nVrcXTL-gTsaAHAJw,11196
401
+ unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
402
+ unstructured_ingest-0.2.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
403
+ unstructured_ingest-0.2.0.dist-info/METADATA,sha256=F8s5t23zy5zdxICEj6BseR0teRWCQc7IjB_xtlZUkaM,7271
404
+ unstructured_ingest-0.2.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
405
+ unstructured_ingest-0.2.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
406
+ unstructured_ingest-0.2.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
407
+ unstructured_ingest-0.2.0.dist-info/RECORD,,