unstructured-ingest 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from _pytest.fixtures import TopRequest
5
+
6
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
7
+ from test.integration.connectors.utils.validation.destination import (
8
+ StagerValidationConfigs,
9
+ stager_validation,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.sql.vastdb import (
12
+ CONNECTOR_TYPE,
13
+ VastdbUploadStager,
14
+ VastdbUploadStagerConfig,
15
+ )
16
+
17
+
18
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
19
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
20
+ def test_vast_stager(
21
+ request: TopRequest,
22
+ upload_file_str: str,
23
+ tmp_path: Path,
24
+ ):
25
+ upload_file: Path = request.getfixturevalue(upload_file_str)
26
+ stager = VastdbUploadStager(
27
+ upload_stager_config=VastdbUploadStagerConfig(rename_columns_map={"page_number": "page"})
28
+ )
29
+ stager_validation(
30
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
31
+ input_file=upload_file,
32
+ stager=stager,
33
+ tmp_dir=tmp_path,
34
+ )
@@ -0,0 +1,116 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import (
6
+ SOURCE_TAG,
7
+ UNCATEGORIZED_TAG,
8
+ )
9
+ from test.integration.connectors.utils.validation.source import (
10
+ SourceValidationConfigs,
11
+ get_all_file_data,
12
+ run_all_validations,
13
+ update_fixtures,
14
+ )
15
+ from test.integration.utils import requires_env
16
+ from unstructured_ingest.v2.interfaces import Downloader, Indexer
17
+ from unstructured_ingest.v2.processes.connectors.google_drive import (
18
+ CONNECTOR_TYPE,
19
+ GoogleDriveAccessConfig,
20
+ GoogleDriveConnectionConfig,
21
+ GoogleDriveDownloader,
22
+ GoogleDriveDownloaderConfig,
23
+ GoogleDriveIndexer,
24
+ GoogleDriveIndexerConfig,
25
+ )
26
+
27
+
28
+ @requires_env("GOOGLE_DRIVE_SERVICE_KEY")
29
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
30
+ def test_google_drive_source(temp_dir):
31
+ # Retrieve environment variables
32
+ service_account_key = os.environ["GOOGLE_DRIVE_SERVICE_KEY"]
33
+
34
+ # Create connection and indexer configurations
35
+ access_config = GoogleDriveAccessConfig(service_account_key=service_account_key)
36
+ connection_config = GoogleDriveConnectionConfig(
37
+ drive_id="1XidSOO76VpZ4m0i3gJN2m1X0Obol3UAi",
38
+ access_config=access_config,
39
+ )
40
+ index_config = GoogleDriveIndexerConfig(recursive=True)
41
+
42
+ download_config = GoogleDriveDownloaderConfig(download_dir=temp_dir)
43
+
44
+ # Instantiate indexer and downloader
45
+ indexer = GoogleDriveIndexer(
46
+ connection_config=connection_config,
47
+ index_config=index_config,
48
+ )
49
+ downloader = GoogleDriveDownloader(
50
+ connection_config=connection_config,
51
+ download_config=download_config,
52
+ )
53
+
54
+ # Run the source connector validation
55
+ source_connector_validation(
56
+ indexer=indexer,
57
+ downloader=downloader,
58
+ configs=SourceValidationConfigs(
59
+ test_id="google_drive_source",
60
+ expected_num_files=1,
61
+ validate_downloaded_files=True,
62
+ ),
63
+ )
64
+
65
+
66
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
67
+ def source_connector_validation(
68
+ indexer: Indexer,
69
+ downloader: Downloader,
70
+ configs: SourceValidationConfigs,
71
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
72
+ ) -> None:
73
+ # Run common validations on the process of running a source connector, supporting dynamic
74
+ # validators that get passed in along with comparisons on the saved expected values.
75
+ # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
76
+ # expected values with what gets generated by this test.
77
+ all_predownload_file_data = []
78
+ all_postdownload_file_data = []
79
+ indexer.precheck()
80
+ download_dir = downloader.download_config.download_dir
81
+ test_output_dir = configs.test_output_dir()
82
+
83
+ for file_data in indexer.run():
84
+ assert file_data
85
+ predownload_file_data = file_data.model_copy(deep=True)
86
+ all_predownload_file_data.append(predownload_file_data)
87
+ resp = downloader.run(file_data=file_data)
88
+ if isinstance(resp, list):
89
+ for r in resp:
90
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
91
+ all_postdownload_file_data.append(postdownload_file_data)
92
+ else:
93
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
94
+ all_postdownload_file_data.append(postdownload_file_data)
95
+
96
+ if not overwrite_fixtures:
97
+ print("Running validation")
98
+ run_all_validations(
99
+ configs=configs,
100
+ predownload_file_data=all_predownload_file_data,
101
+ postdownload_file_data=all_postdownload_file_data,
102
+ download_dir=download_dir,
103
+ test_output_dir=test_output_dir,
104
+ )
105
+ else:
106
+ print("Running fixtures update")
107
+ update_fixtures(
108
+ output_dir=test_output_dir,
109
+ download_dir=download_dir,
110
+ all_file_data=get_all_file_data(
111
+ all_predownload_file_data=all_predownload_file_data,
112
+ all_postdownload_file_data=all_postdownload_file_data,
113
+ ),
114
+ save_downloads=configs.validate_downloaded_files,
115
+ save_filedata=configs.validate_file_data,
116
+ )
File without changes
@@ -0,0 +1,74 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from pytest_mock import MockerFixture
5
+
6
+ from unstructured_ingest.v2.interfaces import FileData
7
+ from unstructured_ingest.v2.interfaces.file_data import SourceIdentifiers
8
+ from unstructured_ingest.v2.interfaces.upload_stager import UploadStagerConfig
9
+ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
10
+
11
+
12
+ @pytest.fixture
13
+ def mock_instance() -> BaseDuckDBUploadStager:
14
+ return BaseDuckDBUploadStager(UploadStagerConfig())
15
+
16
+
17
+ @pytest.mark.parametrize(
18
+ ("input_filepath", "output_filename", "expected"),
19
+ [
20
+ (
21
+ "/path/to/input_file.ndjson",
22
+ "output_file.ndjson",
23
+ "output_file.ndjson",
24
+ ),
25
+ ("input_file.txt", "output_file.json", "output_file.txt"),
26
+ ("/path/to/input_file.json", "output_file", "output_file.json"),
27
+ ],
28
+ )
29
+ def test_run_output_filename_suffix(
30
+ mocker: MockerFixture,
31
+ mock_instance: BaseDuckDBUploadStager,
32
+ input_filepath: str,
33
+ output_filename: str,
34
+ expected: str,
35
+ ):
36
+ output_dir = Path("/tmp/test/output_dir")
37
+
38
+ # Mocks
39
+ mock_get_data = mocker.patch(
40
+ "unstructured_ingest.v2.processes.connectors.duckdb.base.get_data",
41
+ return_value=[{"key": "value"}, {"key": "value2"}],
42
+ )
43
+ mock_conform_dict = mocker.patch.object(
44
+ BaseDuckDBUploadStager,
45
+ "conform_dict",
46
+ side_effect=lambda element_dict, file_data: element_dict,
47
+ )
48
+ mock_get_output_path = mocker.patch.object(
49
+ BaseDuckDBUploadStager, "get_output_path", return_value=output_dir / expected
50
+ )
51
+ mock_write_output = mocker.patch(
52
+ "unstructured_ingest.v2.processes.connectors.duckdb.base.write_data", return_value=None
53
+ )
54
+
55
+ # Act
56
+ result = mock_instance.run(
57
+ elements_filepath=Path(input_filepath),
58
+ file_data=FileData(
59
+ identifier="test",
60
+ connector_type="test",
61
+ source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
62
+ ),
63
+ output_dir=output_dir,
64
+ output_filename=output_filename,
65
+ )
66
+
67
+ # Assert
68
+ mock_get_data.assert_called_once_with(path=Path(input_filepath))
69
+ assert mock_conform_dict.call_count == 2
70
+ mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
71
+ mock_write_output.assert_called_once_with(
72
+ path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
+ )
74
+ assert result.name == expected
@@ -1 +1 @@
1
- __version__ = "0.5.0" # pragma: no cover
1
+ __version__ = "0.5.1" # pragma: no cover
@@ -81,6 +81,8 @@ class BaseDuckDBUploadStager(UploadStager):
81
81
  **kwargs: Any,
82
82
  ) -> Path:
83
83
  elements_contents = get_data(path=elements_filepath)
84
+ output_filename_suffix = Path(elements_filepath).suffix
85
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
84
86
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
85
87
 
86
88
  output = [
@@ -61,7 +61,7 @@ class MotherDuckConnectionConfig(ConnectionConfig):
61
61
  "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
62
  },
63
63
  ) as conn:
64
- conn.sql(f"USE {self.database}")
64
+ conn.sql(f'USE "{self.database}"')
65
65
  yield conn
66
66
 
67
67
  @contextmanager
@@ -102,11 +102,12 @@ class MotherDuckUploader(Uploader):
102
102
 
103
103
  def upload_dataframe(self, df: pd.DataFrame) -> None:
104
104
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
+ database = self.connection_config.database
106
+ db_schema = self.connection_config.db_schema
107
+ table = self.connection_config.table
105
108
 
106
109
  with self.connection_config.get_client() as conn:
107
- conn.query(
108
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
109
- )
110
+ conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
110
111
 
111
112
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
112
113
  df = pd.DataFrame(data=data)
@@ -310,20 +310,22 @@ class GoogleDriveDownloader(Downloader):
310
310
  from googleapiclient.http import MediaIoBaseDownload
311
311
 
312
312
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
313
- mime_type = file_data.additional_metadata["mimeType"]
314
313
  record_id = file_data.identifier
314
+ mime_type = file_data.additional_metadata["mimeType"]
315
+ if not mime_type:
316
+ raise TypeError(
317
+ f"File not supported. Name: {file_data.source_identifiers.filename} "
318
+ f"ID: {record_id} "
319
+ f"MimeType: {mime_type}"
320
+ )
315
321
  with self.connection_config.get_client() as client:
316
- if mime_type.startswith("application/vnd.google-apps"):
322
+ if (
323
+ mime_type.startswith("application/vnd.google-apps")
324
+ and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
325
+ ):
317
326
  export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
318
- self.meta.get("mimeType"), # type: ignore
327
+ mime_type, # type: ignore
319
328
  )
320
- if not export_mime:
321
- raise TypeError(
322
- f"File not supported. Name: {file_data.source_identifiers.filename} "
323
- f"ID: {record_id} "
324
- f"MimeType: {mime_type}"
325
- )
326
-
327
329
  request = client.export_media(
328
330
  fileId=record_id,
329
331
  mimeType=export_mime,
@@ -81,6 +81,7 @@ ALLOWED_FIELDS = (
81
81
  "link_urls",
82
82
  "link_texts",
83
83
  "text_as_html",
84
+ "entities",
84
85
  )
85
86
 
86
87
 
@@ -38,48 +38,6 @@ from unstructured_ingest.v2.interfaces import (
38
38
  from unstructured_ingest.v2.logger import logger
39
39
  from unstructured_ingest.v2.utils import get_enhanced_element_id
40
40
 
41
- _COLUMNS = (
42
- "id",
43
- "element_id",
44
- "text",
45
- "embeddings",
46
- "type",
47
- "system",
48
- "layout_width",
49
- "layout_height",
50
- "points",
51
- "url",
52
- "version",
53
- "date_created",
54
- "date_modified",
55
- "date_processed",
56
- "permissions_data",
57
- "record_locator",
58
- "category_depth",
59
- "parent_id",
60
- "attached_filename",
61
- "filetype",
62
- "last_modified",
63
- "file_directory",
64
- "filename",
65
- "languages",
66
- "page_number",
67
- "links",
68
- "page_name",
69
- "link_urls",
70
- "link_texts",
71
- "sent_from",
72
- "sent_to",
73
- "subject",
74
- "section",
75
- "header_footer_type",
76
- "emphasized_text_contents",
77
- "emphasized_text_tags",
78
- "text_as_html",
79
- "regex_metadata",
80
- "detection_class_prob",
81
- )
82
-
83
41
  _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
84
42
 
85
43
 
@@ -270,10 +228,8 @@ class SQLUploadStager(UploadStager):
270
228
 
271
229
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
272
230
 
273
- # remove extraneous, not supported columns
274
- element = {k: v for k, v in data.items() if k in _COLUMNS}
275
- element[RECORD_ID_LABEL] = file_data.identifier
276
- return element
231
+ data[RECORD_ID_LABEL] = file_data.identifier
232
+ return data
277
233
 
278
234
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
279
235
  for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
@@ -375,7 +331,7 @@ class SQLUploader(Uploader):
375
331
  missing_columns = schema_fields - columns
376
332
 
377
333
  if columns_to_drop:
378
- logger.warning(
334
+ logger.info(
379
335
  "Following columns will be dropped to match the table's schema: "
380
336
  f"{', '.join(columns_to_drop)}"
381
337
  )
@@ -19,7 +19,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
19
19
  SourceRegistryEntry,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
22
- _COLUMNS,
23
22
  SQLAccessConfig,
24
23
  SqlBatchFileData,
25
24
  SQLConnectionConfig,
@@ -149,13 +148,11 @@ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
149
148
  default=None,
150
149
  description="Map of column names to rename, ex: {'old_name': 'new_name'}",
151
150
  )
152
- additional_columns: Optional[list[str]] = Field(
153
- default_factory=list, description="Additional columns to include in the upload"
154
- )
155
151
 
156
152
 
153
+ @dataclass
157
154
  class VastdbUploadStager(SQLUploadStager):
158
- upload_stager_config: VastdbUploadStagerConfig
155
+ upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
159
156
 
160
157
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
161
158
  data = element_dict.copy()
@@ -168,13 +165,8 @@ class VastdbUploadStager(SQLUploadStager):
168
165
  data.update(coordinates)
169
166
 
170
167
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
171
-
172
- # remove extraneous, not supported columns
173
- # but also allow for additional columns
174
- approved_columns = set(_COLUMNS).union(self.upload_stager_config.additional_columns)
175
- element = {k: v for k, v in data.items() if k in approved_columns}
176
- element[RECORD_ID_LABEL] = file_data.identifier
177
- return element
168
+ data[RECORD_ID_LABEL] = file_data.identifier
169
+ return data
178
170
 
179
171
  def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
180
172
  df = super().conform_dataframe(df=df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: pandas
26
25
  Requires-Dist: dataclasses-json
27
- Requires-Dist: pydantic>=2.7
28
26
  Requires-Dist: click
29
- Requires-Dist: tqdm
30
- Requires-Dist: python-dateutil
31
27
  Requires-Dist: opentelemetry-sdk
28
+ Requires-Dist: pydantic>=2.7
29
+ Requires-Dist: python-dateutil
30
+ Requires-Dist: pandas
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -52,8 +52,8 @@ Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: requests; extra == "confluence"
56
55
  Requires-Dist: atlassian-python-api; extra == "confluence"
56
+ Requires-Dist: requests; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
63
  Provides-Extra: databricks-volumes
64
64
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
65
  Provides-Extra: delta-table
66
- Requires-Dist: deltalake; extra == "delta-table"
67
66
  Requires-Dist: boto3; extra == "delta-table"
67
+ Requires-Dist: deltalake; extra == "delta-table"
68
68
  Provides-Extra: discord
69
69
  Requires-Dist: discord.py; extra == "discord"
70
70
  Provides-Extra: doc
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
83
83
  Provides-Extra: embed-mixedbreadai
84
84
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
85
85
  Provides-Extra: embed-octoai
86
- Requires-Dist: tiktoken; extra == "embed-octoai"
87
86
  Requires-Dist: openai; extra == "embed-octoai"
87
+ Requires-Dist: tiktoken; extra == "embed-octoai"
88
88
  Provides-Extra: embed-vertexai
89
89
  Requires-Dist: vertexai; extra == "embed-vertexai"
90
90
  Provides-Extra: embed-voyageai
@@ -96,8 +96,8 @@ Requires-Dist: gcsfs; extra == "gcs"
96
96
  Requires-Dist: bs4; extra == "gcs"
97
97
  Requires-Dist: fsspec; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: pygithub>1.58.0; extra == "github"
100
99
  Requires-Dist: requests; extra == "github"
100
+ Requires-Dist: pygithub>1.58.0; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
@@ -122,23 +122,23 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
- Requires-Dist: networkx; extra == "neo4j"
126
- Requires-Dist: cymple; extra == "neo4j"
127
125
  Requires-Dist: neo4j; extra == "neo4j"
126
+ Requires-Dist: cymple; extra == "neo4j"
127
+ Requires-Dist: networkx; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: httpx; extra == "notion"
130
129
  Requires-Dist: backoff; extra == "notion"
131
- Requires-Dist: notion-client; extra == "notion"
132
130
  Requires-Dist: htmlBuilder; extra == "notion"
131
+ Requires-Dist: httpx; extra == "notion"
132
+ Requires-Dist: notion-client; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: bs4; extra == "onedrive"
138
137
  Requires-Dist: msal; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: openai
140
- Requires-Dist: tiktoken; extra == "openai"
141
140
  Requires-Dist: openai; extra == "openai"
141
+ Requires-Dist: tiktoken; extra == "openai"
142
142
  Provides-Extra: opensearch
143
143
  Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
@@ -169,8 +169,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
169
169
  Provides-Extra: rtf
170
170
  Requires-Dist: unstructured[rtf]; extra == "rtf"
171
171
  Provides-Extra: s3
172
- Requires-Dist: s3fs; extra == "s3"
173
172
  Requires-Dist: fsspec; extra == "s3"
173
+ Requires-Dist: s3fs; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
@@ -184,16 +184,16 @@ Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
185
185
  Requires-Dist: slack-sdk[optional]; extra == "slack"
186
186
  Provides-Extra: snowflake
187
- Requires-Dist: psycopg2-binary; extra == "snowflake"
188
187
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
188
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
189
189
  Provides-Extra: togetherai
190
190
  Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
+ Requires-Dist: pyarrow; extra == "vastdb"
194
195
  Requires-Dist: vastdb; extra == "vastdb"
195
196
  Requires-Dist: ibis; extra == "vastdb"
196
- Requires-Dist: pyarrow; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
198
  Requires-Dist: httpx; extra == "vectara"
199
199
  Requires-Dist: requests; extra == "vectara"
@@ -10,6 +10,7 @@ test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNp
10
10
  test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
11
11
  test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
12
12
  test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
13
+ test/integration/connectors/test_google_drive.py,sha256=0zJZ4UJOq4TkfU-bkc556_abV7q6zVS9ZgIvW9qcTU4,4204
13
14
  test/integration/connectors/test_lancedb.py,sha256=8MBxK_CUtOt87-4B7svDDK82NFII5psceo5cNN8HJMs,9228
14
15
  test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8DgGzlpyevsFu2w,7173
15
16
  test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
@@ -40,6 +41,7 @@ test/integration/connectors/sql/test_postgres.py,sha256=bGDyzLRpgrXO7nl0U8nF2zSN
40
41
  test/integration/connectors/sql/test_singlestore.py,sha256=XeU2s4Kt_3tGyaDYYKTgYjdOyb8j2dnz4TgSMwFUjWs,6153
41
42
  test/integration/connectors/sql/test_snowflake.py,sha256=LEwsRDoC6-rRiwYsqeo5B9Eo6RYygLLGAUsrtrgI9pM,7494
42
43
  test/integration/connectors/sql/test_sqlite.py,sha256=MHvhFRx1y_LTgfS-aPz-Zn9yOGsm-TF_s0t1seBzV1k,5956
44
+ test/integration/connectors/sql/test_vastdb.py,sha256=66T-o_y7NaDKGmKFkT778AB-nanlLv9KdtgUGPOdnLs,1069
43
45
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
46
  test/integration/connectors/utils/constants.py,sha256=JhTk6YNw7JVpkk-Pl8zn2YYkExeL1oE9VBWm_kMYGfo,369
45
47
  test/integration/connectors/utils/docker.py,sha256=4g1STiSbYN5qcmDTXyPxVJgwx97O6wk7n-DJ-zgzgag,4971
@@ -87,6 +89,8 @@ test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
87
89
  test/unit/v2/connectors/test_confluence.py,sha256=bXrn_kRb4IQdqkk4rc-P2gJAtPba7n7pNplQgfbqZDY,1047
88
90
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
91
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
92
+ test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
+ test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
90
94
  test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
95
  test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
92
96
  test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -103,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
103
107
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
104
108
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
105
109
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
106
- unstructured_ingest/__version__.py,sha256=A09Ks7MDqP-QtYP9TIQMxydOZeCTtu9i7xoq5wdy4As,42
110
+ unstructured_ingest/__version__.py,sha256=LXdgOM6QWErpDu1oCqJrypfmAkBaXzRxVPcjHL8yPrI,42
107
111
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
108
112
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
109
113
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -428,7 +432,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVm
428
432
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
429
433
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
430
434
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
431
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=5k7pdAzJGXSdyPCzW9vu2OaAjGVTo2JevDyGaXM1Hvk,13370
435
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=tSbyibwm9RQyXD-HJGZa1Y9lBSCXaEFnvxpf6bHwBSE,13394
432
436
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
433
437
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
434
438
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
@@ -436,7 +440,7 @@ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNN
436
440
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
437
441
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
438
442
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
439
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
443
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=U5gSa8S08JvCwmAhE8aV0yxGTIFnUlKVsQDybE8Fqb8,10746
440
444
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
441
445
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
442
446
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=f0F7KioXgucVc3tVASTa67ynlTa4s9_FKGPHop6Xm0A,4563
@@ -453,9 +457,9 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8
453
457
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
454
458
  unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=2KNLwDZJDhsMAUGCzktEIn4Lvb0nxLWabBOPJbgyoEE,5010
455
459
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
456
- unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=0YBdOpTX5mbRLhP00lRHSMpl2-LfuRpqB1XPMJMxn04,2647
460
+ unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=IHaY1mWuidt6GDEJhB1c_orwmjeyXuRCVJ88djYDciM,2793
457
461
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=oUHHaLpO2pWW2Lu4Mc-XFjrA0ze97205WQ_xP95ua4M,4296
458
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=mU5x6SnbFgRsVicNGh4y4gtR6ek7eQFinI0dQQmzMds,4481
462
+ unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=OsRy-rcrP4_KSustpxlEKoZ_FmJNFMyMmIfFk6WJ3UY,4559
459
463
  unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
460
464
  unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
461
465
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
@@ -555,17 +559,17 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
555
559
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
556
560
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
557
561
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=QE-WBqrPVjCgcxR5EdVD9iTHBjgDSSSQgWYvq5N61qU,7746
558
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=O2XBu_E2WqNia9OUTdhTWkYo0xhoMMm6ZuanTz-0V9s,16192
562
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
559
563
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
560
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=4DckpVAXpmMTcoKrWiJbnFQQlcrwMA-GMaDsAYchTUs,9992
564
+ unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
561
565
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
562
566
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
563
567
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
564
568
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
565
569
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
566
- unstructured_ingest-0.5.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
567
- unstructured_ingest-0.5.0.dist-info/METADATA,sha256=dyxZ7tfjq1tkZPJgaK6ZanQwB6pteSIznmfUhAgnT64,8051
568
- unstructured_ingest-0.5.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
569
- unstructured_ingest-0.5.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
570
- unstructured_ingest-0.5.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
571
- unstructured_ingest-0.5.0.dist-info/RECORD,,
570
+ unstructured_ingest-0.5.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
+ unstructured_ingest-0.5.1.dist-info/METADATA,sha256=4fo4K5ac0RNRlWGGyNumZ5gXJf-0PwknZWjS6HvAD6w,8051
572
+ unstructured_ingest-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
573
+ unstructured_ingest-0.5.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
+ unstructured_ingest-0.5.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
+ unstructured_ingest-0.5.1.dist-info/RECORD,,