unstructured-ingest 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.6.0" # pragma: no cover
1
+ __version__ = "0.6.2" # pragma: no cover
@@ -6,6 +6,7 @@ from typing import Any, Optional
6
6
  from pydantic import BaseModel, Field, SecretStr
7
7
 
8
8
  from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
+ from unstructured_ingest.utils.data_prep import get_json_data
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
12
  from unstructured_ingest.v2.logger import logger
@@ -92,9 +93,11 @@ class Chunker(BaseProcess, ABC):
92
93
  @requires_dependencies(dependencies=["unstructured"])
93
94
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
94
95
  from unstructured.chunking import dispatch
95
- from unstructured.staging.base import elements_from_json
96
+ from unstructured.staging.base import elements_from_dicts
96
97
 
97
- elements = elements_from_json(filename=str(elements_filepath))
98
+ element_dicts = get_json_data(elements_filepath)
99
+
100
+ elements = elements_from_dicts(element_dicts=element_dicts)
98
101
  if not elements:
99
102
  return [e.to_dict() for e in elements]
100
103
  local_chunking_strategies = ("basic", "by_title")
@@ -0,0 +1,10 @@
1
+ CREATE TABLE elements (
2
+ id STRING NOT NULL PRIMARY KEY,
3
+ record_id STRING NOT NULL,
4
+ element_id STRING NOT NULL,
5
+ text STRING,
6
+ embeddings ARRAY<FLOAT>,
7
+ type STRING,
8
+ metadata VARIANT
9
+ );
10
+
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
  from uuid import NAMESPACE_DNS, uuid5
7
7
 
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel, Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.errors import (
@@ -61,12 +61,14 @@ class DatabricksVolumesAccessConfig(AccessConfig):
61
61
 
62
62
 
63
63
  class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
64
+ access_config: Secret[DatabricksVolumesAccessConfig]
64
65
  host: Optional[str] = Field(
65
66
  default=None,
66
67
  description="The Databricks host URL for either the "
67
68
  "Databricks workspace endpoint or the "
68
69
  "Databricks accounts endpoint.",
69
70
  )
71
+ user_agent: str = "unstructuredio_oss"
70
72
 
71
73
  def wrap_error(self, e: Exception) -> Exception:
72
74
  from databricks.sdk.errors.base import DatabricksError
@@ -94,11 +96,14 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
94
96
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
95
97
  def get_client(self) -> "WorkspaceClient":
96
98
  from databricks.sdk import WorkspaceClient
99
+ from databricks.sdk.core import Config
97
100
 
98
- return WorkspaceClient(
101
+ config = Config(
99
102
  host=self.host,
100
103
  **self.access_config.get_secret_value().model_dump(),
101
- )
104
+ ).with_user_agent_extra("PyDatabricksSdk", self.user_agent)
105
+
106
+ return WorkspaceClient(config=config)
102
107
 
103
108
 
104
109
  class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
@@ -1,14 +1,20 @@
1
+ import json
1
2
  import os
2
- import tempfile
3
3
  from contextlib import contextmanager
4
- from dataclasses import dataclass
4
+ from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from unstructured_ingest.utils.data_prep import get_data_df, write_data
11
- from unstructured_ingest.v2.interfaces import Uploader, UploaderConfig
10
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
11
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
+ from unstructured_ingest.v2.interfaces import (
13
+ Uploader,
14
+ UploaderConfig,
15
+ UploadStager,
16
+ UploadStagerConfig,
17
+ )
12
18
  from unstructured_ingest.v2.logger import logger
13
19
  from unstructured_ingest.v2.processes.connector_registry import (
14
20
  DestinationRegistryEntry,
@@ -16,28 +22,50 @@ from unstructured_ingest.v2.processes.connector_registry import (
16
22
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
17
23
  from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
18
24
  DatabricksDeltaTablesConnectionConfig,
19
- DatabricksDeltaTablesUploadStager,
20
25
  DatabricksDeltaTablesUploadStagerConfig,
21
26
  )
22
27
  from unstructured_ingest.v2.types.file_data import FileData
28
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
23
29
 
24
30
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
25
31
 
26
32
  if TYPE_CHECKING:
27
- from pandas import DataFrame
33
+ pass
28
34
 
29
35
 
30
36
  class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
31
37
  database: str = Field(description="Database name", default="default")
32
- table_name: str = Field(description="Table name")
38
+ table_name: Optional[str] = Field(description="Table name", default=None)
39
+
40
+
41
+ class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
42
+ pass
33
43
 
34
44
 
35
45
  @dataclass
36
- class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
37
- def write_output(self, output_path: Path, data: list[dict]) -> Path:
46
+ class DatabricksVolumeDeltaTableStager(UploadStager):
47
+ upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
48
+ default_factory=DatabricksVolumeDeltaTableStagerConfig
49
+ )
50
+
51
+ def run(
52
+ self,
53
+ elements_filepath: Path,
54
+ output_dir: Path,
55
+ output_filename: str,
56
+ file_data: FileData,
57
+ **kwargs: Any,
58
+ ) -> Path:
38
59
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
39
60
  # and always write it as a json file
61
+ output_dir.mkdir(exist_ok=True, parents=True)
62
+ output_path = output_dir / output_filename
40
63
  final_output_path = output_path.with_suffix(".json")
64
+ data = get_json_data(path=elements_filepath)
65
+ for element in data:
66
+ element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
67
+ element[RECORD_ID_LABEL] = file_data.identifier
68
+ element["metadata"] = json.dumps(element.get("metadata", {}))
41
69
  write_data(path=final_output_path, data=data, indent=None)
42
70
  return final_output_path
43
71
 
@@ -49,6 +77,29 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
49
77
  connector_type: str = CONNECTOR_TYPE
50
78
  _columns: Optional[dict[str, str]] = None
51
79
 
80
+ def init(self, **kwargs: Any) -> None:
81
+ self.create_destination(**kwargs)
82
+
83
+ def create_destination(
84
+ self, destination_name: str = "unstructuredautocreated", **kwargs: Any
85
+ ) -> bool:
86
+ table_name = self.upload_config.table_name or destination_name
87
+ self.upload_config.table_name = table_name
88
+ connectors_dir = Path(__file__).parents[1]
89
+ collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
90
+ with self.get_cursor() as cursor:
91
+ cursor.execute("SHOW TABLES")
92
+ table_names = [r[1] for r in cursor.fetchall()]
93
+ if table_name in table_names:
94
+ return False
95
+ with collection_config_file.open() as schema_file:
96
+ data_lines = schema_file.readlines()
97
+ data_lines[0] = data_lines[0].replace("elements", table_name)
98
+ destination_schema = "".join([line.strip() for line in data_lines])
99
+ logger.info(f"creating table {table_name} for user")
100
+ cursor.execute(destination_schema)
101
+ return True
102
+
52
103
  def precheck(self) -> None:
53
104
  with self.connection_config.get_cursor() as cursor:
54
105
  cursor.execute("SHOW CATALOGS")
@@ -68,14 +119,6 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
68
119
  self.upload_config.database, ", ".join(databases)
69
120
  )
70
121
  )
71
- cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
72
- table_names = [r[1] for r in cursor.fetchall()]
73
- if self.upload_config.table_name not in table_names:
74
- raise ValueError(
75
- "Table {} not found in {}".format(
76
- self.upload_config.table_name, ", ".join(table_names)
77
- )
78
- )
79
122
 
80
123
  def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
81
124
  filename = Path(file_data.source_identifiers.filename)
@@ -98,51 +141,42 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
98
141
  self._columns = {desc[0]: desc[1] for desc in cursor.description}
99
142
  return self._columns
100
143
 
101
- def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
102
- import pandas as pd
103
-
104
- table_columns = self.get_table_columns()
105
- columns = set(df.columns)
106
- schema_fields = set(table_columns.keys())
107
- columns_to_drop = columns - schema_fields
108
- missing_columns = schema_fields - columns
109
-
110
- if columns_to_drop:
111
- logger.info(
112
- "Following columns will be dropped to match the table's schema: "
113
- f"{', '.join(columns_to_drop)}"
114
- )
115
- if missing_columns and add_missing_columns:
116
- logger.info(
117
- "Following null filled columns will be added to match the table's schema:"
118
- f" {', '.join(missing_columns)} "
144
+ def can_delete(self) -> bool:
145
+ existing_columns = self.get_table_columns()
146
+ return RECORD_ID_LABEL in existing_columns
147
+
148
+ def delete_previous_content(self, file_data: FileData) -> None:
149
+ logger.debug(
150
+ f"deleting any content with metadata "
151
+ f"{RECORD_ID_LABEL}={file_data.identifier} "
152
+ f"from delta table: {self.upload_config.table_name}"
153
+ )
154
+ with self.get_cursor() as cursor:
155
+ cursor.execute(
156
+ f"DELETE FROM {self.upload_config.table_name} WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
119
157
  )
120
-
121
- df = df.drop(columns=columns_to_drop)
122
-
123
- if add_missing_columns:
124
- for column in missing_columns:
125
- df[column] = pd.Series()
126
- return df
158
+ results = cursor.fetchall()
159
+ deleted_rows = results[0][0]
160
+ logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
127
161
 
128
162
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
129
- with tempfile.TemporaryDirectory() as temp_dir:
130
- df = get_data_df()
131
- df = self._fit_to_schema(df=df)
132
- temp_path = Path(temp_dir) / path.name
133
- df.to_json(temp_path, orient="records", lines=False)
134
- with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
135
- catalog_path = self.get_output_path(file_data=file_data)
136
- logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
137
- cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
138
- logger.debug(
139
- f"migrating content from {catalog_path} to "
140
- f"table {self.upload_config.table_name}"
141
- )
142
- columns = list(df.columns)
143
- column_str = ", ".join(columns)
144
- sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
145
- cursor.execute(sql_statment)
163
+ if self.can_delete():
164
+ self.delete_previous_content(file_data=file_data)
165
+ with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
166
+ catalog_path = self.get_output_path(file_data=file_data)
167
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
168
+ cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
169
+ logger.debug(
170
+ f"migrating content from {catalog_path} to "
171
+ f"table {self.upload_config.table_name}"
172
+ )
173
+ data = get_json_data(path=path)
174
+ columns = data[0].keys()
175
+ select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
176
+ column_str = ", ".join(columns)
177
+ select_column_str = ", ".join(select_columns)
178
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
179
+ cursor.execute(sql_statment)
146
180
 
147
181
 
148
182
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: opentelemetry-sdk
25
+ Requires-Dist: click
26
+ Requires-Dist: python-dateutil
26
27
  Requires-Dist: dataclasses_json
27
28
  Requires-Dist: pydantic>=2.7
28
- Requires-Dist: python-dateutil
29
- Requires-Dist: click
30
29
  Requires-Dist: tqdm
30
+ Requires-Dist: opentelemetry-sdk
31
31
  Requires-Dist: pandas
32
32
  Requires-Dist: numpy
33
33
  Provides-Extra: remote
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
103
103
  Requires-Dist: pandas; extra == "astradb"
104
104
  Requires-Dist: numpy; extra == "astradb"
105
105
  Provides-Extra: azure
106
- Requires-Dist: adlfs; extra == "azure"
107
106
  Requires-Dist: fsspec; extra == "azure"
107
+ Requires-Dist: adlfs; extra == "azure"
108
108
  Requires-Dist: pandas; extra == "azure"
109
109
  Requires-Dist: numpy; extra == "azure"
110
110
  Provides-Extra: azure-ai-search
@@ -117,8 +117,8 @@ Requires-Dist: bs4; extra == "biomed"
117
117
  Requires-Dist: pandas; extra == "biomed"
118
118
  Requires-Dist: numpy; extra == "biomed"
119
119
  Provides-Extra: box
120
- Requires-Dist: boxfs; extra == "box"
121
120
  Requires-Dist: fsspec; extra == "box"
121
+ Requires-Dist: boxfs; extra == "box"
122
122
  Requires-Dist: pandas; extra == "box"
123
123
  Requires-Dist: numpy; extra == "box"
124
124
  Provides-Extra: chroma
@@ -130,8 +130,8 @@ Requires-Dist: clarifai; extra == "clarifai"
130
130
  Requires-Dist: pandas; extra == "clarifai"
131
131
  Requires-Dist: numpy; extra == "clarifai"
132
132
  Provides-Extra: confluence
133
- Requires-Dist: requests; extra == "confluence"
134
133
  Requires-Dist: atlassian-python-api; extra == "confluence"
134
+ Requires-Dist: requests; extra == "confluence"
135
135
  Requires-Dist: pandas; extra == "confluence"
136
136
  Requires-Dist: numpy; extra == "confluence"
137
137
  Provides-Extra: couchbase
@@ -161,14 +161,14 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
161
  Requires-Dist: pandas; extra == "elasticsearch"
162
162
  Requires-Dist: numpy; extra == "elasticsearch"
163
163
  Provides-Extra: gcs
164
- Requires-Dist: gcsfs; extra == "gcs"
165
164
  Requires-Dist: fsspec; extra == "gcs"
166
165
  Requires-Dist: bs4; extra == "gcs"
166
+ Requires-Dist: gcsfs; extra == "gcs"
167
167
  Requires-Dist: pandas; extra == "gcs"
168
168
  Requires-Dist: numpy; extra == "gcs"
169
169
  Provides-Extra: github
170
- Requires-Dist: pygithub>1.58.0; extra == "github"
171
170
  Requires-Dist: requests; extra == "github"
171
+ Requires-Dist: pygithub>1.58.0; extra == "github"
172
172
  Requires-Dist: pandas; extra == "github"
173
173
  Requires-Dist: numpy; extra == "github"
174
174
  Provides-Extra: gitlab
@@ -185,10 +185,10 @@ Requires-Dist: urllib3; extra == "hubspot"
185
185
  Requires-Dist: pandas; extra == "hubspot"
186
186
  Requires-Dist: numpy; extra == "hubspot"
187
187
  Provides-Extra: ibm-watsonx-s3
188
- Requires-Dist: httpx; extra == "ibm-watsonx-s3"
188
+ Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
189
189
  Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
190
190
  Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
191
- Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
191
+ Requires-Dist: httpx; extra == "ibm-watsonx-s3"
192
192
  Requires-Dist: pandas; extra == "ibm-watsonx-s3"
193
193
  Requires-Dist: numpy; extra == "ibm-watsonx-s3"
194
194
  Provides-Extra: jira
@@ -216,21 +216,21 @@ Requires-Dist: pymongo; extra == "mongodb"
216
216
  Requires-Dist: pandas; extra == "mongodb"
217
217
  Requires-Dist: numpy; extra == "mongodb"
218
218
  Provides-Extra: neo4j
219
- Requires-Dist: networkx; extra == "neo4j"
220
- Requires-Dist: cymple; extra == "neo4j"
221
219
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
220
+ Requires-Dist: cymple; extra == "neo4j"
221
+ Requires-Dist: networkx; extra == "neo4j"
222
222
  Requires-Dist: pandas; extra == "neo4j"
223
223
  Requires-Dist: numpy; extra == "neo4j"
224
224
  Provides-Extra: notion
225
225
  Requires-Dist: backoff; extra == "notion"
226
- Requires-Dist: httpx; extra == "notion"
227
226
  Requires-Dist: notion-client; extra == "notion"
227
+ Requires-Dist: httpx; extra == "notion"
228
228
  Requires-Dist: htmlBuilder; extra == "notion"
229
229
  Requires-Dist: pandas; extra == "notion"
230
230
  Requires-Dist: numpy; extra == "notion"
231
231
  Provides-Extra: onedrive
232
- Requires-Dist: msal; extra == "onedrive"
233
232
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
233
+ Requires-Dist: msal; extra == "onedrive"
234
234
  Requires-Dist: bs4; extra == "onedrive"
235
235
  Requires-Dist: pandas; extra == "onedrive"
236
236
  Requires-Dist: numpy; extra == "onedrive"
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
239
239
  Requires-Dist: pandas; extra == "opensearch"
240
240
  Requires-Dist: numpy; extra == "opensearch"
241
241
  Provides-Extra: outlook
242
- Requires-Dist: msal; extra == "outlook"
243
242
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
243
+ Requires-Dist: msal; extra == "outlook"
244
244
  Requires-Dist: pandas; extra == "outlook"
245
245
  Requires-Dist: numpy; extra == "outlook"
246
246
  Provides-Extra: pinecone
@@ -269,8 +269,8 @@ Requires-Dist: s3fs; extra == "s3"
269
269
  Requires-Dist: pandas; extra == "s3"
270
270
  Requires-Dist: numpy; extra == "s3"
271
271
  Provides-Extra: sharepoint
272
- Requires-Dist: msal; extra == "sharepoint"
273
272
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
273
+ Requires-Dist: msal; extra == "sharepoint"
274
274
  Requires-Dist: pandas; extra == "sharepoint"
275
275
  Requires-Dist: numpy; extra == "sharepoint"
276
276
  Provides-Extra: salesforce
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
278
278
  Requires-Dist: pandas; extra == "salesforce"
279
279
  Requires-Dist: numpy; extra == "salesforce"
280
280
  Provides-Extra: sftp
281
- Requires-Dist: fsspec; extra == "sftp"
282
281
  Requires-Dist: paramiko; extra == "sftp"
282
+ Requires-Dist: fsspec; extra == "sftp"
283
283
  Requires-Dist: pandas; extra == "sftp"
284
284
  Requires-Dist: numpy; extra == "sftp"
285
285
  Provides-Extra: slack
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
287
287
  Requires-Dist: pandas; extra == "slack"
288
288
  Requires-Dist: numpy; extra == "slack"
289
289
  Provides-Extra: snowflake
290
- Requires-Dist: psycopg2-binary; extra == "snowflake"
291
290
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
291
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
292
292
  Requires-Dist: pandas; extra == "snowflake"
293
293
  Requires-Dist: numpy; extra == "snowflake"
294
294
  Provides-Extra: wikipedia
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
312
312
  Requires-Dist: pandas; extra == "singlestore"
313
313
  Requires-Dist: numpy; extra == "singlestore"
314
314
  Provides-Extra: vectara
315
- Requires-Dist: httpx; extra == "vectara"
316
315
  Requires-Dist: requests; extra == "vectara"
316
+ Requires-Dist: httpx; extra == "vectara"
317
317
  Requires-Dist: aiofiles; extra == "vectara"
318
318
  Requires-Dist: pandas; extra == "vectara"
319
319
  Requires-Dist: numpy; extra == "vectara"
320
320
  Provides-Extra: vastdb
321
- Requires-Dist: ibis; extra == "vastdb"
322
321
  Requires-Dist: pyarrow; extra == "vastdb"
322
+ Requires-Dist: ibis; extra == "vastdb"
323
323
  Requires-Dist: vastdb; extra == "vastdb"
324
324
  Requires-Dist: pandas; extra == "vastdb"
325
325
  Requires-Dist: numpy; extra == "vastdb"
326
326
  Provides-Extra: zendesk
327
327
  Requires-Dist: httpx; extra == "zendesk"
328
- Requires-Dist: aiofiles; extra == "zendesk"
329
328
  Requires-Dist: bs4; extra == "zendesk"
329
+ Requires-Dist: aiofiles; extra == "zendesk"
330
330
  Requires-Dist: pandas; extra == "zendesk"
331
331
  Requires-Dist: numpy; extra == "zendesk"
332
332
  Provides-Extra: embed-huggingface
@@ -356,8 +356,8 @@ Requires-Dist: tiktoken; extra == "openai"
356
356
  Requires-Dist: pandas; extra == "openai"
357
357
  Requires-Dist: numpy; extra == "openai"
358
358
  Provides-Extra: bedrock
359
- Requires-Dist: aioboto3; extra == "bedrock"
360
359
  Requires-Dist: boto3; extra == "bedrock"
360
+ Requires-Dist: aioboto3; extra == "bedrock"
361
361
  Requires-Dist: pandas; extra == "bedrock"
362
362
  Requires-Dist: numpy; extra == "bedrock"
363
363
  Provides-Extra: togetherai
@@ -91,8 +91,6 @@ test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCr
91
91
  test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
93
93
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
- test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
- test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
96
94
  test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
95
  test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=WKpDKvEGalh8LYRqN9xA7CfMPOPHo_VcZbnCXdkVjho,14513
98
96
  test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -113,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
113
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
115
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
116
- unstructured_ingest/__version__.py,sha256=i3sYvJ7iKJXkLBzcGwrQbcRp0S3NVhrYt6MAMPjU-ss,42
114
+ unstructured_ingest/__version__.py,sha256=UDy7drjkPUljex5sEiDR3ZALQNnlcrCXwJShdKZ37Ek,42
117
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
118
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
119
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -421,7 +419,7 @@ unstructured_ingest/v2/pipeline/steps/stage.py,sha256=_0BN2i273y_fZyvSUPOOeXv4kL
421
419
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=I9TyqMCUSxlf2kdPADjeH4TrUTSe0FMTlARp9QD6TsE,1763
422
420
  unstructured_ingest/v2/pipeline/steps/upload.py,sha256=6x8SUdnydR76K6cR3nUVupOACIx-XsRV3vXRlebolqg,1996
423
421
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
424
- unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
422
+ unstructured_ingest/v2/processes/chunker.py,sha256=O5FN8KWym79H0dtKZvW7ABgn4bwKtaeUO8meGdjM2Yo,5609
425
423
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
426
424
  unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8gJwIS-imgloE-UOc,7887
427
425
  unstructured_ingest/v2/processes/filter.py,sha256=E1MLxk-XeCm3mZIuM49lJToVcSgOivmTFIZApqOEFs8,2150
@@ -454,14 +452,15 @@ unstructured_ingest/v2/processes/connectors/slack.py,sha256=vbBVCYEd741-n2v6eAXL
454
452
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
455
453
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=KUqgZ6D2KUOrW596ms-EekvQYDh-fXqBTa7KG-leXoo,12301
456
454
  unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
455
+ unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
457
456
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
458
457
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
459
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=8fg11-32If4iQGZTT9MEl1DOWZ5s3Qgj1OOzMVaHldU,7749
458
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=EghKdkt4nGacGxulSpjhToHOl5BRLbb3xNZpJzpWNX8,8002
460
459
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
461
460
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
462
461
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
463
462
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
464
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=5BArD1FkLC6wRJC0LxjXxQmYfmtF7r9Zrd8CtaGgWls,6855
463
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=LiSb66039idaRtMnTuHjR5ZqvdmmIu3ByUgFQ1a3iZQ,8264
465
464
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
466
465
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=VCoQ3h289BO4A2kJKZXUVB0QOcaQif-HeRgg-xXzn10,2976
467
466
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=DM4pygQAnP-dtuFEFAVeBfGt0pzrfkltteCai0GKnG0,4439
@@ -582,9 +581,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
582
581
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=_I3OMdpUElQdIwVs7W9ORU1kncNaZ_nr6lbxeKE8uaU,1014
583
582
  unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
584
583
  unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
585
- unstructured_ingest-0.6.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
586
- unstructured_ingest-0.6.0.dist-info/METADATA,sha256=NWCm69UvgcHT7J8owHmQCYkXmdfkfyCJNcW6slNzANA,14998
587
- unstructured_ingest-0.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
588
- unstructured_ingest-0.6.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
589
- unstructured_ingest-0.6.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
590
- unstructured_ingest-0.6.0.dist-info/RECORD,,
584
+ unstructured_ingest-0.6.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
585
+ unstructured_ingest-0.6.2.dist-info/METADATA,sha256=yUMpJD0UXDhUG1cIIpHkjn-VU2AScEaA12wLmISmG-A,14998
586
+ unstructured_ingest-0.6.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
587
+ unstructured_ingest-0.6.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
588
+ unstructured_ingest-0.6.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
589
+ unstructured_ingest-0.6.2.dist-info/RECORD,,
File without changes
@@ -1,44 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
- from pytest_mock import MockerFixture
5
-
6
- from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
7
- DatabricksVolumeDeltaTableStager,
8
- )
9
-
10
-
11
- @pytest.fixture
12
- def stager():
13
- return DatabricksVolumeDeltaTableStager()
14
-
15
-
16
- @pytest.mark.parametrize(
17
- ("output_path", "called_output_path"),
18
- [
19
- (
20
- Path("/fake/path/output"),
21
- Path("/fake/path/output.json"),
22
- ),
23
- (
24
- Path("/fake/path/output.ndjson"),
25
- Path("/fake/path/output.json"),
26
- ),
27
- ],
28
- )
29
- def test_write_output(
30
- mocker: MockerFixture,
31
- stager: DatabricksVolumeDeltaTableStager,
32
- output_path: Path,
33
- called_output_path: Path,
34
- ):
35
- data = [{"key1": "value1", "key2": "value2"}]
36
-
37
- mock_get_data = mocker.patch(
38
- "unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
39
- return_value=None,
40
- )
41
-
42
- stager.write_output(output_path, data)
43
-
44
- mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)