unstructured-ingest 0.6.1__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -0,0 +1,49 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
+ from test.integration.connectors.utils.validation.source import (
7
+ SourceValidationConfigs,
8
+ source_connector_validation,
9
+ )
10
+ from test.integration.utils import requires_env
11
+ from unstructured_ingest.v2.processes.connectors.github import (
12
+ CONNECTOR_TYPE,
13
+ GithubAccessConfig,
14
+ GithubConnectionConfig,
15
+ GithubDownloader,
16
+ GithubDownloaderConfig,
17
+ GithubIndexer,
18
+ GithubIndexerConfig,
19
+ )
20
+
21
+
22
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
23
+ @pytest.mark.asyncio
24
+ @requires_env("GH_READ_ONLY_ACCESS_TOKEN")
25
+ async def test_github_source(temp_dir):
26
+ access_token = os.environ["GH_READ_ONLY_ACCESS_TOKEN"]
27
+ connection_config = GithubConnectionConfig(
28
+ access_config=GithubAccessConfig(access_token=access_token),
29
+ url="dcneiner/Downloadify",
30
+ )
31
+
32
+ indexer = GithubIndexer(
33
+ connection_config=connection_config,
34
+ index_config=GithubIndexerConfig(file_glob=["*.txt", "*.html"]),
35
+ )
36
+
37
+ downloader = GithubDownloader(
38
+ connection_config=connection_config,
39
+ download_config=GithubDownloaderConfig(download_dir=temp_dir),
40
+ )
41
+
42
+ # Run the source connector validation
43
+ await source_connector_validation(
44
+ indexer=indexer,
45
+ downloader=downloader,
46
+ configs=SourceValidationConfigs(
47
+ test_id="github", expected_num_files=2, validate_downloaded_files=True
48
+ ),
49
+ )
@@ -1 +1 @@
1
- __version__ = "0.6.1" # pragma: no cover
1
+ __version__ = "0.6.4" # pragma: no cover
@@ -31,6 +31,8 @@ from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
31
31
  from .delta_table import delta_table_destination_entry
32
32
  from .discord import CONNECTOR_TYPE as DISCORD_CONNECTOR_TYPE
33
33
  from .discord import discord_source_entry
34
+ from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE
35
+ from .github import github_source_entry
34
36
  from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
35
37
  from .gitlab import gitlab_source_entry
36
38
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -124,3 +126,4 @@ add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destina
124
126
  add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
125
127
 
126
128
  add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
129
+ add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry)
@@ -0,0 +1,10 @@
1
+ CREATE TABLE elements (
2
+ id STRING NOT NULL PRIMARY KEY,
3
+ record_id STRING NOT NULL,
4
+ element_id STRING NOT NULL,
5
+ text STRING,
6
+ embeddings ARRAY<FLOAT>,
7
+ type STRING,
8
+ metadata VARIANT
9
+ );
10
+
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
  from uuid import NAMESPACE_DNS, uuid5
7
7
 
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel, Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.errors import (
@@ -61,6 +61,7 @@ class DatabricksVolumesAccessConfig(AccessConfig):
61
61
 
62
62
 
63
63
  class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
64
+ access_config: Secret[DatabricksVolumesAccessConfig]
64
65
  host: Optional[str] = Field(
65
66
  default=None,
66
67
  description="The Databricks host URL for either the "
@@ -94,12 +95,17 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
94
95
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
95
96
  def get_client(self) -> "WorkspaceClient":
96
97
  from databricks.sdk import WorkspaceClient
98
+ from databricks.sdk.core import Config
97
99
 
98
- return WorkspaceClient(
100
+ config = Config(
99
101
  host=self.host,
100
102
  **self.access_config.get_secret_value().model_dump(),
103
+ ).with_user_agent_extra(
104
+ "PyDatabricksSdk", os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
101
105
  )
102
106
 
107
+ return WorkspaceClient(config=config)
108
+
103
109
 
104
110
  class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
105
111
  recursive: bool = False
@@ -1,14 +1,20 @@
1
+ import json
1
2
  import os
2
- import tempfile
3
3
  from contextlib import contextmanager
4
- from dataclasses import dataclass
4
+ from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from unstructured_ingest.utils.data_prep import get_data_df, write_data
11
- from unstructured_ingest.v2.interfaces import Uploader, UploaderConfig
10
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
11
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
+ from unstructured_ingest.v2.interfaces import (
13
+ Uploader,
14
+ UploaderConfig,
15
+ UploadStager,
16
+ UploadStagerConfig,
17
+ )
12
18
  from unstructured_ingest.v2.logger import logger
13
19
  from unstructured_ingest.v2.processes.connector_registry import (
14
20
  DestinationRegistryEntry,
@@ -16,28 +22,50 @@ from unstructured_ingest.v2.processes.connector_registry import (
16
22
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
17
23
  from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
18
24
  DatabricksDeltaTablesConnectionConfig,
19
- DatabricksDeltaTablesUploadStager,
20
25
  DatabricksDeltaTablesUploadStagerConfig,
21
26
  )
22
27
  from unstructured_ingest.v2.types.file_data import FileData
28
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
23
29
 
24
30
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
25
31
 
26
32
  if TYPE_CHECKING:
27
- from pandas import DataFrame
33
+ pass
28
34
 
29
35
 
30
36
  class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
31
37
  database: str = Field(description="Database name", default="default")
32
- table_name: str = Field(description="Table name")
38
+ table_name: Optional[str] = Field(description="Table name", default=None)
39
+
40
+
41
+ class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
42
+ pass
33
43
 
34
44
 
35
45
  @dataclass
36
- class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
37
- def write_output(self, output_path: Path, data: list[dict]) -> Path:
46
+ class DatabricksVolumeDeltaTableStager(UploadStager):
47
+ upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
48
+ default_factory=DatabricksVolumeDeltaTableStagerConfig
49
+ )
50
+
51
+ def run(
52
+ self,
53
+ elements_filepath: Path,
54
+ output_dir: Path,
55
+ output_filename: str,
56
+ file_data: FileData,
57
+ **kwargs: Any,
58
+ ) -> Path:
38
59
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
39
60
  # and always write it as a json file
61
+ output_dir.mkdir(exist_ok=True, parents=True)
62
+ output_path = output_dir / output_filename
40
63
  final_output_path = output_path.with_suffix(".json")
64
+ data = get_json_data(path=elements_filepath)
65
+ for element in data:
66
+ element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
67
+ element[RECORD_ID_LABEL] = file_data.identifier
68
+ element["metadata"] = json.dumps(element.get("metadata", {}))
41
69
  write_data(path=final_output_path, data=data, indent=None)
42
70
  return final_output_path
43
71
 
@@ -49,6 +77,29 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
49
77
  connector_type: str = CONNECTOR_TYPE
50
78
  _columns: Optional[dict[str, str]] = None
51
79
 
80
+ def init(self, **kwargs: Any) -> None:
81
+ self.create_destination(**kwargs)
82
+
83
+ def create_destination(
84
+ self, destination_name: str = "unstructuredautocreated", **kwargs: Any
85
+ ) -> bool:
86
+ table_name = self.upload_config.table_name or destination_name
87
+ self.upload_config.table_name = table_name
88
+ connectors_dir = Path(__file__).parents[1]
89
+ collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
90
+ with self.get_cursor() as cursor:
91
+ cursor.execute("SHOW TABLES")
92
+ table_names = [r[1] for r in cursor.fetchall()]
93
+ if table_name in table_names:
94
+ return False
95
+ with collection_config_file.open() as schema_file:
96
+ data_lines = schema_file.readlines()
97
+ data_lines[0] = data_lines[0].replace("elements", table_name)
98
+ destination_schema = "".join([line.strip() for line in data_lines])
99
+ logger.info(f"creating table {table_name} for user")
100
+ cursor.execute(destination_schema)
101
+ return True
102
+
52
103
  def precheck(self) -> None:
53
104
  with self.connection_config.get_cursor() as cursor:
54
105
  cursor.execute("SHOW CATALOGS")
@@ -68,14 +119,6 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
68
119
  self.upload_config.database, ", ".join(databases)
69
120
  )
70
121
  )
71
- cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
72
- table_names = [r[1] for r in cursor.fetchall()]
73
- if self.upload_config.table_name not in table_names:
74
- raise ValueError(
75
- "Table {} not found in {}".format(
76
- self.upload_config.table_name, ", ".join(table_names)
77
- )
78
- )
79
122
 
80
123
  def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
81
124
  filename = Path(file_data.source_identifiers.filename)
@@ -98,51 +141,42 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
98
141
  self._columns = {desc[0]: desc[1] for desc in cursor.description}
99
142
  return self._columns
100
143
 
101
- def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
102
- import pandas as pd
103
-
104
- table_columns = self.get_table_columns()
105
- columns = set(df.columns)
106
- schema_fields = set(table_columns.keys())
107
- columns_to_drop = columns - schema_fields
108
- missing_columns = schema_fields - columns
109
-
110
- if columns_to_drop:
111
- logger.info(
112
- "Following columns will be dropped to match the table's schema: "
113
- f"{', '.join(columns_to_drop)}"
114
- )
115
- if missing_columns and add_missing_columns:
116
- logger.info(
117
- "Following null filled columns will be added to match the table's schema:"
118
- f" {', '.join(missing_columns)} "
144
+ def can_delete(self) -> bool:
145
+ existing_columns = self.get_table_columns()
146
+ return RECORD_ID_LABEL in existing_columns
147
+
148
+ def delete_previous_content(self, file_data: FileData) -> None:
149
+ logger.debug(
150
+ f"deleting any content with metadata "
151
+ f"{RECORD_ID_LABEL}={file_data.identifier} "
152
+ f"from delta table: {self.upload_config.table_name}"
153
+ )
154
+ with self.get_cursor() as cursor:
155
+ cursor.execute(
156
+ f"DELETE FROM {self.upload_config.table_name} WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
119
157
  )
120
-
121
- df = df.drop(columns=columns_to_drop)
122
-
123
- if add_missing_columns:
124
- for column in missing_columns:
125
- df[column] = pd.Series()
126
- return df
158
+ results = cursor.fetchall()
159
+ deleted_rows = results[0][0]
160
+ logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
127
161
 
128
162
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
129
- with tempfile.TemporaryDirectory() as temp_dir:
130
- df = get_data_df()
131
- df = self._fit_to_schema(df=df)
132
- temp_path = Path(temp_dir) / path.name
133
- df.to_json(temp_path, orient="records", lines=False)
134
- with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
135
- catalog_path = self.get_output_path(file_data=file_data)
136
- logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
137
- cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
138
- logger.debug(
139
- f"migrating content from {catalog_path} to "
140
- f"table {self.upload_config.table_name}"
141
- )
142
- columns = list(df.columns)
143
- column_str = ", ".join(columns)
144
- sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
145
- cursor.execute(sql_statment)
163
+ if self.can_delete():
164
+ self.delete_previous_content(file_data=file_data)
165
+ with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
166
+ catalog_path = self.get_output_path(file_data=file_data)
167
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
168
+ cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
169
+ logger.debug(
170
+ f"migrating content from {catalog_path} to "
171
+ f"table {self.upload_config.table_name}"
172
+ )
173
+ data = get_json_data(path=path)
174
+ columns = data[0].keys()
175
+ select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
176
+ column_str = ", ".join(columns)
177
+ select_column_str = ", ".join(select_columns)
178
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
179
+ cursor.execute(sql_statment)
146
180
 
147
181
 
148
182
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
@@ -0,0 +1,221 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from time import time
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+ from urllib.parse import urlparse
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import Field, Secret, field_validator
9
+
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ Downloader,
16
+ DownloaderConfig,
17
+ Indexer,
18
+ IndexerConfig,
19
+ download_responses,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ SourceRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.types.file_data import (
26
+ FileData,
27
+ FileDataSourceMetadata,
28
+ SourceIdentifiers,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from github import ContentFile, GitTreeElement, Repository
33
+ from github import Github as GithubClient
34
+ from github.GithubException import GithubException
35
+ from requests import HTTPError
36
+
37
+ CONNECTOR_TYPE = "github"
38
+
39
+
40
+ class GithubAccessConfig(AccessConfig):
41
+ access_token: str = Field(description="Github acess token")
42
+
43
+
44
+ class GithubConnectionConfig(ConnectionConfig):
45
+ access_config: Secret[GithubAccessConfig]
46
+ url: str = Field(description="Github url or repository owner/name pair")
47
+
48
+ @field_validator("url", mode="after")
49
+ def conform_url(cls, value: str):
50
+ parsed_url = urlparse(value)
51
+ return parsed_url.path
52
+
53
+ def get_full_url(self):
54
+ return f"https://github.com/{self.url}"
55
+
56
+ @requires_dependencies(["github"], extras="github")
57
+ def get_client(self) -> "GithubClient":
58
+ from github import Github as GithubClient
59
+
60
+ return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
61
+
62
+ def get_repo(self) -> "Repository":
63
+ client = self.get_client()
64
+ return client.get_repo(self.url)
65
+
66
+ def wrap_github_exception(self, e: "GithubException") -> Exception:
67
+ data = e.data
68
+ status_code = e.status
69
+ message = data.get("message")
70
+ if status_code == 401:
71
+ return UserAuthError(f"Unauthorized access to Github: {message}")
72
+ if 400 <= status_code < 500:
73
+ return UserError(message)
74
+ if status_code > 500:
75
+ return ProviderError(message)
76
+ logger.debug(f"unhandled github error: {e}")
77
+ return e
78
+
79
+ def wrap_http_error(self, e: "HTTPError") -> Exception:
80
+ status_code = e.response.status_code
81
+ if status_code == 401:
82
+ return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
83
+ if 400 <= status_code < 500:
84
+ return UserError(e.response.text)
85
+ if status_code > 500:
86
+ return ProviderError(e.response.text)
87
+ logger.debug(f"unhandled http error: {e}")
88
+ return e
89
+
90
+ @requires_dependencies(["requests"], extras="github")
91
+ def wrap_error(self, e: Exception) -> Exception:
92
+ from github.GithubException import GithubException
93
+ from requests import HTTPError
94
+
95
+ if isinstance(e, GithubException):
96
+ return self.wrap_github_exception(e=e)
97
+ if isinstance(e, HTTPError):
98
+ return self.wrap_http_error(e=e)
99
+ logger.debug(f"unhandled error: {e}")
100
+ return e
101
+
102
+
103
+ class GithubIndexerConfig(IndexerConfig):
104
+ branch: Optional[str] = Field(
105
+ description="Branch to index, use the default if one isn't provided", default=None
106
+ )
107
+ recursive: bool = Field(
108
+ description="Recursively index all files in the repository", default=True
109
+ )
110
+
111
+
112
+ @dataclass
113
+ class GithubIndexer(Indexer):
114
+ connection_config: GithubConnectionConfig
115
+ index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
116
+ connector_type: str = CONNECTOR_TYPE
117
+
118
+ def precheck(self) -> None:
119
+ try:
120
+ self.connection_config.get_repo()
121
+ except Exception as e:
122
+ raise self.connection_config.wrap_error(e=e)
123
+
124
+ def get_branch(self) -> str:
125
+ repo = self.connection_config.get_repo()
126
+ sha = self.index_config.branch or repo.default_branch
127
+ return sha
128
+
129
+ def list_files(self) -> list["GitTreeElement"]:
130
+ repo = self.connection_config.get_repo()
131
+ sha = self.index_config.branch or repo.default_branch
132
+ git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
133
+ file_elements = [
134
+ element for element in git_tree.tree if element.size is not None and element.size > 0
135
+ ]
136
+ return file_elements
137
+
138
+ def convert_element(self, element: "GitTreeElement") -> FileData:
139
+ full_path = (
140
+ f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
141
+ )
142
+
143
+ return FileData(
144
+ identifier=str(uuid5(NAMESPACE_DNS, full_path)),
145
+ connector_type=self.connector_type,
146
+ display_name=full_path,
147
+ source_identifiers=SourceIdentifiers(
148
+ filename=Path(element.path).name,
149
+ fullpath=(Path(self.get_branch()) / element.path).as_posix(),
150
+ rel_path=element.path,
151
+ ),
152
+ metadata=FileDataSourceMetadata(
153
+ url=element.url,
154
+ version=element.etag,
155
+ record_locator={},
156
+ date_modified=str(element.last_modified_datetime.timestamp()),
157
+ date_processed=str(time()),
158
+ filesize_bytes=element.size,
159
+ permissions_data=[{"mode": element.mode}],
160
+ ),
161
+ )
162
+
163
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
164
+ for element in self.list_files():
165
+ yield self.convert_element(element=element)
166
+
167
+
168
+ class GithubDownloaderConfig(DownloaderConfig):
169
+ pass
170
+
171
+
172
+ @dataclass
173
+ class GithubDownloader(Downloader):
174
+ download_config: GithubDownloaderConfig
175
+ connection_config: GithubConnectionConfig
176
+ connector_type: str = CONNECTOR_TYPE
177
+
178
+ @requires_dependencies(["github"], extras="github")
179
+ def get_file(self, file_data: FileData) -> "ContentFile":
180
+ from github.GithubException import UnknownObjectException
181
+
182
+ path = file_data.source_identifiers.relative_path
183
+ repo = self.connection_config.get_repo()
184
+
185
+ try:
186
+ content_file = repo.get_contents(path)
187
+ except UnknownObjectException as e:
188
+ logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
189
+ raise UserError(f"File not found: {path}")
190
+ return content_file
191
+
192
+ @requires_dependencies(["requests"], extras="github")
193
+ def get_contents(self, content_file: "ContentFile") -> bytes:
194
+ import requests
195
+
196
+ if content_file.decoded_content:
197
+ return content_file.decoded_content
198
+ download_url = content_file.download_url
199
+ resp = requests.get(download_url)
200
+ try:
201
+ resp.raise_for_status()
202
+ except requests.HTTPError as e:
203
+ raise self.connection_config.wrap_error(e=e)
204
+ return resp.content
205
+
206
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
207
+ content_file = self.get_file(file_data)
208
+ contents = self.get_contents(content_file)
209
+ download_path = self.get_download_path(file_data)
210
+ with download_path.open("wb") as f:
211
+ f.write(contents)
212
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
213
+
214
+
215
+ github_source_entry = SourceRegistryEntry(
216
+ indexer=GithubIndexer,
217
+ indexer_config=GithubIndexerConfig,
218
+ downloader=GithubDownloader,
219
+ downloader_config=GithubDownloaderConfig,
220
+ connection_config=GithubConnectionConfig,
221
+ )
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  from contextlib import contextmanager
3
4
  from dataclasses import dataclass
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
@@ -42,7 +43,6 @@ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
42
43
  access_config: Secret[DatabricksDeltaTablesAccessConfig]
43
44
  server_hostname: str = Field(description="server hostname connection config value")
44
45
  http_path: str = Field(description="http path connection config value")
45
- user_agent: str = "unstructuredio_oss"
46
46
 
47
47
  @requires_dependencies(["databricks"], extras="databricks-delta-tables")
48
48
  def get_credentials_provider(self) -> "oauth_service_principal":
@@ -86,7 +86,9 @@ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
86
86
  from databricks.sql import connect
87
87
 
88
88
  connect_kwargs = connect_kwargs or {}
89
- connect_kwargs["_user_agent_entry"] = self.user_agent
89
+ connect_kwargs["_user_agent_entry"] = os.getenv(
90
+ "UNSTRUCTURED_USER_AGENT", "unstructuredio_oss"
91
+ )
90
92
  connect_kwargs["server_hostname"] = connect_kwargs.get(
91
93
  "server_hostname", self.server_hostname
92
94
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.6.1
3
+ Version: 0.6.4
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: click
26
- Requires-Dist: tqdm
25
+ Requires-Dist: pydantic>=2.7
27
26
  Requires-Dist: opentelemetry-sdk
28
- Requires-Dist: python-dateutil
27
+ Requires-Dist: tqdm
28
+ Requires-Dist: click
29
29
  Requires-Dist: dataclasses_json
30
- Requires-Dist: pydantic>=2.7
30
+ Requires-Dist: python-dateutil
31
31
  Requires-Dist: numpy
32
32
  Requires-Dist: pandas
33
33
  Provides-Extra: remote
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
103
103
  Requires-Dist: numpy; extra == "astradb"
104
104
  Requires-Dist: pandas; extra == "astradb"
105
105
  Provides-Extra: azure
106
- Requires-Dist: adlfs; extra == "azure"
107
106
  Requires-Dist: fsspec; extra == "azure"
107
+ Requires-Dist: adlfs; extra == "azure"
108
108
  Requires-Dist: numpy; extra == "azure"
109
109
  Requires-Dist: pandas; extra == "azure"
110
110
  Provides-Extra: azure-ai-search
@@ -139,8 +139,8 @@ Requires-Dist: couchbase; extra == "couchbase"
139
139
  Requires-Dist: numpy; extra == "couchbase"
140
140
  Requires-Dist: pandas; extra == "couchbase"
141
141
  Provides-Extra: delta-table
142
- Requires-Dist: boto3; extra == "delta-table"
143
142
  Requires-Dist: deltalake; extra == "delta-table"
143
+ Requires-Dist: boto3; extra == "delta-table"
144
144
  Requires-Dist: numpy; extra == "delta-table"
145
145
  Requires-Dist: pandas; extra == "delta-table"
146
146
  Provides-Extra: discord
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
148
148
  Requires-Dist: numpy; extra == "discord"
149
149
  Requires-Dist: pandas; extra == "discord"
150
150
  Provides-Extra: dropbox
151
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
152
151
  Requires-Dist: fsspec; extra == "dropbox"
152
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
153
153
  Requires-Dist: numpy; extra == "dropbox"
154
154
  Requires-Dist: pandas; extra == "dropbox"
155
155
  Provides-Extra: duckdb
@@ -161,9 +161,9 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
161
  Requires-Dist: numpy; extra == "elasticsearch"
162
162
  Requires-Dist: pandas; extra == "elasticsearch"
163
163
  Provides-Extra: gcs
164
+ Requires-Dist: fsspec; extra == "gcs"
164
165
  Requires-Dist: bs4; extra == "gcs"
165
166
  Requires-Dist: gcsfs; extra == "gcs"
166
- Requires-Dist: fsspec; extra == "gcs"
167
167
  Requires-Dist: numpy; extra == "gcs"
168
168
  Requires-Dist: pandas; extra == "gcs"
169
169
  Provides-Extra: github
@@ -185,9 +185,9 @@ Requires-Dist: urllib3; extra == "hubspot"
185
185
  Requires-Dist: numpy; extra == "hubspot"
186
186
  Requires-Dist: pandas; extra == "hubspot"
187
187
  Provides-Extra: ibm-watsonx-s3
188
- Requires-Dist: httpx; extra == "ibm-watsonx-s3"
189
- Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
190
188
  Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
189
+ Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
190
+ Requires-Dist: httpx; extra == "ibm-watsonx-s3"
191
191
  Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
192
192
  Requires-Dist: numpy; extra == "ibm-watsonx-s3"
193
193
  Requires-Dist: pandas; extra == "ibm-watsonx-s3"
@@ -216,15 +216,15 @@ Requires-Dist: pymongo; extra == "mongodb"
216
216
  Requires-Dist: numpy; extra == "mongodb"
217
217
  Requires-Dist: pandas; extra == "mongodb"
218
218
  Provides-Extra: neo4j
219
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
219
220
  Requires-Dist: networkx; extra == "neo4j"
220
221
  Requires-Dist: cymple; extra == "neo4j"
221
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
222
222
  Requires-Dist: numpy; extra == "neo4j"
223
223
  Requires-Dist: pandas; extra == "neo4j"
224
224
  Provides-Extra: notion
225
- Requires-Dist: httpx; extra == "notion"
226
- Requires-Dist: notion-client; extra == "notion"
227
225
  Requires-Dist: htmlBuilder; extra == "notion"
226
+ Requires-Dist: notion-client; extra == "notion"
227
+ Requires-Dist: httpx; extra == "notion"
228
228
  Requires-Dist: backoff; extra == "notion"
229
229
  Requires-Dist: numpy; extra == "notion"
230
230
  Requires-Dist: pandas; extra == "notion"
@@ -264,8 +264,8 @@ Requires-Dist: redis; extra == "redis"
264
264
  Requires-Dist: numpy; extra == "redis"
265
265
  Requires-Dist: pandas; extra == "redis"
266
266
  Provides-Extra: s3
267
- Requires-Dist: s3fs; extra == "s3"
268
267
  Requires-Dist: fsspec; extra == "s3"
268
+ Requires-Dist: s3fs; extra == "s3"
269
269
  Requires-Dist: numpy; extra == "s3"
270
270
  Requires-Dist: pandas; extra == "s3"
271
271
  Provides-Extra: sharepoint
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
278
278
  Requires-Dist: numpy; extra == "salesforce"
279
279
  Requires-Dist: pandas; extra == "salesforce"
280
280
  Provides-Extra: sftp
281
- Requires-Dist: paramiko; extra == "sftp"
282
281
  Requires-Dist: fsspec; extra == "sftp"
282
+ Requires-Dist: paramiko; extra == "sftp"
283
283
  Requires-Dist: numpy; extra == "sftp"
284
284
  Requires-Dist: pandas; extra == "sftp"
285
285
  Provides-Extra: slack
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
287
287
  Requires-Dist: numpy; extra == "slack"
288
288
  Requires-Dist: pandas; extra == "slack"
289
289
  Provides-Extra: snowflake
290
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
291
290
  Requires-Dist: psycopg2-binary; extra == "snowflake"
291
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
292
292
  Requires-Dist: numpy; extra == "snowflake"
293
293
  Requires-Dist: pandas; extra == "snowflake"
294
294
  Provides-Extra: wikipedia
@@ -313,8 +313,8 @@ Requires-Dist: numpy; extra == "singlestore"
313
313
  Requires-Dist: pandas; extra == "singlestore"
314
314
  Provides-Extra: vectara
315
315
  Requires-Dist: requests; extra == "vectara"
316
- Requires-Dist: aiofiles; extra == "vectara"
317
316
  Requires-Dist: httpx; extra == "vectara"
317
+ Requires-Dist: aiofiles; extra == "vectara"
318
318
  Requires-Dist: numpy; extra == "vectara"
319
319
  Requires-Dist: pandas; extra == "vectara"
320
320
  Provides-Extra: vastdb
@@ -324,9 +324,9 @@ Requires-Dist: pyarrow; extra == "vastdb"
324
324
  Requires-Dist: numpy; extra == "vastdb"
325
325
  Requires-Dist: pandas; extra == "vastdb"
326
326
  Provides-Extra: zendesk
327
+ Requires-Dist: bs4; extra == "zendesk"
327
328
  Requires-Dist: httpx; extra == "zendesk"
328
329
  Requires-Dist: aiofiles; extra == "zendesk"
329
- Requires-Dist: bs4; extra == "zendesk"
330
330
  Requires-Dist: numpy; extra == "zendesk"
331
331
  Requires-Dist: pandas; extra == "zendesk"
332
332
  Provides-Extra: embed-huggingface
@@ -356,8 +356,8 @@ Requires-Dist: tiktoken; extra == "openai"
356
356
  Requires-Dist: numpy; extra == "openai"
357
357
  Requires-Dist: pandas; extra == "openai"
358
358
  Provides-Extra: bedrock
359
- Requires-Dist: boto3; extra == "bedrock"
360
359
  Requires-Dist: aioboto3; extra == "bedrock"
360
+ Requires-Dist: boto3; extra == "bedrock"
361
361
  Requires-Dist: numpy; extra == "bedrock"
362
362
  Requires-Dist: pandas; extra == "bedrock"
363
363
  Provides-Extra: togetherai
@@ -11,6 +11,7 @@ test/integration/connectors/test_chroma.py,sha256=yn2p8U8yE9LaF-IEKiLp2XB4T4Vqo-
11
11
  test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
12
12
  test/integration/connectors/test_delta_table.py,sha256=r2OvLMRfJFfsyIHd1H44Kx6AgNnDjPHKofN7a01oqrY,6916
13
13
  test/integration/connectors/test_dropbox.py,sha256=jzpZ6wawLa4sC1BVoHWZJ3cHjL4DWWUEX5ee7bXUOOM,4945
14
+ test/integration/connectors/test_github.py,sha256=cRArMUMtAujsVtR2yp2GHZktX-cSPRSsYhlnnsRhhH8,1503
14
15
  test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
15
16
  test/integration/connectors/test_jira.py,sha256=IgF_cdknJ97W3OrNfHKp3YLmRyVwY6BI8jhZwVbrebc,2076
16
17
  test/integration/connectors/test_lancedb.py,sha256=AuO2qHDay5NHxQ2b1LzbsNNp6LnUDeB2_0dWlkgAuMo,9223
@@ -91,8 +92,6 @@ test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCr
91
92
  test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
93
  test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
93
94
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
- test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
- test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
96
95
  test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
96
  test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=WKpDKvEGalh8LYRqN9xA7CfMPOPHo_VcZbnCXdkVjho,14513
98
97
  test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -113,7 +112,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
113
112
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
113
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
115
114
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
116
- unstructured_ingest/__version__.py,sha256=vYkj5wI9darc7y1Fll8uAtxzlI0lqsa5gGerwBBkeIQ,42
115
+ unstructured_ingest/__version__.py,sha256=0sOJ1f0sRyjdtSL0LUqCE6m6T039ttMVXeIDsmdxcPw,42
117
116
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
118
117
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
119
118
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -427,7 +426,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
427
426
  unstructured_ingest/v2/processes/filter.py,sha256=E1MLxk-XeCm3mZIuM49lJToVcSgOivmTFIZApqOEFs8,2150
428
427
  unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
429
428
  unstructured_ingest/v2/processes/uncompress.py,sha256=O7q0pOiL6cDBvAa9NAy_vkc873tisjK2Hoq6Z-grRRo,2430
430
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=l4Xq4AuzRMTqUv5TU7cE1NbhGCka4SFJFZwG1FoVotE,6666
429
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=iyoZrTaEoBPBn9-tczFhLcwKQQoeFbESF3QkJkVimv4,6845
431
430
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=JesWeUv_tIA7b65sE2Z-ixMKuGLlgugZRMKE38ID3zg,8959
432
431
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=SwfUcdrCbMK_LDcHG5auCCuga_luPmZVvhjuAsRtKU0,18304
433
432
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=K4g-Dh7u7Z13rheNKhnnBXcO6TUfz8RPrrusDAsgkBk,11575
@@ -436,6 +435,7 @@ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=ERHYYutQsnS8eZN
436
435
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LARX4F_1Zd1LnUMSNdvIsbqLoZXZ9kl_vMZh-dRr4XA,12305
437
436
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=5T4hkXHGitGprpUb20206ODcBh5wchgHkUocji5l2rk,7286
438
437
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=yvLxTx0ZRIATqSYLm5d09u9a0ktZGJnGcnzapwePHK8,5301
438
+ unstructured_ingest/v2/processes/connectors/github.py,sha256=d-sh28MVe40vyaTf8b8NkspSrV9zlUHjdSSfSZjOcOA,7772
439
439
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=hsB5g-3tZqe7bVSqwA0nY9GY4J0cg4PBawf77GkpjZY,10039
440
440
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=mn1BoUXYw5j-q7jO-rzPPEv_rt1UV9LDN1M_PyREEps,19678
441
441
  unstructured_ingest/v2/processes/connectors/jira.py,sha256=_ivv0TqeVPlHG9YMihljzrpYP7OLQAa0D7nqLonaeM8,17149
@@ -454,14 +454,15 @@ unstructured_ingest/v2/processes/connectors/slack.py,sha256=vbBVCYEd741-n2v6eAXL
454
454
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
455
455
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=KUqgZ6D2KUOrW596ms-EekvQYDh-fXqBTa7KG-leXoo,12301
456
456
  unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
457
+ unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
457
458
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
458
459
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
459
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=8fg11-32If4iQGZTT9MEl1DOWZ5s3Qgj1OOzMVaHldU,7749
460
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=JktJXC9SYnKLetjsyGJWKXqg5Kml8WY9dcKyr5o_Yxs,8024
460
461
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
461
462
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
462
463
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
463
464
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
464
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=5BArD1FkLC6wRJC0LxjXxQmYfmtF7r9Zrd8CtaGgWls,6855
465
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=LiSb66039idaRtMnTuHjR5ZqvdmmIu3ByUgFQ1a3iZQ,8264
465
466
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
466
467
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=VCoQ3h289BO4A2kJKZXUVB0QOcaQif-HeRgg-xXzn10,2976
467
468
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=DM4pygQAnP-dtuFEFAVeBfGt0pzrfkltteCai0GKnG0,4439
@@ -563,7 +564,7 @@ unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ
563
564
  unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=hsOd2Gliyjzkb21Vv6RAiFf8NAysxd29K0AxBkkm844,5483
564
565
  unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
565
566
  unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=NSEZwJDHh_9kFc31LnG14iRtYF3meK2UfUlQfYnwYEQ,2059
566
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=JrQMlSavPgSD1ruy4xRHaQV5iGiG7oavtm7az0tVVZc,9054
567
+ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=hJ0yacutrgiCer9cJSfxcNgLlOgsozJ2yGhgy8vZAkk,9086
567
568
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
568
569
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
569
570
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=FOb08WCr0SdzylN88xDP51NdVD4ggDbjanJurwJUrgM,9374
@@ -582,9 +583,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
582
583
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=_I3OMdpUElQdIwVs7W9ORU1kncNaZ_nr6lbxeKE8uaU,1014
583
584
  unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
584
585
  unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
585
- unstructured_ingest-0.6.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
586
- unstructured_ingest-0.6.1.dist-info/METADATA,sha256=Babhsu1h1L0nvRFeImk9Jn-jPjnaW-jdz6mhB3jkmbI,14998
587
- unstructured_ingest-0.6.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
588
- unstructured_ingest-0.6.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
589
- unstructured_ingest-0.6.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
590
- unstructured_ingest-0.6.1.dist-info/RECORD,,
586
+ unstructured_ingest-0.6.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
587
+ unstructured_ingest-0.6.4.dist-info/METADATA,sha256=vsGBmdOat7mYz2HZ-RuKLYaSzpjgOaGwwn_b1qxNC7g,14998
588
+ unstructured_ingest-0.6.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
589
+ unstructured_ingest-0.6.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
590
+ unstructured_ingest-0.6.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
591
+ unstructured_ingest-0.6.4.dist-info/RECORD,,
File without changes
@@ -1,44 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
- from pytest_mock import MockerFixture
5
-
6
- from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
7
- DatabricksVolumeDeltaTableStager,
8
- )
9
-
10
-
11
- @pytest.fixture
12
- def stager():
13
- return DatabricksVolumeDeltaTableStager()
14
-
15
-
16
- @pytest.mark.parametrize(
17
- ("output_path", "called_output_path"),
18
- [
19
- (
20
- Path("/fake/path/output"),
21
- Path("/fake/path/output.json"),
22
- ),
23
- (
24
- Path("/fake/path/output.ndjson"),
25
- Path("/fake/path/output.json"),
26
- ),
27
- ],
28
- )
29
- def test_write_output(
30
- mocker: MockerFixture,
31
- stager: DatabricksVolumeDeltaTableStager,
32
- output_path: Path,
33
- called_output_path: Path,
34
- ):
35
- data = [{"key1": "value1", "key2": "value2"}]
36
-
37
- mock_get_data = mocker.patch(
38
- "unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
39
- return_value=None,
40
- )
41
-
42
- stager.write_output(output_path, data)
43
-
44
- mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)