unstructured-ingest 0.6.1__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_github.py +49 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +8 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +93 -59
- unstructured_ingest/v2/processes/connectors/github.py +221 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +4 -2
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/METADATA +20 -20
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/RECORD +14 -13
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +0 -44
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
6
|
+
from test.integration.connectors.utils.validation.source import (
|
|
7
|
+
SourceValidationConfigs,
|
|
8
|
+
source_connector_validation,
|
|
9
|
+
)
|
|
10
|
+
from test.integration.utils import requires_env
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.github import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
GithubAccessConfig,
|
|
14
|
+
GithubConnectionConfig,
|
|
15
|
+
GithubDownloader,
|
|
16
|
+
GithubDownloaderConfig,
|
|
17
|
+
GithubIndexer,
|
|
18
|
+
GithubIndexerConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
@requires_env("GH_READ_ONLY_ACCESS_TOKEN")
|
|
25
|
+
async def test_github_source(temp_dir):
|
|
26
|
+
access_token = os.environ["GH_READ_ONLY_ACCESS_TOKEN"]
|
|
27
|
+
connection_config = GithubConnectionConfig(
|
|
28
|
+
access_config=GithubAccessConfig(access_token=access_token),
|
|
29
|
+
url="dcneiner/Downloadify",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
indexer = GithubIndexer(
|
|
33
|
+
connection_config=connection_config,
|
|
34
|
+
index_config=GithubIndexerConfig(file_glob=["*.txt", "*.html"]),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
downloader = GithubDownloader(
|
|
38
|
+
connection_config=connection_config,
|
|
39
|
+
download_config=GithubDownloaderConfig(download_dir=temp_dir),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Run the source connector validation
|
|
43
|
+
await source_connector_validation(
|
|
44
|
+
indexer=indexer,
|
|
45
|
+
downloader=downloader,
|
|
46
|
+
configs=SourceValidationConfigs(
|
|
47
|
+
test_id="github", expected_num_files=2, validate_downloaded_files=True
|
|
48
|
+
),
|
|
49
|
+
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.6.
|
|
1
|
+
__version__ = "0.6.4" # pragma: no cover
|
|
@@ -31,6 +31,8 @@ from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
|
31
31
|
from .delta_table import delta_table_destination_entry
|
|
32
32
|
from .discord import CONNECTOR_TYPE as DISCORD_CONNECTOR_TYPE
|
|
33
33
|
from .discord import discord_source_entry
|
|
34
|
+
from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE
|
|
35
|
+
from .github import github_source_entry
|
|
34
36
|
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
35
37
|
from .gitlab import gitlab_source_entry
|
|
36
38
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -124,3 +126,4 @@ add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destina
|
|
|
124
126
|
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
|
|
125
127
|
|
|
126
128
|
add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
|
|
129
|
+
add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry)
|
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
from uuid import NAMESPACE_DNS, uuid5
|
|
7
7
|
|
|
8
|
-
from pydantic import BaseModel, Field
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
11
|
from unstructured_ingest.v2.errors import (
|
|
@@ -61,6 +61,7 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
64
|
+
access_config: Secret[DatabricksVolumesAccessConfig]
|
|
64
65
|
host: Optional[str] = Field(
|
|
65
66
|
default=None,
|
|
66
67
|
description="The Databricks host URL for either the "
|
|
@@ -94,12 +95,17 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
|
94
95
|
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
95
96
|
def get_client(self) -> "WorkspaceClient":
|
|
96
97
|
from databricks.sdk import WorkspaceClient
|
|
98
|
+
from databricks.sdk.core import Config
|
|
97
99
|
|
|
98
|
-
|
|
100
|
+
config = Config(
|
|
99
101
|
host=self.host,
|
|
100
102
|
**self.access_config.get_secret_value().model_dump(),
|
|
103
|
+
).with_user_agent_extra(
|
|
104
|
+
"PyDatabricksSdk", os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
|
|
101
105
|
)
|
|
102
106
|
|
|
107
|
+
return WorkspaceClient(config=config)
|
|
108
|
+
|
|
103
109
|
|
|
104
110
|
class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
|
|
105
111
|
recursive: bool = False
|
|
@@ -1,14 +1,20 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import os
|
|
2
|
-
import tempfile
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
-
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.utils.data_prep import
|
|
11
|
-
from unstructured_ingest.v2.
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
11
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
UploadStager,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
12
18
|
from unstructured_ingest.v2.logger import logger
|
|
13
19
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
20
|
DestinationRegistryEntry,
|
|
@@ -16,28 +22,50 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
16
22
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
17
23
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
18
24
|
DatabricksDeltaTablesConnectionConfig,
|
|
19
|
-
DatabricksDeltaTablesUploadStager,
|
|
20
25
|
DatabricksDeltaTablesUploadStagerConfig,
|
|
21
26
|
)
|
|
22
27
|
from unstructured_ingest.v2.types.file_data import FileData
|
|
28
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
23
29
|
|
|
24
30
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
25
31
|
|
|
26
32
|
if TYPE_CHECKING:
|
|
27
|
-
|
|
33
|
+
pass
|
|
28
34
|
|
|
29
35
|
|
|
30
36
|
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
31
37
|
database: str = Field(description="Database name", default="default")
|
|
32
|
-
table_name: str = Field(description="Table name")
|
|
38
|
+
table_name: Optional[str] = Field(description="Table name", default=None)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
|
|
42
|
+
pass
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
@dataclass
|
|
36
|
-
class DatabricksVolumeDeltaTableStager(
|
|
37
|
-
|
|
46
|
+
class DatabricksVolumeDeltaTableStager(UploadStager):
|
|
47
|
+
upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
|
|
48
|
+
default_factory=DatabricksVolumeDeltaTableStagerConfig
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def run(
|
|
52
|
+
self,
|
|
53
|
+
elements_filepath: Path,
|
|
54
|
+
output_dir: Path,
|
|
55
|
+
output_filename: str,
|
|
56
|
+
file_data: FileData,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
) -> Path:
|
|
38
59
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
39
60
|
# and always write it as a json file
|
|
61
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
|
62
|
+
output_path = output_dir / output_filename
|
|
40
63
|
final_output_path = output_path.with_suffix(".json")
|
|
64
|
+
data = get_json_data(path=elements_filepath)
|
|
65
|
+
for element in data:
|
|
66
|
+
element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
|
|
67
|
+
element[RECORD_ID_LABEL] = file_data.identifier
|
|
68
|
+
element["metadata"] = json.dumps(element.get("metadata", {}))
|
|
41
69
|
write_data(path=final_output_path, data=data, indent=None)
|
|
42
70
|
return final_output_path
|
|
43
71
|
|
|
@@ -49,6 +77,29 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
49
77
|
connector_type: str = CONNECTOR_TYPE
|
|
50
78
|
_columns: Optional[dict[str, str]] = None
|
|
51
79
|
|
|
80
|
+
def init(self, **kwargs: Any) -> None:
|
|
81
|
+
self.create_destination(**kwargs)
|
|
82
|
+
|
|
83
|
+
def create_destination(
|
|
84
|
+
self, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
85
|
+
) -> bool:
|
|
86
|
+
table_name = self.upload_config.table_name or destination_name
|
|
87
|
+
self.upload_config.table_name = table_name
|
|
88
|
+
connectors_dir = Path(__file__).parents[1]
|
|
89
|
+
collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
|
|
90
|
+
with self.get_cursor() as cursor:
|
|
91
|
+
cursor.execute("SHOW TABLES")
|
|
92
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
93
|
+
if table_name in table_names:
|
|
94
|
+
return False
|
|
95
|
+
with collection_config_file.open() as schema_file:
|
|
96
|
+
data_lines = schema_file.readlines()
|
|
97
|
+
data_lines[0] = data_lines[0].replace("elements", table_name)
|
|
98
|
+
destination_schema = "".join([line.strip() for line in data_lines])
|
|
99
|
+
logger.info(f"creating table {table_name} for user")
|
|
100
|
+
cursor.execute(destination_schema)
|
|
101
|
+
return True
|
|
102
|
+
|
|
52
103
|
def precheck(self) -> None:
|
|
53
104
|
with self.connection_config.get_cursor() as cursor:
|
|
54
105
|
cursor.execute("SHOW CATALOGS")
|
|
@@ -68,14 +119,6 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
68
119
|
self.upload_config.database, ", ".join(databases)
|
|
69
120
|
)
|
|
70
121
|
)
|
|
71
|
-
cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
|
|
72
|
-
table_names = [r[1] for r in cursor.fetchall()]
|
|
73
|
-
if self.upload_config.table_name not in table_names:
|
|
74
|
-
raise ValueError(
|
|
75
|
-
"Table {} not found in {}".format(
|
|
76
|
-
self.upload_config.table_name, ", ".join(table_names)
|
|
77
|
-
)
|
|
78
|
-
)
|
|
79
122
|
|
|
80
123
|
def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
|
|
81
124
|
filename = Path(file_data.source_identifiers.filename)
|
|
@@ -98,51 +141,42 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
98
141
|
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
99
142
|
return self._columns
|
|
100
143
|
|
|
101
|
-
def
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
f"{
|
|
114
|
-
)
|
|
115
|
-
if missing_columns and add_missing_columns:
|
|
116
|
-
logger.info(
|
|
117
|
-
"Following null filled columns will be added to match the table's schema:"
|
|
118
|
-
f" {', '.join(missing_columns)} "
|
|
144
|
+
def can_delete(self) -> bool:
|
|
145
|
+
existing_columns = self.get_table_columns()
|
|
146
|
+
return RECORD_ID_LABEL in existing_columns
|
|
147
|
+
|
|
148
|
+
def delete_previous_content(self, file_data: FileData) -> None:
|
|
149
|
+
logger.debug(
|
|
150
|
+
f"deleting any content with metadata "
|
|
151
|
+
f"{RECORD_ID_LABEL}={file_data.identifier} "
|
|
152
|
+
f"from delta table: {self.upload_config.table_name}"
|
|
153
|
+
)
|
|
154
|
+
with self.get_cursor() as cursor:
|
|
155
|
+
cursor.execute(
|
|
156
|
+
f"DELETE FROM {self.upload_config.table_name} WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
|
|
119
157
|
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if add_missing_columns:
|
|
124
|
-
for column in missing_columns:
|
|
125
|
-
df[column] = pd.Series()
|
|
126
|
-
return df
|
|
158
|
+
results = cursor.fetchall()
|
|
159
|
+
deleted_rows = results[0][0]
|
|
160
|
+
logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
|
|
127
161
|
|
|
128
162
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
163
|
+
if self.can_delete():
|
|
164
|
+
self.delete_previous_content(file_data=file_data)
|
|
165
|
+
with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
|
|
166
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
167
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
168
|
+
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
169
|
+
logger.debug(
|
|
170
|
+
f"migrating content from {catalog_path} to "
|
|
171
|
+
f"table {self.upload_config.table_name}"
|
|
172
|
+
)
|
|
173
|
+
data = get_json_data(path=path)
|
|
174
|
+
columns = data[0].keys()
|
|
175
|
+
select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
|
|
176
|
+
column_str = ", ".join(columns)
|
|
177
|
+
select_column_str = ", ".join(select_columns)
|
|
178
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
179
|
+
cursor.execute(sql_statment)
|
|
146
180
|
|
|
147
181
|
|
|
148
182
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret, field_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
Downloader,
|
|
16
|
+
DownloaderConfig,
|
|
17
|
+
Indexer,
|
|
18
|
+
IndexerConfig,
|
|
19
|
+
download_responses,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
SourceRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
26
|
+
FileData,
|
|
27
|
+
FileDataSourceMetadata,
|
|
28
|
+
SourceIdentifiers,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from github import ContentFile, GitTreeElement, Repository
|
|
33
|
+
from github import Github as GithubClient
|
|
34
|
+
from github.GithubException import GithubException
|
|
35
|
+
from requests import HTTPError
|
|
36
|
+
|
|
37
|
+
CONNECTOR_TYPE = "github"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GithubAccessConfig(AccessConfig):
|
|
41
|
+
access_token: str = Field(description="Github acess token")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class GithubConnectionConfig(ConnectionConfig):
|
|
45
|
+
access_config: Secret[GithubAccessConfig]
|
|
46
|
+
url: str = Field(description="Github url or repository owner/name pair")
|
|
47
|
+
|
|
48
|
+
@field_validator("url", mode="after")
|
|
49
|
+
def conform_url(cls, value: str):
|
|
50
|
+
parsed_url = urlparse(value)
|
|
51
|
+
return parsed_url.path
|
|
52
|
+
|
|
53
|
+
def get_full_url(self):
|
|
54
|
+
return f"https://github.com/{self.url}"
|
|
55
|
+
|
|
56
|
+
@requires_dependencies(["github"], extras="github")
|
|
57
|
+
def get_client(self) -> "GithubClient":
|
|
58
|
+
from github import Github as GithubClient
|
|
59
|
+
|
|
60
|
+
return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
|
|
61
|
+
|
|
62
|
+
def get_repo(self) -> "Repository":
|
|
63
|
+
client = self.get_client()
|
|
64
|
+
return client.get_repo(self.url)
|
|
65
|
+
|
|
66
|
+
def wrap_github_exception(self, e: "GithubException") -> Exception:
|
|
67
|
+
data = e.data
|
|
68
|
+
status_code = e.status
|
|
69
|
+
message = data.get("message")
|
|
70
|
+
if status_code == 401:
|
|
71
|
+
return UserAuthError(f"Unauthorized access to Github: {message}")
|
|
72
|
+
if 400 <= status_code < 500:
|
|
73
|
+
return UserError(message)
|
|
74
|
+
if status_code > 500:
|
|
75
|
+
return ProviderError(message)
|
|
76
|
+
logger.debug(f"unhandled github error: {e}")
|
|
77
|
+
return e
|
|
78
|
+
|
|
79
|
+
def wrap_http_error(self, e: "HTTPError") -> Exception:
|
|
80
|
+
status_code = e.response.status_code
|
|
81
|
+
if status_code == 401:
|
|
82
|
+
return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
|
|
83
|
+
if 400 <= status_code < 500:
|
|
84
|
+
return UserError(e.response.text)
|
|
85
|
+
if status_code > 500:
|
|
86
|
+
return ProviderError(e.response.text)
|
|
87
|
+
logger.debug(f"unhandled http error: {e}")
|
|
88
|
+
return e
|
|
89
|
+
|
|
90
|
+
@requires_dependencies(["requests"], extras="github")
|
|
91
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
92
|
+
from github.GithubException import GithubException
|
|
93
|
+
from requests import HTTPError
|
|
94
|
+
|
|
95
|
+
if isinstance(e, GithubException):
|
|
96
|
+
return self.wrap_github_exception(e=e)
|
|
97
|
+
if isinstance(e, HTTPError):
|
|
98
|
+
return self.wrap_http_error(e=e)
|
|
99
|
+
logger.debug(f"unhandled error: {e}")
|
|
100
|
+
return e
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class GithubIndexerConfig(IndexerConfig):
|
|
104
|
+
branch: Optional[str] = Field(
|
|
105
|
+
description="Branch to index, use the default if one isn't provided", default=None
|
|
106
|
+
)
|
|
107
|
+
recursive: bool = Field(
|
|
108
|
+
description="Recursively index all files in the repository", default=True
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class GithubIndexer(Indexer):
|
|
114
|
+
connection_config: GithubConnectionConfig
|
|
115
|
+
index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
|
|
116
|
+
connector_type: str = CONNECTOR_TYPE
|
|
117
|
+
|
|
118
|
+
def precheck(self) -> None:
|
|
119
|
+
try:
|
|
120
|
+
self.connection_config.get_repo()
|
|
121
|
+
except Exception as e:
|
|
122
|
+
raise self.connection_config.wrap_error(e=e)
|
|
123
|
+
|
|
124
|
+
def get_branch(self) -> str:
|
|
125
|
+
repo = self.connection_config.get_repo()
|
|
126
|
+
sha = self.index_config.branch or repo.default_branch
|
|
127
|
+
return sha
|
|
128
|
+
|
|
129
|
+
def list_files(self) -> list["GitTreeElement"]:
|
|
130
|
+
repo = self.connection_config.get_repo()
|
|
131
|
+
sha = self.index_config.branch or repo.default_branch
|
|
132
|
+
git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
|
|
133
|
+
file_elements = [
|
|
134
|
+
element for element in git_tree.tree if element.size is not None and element.size > 0
|
|
135
|
+
]
|
|
136
|
+
return file_elements
|
|
137
|
+
|
|
138
|
+
def convert_element(self, element: "GitTreeElement") -> FileData:
|
|
139
|
+
full_path = (
|
|
140
|
+
f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return FileData(
|
|
144
|
+
identifier=str(uuid5(NAMESPACE_DNS, full_path)),
|
|
145
|
+
connector_type=self.connector_type,
|
|
146
|
+
display_name=full_path,
|
|
147
|
+
source_identifiers=SourceIdentifiers(
|
|
148
|
+
filename=Path(element.path).name,
|
|
149
|
+
fullpath=(Path(self.get_branch()) / element.path).as_posix(),
|
|
150
|
+
rel_path=element.path,
|
|
151
|
+
),
|
|
152
|
+
metadata=FileDataSourceMetadata(
|
|
153
|
+
url=element.url,
|
|
154
|
+
version=element.etag,
|
|
155
|
+
record_locator={},
|
|
156
|
+
date_modified=str(element.last_modified_datetime.timestamp()),
|
|
157
|
+
date_processed=str(time()),
|
|
158
|
+
filesize_bytes=element.size,
|
|
159
|
+
permissions_data=[{"mode": element.mode}],
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
164
|
+
for element in self.list_files():
|
|
165
|
+
yield self.convert_element(element=element)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class GithubDownloaderConfig(DownloaderConfig):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class GithubDownloader(Downloader):
|
|
174
|
+
download_config: GithubDownloaderConfig
|
|
175
|
+
connection_config: GithubConnectionConfig
|
|
176
|
+
connector_type: str = CONNECTOR_TYPE
|
|
177
|
+
|
|
178
|
+
@requires_dependencies(["github"], extras="github")
|
|
179
|
+
def get_file(self, file_data: FileData) -> "ContentFile":
|
|
180
|
+
from github.GithubException import UnknownObjectException
|
|
181
|
+
|
|
182
|
+
path = file_data.source_identifiers.relative_path
|
|
183
|
+
repo = self.connection_config.get_repo()
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
content_file = repo.get_contents(path)
|
|
187
|
+
except UnknownObjectException as e:
|
|
188
|
+
logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
|
|
189
|
+
raise UserError(f"File not found: {path}")
|
|
190
|
+
return content_file
|
|
191
|
+
|
|
192
|
+
@requires_dependencies(["requests"], extras="github")
|
|
193
|
+
def get_contents(self, content_file: "ContentFile") -> bytes:
|
|
194
|
+
import requests
|
|
195
|
+
|
|
196
|
+
if content_file.decoded_content:
|
|
197
|
+
return content_file.decoded_content
|
|
198
|
+
download_url = content_file.download_url
|
|
199
|
+
resp = requests.get(download_url)
|
|
200
|
+
try:
|
|
201
|
+
resp.raise_for_status()
|
|
202
|
+
except requests.HTTPError as e:
|
|
203
|
+
raise self.connection_config.wrap_error(e=e)
|
|
204
|
+
return resp.content
|
|
205
|
+
|
|
206
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
207
|
+
content_file = self.get_file(file_data)
|
|
208
|
+
contents = self.get_contents(content_file)
|
|
209
|
+
download_path = self.get_download_path(file_data)
|
|
210
|
+
with download_path.open("wb") as f:
|
|
211
|
+
f.write(contents)
|
|
212
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
github_source_entry = SourceRegistryEntry(
|
|
216
|
+
indexer=GithubIndexer,
|
|
217
|
+
indexer_config=GithubIndexerConfig,
|
|
218
|
+
downloader=GithubDownloader,
|
|
219
|
+
downloader_config=GithubDownloaderConfig,
|
|
220
|
+
connection_config=GithubConnectionConfig,
|
|
221
|
+
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
@@ -42,7 +43,6 @@ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
|
42
43
|
access_config: Secret[DatabricksDeltaTablesAccessConfig]
|
|
43
44
|
server_hostname: str = Field(description="server hostname connection config value")
|
|
44
45
|
http_path: str = Field(description="http path connection config value")
|
|
45
|
-
user_agent: str = "unstructuredio_oss"
|
|
46
46
|
|
|
47
47
|
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
48
48
|
def get_credentials_provider(self) -> "oauth_service_principal":
|
|
@@ -86,7 +86,9 @@ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
|
86
86
|
from databricks.sql import connect
|
|
87
87
|
|
|
88
88
|
connect_kwargs = connect_kwargs or {}
|
|
89
|
-
connect_kwargs["_user_agent_entry"] =
|
|
89
|
+
connect_kwargs["_user_agent_entry"] = os.getenv(
|
|
90
|
+
"UNSTRUCTURED_USER_AGENT", "unstructuredio_oss"
|
|
91
|
+
)
|
|
90
92
|
connect_kwargs["server_hostname"] = connect_kwargs.get(
|
|
91
93
|
"server_hostname", self.server_hostname
|
|
92
94
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.4
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist: tqdm
|
|
25
|
+
Requires-Dist: pydantic>=2.7
|
|
27
26
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: tqdm
|
|
28
|
+
Requires-Dist: click
|
|
29
29
|
Requires-Dist: dataclasses_json
|
|
30
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: python-dateutil
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Requires-Dist: pandas
|
|
33
33
|
Provides-Extra: remote
|
|
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
|
|
|
103
103
|
Requires-Dist: numpy; extra == "astradb"
|
|
104
104
|
Requires-Dist: pandas; extra == "astradb"
|
|
105
105
|
Provides-Extra: azure
|
|
106
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
107
106
|
Requires-Dist: fsspec; extra == "azure"
|
|
107
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
108
108
|
Requires-Dist: numpy; extra == "azure"
|
|
109
109
|
Requires-Dist: pandas; extra == "azure"
|
|
110
110
|
Provides-Extra: azure-ai-search
|
|
@@ -139,8 +139,8 @@ Requires-Dist: couchbase; extra == "couchbase"
|
|
|
139
139
|
Requires-Dist: numpy; extra == "couchbase"
|
|
140
140
|
Requires-Dist: pandas; extra == "couchbase"
|
|
141
141
|
Provides-Extra: delta-table
|
|
142
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
143
142
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
143
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
144
144
|
Requires-Dist: numpy; extra == "delta-table"
|
|
145
145
|
Requires-Dist: pandas; extra == "delta-table"
|
|
146
146
|
Provides-Extra: discord
|
|
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
|
|
|
148
148
|
Requires-Dist: numpy; extra == "discord"
|
|
149
149
|
Requires-Dist: pandas; extra == "discord"
|
|
150
150
|
Provides-Extra: dropbox
|
|
151
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
152
151
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
152
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
153
153
|
Requires-Dist: numpy; extra == "dropbox"
|
|
154
154
|
Requires-Dist: pandas; extra == "dropbox"
|
|
155
155
|
Provides-Extra: duckdb
|
|
@@ -161,9 +161,9 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
|
161
161
|
Requires-Dist: numpy; extra == "elasticsearch"
|
|
162
162
|
Requires-Dist: pandas; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
164
165
|
Requires-Dist: bs4; extra == "gcs"
|
|
165
166
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
166
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
167
167
|
Requires-Dist: numpy; extra == "gcs"
|
|
168
168
|
Requires-Dist: pandas; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
@@ -185,9 +185,9 @@ Requires-Dist: urllib3; extra == "hubspot"
|
|
|
185
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
186
|
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
-
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
189
|
-
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
190
188
|
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
189
|
+
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
190
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
191
191
|
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
@@ -216,15 +216,15 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
216
216
|
Requires-Dist: numpy; extra == "mongodb"
|
|
217
217
|
Requires-Dist: pandas; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
219
220
|
Requires-Dist: networkx; extra == "neo4j"
|
|
220
221
|
Requires-Dist: cymple; extra == "neo4j"
|
|
221
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
222
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
223
|
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
|
-
Requires-Dist: httpx; extra == "notion"
|
|
226
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
227
225
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
226
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
227
|
+
Requires-Dist: httpx; extra == "notion"
|
|
228
228
|
Requires-Dist: backoff; extra == "notion"
|
|
229
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
230
|
Requires-Dist: pandas; extra == "notion"
|
|
@@ -264,8 +264,8 @@ Requires-Dist: redis; extra == "redis"
|
|
|
264
264
|
Requires-Dist: numpy; extra == "redis"
|
|
265
265
|
Requires-Dist: pandas; extra == "redis"
|
|
266
266
|
Provides-Extra: s3
|
|
267
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
268
267
|
Requires-Dist: fsspec; extra == "s3"
|
|
268
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
269
269
|
Requires-Dist: numpy; extra == "s3"
|
|
270
270
|
Requires-Dist: pandas; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
|
278
278
|
Requires-Dist: numpy; extra == "salesforce"
|
|
279
279
|
Requires-Dist: pandas; extra == "salesforce"
|
|
280
280
|
Provides-Extra: sftp
|
|
281
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
282
281
|
Requires-Dist: fsspec; extra == "sftp"
|
|
282
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
283
283
|
Requires-Dist: numpy; extra == "sftp"
|
|
284
284
|
Requires-Dist: pandas; extra == "sftp"
|
|
285
285
|
Provides-Extra: slack
|
|
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
|
287
287
|
Requires-Dist: numpy; extra == "slack"
|
|
288
288
|
Requires-Dist: pandas; extra == "slack"
|
|
289
289
|
Provides-Extra: snowflake
|
|
290
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
291
290
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
291
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
292
292
|
Requires-Dist: numpy; extra == "snowflake"
|
|
293
293
|
Requires-Dist: pandas; extra == "snowflake"
|
|
294
294
|
Provides-Extra: wikipedia
|
|
@@ -313,8 +313,8 @@ Requires-Dist: numpy; extra == "singlestore"
|
|
|
313
313
|
Requires-Dist: pandas; extra == "singlestore"
|
|
314
314
|
Provides-Extra: vectara
|
|
315
315
|
Requires-Dist: requests; extra == "vectara"
|
|
316
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
317
316
|
Requires-Dist: httpx; extra == "vectara"
|
|
317
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
318
318
|
Requires-Dist: numpy; extra == "vectara"
|
|
319
319
|
Requires-Dist: pandas; extra == "vectara"
|
|
320
320
|
Provides-Extra: vastdb
|
|
@@ -324,9 +324,9 @@ Requires-Dist: pyarrow; extra == "vastdb"
|
|
|
324
324
|
Requires-Dist: numpy; extra == "vastdb"
|
|
325
325
|
Requires-Dist: pandas; extra == "vastdb"
|
|
326
326
|
Provides-Extra: zendesk
|
|
327
|
+
Requires-Dist: bs4; extra == "zendesk"
|
|
327
328
|
Requires-Dist: httpx; extra == "zendesk"
|
|
328
329
|
Requires-Dist: aiofiles; extra == "zendesk"
|
|
329
|
-
Requires-Dist: bs4; extra == "zendesk"
|
|
330
330
|
Requires-Dist: numpy; extra == "zendesk"
|
|
331
331
|
Requires-Dist: pandas; extra == "zendesk"
|
|
332
332
|
Provides-Extra: embed-huggingface
|
|
@@ -356,8 +356,8 @@ Requires-Dist: tiktoken; extra == "openai"
|
|
|
356
356
|
Requires-Dist: numpy; extra == "openai"
|
|
357
357
|
Requires-Dist: pandas; extra == "openai"
|
|
358
358
|
Provides-Extra: bedrock
|
|
359
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
360
359
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
360
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
361
361
|
Requires-Dist: numpy; extra == "bedrock"
|
|
362
362
|
Requires-Dist: pandas; extra == "bedrock"
|
|
363
363
|
Provides-Extra: togetherai
|
|
@@ -11,6 +11,7 @@ test/integration/connectors/test_chroma.py,sha256=yn2p8U8yE9LaF-IEKiLp2XB4T4Vqo-
|
|
|
11
11
|
test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
|
|
12
12
|
test/integration/connectors/test_delta_table.py,sha256=r2OvLMRfJFfsyIHd1H44Kx6AgNnDjPHKofN7a01oqrY,6916
|
|
13
13
|
test/integration/connectors/test_dropbox.py,sha256=jzpZ6wawLa4sC1BVoHWZJ3cHjL4DWWUEX5ee7bXUOOM,4945
|
|
14
|
+
test/integration/connectors/test_github.py,sha256=cRArMUMtAujsVtR2yp2GHZktX-cSPRSsYhlnnsRhhH8,1503
|
|
14
15
|
test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
|
|
15
16
|
test/integration/connectors/test_jira.py,sha256=IgF_cdknJ97W3OrNfHKp3YLmRyVwY6BI8jhZwVbrebc,2076
|
|
16
17
|
test/integration/connectors/test_lancedb.py,sha256=AuO2qHDay5NHxQ2b1LzbsNNp6LnUDeB2_0dWlkgAuMo,9223
|
|
@@ -91,8 +92,6 @@ test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCr
|
|
|
91
92
|
test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
93
|
test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
|
|
93
94
|
test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
|
|
94
|
-
test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
|
-
test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
|
|
96
95
|
test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
96
|
test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=WKpDKvEGalh8LYRqN9xA7CfMPOPHo_VcZbnCXdkVjho,14513
|
|
98
97
|
test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -113,7 +112,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
113
112
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
113
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
115
114
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
116
|
-
unstructured_ingest/__version__.py,sha256=
|
|
115
|
+
unstructured_ingest/__version__.py,sha256=0sOJ1f0sRyjdtSL0LUqCE6m6T039ttMVXeIDsmdxcPw,42
|
|
117
116
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
118
117
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
119
118
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -427,7 +426,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
|
|
|
427
426
|
unstructured_ingest/v2/processes/filter.py,sha256=E1MLxk-XeCm3mZIuM49lJToVcSgOivmTFIZApqOEFs8,2150
|
|
428
427
|
unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
|
|
429
428
|
unstructured_ingest/v2/processes/uncompress.py,sha256=O7q0pOiL6cDBvAa9NAy_vkc873tisjK2Hoq6Z-grRRo,2430
|
|
430
|
-
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=
|
|
429
|
+
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=iyoZrTaEoBPBn9-tczFhLcwKQQoeFbESF3QkJkVimv4,6845
|
|
431
430
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=JesWeUv_tIA7b65sE2Z-ixMKuGLlgugZRMKE38ID3zg,8959
|
|
432
431
|
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=SwfUcdrCbMK_LDcHG5auCCuga_luPmZVvhjuAsRtKU0,18304
|
|
433
432
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=K4g-Dh7u7Z13rheNKhnnBXcO6TUfz8RPrrusDAsgkBk,11575
|
|
@@ -436,6 +435,7 @@ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=ERHYYutQsnS8eZN
|
|
|
436
435
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LARX4F_1Zd1LnUMSNdvIsbqLoZXZ9kl_vMZh-dRr4XA,12305
|
|
437
436
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=5T4hkXHGitGprpUb20206ODcBh5wchgHkUocji5l2rk,7286
|
|
438
437
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=yvLxTx0ZRIATqSYLm5d09u9a0ktZGJnGcnzapwePHK8,5301
|
|
438
|
+
unstructured_ingest/v2/processes/connectors/github.py,sha256=d-sh28MVe40vyaTf8b8NkspSrV9zlUHjdSSfSZjOcOA,7772
|
|
439
439
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=hsB5g-3tZqe7bVSqwA0nY9GY4J0cg4PBawf77GkpjZY,10039
|
|
440
440
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=mn1BoUXYw5j-q7jO-rzPPEv_rt1UV9LDN1M_PyREEps,19678
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/jira.py,sha256=_ivv0TqeVPlHG9YMihljzrpYP7OLQAa0D7nqLonaeM8,17149
|
|
@@ -454,14 +454,15 @@ unstructured_ingest/v2/processes/connectors/slack.py,sha256=vbBVCYEd741-n2v6eAXL
|
|
|
454
454
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
455
455
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=KUqgZ6D2KUOrW596ms-EekvQYDh-fXqBTa7KG-leXoo,12301
|
|
456
456
|
unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
457
|
+
unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
|
|
457
458
|
unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
458
459
|
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
|
|
459
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=
|
|
460
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=JktJXC9SYnKLetjsyGJWKXqg5Kml8WY9dcKyr5o_Yxs,8024
|
|
460
461
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
|
|
461
462
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
|
|
462
463
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
|
|
463
464
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
464
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=
|
|
465
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=LiSb66039idaRtMnTuHjR5ZqvdmmIu3ByUgFQ1a3iZQ,8264
|
|
465
466
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
466
467
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=VCoQ3h289BO4A2kJKZXUVB0QOcaQif-HeRgg-xXzn10,2976
|
|
467
468
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=DM4pygQAnP-dtuFEFAVeBfGt0pzrfkltteCai0GKnG0,4439
|
|
@@ -563,7 +564,7 @@ unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ
|
|
|
563
564
|
unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=hsOd2Gliyjzkb21Vv6RAiFf8NAysxd29K0AxBkkm844,5483
|
|
564
565
|
unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
|
|
565
566
|
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=NSEZwJDHh_9kFc31LnG14iRtYF3meK2UfUlQfYnwYEQ,2059
|
|
566
|
-
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=
|
|
567
|
+
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=hJ0yacutrgiCer9cJSfxcNgLlOgsozJ2yGhgy8vZAkk,9086
|
|
567
568
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
568
569
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
|
|
569
570
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=FOb08WCr0SdzylN88xDP51NdVD4ggDbjanJurwJUrgM,9374
|
|
@@ -582,9 +583,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
582
583
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=_I3OMdpUElQdIwVs7W9ORU1kncNaZ_nr6lbxeKE8uaU,1014
|
|
583
584
|
unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
584
585
|
unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
|
|
585
|
-
unstructured_ingest-0.6.
|
|
586
|
-
unstructured_ingest-0.6.
|
|
587
|
-
unstructured_ingest-0.6.
|
|
588
|
-
unstructured_ingest-0.6.
|
|
589
|
-
unstructured_ingest-0.6.
|
|
590
|
-
unstructured_ingest-0.6.
|
|
586
|
+
unstructured_ingest-0.6.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
587
|
+
unstructured_ingest-0.6.4.dist-info/METADATA,sha256=vsGBmdOat7mYz2HZ-RuKLYaSzpjgOaGwwn_b1qxNC7g,14998
|
|
588
|
+
unstructured_ingest-0.6.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
589
|
+
unstructured_ingest-0.6.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
590
|
+
unstructured_ingest-0.6.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
591
|
+
unstructured_ingest-0.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from pytest_mock import MockerFixture
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
|
|
7
|
-
DatabricksVolumeDeltaTableStager,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def stager():
|
|
13
|
-
return DatabricksVolumeDeltaTableStager()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.mark.parametrize(
|
|
17
|
-
("output_path", "called_output_path"),
|
|
18
|
-
[
|
|
19
|
-
(
|
|
20
|
-
Path("/fake/path/output"),
|
|
21
|
-
Path("/fake/path/output.json"),
|
|
22
|
-
),
|
|
23
|
-
(
|
|
24
|
-
Path("/fake/path/output.ndjson"),
|
|
25
|
-
Path("/fake/path/output.json"),
|
|
26
|
-
),
|
|
27
|
-
],
|
|
28
|
-
)
|
|
29
|
-
def test_write_output(
|
|
30
|
-
mocker: MockerFixture,
|
|
31
|
-
stager: DatabricksVolumeDeltaTableStager,
|
|
32
|
-
output_path: Path,
|
|
33
|
-
called_output_path: Path,
|
|
34
|
-
):
|
|
35
|
-
data = [{"key1": "value1", "key2": "value2"}]
|
|
36
|
-
|
|
37
|
-
mock_get_data = mocker.patch(
|
|
38
|
-
"unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
|
|
39
|
-
return_value=None,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
stager.write_output(output_path, data)
|
|
43
|
-
|
|
44
|
-
mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.6.1.dist-info → unstructured_ingest-0.6.4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|