unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/test_duckdb.py +82 -0
- test/integration/connectors/duckdb/test_motherduck.py +106 -0
- test/integration/connectors/test_kafka.py +109 -6
- test/integration/connectors/test_qdrant.py +55 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
- unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
- unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
- unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
- unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
- unstructured_ingest/v2/utils.py +9 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/METADATA +19 -17
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/RECORD +39 -31
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import uuid
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
24
23
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
25
24
|
DestinationRegistryEntry,
|
|
26
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from kdbai_client import Database, Session, Table
|
|
@@ -81,7 +81,7 @@ class KdbaiUploadStager(UploadStager):
|
|
|
81
81
|
for element in elements_contents:
|
|
82
82
|
data.append(
|
|
83
83
|
{
|
|
84
|
-
"id":
|
|
84
|
+
"id": get_enhanced_element_id(element_dict=element, file_data=file_data),
|
|
85
85
|
"element_id": element.get("element_id"),
|
|
86
86
|
"document": element.pop("text", None),
|
|
87
87
|
"embeddings": element.get("embeddings"),
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import uuid
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
20
|
)
|
|
22
21
|
from unstructured_ingest.v2.logger import logger
|
|
23
22
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
23
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from pinecone import Index as PineconeIndex
|
|
@@ -149,7 +149,7 @@ class PineconeUploadStager(UploadStager):
|
|
|
149
149
|
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
150
150
|
|
|
151
151
|
return {
|
|
152
|
-
"id":
|
|
152
|
+
"id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
|
|
153
153
|
"values": embeddings,
|
|
154
154
|
"metadata": metadata,
|
|
155
155
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
|
-
import uuid
|
|
4
3
|
from abc import ABC, abstractmethod
|
|
5
4
|
from contextlib import asynccontextmanager
|
|
6
5
|
from dataclasses import dataclass, field
|
|
@@ -22,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
21
|
UploadStagerConfig,
|
|
23
22
|
)
|
|
24
23
|
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from qdrant_client import AsyncQdrantClient
|
|
@@ -64,10 +64,10 @@ class QdrantUploadStager(UploadStager, ABC):
|
|
|
64
64
|
)
|
|
65
65
|
|
|
66
66
|
@staticmethod
|
|
67
|
-
def conform_dict(data: dict) -> dict:
|
|
67
|
+
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
68
68
|
"""Prepares dictionary in the format that Chroma requires"""
|
|
69
69
|
return {
|
|
70
|
-
"id":
|
|
70
|
+
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
71
71
|
"vector": data.pop("embeddings", {}),
|
|
72
72
|
"payload": {
|
|
73
73
|
"text": data.pop("text", None),
|
|
@@ -91,7 +91,9 @@ class QdrantUploadStager(UploadStager, ABC):
|
|
|
91
91
|
with open(elements_filepath) as elements_file:
|
|
92
92
|
elements_contents = json.load(elements_file)
|
|
93
93
|
|
|
94
|
-
conformed_elements = [
|
|
94
|
+
conformed_elements = [
|
|
95
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
96
|
+
]
|
|
95
97
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
96
98
|
|
|
97
99
|
with open(output_path, "w") as output_file:
|
|
@@ -8,7 +8,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
8
8
|
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
9
9
|
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
10
10
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
11
|
-
from .singlestore import singlestore_destination_entry
|
|
11
|
+
from .singlestore import singlestore_destination_entry, singlestore_source_entry
|
|
12
12
|
from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
|
|
13
13
|
from .snowflake import snowflake_destination_entry, snowflake_source_entry
|
|
14
14
|
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
@@ -17,6 +17,7 @@ from .sqlite import sqlite_destination_entry, sqlite_source_entry
|
|
|
17
17
|
add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
|
|
18
18
|
add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
|
|
19
19
|
add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
|
|
20
|
+
add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
|
|
20
21
|
|
|
21
22
|
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
22
23
|
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
@@ -91,22 +91,20 @@ class SingleStoreDownloader(SQLDownloader):
|
|
|
91
91
|
connection_config: SingleStoreConnectionConfig
|
|
92
92
|
download_config: SingleStoreDownloaderConfig
|
|
93
93
|
connector_type: str = CONNECTOR_TYPE
|
|
94
|
+
values_delimiter: str = "%s"
|
|
94
95
|
|
|
95
96
|
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
96
97
|
table_name = file_data.additional_metadata["table_name"]
|
|
97
98
|
id_column = file_data.additional_metadata["id_column"]
|
|
98
|
-
ids = file_data.additional_metadata["ids"]
|
|
99
|
+
ids = tuple(file_data.additional_metadata["ids"])
|
|
99
100
|
with self.connection_config.get_connection() as sqlite_connection:
|
|
100
101
|
cursor = sqlite_connection.cursor()
|
|
101
102
|
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
102
|
-
query =
|
|
103
|
-
fields
|
|
104
|
-
table_name=table_name,
|
|
105
|
-
id_column=id_column,
|
|
106
|
-
ids=",".join([str(i) for i in ids]),
|
|
103
|
+
query = (
|
|
104
|
+
f"SELECT {fields} FROM {table_name} WHERE {id_column} IN {self.values_delimiter}"
|
|
107
105
|
)
|
|
108
|
-
logger.debug(f"running query: {query}")
|
|
109
|
-
cursor.execute(query)
|
|
106
|
+
logger.debug(f"running query: {query}\nwith values: {(ids,)}")
|
|
107
|
+
cursor.execute(query, (ids,))
|
|
110
108
|
rows = cursor.fetchall()
|
|
111
109
|
columns = [col[0] for col in cursor.description]
|
|
112
110
|
return rows, columns
|
|
@@ -154,7 +152,7 @@ class SingleStoreUploader(SQLUploader):
|
|
|
154
152
|
singlestore_source_entry = SourceRegistryEntry(
|
|
155
153
|
connection_config=SingleStoreConnectionConfig,
|
|
156
154
|
indexer_config=SingleStoreIndexerConfig,
|
|
157
|
-
indexer=
|
|
155
|
+
indexer=SingleStoreIndexer,
|
|
158
156
|
downloader_config=SingleStoreDownloaderConfig,
|
|
159
157
|
downloader=SingleStoreDownloader,
|
|
160
158
|
)
|
|
@@ -7,22 +7,26 @@ import numpy as np
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
+
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
15
|
DestinationRegistryEntry,
|
|
13
16
|
SourceRegistryEntry,
|
|
14
17
|
)
|
|
15
|
-
from unstructured_ingest.v2.processes.connectors.sql.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
19
|
+
SQLAccessConfig,
|
|
20
|
+
SQLConnectionConfig,
|
|
21
|
+
SQLDownloader,
|
|
22
|
+
SQLDownloaderConfig,
|
|
23
|
+
SQLIndexer,
|
|
24
|
+
SQLIndexerConfig,
|
|
25
|
+
SQLUploader,
|
|
26
|
+
SQLUploaderConfig,
|
|
27
|
+
SQLUploadStager,
|
|
28
|
+
SQLUploadStagerConfig,
|
|
24
29
|
)
|
|
25
|
-
from unstructured_ingest.v2.processes.connectors.sql.sql import SQLAccessConfig, SQLConnectionConfig
|
|
26
30
|
|
|
27
31
|
if TYPE_CHECKING:
|
|
28
32
|
from snowflake.connector import SnowflakeConnection
|
|
@@ -59,6 +63,7 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
|
|
|
59
63
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
60
64
|
|
|
61
65
|
@contextmanager
|
|
66
|
+
# The actual snowflake module package name is: snowflake-connector-python
|
|
62
67
|
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
63
68
|
def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
|
|
64
69
|
# https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
|
|
@@ -89,42 +94,67 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
|
|
|
89
94
|
cursor.close()
|
|
90
95
|
|
|
91
96
|
|
|
92
|
-
class SnowflakeIndexerConfig(
|
|
97
|
+
class SnowflakeIndexerConfig(SQLIndexerConfig):
|
|
93
98
|
pass
|
|
94
99
|
|
|
95
100
|
|
|
96
101
|
@dataclass
|
|
97
|
-
class SnowflakeIndexer(
|
|
102
|
+
class SnowflakeIndexer(SQLIndexer):
|
|
98
103
|
connection_config: SnowflakeConnectionConfig
|
|
99
104
|
index_config: SnowflakeIndexerConfig
|
|
100
105
|
connector_type: str = CONNECTOR_TYPE
|
|
101
106
|
|
|
102
107
|
|
|
103
|
-
class SnowflakeDownloaderConfig(
|
|
108
|
+
class SnowflakeDownloaderConfig(SQLDownloaderConfig):
|
|
104
109
|
pass
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
@dataclass
|
|
108
|
-
class SnowflakeDownloader(
|
|
113
|
+
class SnowflakeDownloader(SQLDownloader):
|
|
109
114
|
connection_config: SnowflakeConnectionConfig
|
|
110
115
|
download_config: SnowflakeDownloaderConfig
|
|
111
116
|
connector_type: str = CONNECTOR_TYPE
|
|
117
|
+
values_delimiter: str = "?"
|
|
112
118
|
|
|
113
|
-
|
|
114
|
-
|
|
119
|
+
# The actual snowflake module package name is: snowflake-connector-python
|
|
120
|
+
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
121
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
122
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
123
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
124
|
+
ids = file_data.additional_metadata["ids"]
|
|
125
|
+
|
|
126
|
+
with self.connection_config.get_cursor() as cursor:
|
|
127
|
+
query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
|
|
128
|
+
table_name=table_name,
|
|
129
|
+
id_column=id_column,
|
|
130
|
+
fields=(
|
|
131
|
+
",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
132
|
+
),
|
|
133
|
+
values=",".join([self.values_delimiter for _ in ids]),
|
|
134
|
+
)
|
|
135
|
+
logger.debug(f"running query: {query}\nwith values: {ids}")
|
|
136
|
+
cursor.execute(query, ids)
|
|
137
|
+
rows = [
|
|
138
|
+
tuple(row.values()) if isinstance(row, dict) else row for row in cursor.fetchall()
|
|
139
|
+
]
|
|
140
|
+
columns = [col[0] for col in cursor.description]
|
|
141
|
+
return rows, columns
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class SnowflakeUploadStagerConfig(SQLUploadStagerConfig):
|
|
115
145
|
pass
|
|
116
146
|
|
|
117
147
|
|
|
118
|
-
class SnowflakeUploadStager(
|
|
148
|
+
class SnowflakeUploadStager(SQLUploadStager):
|
|
119
149
|
upload_stager_config: SnowflakeUploadStagerConfig
|
|
120
150
|
|
|
121
151
|
|
|
122
|
-
class SnowflakeUploaderConfig(
|
|
152
|
+
class SnowflakeUploaderConfig(SQLUploaderConfig):
|
|
123
153
|
pass
|
|
124
154
|
|
|
125
155
|
|
|
126
156
|
@dataclass
|
|
127
|
-
class SnowflakeUploader(
|
|
157
|
+
class SnowflakeUploader(SQLUploader):
|
|
128
158
|
upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
|
|
129
159
|
connection_config: SnowflakeConnectionConfig
|
|
130
160
|
connector_type: str = CONNECTOR_TYPE
|
|
@@ -135,15 +165,23 @@ class SnowflakeUploader(PostgresUploader):
|
|
|
135
165
|
df.replace({np.nan: None}, inplace=True)
|
|
136
166
|
|
|
137
167
|
columns = list(df.columns)
|
|
138
|
-
stmt =
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
)
|
|
168
|
+
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
169
|
+
table_name=self.upload_config.table_name,
|
|
170
|
+
columns=",".join(columns),
|
|
171
|
+
values=",".join([self.values_delimiter for _ in columns]),
|
|
172
|
+
)
|
|
173
|
+
logger.info(
|
|
174
|
+
f"writing a total of {len(df)} elements via"
|
|
175
|
+
f" document batches to destination"
|
|
176
|
+
f" table named {self.upload_config.table_name}"
|
|
177
|
+
f" with batch size {self.upload_config.batch_size}"
|
|
178
|
+
)
|
|
179
|
+
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
143
180
|
with self.connection_config.get_cursor() as cursor:
|
|
144
181
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
145
182
|
# TODO: executemany break on 'Binding data in type (list) is not supported'
|
|
146
183
|
for val in values:
|
|
184
|
+
logger.debug(f"running query: {stmt}\nwith values: {val}")
|
|
147
185
|
cursor.execute(stmt, val)
|
|
148
186
|
|
|
149
187
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
|
-
import uuid
|
|
5
4
|
from abc import ABC, abstractmethod
|
|
6
5
|
from contextlib import contextmanager
|
|
7
6
|
from dataclasses import dataclass, field, replace
|
|
@@ -35,6 +34,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
35
34
|
download_responses,
|
|
36
35
|
)
|
|
37
36
|
from unstructured_ingest.v2.logger import logger
|
|
37
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
38
38
|
|
|
39
39
|
_COLUMNS = (
|
|
40
40
|
"id",
|
|
@@ -251,7 +251,7 @@ class SQLUploadStager(UploadStager):
|
|
|
251
251
|
element.update(data_source)
|
|
252
252
|
element.update(coordinates)
|
|
253
253
|
|
|
254
|
-
element["id"] =
|
|
254
|
+
element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
|
|
255
255
|
|
|
256
256
|
# remove extraneous, not supported columns
|
|
257
257
|
element = {k: v for k, v in element.items() if k in _COLUMNS}
|
|
@@ -367,7 +367,11 @@ class SQLUploader(Uploader):
|
|
|
367
367
|
self._fit_to_schema(df=df, columns=self.get_table_columns())
|
|
368
368
|
|
|
369
369
|
columns = list(df.columns)
|
|
370
|
-
stmt =
|
|
370
|
+
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
371
|
+
table_name=self.upload_config.table_name,
|
|
372
|
+
columns=",".join(columns),
|
|
373
|
+
values=",".join([self.values_delimiter for _ in columns]),
|
|
374
|
+
)
|
|
371
375
|
logger.info(
|
|
372
376
|
f"writing a total of {len(df)} elements via"
|
|
373
377
|
f" document batches to destination"
|
|
@@ -384,6 +388,7 @@ class SQLUploader(Uploader):
|
|
|
384
388
|
# except Exception as e:
|
|
385
389
|
# print(f"Error: {e}")
|
|
386
390
|
# print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
|
|
391
|
+
logger.debug(f"running query: {stmt}")
|
|
387
392
|
cursor.executemany(stmt, values)
|
|
388
393
|
|
|
389
394
|
def get_table_columns(self) -> list[str]:
|
|
@@ -95,6 +95,7 @@ class SQLiteDownloader(SQLDownloader):
|
|
|
95
95
|
connection_config: SQLConnectionConfig
|
|
96
96
|
download_config: SQLDownloaderConfig
|
|
97
97
|
connector_type: str = CONNECTOR_TYPE
|
|
98
|
+
values_delimiter: str = "?"
|
|
98
99
|
|
|
99
100
|
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
100
101
|
table_name = file_data.additional_metadata["table_name"]
|
|
@@ -103,14 +104,10 @@ class SQLiteDownloader(SQLDownloader):
|
|
|
103
104
|
with self.connection_config.get_connection() as sqlite_connection:
|
|
104
105
|
cursor = sqlite_connection.cursor()
|
|
105
106
|
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
ids=",".join([str(i) for i in ids]),
|
|
111
|
-
)
|
|
112
|
-
logger.debug(f"running query: {query}")
|
|
113
|
-
cursor.execute(query)
|
|
107
|
+
values = ",".join(self.values_delimiter for _ in ids)
|
|
108
|
+
query = f"SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})"
|
|
109
|
+
logger.debug(f"running query: {query}\nwith values: {ids}")
|
|
110
|
+
cursor.execute(query, ids)
|
|
114
111
|
rows = cursor.fetchall()
|
|
115
112
|
columns = [col[0] for col in cursor.description]
|
|
116
113
|
return rows, columns
|
|
@@ -157,7 +154,7 @@ class SQLiteUploader(SQLUploader):
|
|
|
157
154
|
sqlite_source_entry = SourceRegistryEntry(
|
|
158
155
|
connection_config=SQLiteConnectionConfig,
|
|
159
156
|
indexer_config=SQLiteIndexerConfig,
|
|
160
|
-
indexer=
|
|
157
|
+
indexer=SQLiteIndexer,
|
|
161
158
|
downloader_config=SQLiteDownloaderConfig,
|
|
162
159
|
downloader=SQLiteDownloader,
|
|
163
160
|
)
|
unstructured_ingest/v2/utils.py
CHANGED
|
@@ -3,10 +3,13 @@ from datetime import datetime
|
|
|
3
3
|
from inspect import isclass
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
6
7
|
|
|
7
8
|
from pydantic import BaseModel
|
|
8
9
|
from pydantic.types import _SecretBase
|
|
9
10
|
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
def is_secret(value: Any) -> bool:
|
|
12
15
|
# Case Secret[int]
|
|
@@ -50,3 +53,9 @@ def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
|
|
|
50
53
|
|
|
51
54
|
# Support json dumps kwargs such as sort_keys
|
|
52
55
|
return json.dumps(model_dict, default=json_serial, **json_kwargs)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_enhanced_element_id(element_dict: dict, file_data: FileData) -> str:
|
|
59
|
+
element_id = element_dict.get("element_id")
|
|
60
|
+
new_data = f"{element_id}{file_data.identifier}"
|
|
61
|
+
return str(uuid5(NAMESPACE_DNS, new_data))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,27 +22,27 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: pandas
|
|
26
26
|
Requires-Dist: tqdm
|
|
27
27
|
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist: dataclasses-json
|
|
29
28
|
Requires-Dist: opentelemetry-sdk
|
|
30
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: dataclasses-json
|
|
31
30
|
Requires-Dist: click
|
|
31
|
+
Requires-Dist: python-dateutil
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
45
44
|
Requires-Dist: requests; extra == "biomed"
|
|
45
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
47
|
Requires-Dist: fsspec; extra == "box"
|
|
48
48
|
Requires-Dist: boxfs; extra == "box"
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
63
|
Requires-Dist: boto3; extra == "delta-table"
|
|
64
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -69,8 +69,10 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
72
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
74
|
+
Provides-Extra: duckdb
|
|
75
|
+
Requires-Dist: duckdb; extra == "duckdb"
|
|
74
76
|
Provides-Extra: elasticsearch
|
|
75
77
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
78
|
Provides-Extra: embed-huggingface
|
|
@@ -78,8 +80,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
78
80
|
Provides-Extra: embed-mixedbreadai
|
|
79
81
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
80
82
|
Provides-Extra: embed-octoai
|
|
81
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
82
83
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
84
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
83
85
|
Provides-Extra: embed-vertexai
|
|
84
86
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
85
87
|
Provides-Extra: embed-voyageai
|
|
@@ -87,9 +89,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
87
89
|
Provides-Extra: epub
|
|
88
90
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
91
|
Provides-Extra: gcs
|
|
90
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
91
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
92
92
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
94
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
93
95
|
Provides-Extra: github
|
|
94
96
|
Requires-Dist: requests; extra == "github"
|
|
95
97
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
@@ -98,8 +100,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
98
100
|
Provides-Extra: google-drive
|
|
99
101
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
102
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
103
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
104
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
103
105
|
Provides-Extra: jira
|
|
104
106
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
107
|
Provides-Extra: kafka
|
|
@@ -119,17 +121,17 @@ Requires-Dist: unstructured[msg]; extra == "msg"
|
|
|
119
121
|
Provides-Extra: notion
|
|
120
122
|
Requires-Dist: notion-client; extra == "notion"
|
|
121
123
|
Requires-Dist: backoff; extra == "notion"
|
|
122
|
-
Requires-Dist: httpx; extra == "notion"
|
|
123
124
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
125
|
+
Requires-Dist: httpx; extra == "notion"
|
|
124
126
|
Provides-Extra: odt
|
|
125
127
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
126
128
|
Provides-Extra: onedrive
|
|
127
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
128
129
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
129
130
|
Requires-Dist: msal; extra == "onedrive"
|
|
131
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
130
132
|
Provides-Extra: openai
|
|
131
|
-
Requires-Dist: openai; extra == "openai"
|
|
132
133
|
Requires-Dist: tiktoken; extra == "openai"
|
|
134
|
+
Requires-Dist: openai; extra == "openai"
|
|
133
135
|
Provides-Extra: opensearch
|
|
134
136
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
135
137
|
Provides-Extra: org
|
|
@@ -158,8 +160,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
158
160
|
Provides-Extra: rtf
|
|
159
161
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
160
162
|
Provides-Extra: s3
|
|
161
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
162
163
|
Requires-Dist: s3fs; extra == "s3"
|
|
164
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
163
165
|
Provides-Extra: salesforce
|
|
164
166
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
165
167
|
Provides-Extra: sftp
|
|
@@ -173,8 +175,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
173
175
|
Provides-Extra: slack
|
|
174
176
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
175
177
|
Provides-Extra: snowflake
|
|
176
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
177
178
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
179
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
178
180
|
Provides-Extra: togetherai
|
|
179
181
|
Requires-Dist: together; extra == "togetherai"
|
|
180
182
|
Provides-Extra: tsv
|