unstructured-ingest 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/{test_postgres.py → sql/test_postgres.py} +80 -2
- test/integration/connectors/{test_sqlite.py → sql/test_sqlite.py} +72 -12
- test/integration/connectors/utils/constants.py +1 -1
- test/integration/connectors/utils/validation.py +7 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +56 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +132 -3
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +65 -2
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -16
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +17 -16
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -1,21 +1,97 @@
|
|
|
1
1
|
import tempfile
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
5
|
+
import faker
|
|
4
6
|
import pandas as pd
|
|
5
7
|
import pytest
|
|
6
8
|
from psycopg2 import connect
|
|
7
9
|
|
|
8
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
9
11
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
12
|
+
from test.integration.connectors.utils.validation import (
|
|
13
|
+
ValidationConfigs,
|
|
14
|
+
source_connector_validation,
|
|
15
|
+
)
|
|
10
16
|
from unstructured_ingest.v2.interfaces import FileData
|
|
11
17
|
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
12
18
|
CONNECTOR_TYPE,
|
|
13
19
|
PostgresAccessConfig,
|
|
14
20
|
PostgresConnectionConfig,
|
|
21
|
+
PostgresDownloader,
|
|
22
|
+
PostgresDownloaderConfig,
|
|
23
|
+
PostgresIndexer,
|
|
24
|
+
PostgresIndexerConfig,
|
|
15
25
|
PostgresUploader,
|
|
16
26
|
PostgresUploadStager,
|
|
17
27
|
)
|
|
18
28
|
|
|
29
|
+
faker = faker.Faker()
|
|
30
|
+
|
|
31
|
+
SEED_DATA_ROWS = 40
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextmanager
|
|
35
|
+
def postgres_download_setup() -> None:
|
|
36
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
|
|
37
|
+
connection = connect(
|
|
38
|
+
user="unstructured",
|
|
39
|
+
password="test",
|
|
40
|
+
dbname="test_db",
|
|
41
|
+
host="localhost",
|
|
42
|
+
port=5433,
|
|
43
|
+
)
|
|
44
|
+
with connection.cursor() as cursor:
|
|
45
|
+
for _ in range(SEED_DATA_ROWS):
|
|
46
|
+
sql_statment = (
|
|
47
|
+
f"INSERT INTO cars (brand, price) VALUES "
|
|
48
|
+
f"('{faker.word()}', {faker.random_int()})"
|
|
49
|
+
)
|
|
50
|
+
cursor.execute(sql_statment)
|
|
51
|
+
connection.commit()
|
|
52
|
+
yield
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.mark.asyncio
|
|
56
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
57
|
+
async def test_postgres_source():
|
|
58
|
+
connect_params = {
|
|
59
|
+
"host": "localhost",
|
|
60
|
+
"port": 5433,
|
|
61
|
+
"database": "test_db",
|
|
62
|
+
"user": "unstructured",
|
|
63
|
+
"password": "test",
|
|
64
|
+
}
|
|
65
|
+
with postgres_download_setup():
|
|
66
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
67
|
+
connection_config = PostgresConnectionConfig(
|
|
68
|
+
host=connect_params["host"],
|
|
69
|
+
port=connect_params["port"],
|
|
70
|
+
database=connect_params["database"],
|
|
71
|
+
username=connect_params["user"],
|
|
72
|
+
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
73
|
+
)
|
|
74
|
+
indexer = PostgresIndexer(
|
|
75
|
+
connection_config=connection_config,
|
|
76
|
+
index_config=PostgresIndexerConfig(
|
|
77
|
+
table_name="cars", id_column="car_id", batch_size=5
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
downloader = PostgresDownloader(
|
|
81
|
+
connection_config=connection_config,
|
|
82
|
+
download_config=PostgresDownloaderConfig(
|
|
83
|
+
fields=["car_id", "brand"], download_dir=Path(tmpdir)
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
await source_connector_validation(
|
|
87
|
+
indexer=indexer,
|
|
88
|
+
downloader=downloader,
|
|
89
|
+
configs=ValidationConfigs(
|
|
90
|
+
test_id="postgres",
|
|
91
|
+
expected_num_files=40,
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
|
|
19
95
|
|
|
20
96
|
def validate_destination(
|
|
21
97
|
connect_params: dict,
|
|
@@ -50,7 +126,9 @@ async def test_postgres_destination(upload_file: Path):
|
|
|
50
126
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
51
127
|
# mocking it with arbitrary values to meet the base requirements:
|
|
52
128
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
53
|
-
with docker_compose_context(
|
|
129
|
+
with docker_compose_context(
|
|
130
|
+
docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
|
|
131
|
+
):
|
|
54
132
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
55
133
|
stager = PostgresUploadStager()
|
|
56
134
|
stager_params = {
|
|
@@ -3,39 +3,99 @@ import tempfile
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
+
import faker
|
|
6
7
|
import pandas as pd
|
|
7
8
|
import pytest
|
|
8
9
|
|
|
9
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
11
|
+
from test.integration.connectors.utils.validation import (
|
|
12
|
+
ValidationConfigs,
|
|
13
|
+
source_connector_validation,
|
|
14
|
+
)
|
|
10
15
|
from unstructured_ingest.v2.interfaces import FileData
|
|
11
16
|
from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
12
17
|
CONNECTOR_TYPE,
|
|
13
18
|
SQLiteConnectionConfig,
|
|
19
|
+
SQLiteDownloader,
|
|
20
|
+
SQLiteDownloaderConfig,
|
|
21
|
+
SQLiteIndexer,
|
|
22
|
+
SQLiteIndexerConfig,
|
|
14
23
|
SQLiteUploader,
|
|
15
24
|
SQLiteUploadStager,
|
|
16
25
|
)
|
|
17
26
|
|
|
27
|
+
faker = faker.Faker()
|
|
28
|
+
|
|
29
|
+
SEED_DATA_ROWS = 40
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@contextmanager
|
|
33
|
+
def sqlite_download_setup() -> Path:
|
|
34
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
35
|
+
db_path = Path(tmpdir) / "mock_database.db"
|
|
36
|
+
db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
|
|
37
|
+
assert db_init_path.exists()
|
|
38
|
+
assert db_init_path.is_file()
|
|
39
|
+
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
40
|
+
cursor = sqlite_connection.cursor()
|
|
41
|
+
with db_init_path.open("r") as f:
|
|
42
|
+
query = f.read()
|
|
43
|
+
cursor.executescript(query)
|
|
44
|
+
for _ in range(SEED_DATA_ROWS):
|
|
45
|
+
sql_statment = (
|
|
46
|
+
f"INSERT INTO cars (brand, price) "
|
|
47
|
+
f"VALUES ('{faker.word()}', {faker.random_int()})"
|
|
48
|
+
)
|
|
49
|
+
cursor.execute(sql_statment)
|
|
50
|
+
|
|
51
|
+
sqlite_connection.commit()
|
|
52
|
+
cursor.close()
|
|
53
|
+
yield db_path
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
58
|
+
async def test_sqlite_source():
|
|
59
|
+
with sqlite_download_setup() as db_path:
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
61
|
+
connection_config = SQLiteConnectionConfig(database_path=db_path)
|
|
62
|
+
indexer = SQLiteIndexer(
|
|
63
|
+
connection_config=connection_config,
|
|
64
|
+
index_config=SQLiteIndexerConfig(
|
|
65
|
+
table_name="cars", id_column="car_id", batch_size=5
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
downloader = SQLiteDownloader(
|
|
69
|
+
connection_config=connection_config,
|
|
70
|
+
download_config=SQLiteDownloaderConfig(
|
|
71
|
+
fields=["car_id", "brand"], download_dir=Path(tmpdir)
|
|
72
|
+
),
|
|
73
|
+
)
|
|
74
|
+
await source_connector_validation(
|
|
75
|
+
indexer=indexer,
|
|
76
|
+
downloader=downloader,
|
|
77
|
+
configs=ValidationConfigs(
|
|
78
|
+
test_id="sqlite",
|
|
79
|
+
expected_num_files=40,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
|
|
18
83
|
|
|
19
84
|
@contextmanager
|
|
20
|
-
def
|
|
85
|
+
def sqlite_upload_setup() -> Path:
|
|
21
86
|
# Provision the local file that sqlite points to to have the desired schema for the integration
|
|
22
87
|
# tests and make sure the file and connection get cleaned up by using a context manager.
|
|
23
88
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
24
89
|
db_path = Path(tmpdir) / "elements.db"
|
|
25
|
-
db_init_path = env_setup_path / "sql" / "sqlite-schema.sql"
|
|
90
|
+
db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
|
|
26
91
|
assert db_init_path.exists()
|
|
27
92
|
assert db_init_path.is_file()
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
connection = sqlite3.connect(database=db_path)
|
|
93
|
+
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
31
94
|
with db_init_path.open("r") as f:
|
|
32
95
|
query = f.read()
|
|
33
|
-
cursor =
|
|
96
|
+
cursor = sqlite_connection.cursor()
|
|
34
97
|
cursor.executescript(query)
|
|
35
|
-
|
|
36
|
-
finally:
|
|
37
|
-
if connection:
|
|
38
|
-
connection.close()
|
|
98
|
+
yield db_path
|
|
39
99
|
|
|
40
100
|
|
|
41
101
|
def validate_destination(db_path: Path, expected_num_elements: int):
|
|
@@ -62,7 +122,7 @@ async def test_sqlite_destination(upload_file: Path):
|
|
|
62
122
|
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
63
123
|
# mocking it with arbitrary values to meet the base requirements:
|
|
64
124
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
65
|
-
with
|
|
125
|
+
with sqlite_upload_setup() as db_path:
|
|
66
126
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
67
127
|
stager = SQLiteUploadStager()
|
|
68
128
|
stager_params = {
|
|
@@ -3,5 +3,5 @@ from pathlib import Path
|
|
|
3
3
|
SOURCE_TAG = "source"
|
|
4
4
|
DESTINATION_TAG = "destination"
|
|
5
5
|
|
|
6
|
-
env_setup_path = Path(__file__).parents[
|
|
6
|
+
env_setup_path = Path(__file__).parents[1] / "env_setup"
|
|
7
7
|
expected_results_path = Path(__file__).parents[1] / "expected_results"
|
|
@@ -180,8 +180,13 @@ async def source_connector_validation(
|
|
|
180
180
|
resp = await downloader.run_async(file_data=file_data)
|
|
181
181
|
else:
|
|
182
182
|
resp = downloader.run(file_data=file_data)
|
|
183
|
-
|
|
184
|
-
|
|
183
|
+
if isinstance(resp, list):
|
|
184
|
+
for r in resp:
|
|
185
|
+
postdownload_file_data = replace(r["file_data"])
|
|
186
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
187
|
+
else:
|
|
188
|
+
postdownload_file_data = replace(resp["file_data"])
|
|
189
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
185
190
|
if not overwrite_fixtures:
|
|
186
191
|
run_all_validations(
|
|
187
192
|
configs=configs,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.1" # pragma: no cover
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import random
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -63,6 +64,7 @@ class FileConfig(BaseModel):
|
|
|
63
64
|
|
|
64
65
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
65
66
|
recursive: bool = False
|
|
67
|
+
sample_n_files: Optional[int] = None
|
|
66
68
|
|
|
67
69
|
|
|
68
70
|
class FsspecAccessConfig(AccessConfig):
|
|
@@ -128,8 +130,23 @@ class FsspecIndexer(Indexer):
|
|
|
128
130
|
filtered_files = [
|
|
129
131
|
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
130
132
|
]
|
|
133
|
+
|
|
134
|
+
if self.index_config.sample_n_files:
|
|
135
|
+
filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
|
|
136
|
+
|
|
131
137
|
return filtered_files
|
|
132
138
|
|
|
139
|
+
def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
|
|
140
|
+
if len(files) <= n:
|
|
141
|
+
logger.warning(
|
|
142
|
+
f"number of files to be sampled={n} is not smaller than the number"
|
|
143
|
+
f" of files found ({len(files)}). Returning all of the files as the"
|
|
144
|
+
" sample."
|
|
145
|
+
)
|
|
146
|
+
return files
|
|
147
|
+
|
|
148
|
+
return random.sample(files, n)
|
|
149
|
+
|
|
133
150
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
134
151
|
raise NotImplementedError()
|
|
135
152
|
|
|
@@ -26,7 +26,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
26
26
|
)
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
|
-
from kdbai_client import Session, Table
|
|
29
|
+
from kdbai_client import Database, Session, Table
|
|
30
30
|
|
|
31
31
|
CONNECTOR_TYPE = "kdbai"
|
|
32
32
|
|
|
@@ -99,6 +99,9 @@ class KdbaiUploadStager(UploadStager):
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
class KdbaiUploaderConfig(UploaderConfig):
|
|
102
|
+
database_name: str = Field(
|
|
103
|
+
default="default", description="The name of the KDBAI database to write into."
|
|
104
|
+
)
|
|
102
105
|
table_name: str = Field(description="The name of the KDBAI table to write into.")
|
|
103
106
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
104
107
|
|
|
@@ -111,24 +114,29 @@ class KdbaiUploader(Uploader):
|
|
|
111
114
|
|
|
112
115
|
def precheck(self) -> None:
|
|
113
116
|
try:
|
|
114
|
-
self.
|
|
117
|
+
self.get_database()
|
|
115
118
|
except Exception as e:
|
|
116
119
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
117
120
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
118
121
|
|
|
119
|
-
def
|
|
122
|
+
def get_database(self) -> "Database":
|
|
120
123
|
session: Session = self.connection_config.get_session()
|
|
121
|
-
|
|
124
|
+
db = session.database(self.upload_config.database_name)
|
|
125
|
+
return db
|
|
126
|
+
|
|
127
|
+
def get_table(self) -> "Table":
|
|
128
|
+
db = self.get_database()
|
|
129
|
+
table = db.table(self.upload_config.table_name)
|
|
122
130
|
return table
|
|
123
131
|
|
|
124
132
|
def upsert_batch(self, batch: pd.DataFrame):
|
|
125
133
|
table = self.get_table()
|
|
126
|
-
table.insert(
|
|
134
|
+
table.insert(batch)
|
|
127
135
|
|
|
128
136
|
def process_dataframe(self, df: pd.DataFrame):
|
|
129
137
|
logger.debug(
|
|
130
138
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
131
|
-
f"db in table {self.upload_config.table_name}"
|
|
139
|
+
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
132
140
|
)
|
|
133
141
|
for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
|
|
134
142
|
self.upsert_batch(batch=batch_df)
|
|
@@ -7,12 +7,17 @@ import pandas as pd
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
10
11
|
from unstructured_ingest.v2.logger import logger
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
12
13
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
13
14
|
_DATE_COLUMNS,
|
|
14
15
|
SQLAccessConfig,
|
|
15
16
|
SQLConnectionConfig,
|
|
17
|
+
SQLDownloader,
|
|
18
|
+
SQLDownloaderConfig,
|
|
19
|
+
SQLIndexer,
|
|
20
|
+
SQLIndexerConfig,
|
|
16
21
|
SQLUploader,
|
|
17
22
|
SQLUploaderConfig,
|
|
18
23
|
SQLUploadStager,
|
|
@@ -57,6 +62,57 @@ class PostgresConnectionConfig(SQLConnectionConfig):
|
|
|
57
62
|
)
|
|
58
63
|
|
|
59
64
|
|
|
65
|
+
class PostgresIndexerConfig(SQLIndexerConfig):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class PostgresIndexer(SQLIndexer):
|
|
71
|
+
connection_config: PostgresConnectionConfig
|
|
72
|
+
index_config: PostgresIndexerConfig
|
|
73
|
+
connector_type: str = CONNECTOR_TYPE
|
|
74
|
+
|
|
75
|
+
def _get_doc_ids(self) -> list[str]:
|
|
76
|
+
connection = self.connection_config.get_connection()
|
|
77
|
+
with connection.cursor() as cursor:
|
|
78
|
+
cursor.execute(
|
|
79
|
+
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
80
|
+
)
|
|
81
|
+
results = cursor.fetchall()
|
|
82
|
+
ids = [result[0] for result in results]
|
|
83
|
+
return ids
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PostgresDownloaderConfig(SQLDownloaderConfig):
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class PostgresDownloader(SQLDownloader):
|
|
92
|
+
connection_config: PostgresConnectionConfig
|
|
93
|
+
download_config: PostgresDownloaderConfig
|
|
94
|
+
connector_type: str = CONNECTOR_TYPE
|
|
95
|
+
|
|
96
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
97
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
98
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
99
|
+
ids = file_data.additional_metadata["ids"]
|
|
100
|
+
connection = self.connection_config.get_connection()
|
|
101
|
+
with connection.cursor() as cursor:
|
|
102
|
+
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
103
|
+
query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
|
|
104
|
+
fields=fields,
|
|
105
|
+
table_name=table_name,
|
|
106
|
+
id_column=id_column,
|
|
107
|
+
ids=",".join([str(i) for i in ids]),
|
|
108
|
+
)
|
|
109
|
+
logger.debug(f"running query: {query}")
|
|
110
|
+
cursor.execute(query)
|
|
111
|
+
rows = cursor.fetchall()
|
|
112
|
+
columns = [col[0] for col in cursor.description]
|
|
113
|
+
return rows, columns
|
|
114
|
+
|
|
115
|
+
|
|
60
116
|
class PostgresUploadStagerConfig(SQLUploadStagerConfig):
|
|
61
117
|
pass
|
|
62
118
|
|
|
@@ -1,24 +1,34 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import json
|
|
3
|
+
import sys
|
|
2
4
|
import uuid
|
|
3
5
|
from abc import ABC, abstractmethod
|
|
4
|
-
from dataclasses import dataclass, field
|
|
6
|
+
from dataclasses import dataclass, field, replace
|
|
5
7
|
from datetime import date, datetime
|
|
6
8
|
from pathlib import Path
|
|
7
|
-
from
|
|
9
|
+
from time import time
|
|
10
|
+
from typing import Any, Generator, Union
|
|
8
11
|
|
|
9
12
|
import pandas as pd
|
|
10
13
|
from dateutil import parser
|
|
11
14
|
from pydantic import Field, Secret
|
|
12
15
|
|
|
13
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
16
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
14
17
|
from unstructured_ingest.v2.interfaces import (
|
|
15
18
|
AccessConfig,
|
|
16
19
|
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
17
23
|
FileData,
|
|
24
|
+
FileDataSourceMetadata,
|
|
25
|
+
Indexer,
|
|
26
|
+
IndexerConfig,
|
|
18
27
|
Uploader,
|
|
19
28
|
UploaderConfig,
|
|
20
29
|
UploadStager,
|
|
21
30
|
UploadStagerConfig,
|
|
31
|
+
download_responses,
|
|
22
32
|
)
|
|
23
33
|
from unstructured_ingest.v2.logger import logger
|
|
24
34
|
|
|
@@ -88,6 +98,125 @@ class SQLConnectionConfig(ConnectionConfig, ABC):
|
|
|
88
98
|
pass
|
|
89
99
|
|
|
90
100
|
|
|
101
|
+
class SQLIndexerConfig(IndexerConfig):
|
|
102
|
+
table_name: str
|
|
103
|
+
id_column: str
|
|
104
|
+
batch_size: int = 100
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class SQLIndexer(Indexer, ABC):
|
|
108
|
+
connection_config: SQLConnectionConfig
|
|
109
|
+
index_config: SQLIndexerConfig
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def _get_doc_ids(self) -> list[str]:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
try:
|
|
117
|
+
connection = self.connection_config.get_connection()
|
|
118
|
+
cursor = connection.cursor()
|
|
119
|
+
cursor.execute("SELECT 1;")
|
|
120
|
+
cursor.close()
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
123
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
124
|
+
|
|
125
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
126
|
+
ids = self._get_doc_ids()
|
|
127
|
+
id_batches: list[frozenset[str]] = [
|
|
128
|
+
frozenset(
|
|
129
|
+
ids[
|
|
130
|
+
i
|
|
131
|
+
* self.index_config.batch_size : (i + 1) # noqa
|
|
132
|
+
* self.index_config.batch_size
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
for i in range(
|
|
136
|
+
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
137
|
+
)
|
|
138
|
+
]
|
|
139
|
+
for batch in id_batches:
|
|
140
|
+
# Make sure the hash is always a positive number to create identified
|
|
141
|
+
identified = str(hash(batch) + sys.maxsize + 1)
|
|
142
|
+
yield FileData(
|
|
143
|
+
identifier=identified,
|
|
144
|
+
connector_type=self.connector_type,
|
|
145
|
+
metadata=FileDataSourceMetadata(
|
|
146
|
+
date_processed=str(time()),
|
|
147
|
+
),
|
|
148
|
+
doc_type="batch",
|
|
149
|
+
additional_metadata={
|
|
150
|
+
"ids": list(batch),
|
|
151
|
+
"table_name": self.index_config.table_name,
|
|
152
|
+
"id_column": self.index_config.id_column,
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class SQLDownloaderConfig(DownloaderConfig):
|
|
158
|
+
fields: list[str] = field(default_factory=list)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class SQLDownloader(Downloader, ABC):
|
|
162
|
+
connection_config: SQLConnectionConfig
|
|
163
|
+
download_config: SQLDownloaderConfig
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
|
|
170
|
+
data = [dict(zip(columns, row)) for row in rows]
|
|
171
|
+
df = pd.DataFrame(data)
|
|
172
|
+
dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
|
|
173
|
+
return dfs
|
|
174
|
+
|
|
175
|
+
def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
|
|
176
|
+
rows, columns = self.query_db(file_data=file_data)
|
|
177
|
+
return self.sql_to_df(rows=rows, columns=columns)
|
|
178
|
+
|
|
179
|
+
def get_identifier(self, table_name: str, record_id: str) -> str:
|
|
180
|
+
f = f"{table_name}-{record_id}"
|
|
181
|
+
if self.download_config.fields:
|
|
182
|
+
f = "{}-{}".format(
|
|
183
|
+
f,
|
|
184
|
+
hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
|
|
185
|
+
)
|
|
186
|
+
return f
|
|
187
|
+
|
|
188
|
+
def generate_download_response(
|
|
189
|
+
self, result: pd.DataFrame, file_data: FileData
|
|
190
|
+
) -> DownloadResponse:
|
|
191
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
192
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
193
|
+
record_id = result.iloc[0][id_column]
|
|
194
|
+
filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
|
|
195
|
+
filename = f"{filename_id}.csv"
|
|
196
|
+
download_path = self.download_dir / Path(filename)
|
|
197
|
+
logger.debug(
|
|
198
|
+
f"Downloading results from table {table_name} and id {record_id} to {download_path}"
|
|
199
|
+
)
|
|
200
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
+
result.to_csv(download_path)
|
|
202
|
+
copied_file_data = replace(file_data)
|
|
203
|
+
copied_file_data.identifier = filename_id
|
|
204
|
+
copied_file_data.doc_type = "file"
|
|
205
|
+
copied_file_data.additional_metadata.pop("ids", None)
|
|
206
|
+
return super().generate_download_response(
|
|
207
|
+
file_data=copied_file_data, download_path=download_path
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
211
|
+
data_dfs = self.get_data(file_data=file_data)
|
|
212
|
+
download_responses = []
|
|
213
|
+
for df in data_dfs:
|
|
214
|
+
download_responses.append(
|
|
215
|
+
self.generate_download_response(result=df, file_data=file_data)
|
|
216
|
+
)
|
|
217
|
+
return download_responses
|
|
218
|
+
|
|
219
|
+
|
|
91
220
|
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
92
221
|
pass
|
|
93
222
|
|
|
@@ -5,14 +5,19 @@ from typing import TYPE_CHECKING, Any
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pydantic import Field, Secret
|
|
8
|
+
from pydantic import Field, Secret, model_validator
|
|
9
9
|
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
10
11
|
from unstructured_ingest.v2.logger import logger
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
12
13
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
13
14
|
_DATE_COLUMNS,
|
|
14
15
|
SQLAccessConfig,
|
|
15
16
|
SQLConnectionConfig,
|
|
17
|
+
SQLDownloader,
|
|
18
|
+
SQLDownloaderConfig,
|
|
19
|
+
SQLIndexer,
|
|
20
|
+
SQLIndexerConfig,
|
|
16
21
|
SQLUploader,
|
|
17
22
|
SQLUploaderConfig,
|
|
18
23
|
SQLUploadStager,
|
|
@@ -37,7 +42,14 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
|
|
|
37
42
|
database_path: Path = Field(
|
|
38
43
|
description="Path to the .db file.",
|
|
39
44
|
)
|
|
40
|
-
|
|
45
|
+
|
|
46
|
+
@model_validator(mode="after")
|
|
47
|
+
def check_database_path(self) -> "SQLiteConnectionConfig":
|
|
48
|
+
if not self.database_path.exists():
|
|
49
|
+
raise ValueError(f"{self.database_path} does not exist")
|
|
50
|
+
if not self.database_path.is_file():
|
|
51
|
+
raise ValueError(f"{self.database_path} is not a valid file")
|
|
52
|
+
return self
|
|
41
53
|
|
|
42
54
|
def get_connection(self) -> "SqliteConnection":
|
|
43
55
|
from sqlite3 import connect
|
|
@@ -45,6 +57,57 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
|
|
|
45
57
|
return connect(database=self.database_path)
|
|
46
58
|
|
|
47
59
|
|
|
60
|
+
class SQLiteIndexerConfig(SQLIndexerConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class SQLiteIndexer(SQLIndexer):
|
|
66
|
+
connection_config: SQLConnectionConfig
|
|
67
|
+
index_config: SQLIndexerConfig
|
|
68
|
+
connector_type: str = CONNECTOR_TYPE
|
|
69
|
+
|
|
70
|
+
def _get_doc_ids(self) -> list[str]:
|
|
71
|
+
with self.connection_config.get_connection() as sqlite_connection:
|
|
72
|
+
cursor = sqlite_connection.cursor()
|
|
73
|
+
cursor.execute(
|
|
74
|
+
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
75
|
+
)
|
|
76
|
+
results = cursor.fetchall()
|
|
77
|
+
ids = [result[0] for result in results]
|
|
78
|
+
return ids
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SQLiteDownloaderConfig(SQLDownloaderConfig):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class SQLiteDownloader(SQLDownloader):
|
|
87
|
+
connection_config: SQLConnectionConfig
|
|
88
|
+
download_config: SQLDownloaderConfig
|
|
89
|
+
connector_type: str = CONNECTOR_TYPE
|
|
90
|
+
|
|
91
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
92
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
93
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
94
|
+
ids = file_data.additional_metadata["ids"]
|
|
95
|
+
with self.connection_config.get_connection() as sqlite_connection:
|
|
96
|
+
cursor = sqlite_connection.cursor()
|
|
97
|
+
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
98
|
+
query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
|
|
99
|
+
fields=fields,
|
|
100
|
+
table_name=table_name,
|
|
101
|
+
id_column=id_column,
|
|
102
|
+
ids=",".join([str(i) for i in ids]),
|
|
103
|
+
)
|
|
104
|
+
logger.debug(f"running query: {query}")
|
|
105
|
+
cursor.execute(query)
|
|
106
|
+
rows = cursor.fetchall()
|
|
107
|
+
columns = [col[0] for col in cursor.description]
|
|
108
|
+
return rows, columns
|
|
109
|
+
|
|
110
|
+
|
|
48
111
|
class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
|
|
49
112
|
pass
|
|
50
113
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: pydantic>=2.7
|
|
26
|
-
Requires-Dist: opentelemetry-sdk
|
|
27
|
-
Requires-Dist: tqdm
|
|
28
|
-
Requires-Dist: pandas
|
|
29
25
|
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: pydantic>=2.7
|
|
30
28
|
Requires-Dist: dataclasses-json
|
|
29
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
30
|
Requires-Dist: click
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
@@ -44,8 +44,8 @@ Provides-Extra: biomed
|
|
|
44
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: fsspec; extra == "box"
|
|
48
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
|
+
Requires-Dist: fsspec; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
72
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
74
74
|
Provides-Extra: elasticsearch
|
|
75
75
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
76
|
Provides-Extra: embed-huggingface
|
|
@@ -87,12 +87,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
87
87
|
Provides-Extra: epub
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
91
90
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
91
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
92
92
|
Requires-Dist: fsspec; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
|
-
Requires-Dist: requests; extra == "github"
|
|
95
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
|
+
Requires-Dist: requests; extra == "github"
|
|
96
96
|
Provides-Extra: gitlab
|
|
97
97
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
98
98
|
Provides-Extra: google-drive
|
|
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
|
|
|
105
105
|
Provides-Extra: kafka
|
|
106
106
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
107
107
|
Provides-Extra: kdbai
|
|
108
|
-
Requires-Dist: kdbai-client; extra == "kdbai"
|
|
108
|
+
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
109
109
|
Provides-Extra: md
|
|
110
110
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
111
111
|
Provides-Extra: milvus
|
|
@@ -116,15 +116,15 @@ Provides-Extra: msg
|
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
118
|
Requires-Dist: notion-client; extra == "notion"
|
|
119
|
-
Requires-Dist: httpx; extra == "notion"
|
|
120
|
-
Requires-Dist: backoff; extra == "notion"
|
|
121
119
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
120
|
+
Requires-Dist: backoff; extra == "notion"
|
|
121
|
+
Requires-Dist: httpx; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
125
126
|
Requires-Dist: bs4; extra == "onedrive"
|
|
126
127
|
Requires-Dist: msal; extra == "onedrive"
|
|
127
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
129
|
Requires-Dist: openai; extra == "openai"
|
|
130
130
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
133
133
|
Provides-Extra: org
|
|
134
134
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
135
135
|
Provides-Extra: outlook
|
|
136
|
-
Requires-Dist: msal; extra == "outlook"
|
|
137
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
137
|
+
Requires-Dist: msal; extra == "outlook"
|
|
138
138
|
Provides-Extra: pdf
|
|
139
139
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
140
140
|
Provides-Extra: pinecone
|
|
@@ -156,16 +156,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
156
|
Provides-Extra: rtf
|
|
157
157
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
158
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
160
159
|
Requires-Dist: fsspec; extra == "s3"
|
|
160
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
161
161
|
Provides-Extra: salesforce
|
|
162
162
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
163
|
Provides-Extra: sftp
|
|
164
164
|
Requires-Dist: fsspec; extra == "sftp"
|
|
165
165
|
Requires-Dist: paramiko; extra == "sftp"
|
|
166
166
|
Provides-Extra: sharepoint
|
|
167
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
168
167
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
168
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
169
169
|
Provides-Extra: singlestore
|
|
170
170
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
171
171
|
Provides-Extra: slack
|
|
@@ -5,15 +5,16 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
5
5
|
test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
|
|
6
6
|
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
|
|
8
|
-
test/integration/connectors/test_postgres.py,sha256=9uaqlUmLpVF09cwKSw7Yldq2kjU00WBedbEIgyJG5Cw,3998
|
|
9
8
|
test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
|
|
10
|
-
test/integration/connectors/test_sqlite.py,sha256=NnLdyt3FfM1A53tXPJbgIcsy-iEgYY8OZYOfliFqifM,3507
|
|
11
9
|
test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
10
|
test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
|
|
11
|
+
test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
|
|
13
|
+
test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
|
|
13
14
|
test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
test/integration/connectors/utils/constants.py,sha256=
|
|
15
|
+
test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
|
|
15
16
|
test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
|
|
16
|
-
test/integration/connectors/utils/validation.py,sha256=
|
|
17
|
+
test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
|
|
17
18
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
19
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
19
20
|
test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
|
|
@@ -42,7 +43,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
|
|
|
42
43
|
test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
|
|
43
44
|
test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
|
|
44
45
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
45
|
-
unstructured_ingest/__version__.py,sha256=
|
|
46
|
+
unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
|
|
46
47
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
47
48
|
unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
|
|
48
49
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -361,7 +362,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-H
|
|
|
361
362
|
unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
|
|
362
363
|
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
|
|
363
364
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
|
|
364
|
-
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=
|
|
365
|
+
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
|
|
365
366
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
|
|
366
367
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
|
|
367
368
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
|
|
@@ -384,18 +385,18 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
|
|
|
384
385
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
|
|
385
386
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
|
|
386
387
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
|
|
387
|
-
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=
|
|
388
|
+
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
|
|
388
389
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
|
|
389
390
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
|
|
390
391
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
|
|
391
392
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
392
393
|
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
|
|
393
|
-
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=
|
|
394
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
395
|
-
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=
|
|
396
|
-
unstructured_ingest-0.1.
|
|
397
|
-
unstructured_ingest-0.1.
|
|
398
|
-
unstructured_ingest-0.1.
|
|
399
|
-
unstructured_ingest-0.1.
|
|
400
|
-
unstructured_ingest-0.1.
|
|
401
|
-
unstructured_ingest-0.1.
|
|
394
|
+
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
|
|
395
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
|
|
396
|
+
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
|
|
397
|
+
unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
398
|
+
unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
|
|
399
|
+
unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
400
|
+
unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
401
|
+
unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
402
|
+
unstructured_ingest-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|