unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/conftest.py +13 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
- test/integration/connectors/sql/test_postgres.py +6 -10
- test/integration/connectors/sql/test_singlestore.py +156 -0
- test/integration/connectors/sql/test_snowflake.py +205 -0
- test/integration/connectors/sql/test_sqlite.py +6 -10
- test/integration/connectors/test_delta_table.py +138 -0
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/utils/docker.py +78 -0
- test/integration/connectors/utils/docker_compose.py +23 -8
- test/integration/connectors/utils/validation.py +93 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +32 -1
- unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/interfaces/indexer.py +4 -1
- unstructured_ingest/v2/pipeline/pipeline.py +10 -2
- unstructured_ingest/v2/pipeline/steps/index.py +18 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -6
- unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +3 -1
- unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +15 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +33 -56
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +51 -12
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +31 -32
- unstructured_ingest/v2/unstructured_api.py +1 -1
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/METADATA +19 -17
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/RECORD +37 -31
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
- unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
|
+
DestinationRegistryEntry,
|
|
13
|
+
SourceRegistryEntry,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
16
|
+
PostgresDownloader,
|
|
17
|
+
PostgresDownloaderConfig,
|
|
18
|
+
PostgresIndexer,
|
|
19
|
+
PostgresIndexerConfig,
|
|
20
|
+
PostgresUploader,
|
|
21
|
+
PostgresUploaderConfig,
|
|
22
|
+
PostgresUploadStager,
|
|
23
|
+
PostgresUploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import SQLAccessConfig, SQLConnectionConfig
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from snowflake.connector import SnowflakeConnection
|
|
29
|
+
from snowflake.connector.cursor import SnowflakeCursor
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "snowflake"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SnowflakeAccessConfig(SQLAccessConfig):
|
|
35
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SnowflakeConnectionConfig(SQLConnectionConfig):
|
|
39
|
+
access_config: Secret[SnowflakeAccessConfig] = Field(
|
|
40
|
+
default=SnowflakeAccessConfig(), validate_default=True
|
|
41
|
+
)
|
|
42
|
+
account: str = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
description="Your account identifier. The account identifier "
|
|
45
|
+
"does not include the snowflakecomputing.com suffix.",
|
|
46
|
+
)
|
|
47
|
+
user: Optional[str] = Field(default=None, description="DB username")
|
|
48
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
49
|
+
port: Optional[int] = Field(default=443, description="DB host connection port")
|
|
50
|
+
database: str = Field(
|
|
51
|
+
default=None,
|
|
52
|
+
description="Database name.",
|
|
53
|
+
)
|
|
54
|
+
db_schema: str = Field(default=None, description="Database schema.", alias="schema")
|
|
55
|
+
role: str = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Database role.",
|
|
58
|
+
)
|
|
59
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
60
|
+
|
|
61
|
+
@contextmanager
|
|
62
|
+
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
63
|
+
def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
|
|
64
|
+
# https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
|
|
65
|
+
from snowflake.connector import connect
|
|
66
|
+
|
|
67
|
+
connect_kwargs = self.model_dump()
|
|
68
|
+
connect_kwargs["schema"] = connect_kwargs.pop("db_schema")
|
|
69
|
+
connect_kwargs.pop("access_configs", None)
|
|
70
|
+
connect_kwargs["password"] = self.access_config.get_secret_value().password
|
|
71
|
+
# https://peps.python.org/pep-0249/#paramstyle
|
|
72
|
+
connect_kwargs["paramstyle"] = "qmark"
|
|
73
|
+
connection = connect(**connect_kwargs)
|
|
74
|
+
try:
|
|
75
|
+
yield connection
|
|
76
|
+
finally:
|
|
77
|
+
connection.commit()
|
|
78
|
+
connection.close()
|
|
79
|
+
|
|
80
|
+
@contextmanager
|
|
81
|
+
def get_cursor(self) -> Generator["SnowflakeCursor", None, None]:
|
|
82
|
+
with self.get_connection() as connection:
|
|
83
|
+
cursor = connection.cursor()
|
|
84
|
+
try:
|
|
85
|
+
yield cursor
|
|
86
|
+
finally:
|
|
87
|
+
cursor.close()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SnowflakeIndexerConfig(PostgresIndexerConfig):
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class SnowflakeIndexer(PostgresIndexer):
|
|
96
|
+
connection_config: SnowflakeConnectionConfig
|
|
97
|
+
index_config: SnowflakeIndexerConfig
|
|
98
|
+
connector_type: str = CONNECTOR_TYPE
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SnowflakeDownloaderConfig(PostgresDownloaderConfig):
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class SnowflakeDownloader(PostgresDownloader):
|
|
107
|
+
connection_config: SnowflakeConnectionConfig
|
|
108
|
+
download_config: SnowflakeDownloaderConfig
|
|
109
|
+
connector_type: str = CONNECTOR_TYPE
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SnowflakeUploadStagerConfig(PostgresUploadStagerConfig):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class SnowflakeUploadStager(PostgresUploadStager):
|
|
117
|
+
upload_stager_config: SnowflakeUploadStagerConfig
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class SnowflakeUploaderConfig(PostgresUploaderConfig):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class SnowflakeUploader(PostgresUploader):
|
|
126
|
+
upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
|
|
127
|
+
connection_config: SnowflakeConnectionConfig
|
|
128
|
+
connector_type: str = CONNECTOR_TYPE
|
|
129
|
+
values_delimiter: str = "?"
|
|
130
|
+
|
|
131
|
+
def upload_contents(self, path: Path) -> None:
|
|
132
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
133
|
+
df.replace({np.nan: None}, inplace=True)
|
|
134
|
+
|
|
135
|
+
columns = list(df.columns)
|
|
136
|
+
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})" # noqa E501
|
|
137
|
+
|
|
138
|
+
for rows in pd.read_json(
|
|
139
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
140
|
+
):
|
|
141
|
+
with self.connection_config.get_cursor() as cursor:
|
|
142
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
143
|
+
# TODO: executemany break on 'Binding data in type (list) is not supported'
|
|
144
|
+
for val in values:
|
|
145
|
+
cursor.execute(stmt, val)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
snowflake_source_entry = SourceRegistryEntry(
|
|
149
|
+
connection_config=SnowflakeConnectionConfig,
|
|
150
|
+
indexer_config=SnowflakeIndexerConfig,
|
|
151
|
+
indexer=SnowflakeIndexer,
|
|
152
|
+
downloader_config=SnowflakeDownloaderConfig,
|
|
153
|
+
downloader=SnowflakeDownloader,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
snowflake_destination_entry = DestinationRegistryEntry(
|
|
157
|
+
connection_config=SnowflakeConnectionConfig,
|
|
158
|
+
uploader=SnowflakeUploader,
|
|
159
|
+
uploader_config=SnowflakeUploaderConfig,
|
|
160
|
+
upload_stager=SnowflakeUploadStager,
|
|
161
|
+
upload_stager_config=SnowflakeUploadStagerConfig,
|
|
162
|
+
)
|
|
@@ -3,12 +3,14 @@ import json
|
|
|
3
3
|
import sys
|
|
4
4
|
import uuid
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
+
from contextlib import contextmanager
|
|
6
7
|
from dataclasses import dataclass, field, replace
|
|
7
8
|
from datetime import date, datetime
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from time import time
|
|
10
11
|
from typing import Any, Generator, Union
|
|
11
12
|
|
|
13
|
+
import numpy as np
|
|
12
14
|
import pandas as pd
|
|
13
15
|
from dateutil import parser
|
|
14
16
|
from pydantic import Field, Secret
|
|
@@ -94,7 +96,13 @@ class SQLConnectionConfig(ConnectionConfig, ABC):
|
|
|
94
96
|
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
95
97
|
|
|
96
98
|
@abstractmethod
|
|
97
|
-
|
|
99
|
+
@contextmanager
|
|
100
|
+
def get_connection(self) -> Generator[Any, None, None]:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
@contextmanager
|
|
105
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
98
106
|
pass
|
|
99
107
|
|
|
100
108
|
|
|
@@ -108,16 +116,19 @@ class SQLIndexer(Indexer, ABC):
|
|
|
108
116
|
connection_config: SQLConnectionConfig
|
|
109
117
|
index_config: SQLIndexerConfig
|
|
110
118
|
|
|
111
|
-
@abstractmethod
|
|
112
119
|
def _get_doc_ids(self) -> list[str]:
|
|
113
|
-
|
|
120
|
+
with self.connection_config.get_cursor() as cursor:
|
|
121
|
+
cursor.execute(
|
|
122
|
+
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
123
|
+
)
|
|
124
|
+
results = cursor.fetchall()
|
|
125
|
+
ids = [result[0] for result in results]
|
|
126
|
+
return ids
|
|
114
127
|
|
|
115
128
|
def precheck(self) -> None:
|
|
116
129
|
try:
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
cursor.execute("SELECT 1;")
|
|
120
|
-
cursor.close()
|
|
130
|
+
with self.connection_config.get_cursor() as cursor:
|
|
131
|
+
cursor.execute("SELECT 1;")
|
|
121
132
|
except Exception as e:
|
|
122
133
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
123
134
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -198,7 +209,7 @@ class SQLDownloader(Downloader, ABC):
|
|
|
198
209
|
f"Downloading results from table {table_name} and id {record_id} to {download_path}"
|
|
199
210
|
)
|
|
200
211
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
-
result.to_csv(download_path)
|
|
212
|
+
result.to_csv(download_path, index=False)
|
|
202
213
|
copied_file_data = replace(file_data)
|
|
203
214
|
copied_file_data.identifier = filename_id
|
|
204
215
|
copied_file_data.doc_type = "file"
|
|
@@ -285,6 +296,7 @@ class SQLUploaderConfig(UploaderConfig):
|
|
|
285
296
|
class SQLUploader(Uploader):
|
|
286
297
|
upload_config: SQLUploaderConfig
|
|
287
298
|
connection_config: SQLConnectionConfig
|
|
299
|
+
values_delimiter: str = "?"
|
|
288
300
|
|
|
289
301
|
def precheck(self) -> None:
|
|
290
302
|
try:
|
|
@@ -296,15 +308,42 @@ class SQLUploader(Uploader):
|
|
|
296
308
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
297
309
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
298
310
|
|
|
299
|
-
@abstractmethod
|
|
300
311
|
def prepare_data(
|
|
301
312
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
302
313
|
) -> list[tuple[Any, ...]]:
|
|
303
|
-
|
|
314
|
+
output = []
|
|
315
|
+
for row in data:
|
|
316
|
+
parsed = []
|
|
317
|
+
for column_name, value in zip(columns, row):
|
|
318
|
+
if column_name in _DATE_COLUMNS:
|
|
319
|
+
if value is None:
|
|
320
|
+
parsed.append(None)
|
|
321
|
+
else:
|
|
322
|
+
parsed.append(parse_date_string(value))
|
|
323
|
+
else:
|
|
324
|
+
parsed.append(value)
|
|
325
|
+
output.append(tuple(parsed))
|
|
326
|
+
return output
|
|
304
327
|
|
|
305
|
-
@abstractmethod
|
|
306
328
|
def upload_contents(self, path: Path) -> None:
|
|
307
|
-
|
|
329
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
330
|
+
df.replace({np.nan: None}, inplace=True)
|
|
331
|
+
|
|
332
|
+
columns = list(df.columns)
|
|
333
|
+
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})" # noqa E501
|
|
334
|
+
|
|
335
|
+
for rows in pd.read_json(
|
|
336
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
337
|
+
):
|
|
338
|
+
with self.connection_config.get_cursor() as cursor:
|
|
339
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
340
|
+
# for val in values:
|
|
341
|
+
# try:
|
|
342
|
+
# cursor.execute(stmt, val)
|
|
343
|
+
# except Exception as e:
|
|
344
|
+
# print(f"Error: {e}")
|
|
345
|
+
# print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
|
|
346
|
+
cursor.executemany(stmt, values)
|
|
308
347
|
|
|
309
348
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
310
349
|
self.upload_contents(path=path)
|
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator
|
|
5
6
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
7
|
from pydantic import Field, Secret, model_validator
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.v2.interfaces import FileData
|
|
11
10
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
11
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
|
+
DestinationRegistryEntry,
|
|
13
|
+
SourceRegistryEntry,
|
|
14
|
+
)
|
|
13
15
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
14
16
|
_DATE_COLUMNS,
|
|
15
17
|
SQLAccessConfig,
|
|
@@ -27,6 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
27
29
|
|
|
28
30
|
if TYPE_CHECKING:
|
|
29
31
|
from sqlite3 import Connection as SqliteConnection
|
|
32
|
+
from sqlite3 import Cursor as SqliteCursor
|
|
30
33
|
|
|
31
34
|
CONNECTOR_TYPE = "sqlite"
|
|
32
35
|
|
|
@@ -51,10 +54,25 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
|
|
|
51
54
|
raise ValueError(f"{self.database_path} is not a valid file")
|
|
52
55
|
return self
|
|
53
56
|
|
|
54
|
-
|
|
57
|
+
@contextmanager
|
|
58
|
+
def get_connection(self) -> Generator["SqliteConnection", None, None]:
|
|
55
59
|
from sqlite3 import connect
|
|
56
60
|
|
|
57
|
-
|
|
61
|
+
connection = connect(database=self.database_path)
|
|
62
|
+
try:
|
|
63
|
+
yield connection
|
|
64
|
+
finally:
|
|
65
|
+
connection.commit()
|
|
66
|
+
connection.close()
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_cursor(self) -> Generator["SqliteCursor", None, None]:
|
|
70
|
+
with self.get_connection() as connection:
|
|
71
|
+
cursor = connection.cursor()
|
|
72
|
+
try:
|
|
73
|
+
yield cursor
|
|
74
|
+
finally:
|
|
75
|
+
cursor.close()
|
|
58
76
|
|
|
59
77
|
|
|
60
78
|
class SQLiteIndexerConfig(SQLIndexerConfig):
|
|
@@ -67,16 +85,6 @@ class SQLiteIndexer(SQLIndexer):
|
|
|
67
85
|
index_config: SQLIndexerConfig
|
|
68
86
|
connector_type: str = CONNECTOR_TYPE
|
|
69
87
|
|
|
70
|
-
def _get_doc_ids(self) -> list[str]:
|
|
71
|
-
with self.connection_config.get_connection() as sqlite_connection:
|
|
72
|
-
cursor = sqlite_connection.cursor()
|
|
73
|
-
cursor.execute(
|
|
74
|
-
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
75
|
-
)
|
|
76
|
-
results = cursor.fetchall()
|
|
77
|
-
ids = [result[0] for result in results]
|
|
78
|
-
return ids
|
|
79
|
-
|
|
80
88
|
|
|
81
89
|
class SQLiteDownloaderConfig(SQLDownloaderConfig):
|
|
82
90
|
pass
|
|
@@ -145,23 +153,14 @@ class SQLiteUploader(SQLUploader):
|
|
|
145
153
|
output.append(tuple(parsed))
|
|
146
154
|
return output
|
|
147
155
|
|
|
148
|
-
def upload_contents(self, path: Path) -> None:
|
|
149
|
-
df = pd.read_json(path, orient="records", lines=True)
|
|
150
|
-
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
|
|
151
|
-
df.replace({np.nan: None}, inplace=True)
|
|
152
|
-
|
|
153
|
-
columns = tuple(df.columns)
|
|
154
|
-
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
|
|
155
|
-
VALUES({','.join(['?' for x in columns])})" # noqa E501
|
|
156
|
-
|
|
157
|
-
for rows in pd.read_json(
|
|
158
|
-
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
159
|
-
):
|
|
160
|
-
with self.connection_config.get_connection() as conn:
|
|
161
|
-
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
162
|
-
conn.executemany(stmt, values)
|
|
163
|
-
conn.commit()
|
|
164
156
|
|
|
157
|
+
sqlite_source_entry = SourceRegistryEntry(
|
|
158
|
+
connection_config=SQLiteConnectionConfig,
|
|
159
|
+
indexer_config=SQLiteIndexerConfig,
|
|
160
|
+
indexer=SQLIndexer,
|
|
161
|
+
downloader_config=SQLiteDownloaderConfig,
|
|
162
|
+
downloader=SQLiteDownloader,
|
|
163
|
+
)
|
|
165
164
|
|
|
166
165
|
sqlite_destination_entry = DestinationRegistryEntry(
|
|
167
166
|
connection_config=SQLiteConnectionConfig,
|
|
@@ -26,7 +26,7 @@ def create_partition_request(filename: Path, parameters_dict: dict) -> "Partitio
|
|
|
26
26
|
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
27
27
|
# Prior to this it was a dataclass which doesn't have .__fields
|
|
28
28
|
try:
|
|
29
|
-
possible_fields = PartitionParameters.
|
|
29
|
+
possible_fields = PartitionParameters.model_fields
|
|
30
30
|
except AttributeError:
|
|
31
31
|
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
32
32
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,30 +22,30 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: tqdm
|
|
25
26
|
Requires-Dist: python-dateutil
|
|
26
27
|
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: click
|
|
27
29
|
Requires-Dist: pydantic>=2.7
|
|
28
30
|
Requires-Dist: dataclasses-json
|
|
29
31
|
Requires-Dist: opentelemetry-sdk
|
|
30
|
-
Requires-Dist: click
|
|
31
|
-
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-cognitive-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
45
44
|
Requires-Dist: requests; extra == "biomed"
|
|
45
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: boxfs; extra == "box"
|
|
48
47
|
Requires-Dist: fsspec; extra == "box"
|
|
48
|
+
Requires-Dist: boxfs; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
63
|
Requires-Dist: fsspec; extra == "delta-table"
|
|
64
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -88,8 +88,8 @@ Provides-Extra: epub
|
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
90
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
91
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
92
91
|
Requires-Dist: fsspec; extra == "gcs"
|
|
92
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
95
|
Requires-Dist: requests; extra == "github"
|
|
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
101
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
@@ -115,16 +115,16 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
115
|
Provides-Extra: msg
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
119
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
120
|
-
Requires-Dist: backoff; extra == "notion"
|
|
121
118
|
Requires-Dist: httpx; extra == "notion"
|
|
119
|
+
Requires-Dist: backoff; extra == "notion"
|
|
120
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
121
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
126
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
127
125
|
Requires-Dist: msal; extra == "onedrive"
|
|
126
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
127
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
129
|
Requires-Dist: openai; extra == "openai"
|
|
130
130
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
133
133
|
Provides-Extra: org
|
|
134
134
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
135
135
|
Provides-Extra: outlook
|
|
136
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
137
136
|
Requires-Dist: msal; extra == "outlook"
|
|
137
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
138
138
|
Provides-Extra: pdf
|
|
139
139
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
140
140
|
Provides-Extra: pinecone
|
|
@@ -164,12 +164,14 @@ Provides-Extra: sftp
|
|
|
164
164
|
Requires-Dist: fsspec; extra == "sftp"
|
|
165
165
|
Requires-Dist: paramiko; extra == "sftp"
|
|
166
166
|
Provides-Extra: sharepoint
|
|
167
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
168
167
|
Requires-Dist: msal; extra == "sharepoint"
|
|
168
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
169
169
|
Provides-Extra: singlestore
|
|
170
170
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
171
171
|
Provides-Extra: slack
|
|
172
|
-
Requires-Dist: slack-sdk; extra == "slack"
|
|
172
|
+
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
173
|
+
Provides-Extra: snowflake
|
|
174
|
+
Requires-Dist: snowflake; extra == "snowflake"
|
|
173
175
|
Provides-Extra: togetherai
|
|
174
176
|
Requires-Dist: together; extra == "togetherai"
|
|
175
177
|
Provides-Extra: tsv
|