unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_singlestore.py +156 -0
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/utils/docker_compose.py +23 -8
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -6
- unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +3 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +2 -4
- unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -2
- unstructured_ingest/v2/unstructured_api.py +1 -1
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/METADATA +17 -17
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/RECORD +21 -20
- unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from datetime import date, datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from dateutil import parser
|
|
10
|
-
from pydantic import Field, Secret
|
|
11
|
-
|
|
12
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
15
|
-
from unstructured_ingest.v2.interfaces import (
|
|
16
|
-
AccessConfig,
|
|
17
|
-
ConnectionConfig,
|
|
18
|
-
FileData,
|
|
19
|
-
Uploader,
|
|
20
|
-
UploaderConfig,
|
|
21
|
-
UploadStager,
|
|
22
|
-
UploadStagerConfig,
|
|
23
|
-
)
|
|
24
|
-
from unstructured_ingest.v2.logger import logger
|
|
25
|
-
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
-
DestinationRegistryEntry,
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
if TYPE_CHECKING:
|
|
30
|
-
from singlestoredb.connection import Connection
|
|
31
|
-
|
|
32
|
-
CONNECTOR_TYPE = "singlestore"
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class SingleStoreAccessConfig(AccessConfig):
|
|
36
|
-
password: Optional[str] = Field(default=None, description="SingleStore password")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class SingleStoreConnectionConfig(ConnectionConfig):
|
|
40
|
-
host: Optional[str] = Field(default=None, description="SingleStore host")
|
|
41
|
-
port: Optional[int] = Field(default=None, description="SingleStore port")
|
|
42
|
-
user: Optional[str] = Field(default=None, description="SingleStore user")
|
|
43
|
-
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
44
|
-
access_config: Secret[SingleStoreAccessConfig]
|
|
45
|
-
|
|
46
|
-
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
47
|
-
def get_connection(self) -> "Connection":
|
|
48
|
-
import singlestoredb as s2
|
|
49
|
-
|
|
50
|
-
conn = s2.connect(
|
|
51
|
-
host=self.host,
|
|
52
|
-
port=self.port,
|
|
53
|
-
database=self.database,
|
|
54
|
-
user=self.user,
|
|
55
|
-
password=self.access_config.get_secret_value().password,
|
|
56
|
-
)
|
|
57
|
-
return conn
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class SingleStoreUploadStagerConfig(UploadStagerConfig):
|
|
61
|
-
drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class SingleStoreUploadStager(UploadStager):
|
|
66
|
-
upload_stager_config: SingleStoreUploadStagerConfig
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def parse_date_string(date_string: str) -> date:
|
|
70
|
-
try:
|
|
71
|
-
timestamp = float(date_string)
|
|
72
|
-
return datetime.fromtimestamp(timestamp)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
75
|
-
return parser.parse(date_string)
|
|
76
|
-
|
|
77
|
-
def run(
|
|
78
|
-
self,
|
|
79
|
-
elements_filepath: Path,
|
|
80
|
-
file_data: FileData,
|
|
81
|
-
output_dir: Path,
|
|
82
|
-
output_filename: str,
|
|
83
|
-
**kwargs: Any,
|
|
84
|
-
) -> Path:
|
|
85
|
-
with open(elements_filepath) as elements_file:
|
|
86
|
-
elements_contents = json.load(elements_file)
|
|
87
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.csv")
|
|
88
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
89
|
-
|
|
90
|
-
df = convert_to_pandas_dataframe(
|
|
91
|
-
elements_dict=elements_contents,
|
|
92
|
-
drop_empty_cols=self.upload_stager_config.drop_empty_cols,
|
|
93
|
-
)
|
|
94
|
-
datetime_columns = [
|
|
95
|
-
"data_source_date_created",
|
|
96
|
-
"data_source_date_modified",
|
|
97
|
-
"data_source_date_processed",
|
|
98
|
-
]
|
|
99
|
-
for column in filter(lambda x: x in df.columns, datetime_columns):
|
|
100
|
-
df[column] = df[column].apply(self.parse_date_string)
|
|
101
|
-
if "data_source_record_locator" in df.columns:
|
|
102
|
-
df["data_source_record_locator"] = df["data_source_record_locator"].apply(
|
|
103
|
-
lambda x: json.dumps(x) if x else None
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
with output_path.open("w") as output_file:
|
|
107
|
-
df.to_csv(output_file, index=False)
|
|
108
|
-
return output_path
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class SingleStoreUploaderConfig(UploaderConfig):
|
|
112
|
-
table_name: str = Field(description="SingleStore table to write contents to")
|
|
113
|
-
batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
@dataclass
|
|
117
|
-
class SingleStoreUploader(Uploader):
|
|
118
|
-
connection_config: SingleStoreConnectionConfig
|
|
119
|
-
upload_config: SingleStoreUploaderConfig
|
|
120
|
-
connector_type: str = CONNECTOR_TYPE
|
|
121
|
-
|
|
122
|
-
def upload_csv(self, csv_path: Path) -> None:
|
|
123
|
-
df = pd.read_csv(csv_path)
|
|
124
|
-
logger.debug(
|
|
125
|
-
f"uploading {len(df)} entries to {self.connection_config.database} "
|
|
126
|
-
f"db in table {self.upload_config.table_name}"
|
|
127
|
-
)
|
|
128
|
-
stmt = "INSERT INTO {} ({}) VALUES ({})".format(
|
|
129
|
-
self.upload_config.table_name,
|
|
130
|
-
", ".join(df.columns),
|
|
131
|
-
", ".join(["%s"] * len(df.columns)),
|
|
132
|
-
)
|
|
133
|
-
logger.debug(f"sql statement: {stmt}")
|
|
134
|
-
df.replace({np.nan: None}, inplace=True)
|
|
135
|
-
data_as_tuples = list(df.itertuples(index=False, name=None))
|
|
136
|
-
with self.connection_config.get_connection() as conn:
|
|
137
|
-
with conn.cursor() as cur:
|
|
138
|
-
for chunk in batch_generator(
|
|
139
|
-
data_as_tuples, batch_size=self.upload_config.batch_size
|
|
140
|
-
):
|
|
141
|
-
cur.executemany(stmt, chunk)
|
|
142
|
-
conn.commit()
|
|
143
|
-
|
|
144
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
|
-
if path.suffix != ".csv":
|
|
146
|
-
raise ValueError(f"Only .csv files are supported: {path}")
|
|
147
|
-
self.upload_csv(csv_path=path)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
singlestore_destination_entry = DestinationRegistryEntry(
|
|
151
|
-
connection_config=SingleStoreConnectionConfig,
|
|
152
|
-
uploader=SingleStoreUploader,
|
|
153
|
-
uploader_config=SingleStoreUploaderConfig,
|
|
154
|
-
upload_stager=SingleStoreUploadStager,
|
|
155
|
-
upload_stager_config=SingleStoreUploadStagerConfig,
|
|
156
|
-
)
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|