unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (22) hide show
  1. test/integration/connectors/sql/test_singlestore.py +156 -0
  2. test/integration/connectors/test_s3.py +1 -1
  3. test/integration/connectors/utils/docker_compose.py +23 -8
  4. unstructured_ingest/__version__.py +1 -1
  5. unstructured_ingest/v2/interfaces/file_data.py +1 -0
  6. unstructured_ingest/v2/processes/connectors/__init__.py +3 -6
  7. unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
  8. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +3 -1
  9. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -0
  10. unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
  11. unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
  12. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
  13. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +2 -4
  14. unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -2
  15. unstructured_ingest/v2/unstructured_api.py +1 -1
  16. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/METADATA +17 -17
  17. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/RECORD +21 -20
  18. unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
  19. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/LICENSE.md +0 -0
  20. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/WHEEL +0 -0
  21. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt +0 -0
  22. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,156 +0,0 @@
1
- import json
2
- from dataclasses import dataclass
3
- from datetime import date, datetime
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
6
-
7
- import numpy as np
8
- import pandas as pd
9
- from dateutil import parser
10
- from pydantic import Field, Secret
11
-
12
- from unstructured_ingest.utils.data_prep import batch_generator
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.utils.table import convert_to_pandas_dataframe
15
- from unstructured_ingest.v2.interfaces import (
16
- AccessConfig,
17
- ConnectionConfig,
18
- FileData,
19
- Uploader,
20
- UploaderConfig,
21
- UploadStager,
22
- UploadStagerConfig,
23
- )
24
- from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import (
26
- DestinationRegistryEntry,
27
- )
28
-
29
- if TYPE_CHECKING:
30
- from singlestoredb.connection import Connection
31
-
32
- CONNECTOR_TYPE = "singlestore"
33
-
34
-
35
- class SingleStoreAccessConfig(AccessConfig):
36
- password: Optional[str] = Field(default=None, description="SingleStore password")
37
-
38
-
39
- class SingleStoreConnectionConfig(ConnectionConfig):
40
- host: Optional[str] = Field(default=None, description="SingleStore host")
41
- port: Optional[int] = Field(default=None, description="SingleStore port")
42
- user: Optional[str] = Field(default=None, description="SingleStore user")
43
- database: Optional[str] = Field(default=None, description="SingleStore database")
44
- access_config: Secret[SingleStoreAccessConfig]
45
-
46
- @requires_dependencies(["singlestoredb"], extras="singlestore")
47
- def get_connection(self) -> "Connection":
48
- import singlestoredb as s2
49
-
50
- conn = s2.connect(
51
- host=self.host,
52
- port=self.port,
53
- database=self.database,
54
- user=self.user,
55
- password=self.access_config.get_secret_value().password,
56
- )
57
- return conn
58
-
59
-
60
- class SingleStoreUploadStagerConfig(UploadStagerConfig):
61
- drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
62
-
63
-
64
- @dataclass
65
- class SingleStoreUploadStager(UploadStager):
66
- upload_stager_config: SingleStoreUploadStagerConfig
67
-
68
- @staticmethod
69
- def parse_date_string(date_string: str) -> date:
70
- try:
71
- timestamp = float(date_string)
72
- return datetime.fromtimestamp(timestamp)
73
- except Exception as e:
74
- logger.debug(f"date {date_string} string not a timestamp: {e}")
75
- return parser.parse(date_string)
76
-
77
- def run(
78
- self,
79
- elements_filepath: Path,
80
- file_data: FileData,
81
- output_dir: Path,
82
- output_filename: str,
83
- **kwargs: Any,
84
- ) -> Path:
85
- with open(elements_filepath) as elements_file:
86
- elements_contents = json.load(elements_file)
87
- output_path = Path(output_dir) / Path(f"{output_filename}.csv")
88
- output_path.parent.mkdir(parents=True, exist_ok=True)
89
-
90
- df = convert_to_pandas_dataframe(
91
- elements_dict=elements_contents,
92
- drop_empty_cols=self.upload_stager_config.drop_empty_cols,
93
- )
94
- datetime_columns = [
95
- "data_source_date_created",
96
- "data_source_date_modified",
97
- "data_source_date_processed",
98
- ]
99
- for column in filter(lambda x: x in df.columns, datetime_columns):
100
- df[column] = df[column].apply(self.parse_date_string)
101
- if "data_source_record_locator" in df.columns:
102
- df["data_source_record_locator"] = df["data_source_record_locator"].apply(
103
- lambda x: json.dumps(x) if x else None
104
- )
105
-
106
- with output_path.open("w") as output_file:
107
- df.to_csv(output_file, index=False)
108
- return output_path
109
-
110
-
111
- class SingleStoreUploaderConfig(UploaderConfig):
112
- table_name: str = Field(description="SingleStore table to write contents to")
113
- batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
114
-
115
-
116
- @dataclass
117
- class SingleStoreUploader(Uploader):
118
- connection_config: SingleStoreConnectionConfig
119
- upload_config: SingleStoreUploaderConfig
120
- connector_type: str = CONNECTOR_TYPE
121
-
122
- def upload_csv(self, csv_path: Path) -> None:
123
- df = pd.read_csv(csv_path)
124
- logger.debug(
125
- f"uploading {len(df)} entries to {self.connection_config.database} "
126
- f"db in table {self.upload_config.table_name}"
127
- )
128
- stmt = "INSERT INTO {} ({}) VALUES ({})".format(
129
- self.upload_config.table_name,
130
- ", ".join(df.columns),
131
- ", ".join(["%s"] * len(df.columns)),
132
- )
133
- logger.debug(f"sql statement: {stmt}")
134
- df.replace({np.nan: None}, inplace=True)
135
- data_as_tuples = list(df.itertuples(index=False, name=None))
136
- with self.connection_config.get_connection() as conn:
137
- with conn.cursor() as cur:
138
- for chunk in batch_generator(
139
- data_as_tuples, batch_size=self.upload_config.batch_size
140
- ):
141
- cur.executemany(stmt, chunk)
142
- conn.commit()
143
-
144
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
145
- if path.suffix != ".csv":
146
- raise ValueError(f"Only .csv files are supported: {path}")
147
- self.upload_csv(csv_path=path)
148
-
149
-
150
- singlestore_destination_entry = DestinationRegistryEntry(
151
- connection_config=SingleStoreConnectionConfig,
152
- uploader=SingleStoreUploader,
153
- uploader_config=SingleStoreUploaderConfig,
154
- upload_stager=SingleStoreUploadStager,
155
- upload_stager_config=SingleStoreUploadStagerConfig,
156
- )