unstructured-ingest 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

File without changes
@@ -1,21 +1,97 @@
1
1
  import tempfile
2
+ from contextlib import contextmanager
2
3
  from pathlib import Path
3
4
 
5
+ import faker
4
6
  import pandas as pd
5
7
  import pytest
6
8
  from psycopg2 import connect
7
9
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
9
11
  from test.integration.connectors.utils.docker_compose import docker_compose_context
12
+ from test.integration.connectors.utils.validation import (
13
+ ValidationConfigs,
14
+ source_connector_validation,
15
+ )
10
16
  from unstructured_ingest.v2.interfaces import FileData
11
17
  from unstructured_ingest.v2.processes.connectors.sql.postgres import (
12
18
  CONNECTOR_TYPE,
13
19
  PostgresAccessConfig,
14
20
  PostgresConnectionConfig,
21
+ PostgresDownloader,
22
+ PostgresDownloaderConfig,
23
+ PostgresIndexer,
24
+ PostgresIndexerConfig,
15
25
  PostgresUploader,
16
26
  PostgresUploadStager,
17
27
  )
18
28
 
29
+ faker = faker.Faker()
30
+
31
+ SEED_DATA_ROWS = 40
32
+
33
+
34
+ @contextmanager
35
+ def postgres_download_setup() -> None:
36
+ with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
37
+ connection = connect(
38
+ user="unstructured",
39
+ password="test",
40
+ dbname="test_db",
41
+ host="localhost",
42
+ port=5433,
43
+ )
44
+ with connection.cursor() as cursor:
45
+ for _ in range(SEED_DATA_ROWS):
46
+ sql_statment = (
47
+ f"INSERT INTO cars (brand, price) VALUES "
48
+ f"('{faker.word()}', {faker.random_int()})"
49
+ )
50
+ cursor.execute(sql_statment)
51
+ connection.commit()
52
+ yield
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
57
+ async def test_postgres_source():
58
+ connect_params = {
59
+ "host": "localhost",
60
+ "port": 5433,
61
+ "database": "test_db",
62
+ "user": "unstructured",
63
+ "password": "test",
64
+ }
65
+ with postgres_download_setup():
66
+ with tempfile.TemporaryDirectory() as tmpdir:
67
+ connection_config = PostgresConnectionConfig(
68
+ host=connect_params["host"],
69
+ port=connect_params["port"],
70
+ database=connect_params["database"],
71
+ username=connect_params["user"],
72
+ access_config=PostgresAccessConfig(password=connect_params["password"]),
73
+ )
74
+ indexer = PostgresIndexer(
75
+ connection_config=connection_config,
76
+ index_config=PostgresIndexerConfig(
77
+ table_name="cars", id_column="car_id", batch_size=5
78
+ ),
79
+ )
80
+ downloader = PostgresDownloader(
81
+ connection_config=connection_config,
82
+ download_config=PostgresDownloaderConfig(
83
+ fields=["car_id", "brand"], download_dir=Path(tmpdir)
84
+ ),
85
+ )
86
+ await source_connector_validation(
87
+ indexer=indexer,
88
+ downloader=downloader,
89
+ configs=ValidationConfigs(
90
+ test_id="postgres",
91
+ expected_num_files=40,
92
+ ),
93
+ )
94
+
19
95
 
20
96
  def validate_destination(
21
97
  connect_params: dict,
@@ -50,7 +126,9 @@ async def test_postgres_destination(upload_file: Path):
50
126
  # the postgres destination connector doesn't leverage the file data but is required as an input,
51
127
  # mocking it with arbitrary values to meet the base requirements:
52
128
  mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
53
- with docker_compose_context(docker_compose_path=env_setup_path / "sql"):
129
+ with docker_compose_context(
130
+ docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
131
+ ):
54
132
  with tempfile.TemporaryDirectory() as tmpdir:
55
133
  stager = PostgresUploadStager()
56
134
  stager_params = {
@@ -3,39 +3,99 @@ import tempfile
3
3
  from contextlib import contextmanager
4
4
  from pathlib import Path
5
5
 
6
+ import faker
6
7
  import pandas as pd
7
8
  import pytest
8
9
 
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
11
+ from test.integration.connectors.utils.validation import (
12
+ ValidationConfigs,
13
+ source_connector_validation,
14
+ )
10
15
  from unstructured_ingest.v2.interfaces import FileData
11
16
  from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
12
17
  CONNECTOR_TYPE,
13
18
  SQLiteConnectionConfig,
19
+ SQLiteDownloader,
20
+ SQLiteDownloaderConfig,
21
+ SQLiteIndexer,
22
+ SQLiteIndexerConfig,
14
23
  SQLiteUploader,
15
24
  SQLiteUploadStager,
16
25
  )
17
26
 
27
+ faker = faker.Faker()
28
+
29
+ SEED_DATA_ROWS = 40
30
+
31
+
32
+ @contextmanager
33
+ def sqlite_download_setup() -> Path:
34
+ with tempfile.TemporaryDirectory() as tmpdir:
35
+ db_path = Path(tmpdir) / "mock_database.db"
36
+ db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
37
+ assert db_init_path.exists()
38
+ assert db_init_path.is_file()
39
+ with sqlite3.connect(database=db_path) as sqlite_connection:
40
+ cursor = sqlite_connection.cursor()
41
+ with db_init_path.open("r") as f:
42
+ query = f.read()
43
+ cursor.executescript(query)
44
+ for _ in range(SEED_DATA_ROWS):
45
+ sql_statment = (
46
+ f"INSERT INTO cars (brand, price) "
47
+ f"VALUES ('{faker.word()}', {faker.random_int()})"
48
+ )
49
+ cursor.execute(sql_statment)
50
+
51
+ sqlite_connection.commit()
52
+ cursor.close()
53
+ yield db_path
54
+
55
+
56
+ @pytest.mark.asyncio
57
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
58
+ async def test_sqlite_source():
59
+ with sqlite_download_setup() as db_path:
60
+ with tempfile.TemporaryDirectory() as tmpdir:
61
+ connection_config = SQLiteConnectionConfig(database_path=db_path)
62
+ indexer = SQLiteIndexer(
63
+ connection_config=connection_config,
64
+ index_config=SQLiteIndexerConfig(
65
+ table_name="cars", id_column="car_id", batch_size=5
66
+ ),
67
+ )
68
+ downloader = SQLiteDownloader(
69
+ connection_config=connection_config,
70
+ download_config=SQLiteDownloaderConfig(
71
+ fields=["car_id", "brand"], download_dir=Path(tmpdir)
72
+ ),
73
+ )
74
+ await source_connector_validation(
75
+ indexer=indexer,
76
+ downloader=downloader,
77
+ configs=ValidationConfigs(
78
+ test_id="sqlite",
79
+ expected_num_files=40,
80
+ ),
81
+ )
82
+
18
83
 
19
84
  @contextmanager
20
- def sqlite_setup() -> Path:
85
+ def sqlite_upload_setup() -> Path:
21
86
  # Provision the local file that sqlite points to to have the desired schema for the integration
22
87
  # tests and make sure the file and connection get cleaned up by using a context manager.
23
88
  with tempfile.TemporaryDirectory() as tmpdir:
24
89
  db_path = Path(tmpdir) / "elements.db"
25
- db_init_path = env_setup_path / "sql" / "sqlite-schema.sql"
90
+ db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
26
91
  assert db_init_path.exists()
27
92
  assert db_init_path.is_file()
28
- connection = None
29
- try:
30
- connection = sqlite3.connect(database=db_path)
93
+ with sqlite3.connect(database=db_path) as sqlite_connection:
31
94
  with db_init_path.open("r") as f:
32
95
  query = f.read()
33
- cursor = connection.cursor()
96
+ cursor = sqlite_connection.cursor()
34
97
  cursor.executescript(query)
35
- yield db_path
36
- finally:
37
- if connection:
38
- connection.close()
98
+ yield db_path
39
99
 
40
100
 
41
101
  def validate_destination(db_path: Path, expected_num_elements: int):
@@ -62,7 +122,7 @@ async def test_sqlite_destination(upload_file: Path):
62
122
  # the sqlite destination connector doesn't leverage the file data but is required as an input,
63
123
  # mocking it with arbitrary values to meet the base requirements:
64
124
  mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
65
- with sqlite_setup() as db_path:
125
+ with sqlite_upload_setup() as db_path:
66
126
  with tempfile.TemporaryDirectory() as tmpdir:
67
127
  stager = SQLiteUploadStager()
68
128
  stager_params = {
@@ -3,5 +3,5 @@ from pathlib import Path
3
3
  SOURCE_TAG = "source"
4
4
  DESTINATION_TAG = "destination"
5
5
 
6
- env_setup_path = Path(__file__).parents[4] / "test_e2e" / "env_setup"
6
+ env_setup_path = Path(__file__).parents[1] / "env_setup"
7
7
  expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -180,8 +180,13 @@ async def source_connector_validation(
180
180
  resp = await downloader.run_async(file_data=file_data)
181
181
  else:
182
182
  resp = downloader.run(file_data=file_data)
183
- postdownload_file_data = replace(resp["file_data"])
184
- all_postdownload_file_data.append(postdownload_file_data)
183
+ if isinstance(resp, list):
184
+ for r in resp:
185
+ postdownload_file_data = replace(r["file_data"])
186
+ all_postdownload_file_data.append(postdownload_file_data)
187
+ else:
188
+ postdownload_file_data = replace(resp["file_data"])
189
+ all_postdownload_file_data.append(postdownload_file_data)
185
190
  if not overwrite_fixtures:
186
191
  run_all_validations(
187
192
  configs=configs,
@@ -1 +1 @@
1
- __version__ = "0.1.0" # pragma: no cover
1
+ __version__ = "0.1.1" # pragma: no cover
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import random
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -63,6 +64,7 @@ class FileConfig(BaseModel):
63
64
 
64
65
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
65
66
  recursive: bool = False
67
+ sample_n_files: Optional[int] = None
66
68
 
67
69
 
68
70
  class FsspecAccessConfig(AccessConfig):
@@ -128,8 +130,23 @@ class FsspecIndexer(Indexer):
128
130
  filtered_files = [
129
131
  file for file in files if file.get("size") > 0 and file.get("type") == "file"
130
132
  ]
133
+
134
+ if self.index_config.sample_n_files:
135
+ filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
136
+
131
137
  return filtered_files
132
138
 
139
+ def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
140
+ if len(files) <= n:
141
+ logger.warning(
142
+ f"number of files to be sampled={n} is not smaller than the number"
143
+ f" of files found ({len(files)}). Returning all of the files as the"
144
+ " sample."
145
+ )
146
+ return files
147
+
148
+ return random.sample(files, n)
149
+
133
150
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
134
151
  raise NotImplementedError()
135
152
 
@@ -26,7 +26,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
26
26
  )
27
27
 
28
28
  if TYPE_CHECKING:
29
- from kdbai_client import Session, Table
29
+ from kdbai_client import Database, Session, Table
30
30
 
31
31
  CONNECTOR_TYPE = "kdbai"
32
32
 
@@ -99,6 +99,9 @@ class KdbaiUploadStager(UploadStager):
99
99
 
100
100
 
101
101
  class KdbaiUploaderConfig(UploaderConfig):
102
+ database_name: str = Field(
103
+ default="default", description="The name of the KDBAI database to write into."
104
+ )
102
105
  table_name: str = Field(description="The name of the KDBAI table to write into.")
103
106
  batch_size: int = Field(default=100, description="Number of records per batch")
104
107
 
@@ -111,24 +114,29 @@ class KdbaiUploader(Uploader):
111
114
 
112
115
  def precheck(self) -> None:
113
116
  try:
114
- self.get_table()
117
+ self.get_database()
115
118
  except Exception as e:
116
119
  logger.error(f"Failed to validate connection {e}", exc_info=True)
117
120
  raise DestinationConnectionError(f"failed to validate connection: {e}")
118
121
 
119
- def get_table(self) -> "Table":
122
+ def get_database(self) -> "Database":
120
123
  session: Session = self.connection_config.get_session()
121
- table = session.table(self.upload_config.table_name)
124
+ db = session.database(self.upload_config.database_name)
125
+ return db
126
+
127
+ def get_table(self) -> "Table":
128
+ db = self.get_database()
129
+ table = db.table(self.upload_config.table_name)
122
130
  return table
123
131
 
124
132
  def upsert_batch(self, batch: pd.DataFrame):
125
133
  table = self.get_table()
126
- table.insert(data=batch)
134
+ table.insert(batch)
127
135
 
128
136
  def process_dataframe(self, df: pd.DataFrame):
129
137
  logger.debug(
130
138
  f"uploading {len(df)} entries to {self.connection_config.endpoint} "
131
- f"db in table {self.upload_config.table_name}"
139
+ f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
132
140
  )
133
141
  for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
134
142
  self.upsert_batch(batch=batch_df)
@@ -7,12 +7,17 @@ import pandas as pd
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.interfaces import FileData
10
11
  from unstructured_ingest.v2.logger import logger
11
12
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
12
13
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
13
14
  _DATE_COLUMNS,
14
15
  SQLAccessConfig,
15
16
  SQLConnectionConfig,
17
+ SQLDownloader,
18
+ SQLDownloaderConfig,
19
+ SQLIndexer,
20
+ SQLIndexerConfig,
16
21
  SQLUploader,
17
22
  SQLUploaderConfig,
18
23
  SQLUploadStager,
@@ -57,6 +62,57 @@ class PostgresConnectionConfig(SQLConnectionConfig):
57
62
  )
58
63
 
59
64
 
65
+ class PostgresIndexerConfig(SQLIndexerConfig):
66
+ pass
67
+
68
+
69
+ @dataclass
70
+ class PostgresIndexer(SQLIndexer):
71
+ connection_config: PostgresConnectionConfig
72
+ index_config: PostgresIndexerConfig
73
+ connector_type: str = CONNECTOR_TYPE
74
+
75
+ def _get_doc_ids(self) -> list[str]:
76
+ connection = self.connection_config.get_connection()
77
+ with connection.cursor() as cursor:
78
+ cursor.execute(
79
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
80
+ )
81
+ results = cursor.fetchall()
82
+ ids = [result[0] for result in results]
83
+ return ids
84
+
85
+
86
+ class PostgresDownloaderConfig(SQLDownloaderConfig):
87
+ pass
88
+
89
+
90
+ @dataclass
91
+ class PostgresDownloader(SQLDownloader):
92
+ connection_config: PostgresConnectionConfig
93
+ download_config: PostgresDownloaderConfig
94
+ connector_type: str = CONNECTOR_TYPE
95
+
96
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
97
+ table_name = file_data.additional_metadata["table_name"]
98
+ id_column = file_data.additional_metadata["id_column"]
99
+ ids = file_data.additional_metadata["ids"]
100
+ connection = self.connection_config.get_connection()
101
+ with connection.cursor() as cursor:
102
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
103
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
104
+ fields=fields,
105
+ table_name=table_name,
106
+ id_column=id_column,
107
+ ids=",".join([str(i) for i in ids]),
108
+ )
109
+ logger.debug(f"running query: {query}")
110
+ cursor.execute(query)
111
+ rows = cursor.fetchall()
112
+ columns = [col[0] for col in cursor.description]
113
+ return rows, columns
114
+
115
+
60
116
  class PostgresUploadStagerConfig(SQLUploadStagerConfig):
61
117
  pass
62
118
 
@@ -1,24 +1,34 @@
1
+ import hashlib
1
2
  import json
3
+ import sys
2
4
  import uuid
3
5
  from abc import ABC, abstractmethod
4
- from dataclasses import dataclass, field
6
+ from dataclasses import dataclass, field, replace
5
7
  from datetime import date, datetime
6
8
  from pathlib import Path
7
- from typing import Any, Union
9
+ from time import time
10
+ from typing import Any, Generator, Union
8
11
 
9
12
  import pandas as pd
10
13
  from dateutil import parser
11
14
  from pydantic import Field, Secret
12
15
 
13
- from unstructured_ingest.error import DestinationConnectionError
16
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
14
17
  from unstructured_ingest.v2.interfaces import (
15
18
  AccessConfig,
16
19
  ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
17
23
  FileData,
24
+ FileDataSourceMetadata,
25
+ Indexer,
26
+ IndexerConfig,
18
27
  Uploader,
19
28
  UploaderConfig,
20
29
  UploadStager,
21
30
  UploadStagerConfig,
31
+ download_responses,
22
32
  )
23
33
  from unstructured_ingest.v2.logger import logger
24
34
 
@@ -88,6 +98,125 @@ class SQLConnectionConfig(ConnectionConfig, ABC):
88
98
  pass
89
99
 
90
100
 
101
+ class SQLIndexerConfig(IndexerConfig):
102
+ table_name: str
103
+ id_column: str
104
+ batch_size: int = 100
105
+
106
+
107
+ class SQLIndexer(Indexer, ABC):
108
+ connection_config: SQLConnectionConfig
109
+ index_config: SQLIndexerConfig
110
+
111
+ @abstractmethod
112
+ def _get_doc_ids(self) -> list[str]:
113
+ pass
114
+
115
+ def precheck(self) -> None:
116
+ try:
117
+ connection = self.connection_config.get_connection()
118
+ cursor = connection.cursor()
119
+ cursor.execute("SELECT 1;")
120
+ cursor.close()
121
+ except Exception as e:
122
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
123
+ raise SourceConnectionError(f"failed to validate connection: {e}")
124
+
125
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
126
+ ids = self._get_doc_ids()
127
+ id_batches: list[frozenset[str]] = [
128
+ frozenset(
129
+ ids[
130
+ i
131
+ * self.index_config.batch_size : (i + 1) # noqa
132
+ * self.index_config.batch_size
133
+ ]
134
+ )
135
+ for i in range(
136
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
137
+ )
138
+ ]
139
+ for batch in id_batches:
140
+ # Make sure the hash is always a positive number to create identified
141
+ identified = str(hash(batch) + sys.maxsize + 1)
142
+ yield FileData(
143
+ identifier=identified,
144
+ connector_type=self.connector_type,
145
+ metadata=FileDataSourceMetadata(
146
+ date_processed=str(time()),
147
+ ),
148
+ doc_type="batch",
149
+ additional_metadata={
150
+ "ids": list(batch),
151
+ "table_name": self.index_config.table_name,
152
+ "id_column": self.index_config.id_column,
153
+ },
154
+ )
155
+
156
+
157
+ class SQLDownloaderConfig(DownloaderConfig):
158
+ fields: list[str] = field(default_factory=list)
159
+
160
+
161
+ class SQLDownloader(Downloader, ABC):
162
+ connection_config: SQLConnectionConfig
163
+ download_config: SQLDownloaderConfig
164
+
165
+ @abstractmethod
166
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
167
+ pass
168
+
169
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
170
+ data = [dict(zip(columns, row)) for row in rows]
171
+ df = pd.DataFrame(data)
172
+ dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
173
+ return dfs
174
+
175
+ def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
176
+ rows, columns = self.query_db(file_data=file_data)
177
+ return self.sql_to_df(rows=rows, columns=columns)
178
+
179
+ def get_identifier(self, table_name: str, record_id: str) -> str:
180
+ f = f"{table_name}-{record_id}"
181
+ if self.download_config.fields:
182
+ f = "{}-{}".format(
183
+ f,
184
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
185
+ )
186
+ return f
187
+
188
+ def generate_download_response(
189
+ self, result: pd.DataFrame, file_data: FileData
190
+ ) -> DownloadResponse:
191
+ id_column = file_data.additional_metadata["id_column"]
192
+ table_name = file_data.additional_metadata["table_name"]
193
+ record_id = result.iloc[0][id_column]
194
+ filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
195
+ filename = f"{filename_id}.csv"
196
+ download_path = self.download_dir / Path(filename)
197
+ logger.debug(
198
+ f"Downloading results from table {table_name} and id {record_id} to {download_path}"
199
+ )
200
+ download_path.parent.mkdir(parents=True, exist_ok=True)
201
+ result.to_csv(download_path)
202
+ copied_file_data = replace(file_data)
203
+ copied_file_data.identifier = filename_id
204
+ copied_file_data.doc_type = "file"
205
+ copied_file_data.additional_metadata.pop("ids", None)
206
+ return super().generate_download_response(
207
+ file_data=copied_file_data, download_path=download_path
208
+ )
209
+
210
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
211
+ data_dfs = self.get_data(file_data=file_data)
212
+ download_responses = []
213
+ for df in data_dfs:
214
+ download_responses.append(
215
+ self.generate_download_response(result=df, file_data=file_data)
216
+ )
217
+ return download_responses
218
+
219
+
91
220
  class SQLUploadStagerConfig(UploadStagerConfig):
92
221
  pass
93
222
 
@@ -5,14 +5,19 @@ from typing import TYPE_CHECKING, Any
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pydantic import Field, Secret
8
+ from pydantic import Field, Secret, model_validator
9
9
 
10
+ from unstructured_ingest.v2.interfaces import FileData
10
11
  from unstructured_ingest.v2.logger import logger
11
12
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
12
13
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
13
14
  _DATE_COLUMNS,
14
15
  SQLAccessConfig,
15
16
  SQLConnectionConfig,
17
+ SQLDownloader,
18
+ SQLDownloaderConfig,
19
+ SQLIndexer,
20
+ SQLIndexerConfig,
16
21
  SQLUploader,
17
22
  SQLUploaderConfig,
18
23
  SQLUploadStager,
@@ -37,7 +42,14 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
37
42
  database_path: Path = Field(
38
43
  description="Path to the .db file.",
39
44
  )
40
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
45
+
46
+ @model_validator(mode="after")
47
+ def check_database_path(self) -> "SQLiteConnectionConfig":
48
+ if not self.database_path.exists():
49
+ raise ValueError(f"{self.database_path} does not exist")
50
+ if not self.database_path.is_file():
51
+ raise ValueError(f"{self.database_path} is not a valid file")
52
+ return self
41
53
 
42
54
  def get_connection(self) -> "SqliteConnection":
43
55
  from sqlite3 import connect
@@ -45,6 +57,57 @@ class SQLiteConnectionConfig(SQLConnectionConfig):
45
57
  return connect(database=self.database_path)
46
58
 
47
59
 
60
+ class SQLiteIndexerConfig(SQLIndexerConfig):
61
+ pass
62
+
63
+
64
+ @dataclass
65
+ class SQLiteIndexer(SQLIndexer):
66
+ connection_config: SQLConnectionConfig
67
+ index_config: SQLIndexerConfig
68
+ connector_type: str = CONNECTOR_TYPE
69
+
70
+ def _get_doc_ids(self) -> list[str]:
71
+ with self.connection_config.get_connection() as sqlite_connection:
72
+ cursor = sqlite_connection.cursor()
73
+ cursor.execute(
74
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
75
+ )
76
+ results = cursor.fetchall()
77
+ ids = [result[0] for result in results]
78
+ return ids
79
+
80
+
81
+ class SQLiteDownloaderConfig(SQLDownloaderConfig):
82
+ pass
83
+
84
+
85
+ @dataclass
86
+ class SQLiteDownloader(SQLDownloader):
87
+ connection_config: SQLConnectionConfig
88
+ download_config: SQLDownloaderConfig
89
+ connector_type: str = CONNECTOR_TYPE
90
+
91
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
92
+ table_name = file_data.additional_metadata["table_name"]
93
+ id_column = file_data.additional_metadata["id_column"]
94
+ ids = file_data.additional_metadata["ids"]
95
+ with self.connection_config.get_connection() as sqlite_connection:
96
+ cursor = sqlite_connection.cursor()
97
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
98
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
99
+ fields=fields,
100
+ table_name=table_name,
101
+ id_column=id_column,
102
+ ids=",".join([str(i) for i in ids]),
103
+ )
104
+ logger.debug(f"running query: {query}")
105
+ cursor.execute(query)
106
+ rows = cursor.fetchall()
107
+ columns = [col[0] for col in cursor.description]
108
+ return rows, columns
109
+
110
+
48
111
  class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
49
112
  pass
50
113
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: pydantic>=2.7
26
- Requires-Dist: opentelemetry-sdk
27
- Requires-Dist: tqdm
28
- Requires-Dist: pandas
29
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
27
+ Requires-Dist: pydantic>=2.7
30
28
  Requires-Dist: dataclasses-json
29
+ Requires-Dist: opentelemetry-sdk
31
30
  Requires-Dist: click
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -44,8 +44,8 @@ Provides-Extra: biomed
44
44
  Requires-Dist: bs4; extra == "biomed"
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: fsspec; extra == "box"
48
47
  Requires-Dist: boxfs; extra == "box"
48
+ Requires-Dist: fsspec; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
72
  Requires-Dist: fsspec; extra == "dropbox"
73
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -87,12 +87,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
- Requires-Dist: bs4; extra == "gcs"
91
90
  Requires-Dist: gcsfs; extra == "gcs"
91
+ Requires-Dist: bs4; extra == "gcs"
92
92
  Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: requests; extra == "github"
95
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
+ Requires-Dist: requests; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
106
106
  Requires-Dist: confluent-kafka; extra == "kafka"
107
107
  Provides-Extra: kdbai
108
- Requires-Dist: kdbai-client; extra == "kdbai"
108
+ Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
109
109
  Provides-Extra: md
110
110
  Requires-Dist: unstructured[md]; extra == "md"
111
111
  Provides-Extra: milvus
@@ -116,15 +116,15 @@ Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
118
  Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: httpx; extra == "notion"
120
- Requires-Dist: backoff; extra == "notion"
121
119
  Requires-Dist: htmlBuilder; extra == "notion"
120
+ Requires-Dist: backoff; extra == "notion"
121
+ Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
125
126
  Requires-Dist: bs4; extra == "onedrive"
126
127
  Requires-Dist: msal; extra == "onedrive"
127
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
130
130
  Requires-Dist: tiktoken; extra == "openai"
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
134
134
  Requires-Dist: unstructured[org]; extra == "org"
135
135
  Provides-Extra: outlook
136
- Requires-Dist: msal; extra == "outlook"
137
136
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
137
+ Requires-Dist: msal; extra == "outlook"
138
138
  Provides-Extra: pdf
139
139
  Requires-Dist: unstructured[pdf]; extra == "pdf"
140
140
  Provides-Extra: pinecone
@@ -156,16 +156,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
159
  Requires-Dist: fsspec; extra == "s3"
160
+ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
164
164
  Requires-Dist: fsspec; extra == "sftp"
165
165
  Requires-Dist: paramiko; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
- Requires-Dist: msal; extra == "sharepoint"
168
167
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
+ Requires-Dist: msal; extra == "sharepoint"
169
169
  Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
@@ -5,15 +5,16 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
8
- test/integration/connectors/test_postgres.py,sha256=9uaqlUmLpVF09cwKSw7Yldq2kjU00WBedbEIgyJG5Cw,3998
9
8
  test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
10
- test/integration/connectors/test_sqlite.py,sha256=NnLdyt3FfM1A53tXPJbgIcsy-iEgYY8OZYOfliFqifM,3507
11
9
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
10
  test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
11
+ test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
13
+ test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
13
14
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- test/integration/connectors/utils/constants.py,sha256=OjxLmmzCbDNqH5tK0jWFxDgIkM973cr3SmFIRk7aySc,222
15
+ test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
15
16
  test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
16
- test/integration/connectors/utils/validation.py,sha256=VNvyutfnWbnesavL_V5SjM2H3LoOHnkW7Paq8RO4WbM,8199
17
+ test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
17
18
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
19
20
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -42,7 +43,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
42
43
  test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
43
44
  test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
44
45
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
45
- unstructured_ingest/__version__.py,sha256=J87Ao0q5WoHKbDEbH6O10GOGaMO3yEUCBOxCqbm715I,42
46
+ unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
46
47
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
47
48
  unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
48
49
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -361,7 +362,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-H
361
362
  unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
362
363
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
363
364
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
364
- unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
365
+ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
365
366
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
366
367
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
367
368
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
@@ -384,18 +385,18 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
384
385
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
385
386
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
386
387
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
387
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=usLzU2NA5D_a1juhja4jyJP_CzW4h-5rZ22bWVwvZGQ,10853
388
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
388
389
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
389
390
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
390
391
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
391
392
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
392
393
  unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
393
- unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=hqNuGYR_9o5LmfVDXnm3jBF5Pk-s7R66d0epF2uBYuM,4083
394
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=8bDUgyDurQelOabNnSG6ejWWsnLGWf-A-lWrpwYDGQM,5140
395
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=K-Lquxxqa1m5fk9by-5sasq561TRFAeV_SZ1Hc_b9Hk,3426
396
- unstructured_ingest-0.1.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
397
- unstructured_ingest-0.1.0.dist-info/METADATA,sha256=mNOS5HjbygWcTZ5eFlxoPpvt6dVAjkYniNHpk6tLvQw,7181
398
- unstructured_ingest-0.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
399
- unstructured_ingest-0.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
400
- unstructured_ingest-0.1.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
401
- unstructured_ingest-0.1.0.dist-info/RECORD,,
394
+ unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
395
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
396
+ unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
397
+ unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
398
+ unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
399
+ unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
400
+ unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
401
+ unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
402
+ unstructured_ingest-0.1.1.dist-info/RECORD,,