unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (83) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/v2/cli/utils/click.py +21 -2
  60. unstructured_ingest/v2/interfaces/connector.py +22 -2
  61. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  62. unstructured_ingest/v2/processes/chunker.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  64. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  65. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  71. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  72. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  73. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  74. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  75. unstructured_ingest/v2/processes/embedder.py +13 -0
  76. unstructured_ingest/v2/processes/partitioner.py +2 -1
  77. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
  78. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
  79. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  80. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  83. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,275 +0,0 @@
1
- import json
2
- import uuid
3
- from dataclasses import dataclass, field
4
- from datetime import date, datetime
5
- from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
7
-
8
- import numpy as np
9
- import pandas as pd
10
- from dateutil import parser
11
- from pydantic import Field, Secret
12
-
13
- from unstructured_ingest.error import DestinationConnectionError
14
- from unstructured_ingest.utils.dep_check import requires_dependencies
15
- from unstructured_ingest.v2.interfaces import (
16
- AccessConfig,
17
- ConnectionConfig,
18
- FileData,
19
- Uploader,
20
- UploaderConfig,
21
- UploadStager,
22
- UploadStagerConfig,
23
- )
24
- from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
26
-
27
- if TYPE_CHECKING:
28
- from sqlite3 import Connection as SqliteConnection
29
-
30
- from psycopg2.extensions import connection as PostgresConnection
31
-
32
- CONNECTOR_TYPE = "sql"
33
- ELEMENTS_TABLE_NAME = "elements"
34
- SQLITE_DB = "sqlite"
35
- POSTGRESQL_DB = "postgresql"
36
-
37
-
38
- class SQLAccessConfig(AccessConfig):
39
- username: Optional[str] = Field(default=None, description="DB username")
40
- password: Optional[str] = Field(default=None, description="DB password")
41
-
42
-
43
- class SQLConnectionConfig(ConnectionConfig):
44
- db_type: Literal["sqlite", "postgresql"] = Field(
45
- default=SQLITE_DB, description="Type of the database backend"
46
- )
47
- database: Optional[str] = Field(
48
- default=None,
49
- description="Database name. For sqlite databases, this is the path to the .db file.",
50
- )
51
- host: Optional[str] = Field(default=None, description="DB host")
52
- port: Optional[int] = Field(default=5432, description="DB host connection port")
53
- access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
54
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
55
-
56
- def __post_init__(self):
57
- if (self.db_type == SQLITE_DB) and (self.database is None):
58
- raise ValueError(
59
- "A sqlite connection requires a path to a *.db file "
60
- "through the `database` argument"
61
- )
62
-
63
-
64
- class SQLUploadStagerConfig(UploadStagerConfig):
65
- pass
66
-
67
-
68
- _COLUMNS = (
69
- "id",
70
- "element_id",
71
- "text",
72
- "embeddings",
73
- "type",
74
- "system",
75
- "layout_width",
76
- "layout_height",
77
- "points",
78
- "url",
79
- "version",
80
- "date_created",
81
- "date_modified",
82
- "date_processed",
83
- "permissions_data",
84
- "record_locator",
85
- "category_depth",
86
- "parent_id",
87
- "attached_filename",
88
- "filetype",
89
- "last_modified",
90
- "file_directory",
91
- "filename",
92
- "languages",
93
- "page_number",
94
- "links",
95
- "page_name",
96
- "link_urls",
97
- "link_texts",
98
- "sent_from",
99
- "sent_to",
100
- "subject",
101
- "section",
102
- "header_footer_type",
103
- "emphasized_text_contents",
104
- "emphasized_text_tags",
105
- "text_as_html",
106
- "regex_metadata",
107
- "detection_class_prob",
108
- )
109
-
110
- _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
111
-
112
-
113
- def parse_date_string(date_value: Union[str, int]) -> date:
114
- try:
115
- timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
116
- return datetime.fromtimestamp(timestamp)
117
- except Exception as e:
118
- logger.debug(f"date {date_value} string not a timestamp: {e}")
119
- return parser.parse(date_value)
120
-
121
-
122
- @dataclass
123
- class SQLUploadStager(UploadStager):
124
- upload_stager_config: SQLUploadStagerConfig = field(
125
- default_factory=lambda: SQLUploadStagerConfig()
126
- )
127
-
128
- def run(
129
- self,
130
- elements_filepath: Path,
131
- file_data: FileData,
132
- output_dir: Path,
133
- output_filename: str,
134
- **kwargs: Any,
135
- ) -> Path:
136
- with open(elements_filepath) as elements_file:
137
- elements_contents: list[dict] = json.load(elements_file)
138
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
- output_path.parent.mkdir(parents=True, exist_ok=True)
140
-
141
- output = []
142
- for data in elements_contents:
143
- metadata: dict[str, Any] = data.pop("metadata", {})
144
- data_source = metadata.pop("data_source", {})
145
- coordinates = metadata.pop("coordinates", {})
146
-
147
- data.update(metadata)
148
- data.update(data_source)
149
- data.update(coordinates)
150
-
151
- data["id"] = str(uuid.uuid4())
152
-
153
- # remove extraneous, not supported columns
154
- data = {k: v for k, v in data.items() if k in _COLUMNS}
155
-
156
- output.append(data)
157
-
158
- df = pd.DataFrame.from_dict(output)
159
- for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
160
- df[column] = df[column].apply(parse_date_string)
161
- for column in filter(
162
- lambda x: x in df.columns,
163
- ("permissions_data", "record_locator", "points", "links"),
164
- ):
165
- df[column] = df[column].apply(
166
- lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
167
- )
168
- for column in filter(
169
- lambda x: x in df.columns,
170
- ("version", "page_number", "regex_metadata"),
171
- ):
172
- df[column] = df[column].apply(str)
173
-
174
- with output_path.open("w") as output_file:
175
- df.to_json(output_file, orient="records", lines=True)
176
- return output_path
177
-
178
-
179
- class SQLUploaderConfig(UploaderConfig):
180
- batch_size: int = Field(default=50, description="Number of records per batch")
181
-
182
-
183
- @dataclass
184
- class SQLUploader(Uploader):
185
- connector_type: str = CONNECTOR_TYPE
186
- upload_config: SQLUploaderConfig
187
- connection_config: SQLConnectionConfig
188
-
189
- def precheck(self) -> None:
190
- try:
191
- cursor = self.connection().cursor()
192
- cursor.execute("SELECT 1;")
193
- cursor.close()
194
- except Exception as e:
195
- logger.error(f"failed to validate connection: {e}", exc_info=True)
196
- raise DestinationConnectionError(f"failed to validate connection: {e}")
197
-
198
- @property
199
- def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
200
- if self.connection_config.db_type == POSTGRESQL_DB:
201
- return self._make_psycopg_connection
202
- elif self.connection_config.db_type == SQLITE_DB:
203
- return self._make_sqlite_connection
204
- raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
205
-
206
- def _make_sqlite_connection(self) -> "SqliteConnection":
207
- from sqlite3 import connect
208
-
209
- return connect(database=self.connection_config.database)
210
-
211
- @requires_dependencies(["psycopg2"], extras="postgres")
212
- def _make_psycopg_connection(self) -> "PostgresConnection":
213
- from psycopg2 import connect
214
-
215
- access_config = self.connection_config.access_config.get_secret_value()
216
- return connect(
217
- user=access_config.username,
218
- password=access_config.password,
219
- dbname=self.connection_config.database,
220
- host=self.connection_config.host,
221
- port=self.connection_config.port,
222
- )
223
-
224
- def prepare_data(
225
- self, columns: list[str], data: tuple[tuple[Any, ...], ...]
226
- ) -> list[tuple[Any, ...]]:
227
- output = []
228
- for row in data:
229
- parsed = []
230
- for column_name, value in zip(columns, row):
231
- if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
232
- value = json.dumps(value)
233
- if column_name in _DATE_COLUMNS:
234
- if value is None:
235
- parsed.append(None)
236
- else:
237
- parsed.append(parse_date_string(value))
238
- else:
239
- parsed.append(value)
240
- output.append(tuple(parsed))
241
- return output
242
-
243
- def upload_contents(self, path: Path) -> None:
244
- df = pd.read_json(path, orient="records", lines=True)
245
- logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
246
- df.replace({np.nan: None}, inplace=True)
247
-
248
- columns = tuple(df.columns)
249
- stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
250
- VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
251
-
252
- for rows in pd.read_json(
253
- path, orient="records", lines=True, chunksize=self.upload_config.batch_size
254
- ):
255
- with self.connection() as conn:
256
- values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
257
- if self.connection_config.db_type == SQLITE_DB:
258
- conn.executemany(stmt, values)
259
- else:
260
- with conn.cursor() as cur:
261
- cur.executemany(stmt, values)
262
-
263
- conn.commit()
264
-
265
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
266
- self.upload_contents(path=path)
267
-
268
-
269
- sql_destination_entry = DestinationRegistryEntry(
270
- connection_config=SQLConnectionConfig,
271
- uploader=SQLUploader,
272
- uploader_config=SQLUploaderConfig,
273
- upload_stager=SQLUploadStager,
274
- upload_stager_config=SQLUploadStagerConfig,
275
- )