unstructured-ingest 1.0.24__py3-none-any.whl → 1.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.24" # pragma: no cover
1
+ __version__ = "1.0.28" # pragma: no cover
@@ -34,7 +34,7 @@ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
34
34
  class VertexAIEmbeddingConfig(EmbeddingConfig):
35
35
  api_key: ApiKeyType = Field(description="API key for Vertex AI")
36
36
  embedder_model_name: Optional[str] = Field(
37
- default="textembedding-gecko@001", alias="model_name", description="Vertex AI model name"
37
+ default="text-embedding-005", alias="model_name", description="Vertex AI model name"
38
38
  )
39
39
 
40
40
  def wrap_error(self, e: Exception) -> Exception:
@@ -51,14 +51,49 @@ class ExternalIcon(FromJSONMixin, GetHTMLMixin):
51
51
  return None
52
52
 
53
53
 
54
+ @dataclass
55
+ class FileIconContent(FromJSONMixin):
56
+ url: str
57
+ expiry_time: Optional[str] = None # Add expiry_time if needed
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: dict):
61
+ # Only include expiry_time if it exists in the dictionary
62
+ # Notion API might not always include it
63
+ init_data = {"url": data.get("url")}
64
+ if "expiry_time" in data:
65
+ init_data["expiry_time"] = data.get("expiry_time")
66
+ return cls(**init_data)
67
+
68
+
69
+ @dataclass
70
+ class FileIcon(FromJSONMixin, GetHTMLMixin):
71
+ file: FileIconContent
72
+ type: str = "file"
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict):
76
+ return cls(file=FileIconContent.from_dict(data=data.pop("file")), **data)
77
+
78
+ def get_html(self) -> Optional[HtmlTag]:
79
+ # Render the file URL, similar to how ExternalIcon is handled
80
+ if self.file:
81
+ # Could potentially render an <img> tag, but sticking to URL for consistency
82
+ return A([Href(self.file.url)], [f"[File Icon: {self.file.url}]"])
83
+ else:
84
+ return None
85
+
86
+
54
87
  class Icon(FromJSONMixin):
55
88
  @classmethod
56
- def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]:
89
+ def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon, FileIcon]:
57
90
  t = data.get("type")
58
91
  if t == "emoji":
59
92
  return EmojiIcon.from_dict(data)
60
93
  elif t == "external":
61
94
  return ExternalIcon.from_dict(data)
95
+ elif t == "file":
96
+ return FileIcon.from_dict(data)
62
97
  else:
63
98
  raise ValueError(f"Unexpected icon type: {t} ({data})")
64
99
 
@@ -66,7 +101,7 @@ class Icon(FromJSONMixin):
66
101
  @dataclass
67
102
  class Callout(BlockBase):
68
103
  color: str
69
- icon: Optional[Union[EmojiIcon, ExternalIcon]] = None
104
+ icon: Optional[Union[EmojiIcon, ExternalIcon, FileIcon]] = None
70
105
  rich_text: List[RichText] = field(default_factory=list)
71
106
 
72
107
  @staticmethod
@@ -76,9 +111,11 @@ class Callout(BlockBase):
76
111
  @classmethod
77
112
  def from_dict(cls, data: dict):
78
113
  rich_text = data.pop("rich_text", [])
114
+ icon_data = data.pop("icon", None)
115
+ icon = Icon.from_dict(icon_data) if icon_data else None
79
116
  return cls(
80
117
  color=data["color"],
81
- icon=Icon.from_dict(data.pop("icon")),
118
+ icon=icon,
82
119
  rich_text=[RichText.from_dict(rt) for rt in rich_text],
83
120
  )
84
121
 
@@ -36,14 +36,16 @@ class User(FromJSONMixin, GetHTMLMixin):
36
36
  def get_text(self) -> Optional[str]:
37
37
  text = self.name
38
38
  if self.avatar_url:
39
- text = f"[{text}]({self.avatar_url}"
39
+ text = f"[{text}]({self.avatar_url})"
40
40
  return text
41
41
 
42
42
  def get_html(self) -> Optional[HtmlTag]:
43
- if self.avatar_url:
43
+ if self.avatar_url and self.name:
44
44
  return A([Href(self.avatar_url)], self.name)
45
- else:
45
+ elif self.name:
46
46
  return Div([], self.name)
47
+ else:
48
+ return Div([], "")
47
49
 
48
50
 
49
51
  @dataclass
@@ -69,11 +71,13 @@ class Bots(FromJSONMixin, GetHTMLMixin):
69
71
  def get_text(self) -> Optional[str]:
70
72
  text = self.name
71
73
  if self.avatar_url:
72
- text = f"[{text}]({self.avatar_url}"
74
+ text = f"[{text}]({self.avatar_url})"
73
75
  return text
74
76
 
75
77
  def get_html(self) -> Optional[HtmlTag]:
76
- if self.avatar_url:
78
+ if self.avatar_url and self.name:
77
79
  return A([Href(self.avatar_url)], self.name)
78
- else:
80
+ elif self.name:
79
81
  return Div([], self.name)
82
+ else:
83
+ return Div([], "")
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, Any, AsyncIterator
5
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
6
6
 
7
7
  from pydantic import Field
8
8
 
@@ -39,6 +39,10 @@ class SharepointAccessConfig(OnedriveAccessConfig):
39
39
 
40
40
 
41
41
  class SharepointConnectionConfig(OnedriveConnectionConfig):
42
+ user_pname: Optional[str] = Field(
43
+ default=None,
44
+ description="User principal name or service account, usually your Azure AD email.",
45
+ )
42
46
  site: str = Field(
43
47
  description="Sharepoint site url. Process either base url e.g \
44
48
  https://[tenant].sharepoint.com or relative sites \
@@ -37,8 +37,8 @@ if TYPE_CHECKING:
37
37
 
38
38
  CONNECTOR_TYPE = "snowflake"
39
39
 
40
+ EMBEDDINGS_COLUMN = "embeddings"
40
41
  _ARRAY_COLUMNS = (
41
- "embeddings",
42
42
  "languages",
43
43
  "link_urls",
44
44
  "link_texts",
@@ -47,6 +47,7 @@ _ARRAY_COLUMNS = (
47
47
  "emphasized_text_contents",
48
48
  "emphasized_text_tags",
49
49
  )
50
+ _VECTOR_COLUMNS = (EMBEDDINGS_COLUMN,)
50
51
 
51
52
 
52
53
  class SnowflakeAccessConfig(SQLAccessConfig):
@@ -174,6 +175,33 @@ class SnowflakeUploader(SQLUploader):
174
175
  connector_type: str = CONNECTOR_TYPE
175
176
  values_delimiter: str = "?"
176
177
 
178
+ _embeddings_dimension: Optional[int] = None
179
+
180
+ @property
181
+ def embeddings_dimension(self) -> Optional[int]:
182
+ """
183
+ Get the dimension of the embeddings column in the Snowflake table.
184
+ If the column is not present or is not of type VECTOR, returns None.
185
+ """
186
+ if self._embeddings_dimension is None:
187
+ with self.connection_config.get_cursor() as cursor:
188
+ embeddings_column = cursor.execute(
189
+ f"SHOW COLUMNS LIKE '{EMBEDDINGS_COLUMN}' IN {self.upload_config.table_name}"
190
+ ).fetchone()
191
+ if embeddings_column:
192
+ data_type = {}
193
+ if isinstance(embeddings_column, dict):
194
+ data_type = json.loads(embeddings_column.get("data_type", "{}"))
195
+ elif isinstance(embeddings_column, tuple):
196
+ data_type = json.loads(embeddings_column[3] or "{}")
197
+ if isinstance(data_type, dict) and data_type.get("type") == "VECTOR":
198
+ self._embeddings_dimension = data_type.get("dimension")
199
+ # If the _embeddings_dimension is still None, it means the column
200
+ # is not present or not a VECTOR type
201
+ if self._embeddings_dimension is None:
202
+ self._embeddings_dimension = 0
203
+ return self._embeddings_dimension
204
+
177
205
  @requires_dependencies(["pandas"], extras="snowflake")
178
206
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
207
  super().run(path=path, file_data=file_data, **kwargs)
@@ -193,7 +221,7 @@ class SnowflakeUploader(SQLUploader):
193
221
  parsed.append(None)
194
222
  else:
195
223
  parsed.append(parse_date_string(value))
196
- elif column_name in _ARRAY_COLUMNS:
224
+ elif column_name in _ARRAY_COLUMNS or column_name in _VECTOR_COLUMNS:
197
225
  if not isinstance(value, list) and (
198
226
  value is None or pd.isna(value)
199
227
  ): # pandas is nan
@@ -206,16 +234,18 @@ class SnowflakeUploader(SQLUploader):
206
234
  return output
207
235
 
208
236
  def _parse_values(self, columns: list[str]) -> str:
209
- return ",".join(
210
- [
211
- (
212
- f"PARSE_JSON({self.values_delimiter})"
213
- if col in _ARRAY_COLUMNS
214
- else self.values_delimiter
237
+ embeddings_dimension = self.embeddings_dimension
238
+ parsed_values = []
239
+ for col in columns:
240
+ if col in _VECTOR_COLUMNS and embeddings_dimension:
241
+ parsed_values.append(
242
+ f"PARSE_JSON({self.values_delimiter})::VECTOR(FLOAT,{embeddings_dimension})"
215
243
  )
216
- for col in columns
217
- ]
218
- )
244
+ elif col in _ARRAY_COLUMNS or col in _VECTOR_COLUMNS:
245
+ parsed_values.append(f"PARSE_JSON({self.values_delimiter})")
246
+ else:
247
+ parsed_values.append(self.values_delimiter)
248
+ return ",".join(parsed_values)
219
249
 
220
250
  def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
221
251
  import numpy as np
@@ -228,8 +258,8 @@ class SnowflakeUploader(SQLUploader):
228
258
  f"record id column "
229
259
  f"{self.upload_config.record_id_key}, skipping delete"
230
260
  )
261
+ df = self._fit_to_schema(df=df, add_missing_columns=True, case_sensitive=False)
231
262
  df.replace({np.nan: None}, inplace=True)
232
- self._fit_to_schema(df=df)
233
263
 
234
264
  columns = list(df.columns)
235
265
  stmt = "INSERT INTO {table_name} ({columns}) SELECT {values}".format(
@@ -339,12 +339,16 @@ class SQLUploader(Uploader):
339
339
  output.append(tuple(parsed))
340
340
  return output
341
341
 
342
- def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
342
+ def _fit_to_schema(
343
+ self, df: "DataFrame", add_missing_columns: bool = True, case_sensitive: bool = True
344
+ ) -> "DataFrame":
343
345
  import pandas as pd
344
346
 
345
347
  table_columns = self.get_table_columns()
346
- columns = set(df.columns)
347
- schema_fields = set(table_columns)
348
+ columns = set(df.columns if case_sensitive else df.columns.str.lower())
349
+ schema_fields = set(
350
+ table_columns if case_sensitive else {col.lower() for col in table_columns}
351
+ )
348
352
  columns_to_drop = columns - schema_fields
349
353
  missing_columns = schema_fields - columns
350
354
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.24
3
+ Version: 1.0.28
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=p1Nz9H4WBA_aI3GL1htUsWwzMmx5t9ktPqeOxmax3ms,43
2
+ unstructured_ingest/__version__.py,sha256=9ilMs9aEgY_oAgkTk4JfWz5bhMHPPXDWre-49z0HXgo,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -30,7 +30,7 @@ unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzx
30
30
  unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
31
31
  unstructured_ingest/embed/openai.py,sha256=TMEOPVfm_OSs4tb3Ymd6q5J49R_-YKvO4TOqCHb3bwk,4647
32
32
  unstructured_ingest/embed/togetherai.py,sha256=EehrzTRx4sd_P6AG9JkHAGwTG-o93GMaV5ufmJaxKWs,3629
33
- unstructured_ingest/embed/vertexai.py,sha256=jA3Y-AysVVaYwqkVd_OgRKF0JdHLAgZlRgfgddcZV2o,3763
33
+ unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
34
34
  unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
35
35
  unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
36
36
  unstructured_ingest/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
@@ -84,7 +84,7 @@ unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=YzvSlfHs83XWsWMaIC3bV5enKfxejMQ9BQ8CtXfnJ5o,6923
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=PowaqMzWr-VCW1rnwcAeRhHyE55kJ9J9FCVlrmtzN0E,4827
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=jI-erp4YUfHxPeUTcfHSPEG3w0wjSBYfAnMg1WT6lfw,4996
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -144,12 +144,12 @@ unstructured_ingest/processes/connectors/notion/types/file.py,sha256=MpEWi7OE0mp
144
144
  unstructured_ingest/processes/connectors/notion/types/page.py,sha256=0fExZsJHXBzaRLwJAKpZwtnfQf_gZ7KnTIbyIyDYC4Q,1471
145
145
  unstructured_ingest/processes/connectors/notion/types/parent.py,sha256=l-EJBKU0HNpDg7p87cATqw0WlUSATD9btyVF7B2A2nI,1706
146
146
  unstructured_ingest/processes/connectors/notion/types/rich_text.py,sha256=LPeyFconK_-8Kl3DSLFiCmxwXH3LWthBiYSzj4FAJKY,5483
147
- unstructured_ingest/processes/connectors/notion/types/user.py,sha256=AKV2ZUcOPe8a98VmyOEaQf-ow6lR5kzudvmTH9BJTKU,1824
147
+ unstructured_ingest/processes/connectors/notion/types/user.py,sha256=Bs9hqsMPsfXtMJq1pf-tSgoexVjx__jKdJdfcCyMggM,1964
148
148
  unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py,sha256=mp-jlTLXntT94jdG3koguXTwQ4q_a-ZRR9M_yYew3Jc,1505
149
149
  unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py,sha256=-SDNODMDsp92-YGtKR0ZDAdcWJ7v-YMyrXf2iat-9oU,1191
150
150
  unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py,sha256=uR2EwXyAMCBSj-nG4Vp_biZg4CMNXQt4HVwCUS2K08Q,493
151
151
  unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py,sha256=VyP0l6mNk4-u9vWQOC0Y_lkKhv81b7z74UMcCSiycbo,993
152
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py,sha256=MUi8TSn2fL-mfd-yiwOEG5nvjN550YzFAiwSC88UBGA,2619
152
+ unstructured_ingest/processes/connectors/notion/types/blocks/callout.py,sha256=F3blVavN8K6DFcqqP-rQBStksKs1RUJmIabiUeJajx4,3849
153
153
  unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py,sha256=0HkAZHFQtRa8rN48JW7x2pJzvJScjCl5yDhmym8UPHc,544
154
154
  unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py,sha256=r4fp_O0NYP4UxUfncJdhOGl7C_spprqzq3etK9tILIE,564
155
155
  unstructured_ingest/processes/connectors/notion/types/blocks/code.py,sha256=DkhXFYkNzSswwuGh0it4p-RucTAuQPPxPuyEUj09OkI,1404
@@ -206,8 +206,8 @@ unstructured_ingest/processes/connectors/sql/__init__.py,sha256=WNO7jSL1ABw7K5Ix
206
206
  unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py,sha256=_IZFFGQUsHV9ScOOISrm6c5FSd4PnX91ePj_COat-gk,9320
207
207
  unstructured_ingest/processes/connectors/sql/postgres.py,sha256=kDIL8Cj45EDpKqit1_araRpP4v3cb__QbYqoINg9f2k,5403
208
208
  unstructured_ingest/processes/connectors/sql/singlestore.py,sha256=B46lpvyAj1AArpACi9MXbXD1-52zF6Dsj3RJtD1g4r0,5955
209
- unstructured_ingest/processes/connectors/sql/snowflake.py,sha256=GSEoNrIoJM7p-Q-PrFiONamoxWzjQG8wZJG3mw5Uwdk,9589
210
- unstructured_ingest/processes/connectors/sql/sql.py,sha256=yUGnv4MF_vT3VHdg7hhGiTD0be94ll-HyhHmRKQp_vQ,15712
209
+ unstructured_ingest/processes/connectors/sql/snowflake.py,sha256=dkGIFz_VIVhew_FjbuO8r3cVluw7VIUdvV6VjkAItP8,11369
210
+ unstructured_ingest/processes/connectors/sql/sql.py,sha256=e2GKJXBKAPpp-H14PMLMUXSa6pfKctEAVOlH9JqfHF4,15885
211
211
  unstructured_ingest/processes/connectors/sql/sqlite.py,sha256=V3OfRrXGGhTa_R2FPA-ysn95HHCv9x_VEBKVDsSGsbs,5549
212
212
  unstructured_ingest/processes/connectors/sql/vastdb.py,sha256=trhvUBumDmj2rLjmxFBKw9L9wF6ZpssF0wfmRaG97H0,9803
213
213
  unstructured_ingest/processes/connectors/weaviate/__init__.py,sha256=1Vnz8hm_Cf3NkQUTz5ZD4QkbLSVql4UvRoY2j2FnC9k,853
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.24.dist-info/METADATA,sha256=Ssmaf7onq6HIFmhR7f2mMPoS2gqGy6dmvxo605W_dWU,8691
235
- unstructured_ingest-1.0.24.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.24.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.24.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.24.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.28.dist-info/METADATA,sha256=_e-2mJSqWwsdod99-didta-wEjNVIaIua_EkdBU5ZHY,8691
235
+ unstructured_ingest-1.0.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.28.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.28.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.28.dist-info/RECORD,,