unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_gcs"
16
+
17
+
18
+ class LanceDBGCSAccessConfig(AccessConfig):
19
+ google_service_account_key: str = Field(
20
+ description="The serialized google service account key."
21
+ )
22
+
23
+
24
+ class LanceDBGCSConnectionConfig(LanceDBRemoteConnectionConfig):
25
+ access_config: Secret[LanceDBGCSAccessConfig]
26
+
27
+ def get_storage_options(self) -> dict:
28
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
29
+
30
+
31
+ @dataclass
32
+ class LanceDBGSPUploader(LanceDBUploader):
33
+ upload_config: LanceDBUploaderConfig
34
+ connection_config: LanceDBGCSConnectionConfig
35
+ connector_type: str = CONNECTOR_TYPE
36
+
37
+
38
+ lancedb_gcp_destination_entry = DestinationRegistryEntry(
39
+ connection_config=LanceDBGCSConnectionConfig,
40
+ uploader=LanceDBGSPUploader,
41
+ uploader_config=LanceDBUploaderConfig,
42
+ upload_stager_config=LanceDBUploadStagerConfig,
43
+ upload_stager=LanceDBUploadStager,
44
+ )
@@ -0,0 +1,181 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from abc import ABC, abstractmethod
6
+ from contextlib import asynccontextmanager
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
10
+
11
+ from pydantic import Field
12
+
13
+ from unstructured_ingest.data_types.file_data import FileData
14
+ from unstructured_ingest.error import DestinationConnectionError
15
+ from unstructured_ingest.interfaces import (
16
+ ConnectionConfig,
17
+ Uploader,
18
+ UploaderConfig,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ )
22
+ from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
24
+ from unstructured_ingest.utils.data_prep import flatten_dict
25
+ from unstructured_ingest.utils.dep_check import requires_dependencies
26
+
27
+ CONNECTOR_TYPE = "lancedb"
28
+
29
+ if TYPE_CHECKING:
30
+ from lancedb import AsyncConnection
31
+ from lancedb.table import AsyncTable
32
+ from pandas import DataFrame
33
+
34
+
35
+ class LanceDBConnectionConfig(ConnectionConfig, ABC):
36
+ uri: str = Field(description="The uri of the database.")
37
+
38
+ @abstractmethod
39
+ def get_storage_options(self) -> Optional[dict[str, str]]:
40
+ raise NotImplementedError
41
+
42
+ @asynccontextmanager
43
+ @requires_dependencies(["lancedb"], extras="lancedb")
44
+ @DestinationConnectionError.wrap
45
+ async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
46
+ import lancedb
47
+
48
+ with await lancedb.connect_async(
49
+ self.uri,
50
+ storage_options=self.get_storage_options(),
51
+ ) as connection:
52
+ yield connection
53
+
54
+
55
+ class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
56
+ timeout: str = Field(
57
+ default="30s",
58
+ description=(
59
+ "Timeout for the entire request, from connection until the response body has finished"
60
+ "in a [0-9]+(ns|us|ms|[smhdwy]) format."
61
+ ),
62
+ pattern=r"[0-9]+(ns|us|ms|[smhdwy])",
63
+ )
64
+
65
+
66
+ class LanceDBUploadStagerConfig(UploadStagerConfig):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class LanceDBUploadStager(UploadStager):
72
+ upload_stager_config: LanceDBUploadStagerConfig = field(
73
+ default_factory=LanceDBUploadStagerConfig
74
+ )
75
+
76
+ @requires_dependencies(["pandas"], extras="lancedb")
77
+ def run(
78
+ self,
79
+ elements_filepath: Path,
80
+ file_data: FileData,
81
+ output_dir: Path,
82
+ output_filename: str,
83
+ **kwargs: Any,
84
+ ) -> Path:
85
+ import pandas as pd
86
+
87
+ with open(elements_filepath) as elements_file:
88
+ elements_contents: list[dict] = json.load(elements_file)
89
+
90
+ df = pd.DataFrame(
91
+ [
92
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
93
+ for element_dict in elements_contents
94
+ ]
95
+ )
96
+
97
+ output_path = (output_dir / output_filename).with_suffix(".feather")
98
+ df.to_feather(output_path)
99
+
100
+ return output_path
101
+
102
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
103
+ data = element_dict.copy()
104
+ return {
105
+ "vector": data.pop("embeddings", None),
106
+ RECORD_ID_LABEL: file_data.identifier,
107
+ **flatten_dict(data, separator="-"),
108
+ }
109
+
110
+
111
+ class LanceDBUploaderConfig(UploaderConfig):
112
+ table_name: str = Field(description="The name of the table.")
113
+
114
+
115
+ @dataclass
116
+ class LanceDBUploader(Uploader):
117
+ upload_config: LanceDBUploaderConfig
118
+ connection_config: LanceDBConnectionConfig
119
+ connector_type: str = CONNECTOR_TYPE
120
+
121
+ @DestinationConnectionError.wrap
122
+ def precheck(self):
123
+ async def _precheck() -> None:
124
+ async with self.connection_config.get_async_connection() as conn:
125
+ table = await conn.open_table(self.upload_config.table_name)
126
+ table.close()
127
+
128
+ asyncio.run(_precheck())
129
+
130
+ @asynccontextmanager
131
+ async def get_table(self) -> AsyncGenerator["AsyncTable", None]:
132
+ async with self.connection_config.get_async_connection() as conn:
133
+ table = await conn.open_table(self.upload_config.table_name)
134
+ try:
135
+ yield table
136
+ finally:
137
+ table.close()
138
+
139
+ @requires_dependencies(["pandas"], extras="lancedb")
140
+ async def run_async(self, path, file_data, **kwargs):
141
+ import pandas as pd
142
+
143
+ df = pd.read_feather(path)
144
+ async with self.get_table() as table:
145
+ schema = await table.schema()
146
+ df = self._fit_to_schema(df, schema)
147
+ if RECORD_ID_LABEL not in schema.names:
148
+ logger.warning(
149
+ f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
150
+ " string which is required to support overwriting updates on subsequent"
151
+ " uploads of the same record. New rows will be appended instead."
152
+ )
153
+ else:
154
+ await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
155
+ await table.add(data=df)
156
+
157
+ def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
158
+ import pandas as pd
159
+
160
+ columns = set(df.columns)
161
+ schema_fields = set(schema.names)
162
+ columns_to_drop = columns - schema_fields
163
+ missing_columns = schema_fields - columns
164
+
165
+ if columns_to_drop:
166
+ logger.info(
167
+ "Following columns will be dropped to match the table's schema: "
168
+ f"{', '.join(columns_to_drop)}"
169
+ )
170
+ if missing_columns:
171
+ logger.info(
172
+ "Following null filled columns will be added to match the table's schema:"
173
+ f" {', '.join(missing_columns)} "
174
+ )
175
+
176
+ df = df.drop(columns=columns_to_drop)
177
+
178
+ for column in missing_columns:
179
+ df[column] = pd.Series()
180
+
181
+ return df
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.processes.connectors.lancedb.lancedb import (
8
+ LanceDBConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_local"
16
+
17
+
18
+ class LanceDBLocalAccessConfig(AccessConfig):
19
+ pass
20
+
21
+
22
+ class LanceDBLocalConnectionConfig(LanceDBConnectionConfig):
23
+ access_config: Secret[LanceDBLocalAccessConfig] = Field(
24
+ default_factory=LanceDBLocalAccessConfig, validate_default=True
25
+ )
26
+
27
+ def get_storage_options(self) -> None:
28
+ return None
29
+
30
+
31
+ @dataclass
32
+ class LanceDBLocalUploader(LanceDBUploader):
33
+ upload_config: LanceDBUploaderConfig
34
+ connection_config: LanceDBLocalConnectionConfig
35
+ connector_type: str = CONNECTOR_TYPE
36
+
37
+
38
+ lancedb_local_destination_entry = DestinationRegistryEntry(
39
+ connection_config=LanceDBLocalConnectionConfig,
40
+ uploader=LanceDBLocalUploader,
41
+ uploader_config=LanceDBUploaderConfig,
42
+ upload_stager_config=LanceDBUploadStagerConfig,
43
+ upload_stager=LanceDBUploadStager,
44
+ )
@@ -0,0 +1,227 @@
1
+ import glob
2
+ import json
3
+ import shutil
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import Any, Generator
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
16
+ from unstructured_ingest.error import FileExistsError
17
+ from unstructured_ingest.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ Indexer,
24
+ IndexerConfig,
25
+ Uploader,
26
+ UploaderConfig,
27
+ )
28
+ from unstructured_ingest.logger import logger
29
+ from unstructured_ingest.processes.connector_registry import (
30
+ DestinationRegistryEntry,
31
+ SourceRegistryEntry,
32
+ )
33
+ from unstructured_ingest.processes.utils.blob_storage import (
34
+ BlobStoreUploadStager,
35
+ BlobStoreUploadStagerConfig,
36
+ )
37
+
38
+ CONNECTOR_TYPE = "local"
39
+
40
+
41
+ class LocalAccessConfig(AccessConfig):
42
+ pass
43
+
44
+
45
+ class LocalConnectionConfig(ConnectionConfig):
46
+ access_config: Secret[LocalAccessConfig] = Field(
47
+ default=LocalAccessConfig(), validate_default=True
48
+ )
49
+
50
+
51
+ class LocalIndexerConfig(IndexerConfig):
52
+ input_path: Path = Field(
53
+ description="Path to the location in the local file system that will be processed."
54
+ )
55
+ recursive: bool = Field(
56
+ default=False,
57
+ description="Recursively download files in their respective folders "
58
+ "otherwise stop at the files in provided folder level.",
59
+ )
60
+
61
+ @property
62
+ def path(self) -> Path:
63
+ return Path(self.input_path).resolve()
64
+
65
+
66
+ @dataclass
67
+ class LocalIndexer(Indexer):
68
+ index_config: LocalIndexerConfig
69
+ connection_config: LocalConnectionConfig = field(
70
+ default_factory=lambda: LocalConnectionConfig()
71
+ )
72
+ connector_type: str = CONNECTOR_TYPE
73
+
74
+ def list_files(self) -> list[Path]:
75
+ input_path = self.index_config.path
76
+ if input_path.is_file():
77
+ return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
78
+ files = []
79
+ if self.index_config.recursive:
80
+ files.extend(list(input_path.rglob("*")))
81
+ else:
82
+ files.extend(list(input_path.glob("*")))
83
+ return [f for f in files if f.is_file()]
84
+
85
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
86
+ stats = path.stat()
87
+ try:
88
+ date_modified = str(stats.st_mtime)
89
+ except Exception as e:
90
+ logger.warning(f"Couldn't detect date modified: {e}")
91
+ date_modified = None
92
+
93
+ try:
94
+ date_created = str(stats.st_birthtime)
95
+ except Exception as e:
96
+ logger.warning(f"Couldn't detect date created: {e}")
97
+ date_created = None
98
+
99
+ try:
100
+ mode = stats.st_mode
101
+ permissions_data = [{"mode": mode}]
102
+ except Exception as e:
103
+ logger.warning(f"Couldn't detect file mode: {e}")
104
+ permissions_data = None
105
+
106
+ try:
107
+ filesize_bytes = stats.st_size
108
+ except Exception as e:
109
+ logger.warning(f"Couldn't detect file size: {e}")
110
+ filesize_bytes = None
111
+
112
+ return FileDataSourceMetadata(
113
+ date_modified=date_modified,
114
+ date_created=date_created,
115
+ date_processed=str(time()),
116
+ permissions_data=permissions_data,
117
+ record_locator={"path": str(path.resolve())},
118
+ filesize_bytes=filesize_bytes,
119
+ )
120
+
121
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
122
+ for file_path in self.list_files():
123
+ source_identifiers = SourceIdentifiers(
124
+ fullpath=str(file_path.resolve()),
125
+ filename=file_path.name,
126
+ rel_path=(
127
+ str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
128
+ if not self.index_config.path.is_file()
129
+ else self.index_config.path.name
130
+ ),
131
+ )
132
+ file_data = FileData(
133
+ identifier=str(file_path.resolve()),
134
+ connector_type=CONNECTOR_TYPE,
135
+ source_identifiers=source_identifiers,
136
+ metadata=self.get_file_metadata(path=file_path),
137
+ display_name=source_identifiers.fullpath,
138
+ )
139
+ yield file_data
140
+
141
+
142
+ class LocalDownloaderConfig(DownloaderConfig):
143
+ pass
144
+
145
+
146
+ @dataclass
147
+ class LocalDownloader(Downloader):
148
+ connector_type: str = CONNECTOR_TYPE
149
+ connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
150
+ download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
151
+
152
+ def get_download_path(self, file_data: FileData) -> Path:
153
+ return Path(file_data.source_identifiers.fullpath)
154
+
155
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
156
+ return DownloadResponse(
157
+ file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
158
+ )
159
+
160
+
161
+ class LocalUploaderConfig(UploaderConfig):
162
+ output_dir: str = Field(
163
+ default="structured-output", description="Local path to write partitioned output to"
164
+ )
165
+
166
+ @property
167
+ def output_path(self) -> Path:
168
+ return Path(self.output_dir).resolve()
169
+
170
+ def __post_init__(self):
171
+ if self.output_path.exists() and self.output_path.is_file():
172
+ raise FileExistsError(f"output path {self.output_path} already exists as a file")
173
+
174
+
175
+ @dataclass
176
+ class LocalUploader(Uploader):
177
+ connector_type: str = CONNECTOR_TYPE
178
+ upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
179
+ connection_config: LocalConnectionConfig = field(
180
+ default_factory=lambda: LocalConnectionConfig()
181
+ )
182
+
183
+ def is_async(self) -> bool:
184
+ return False
185
+
186
+ def get_destination_path(self, file_data: FileData) -> Path:
187
+ if source_identifiers := file_data.source_identifiers:
188
+ rel_path = (
189
+ source_identifiers.relative_path[1:]
190
+ if source_identifiers.relative_path.startswith("/")
191
+ else source_identifiers.relative_path
192
+ )
193
+ new_path = self.upload_config.output_path / Path(rel_path)
194
+ final_path = str(new_path).replace(
195
+ source_identifiers.filename, f"{source_identifiers.filename}.json"
196
+ )
197
+ else:
198
+ final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
199
+ final_path = Path(final_path)
200
+ final_path.parent.mkdir(parents=True, exist_ok=True)
201
+ return final_path
202
+
203
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
204
+ final_path = self.get_destination_path(file_data=file_data)
205
+ with final_path.open("w") as f:
206
+ json.dump(data, f)
207
+
208
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
209
+ final_path = self.get_destination_path(file_data=file_data)
210
+ logger.debug(f"copying file from {path} to {final_path}")
211
+ shutil.copy(src=str(path), dst=str(final_path))
212
+
213
+
214
+ local_source_entry = SourceRegistryEntry(
215
+ indexer=LocalIndexer,
216
+ indexer_config=LocalIndexerConfig,
217
+ downloader=LocalDownloader,
218
+ downloader_config=LocalDownloaderConfig,
219
+ connection_config=LocalConnectionConfig,
220
+ )
221
+
222
+ local_destination_entry = DestinationRegistryEntry(
223
+ uploader=LocalUploader,
224
+ uploader_config=LocalUploaderConfig,
225
+ upload_stager_config=BlobStoreUploadStagerConfig,
226
+ upload_stager=BlobStoreUploadStager,
227
+ )