unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import unstructured_ingest.processes.connectors.databricks # noqa: F401
4
+ import unstructured_ingest.processes.connectors.duckdb # noqa: F401
5
+ import unstructured_ingest.processes.connectors.elasticsearch # noqa: F401
6
+ import unstructured_ingest.processes.connectors.fsspec # noqa: F401
7
+ import unstructured_ingest.processes.connectors.ibm_watsonx # noqa: F401
8
+ import unstructured_ingest.processes.connectors.kafka # noqa: F401
9
+ import unstructured_ingest.processes.connectors.lancedb # noqa: F401
10
+ import unstructured_ingest.processes.connectors.qdrant # noqa: F401
11
+ import unstructured_ingest.processes.connectors.sql # noqa: F401
12
+ import unstructured_ingest.processes.connectors.weaviate # noqa: F401
13
+ from unstructured_ingest.processes.connector_registry import (
14
+ add_destination_entry,
15
+ add_source_entry,
16
+ )
17
+
18
+ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
19
+ from .airtable import airtable_source_entry
20
+ from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
21
+ from .astradb import astra_db_destination_entry, astra_db_source_entry
22
+ from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
23
+ from .azure_ai_search import azure_ai_search_destination_entry
24
+ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
25
+ from .chroma import chroma_destination_entry
26
+ from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
27
+ from .confluence import confluence_source_entry
28
+ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
29
+ from .couchbase import couchbase_destination_entry, couchbase_source_entry
30
+ from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
31
+ from .delta_table import delta_table_destination_entry
32
+ from .discord import CONNECTOR_TYPE as DISCORD_CONNECTOR_TYPE
33
+ from .discord import discord_source_entry
34
+ from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE
35
+ from .github import github_source_entry
36
+ from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
37
+ from .gitlab import gitlab_source_entry
38
+ from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
39
+ from .google_drive import google_drive_source_entry
40
+ from .jira import CONNECTOR_TYPE as JIRA_CONNECTOR_TYPE
41
+ from .jira import jira_source_entry
42
+ from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
43
+ from .kdbai import kdbai_destination_entry
44
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
45
+ from .local import local_destination_entry, local_source_entry
46
+ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
47
+ from .milvus import milvus_destination_entry
48
+ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
49
+ from .mongodb import mongodb_destination_entry, mongodb_source_entry
50
+ from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
51
+ from .neo4j import neo4j_destination_entry
52
+ from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE
53
+ from .notion.connector import notion_source_entry
54
+ from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
55
+ from .onedrive import onedrive_destination_entry, onedrive_source_entry
56
+ from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
57
+ from .outlook import outlook_source_entry
58
+ from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
59
+ from .pinecone import pinecone_destination_entry
60
+ from .redisdb import CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE
61
+ from .redisdb import redis_destination_entry
62
+ from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
63
+ from .salesforce import salesforce_source_entry
64
+ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
65
+ from .sharepoint import sharepoint_source_entry
66
+ from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
67
+ from .slack import slack_source_entry
68
+ from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
69
+ from .vectara import vectara_destination_entry
70
+ from .zendesk.zendesk import CONNECTOR_TYPE as ZENDESK_CONNECTOR_TYPE
71
+ from .zendesk.zendesk import zendesk_source_entry
72
+
73
+ add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
74
+ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
75
+
76
+ add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
77
+
78
+ add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
79
+ add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
80
+
81
+ add_destination_entry(
82
+ destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
83
+ )
84
+
85
+
86
+ add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
87
+
88
+ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
89
+ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
90
+
91
+ add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
92
+ add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
93
+
94
+ add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
95
+
96
+ add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
97
+
98
+ add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
99
+ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
100
+
101
+ add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
102
+ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
103
+
104
+ add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
105
+ add_destination_entry(
106
+ destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
107
+ entry=azure_ai_search_destination_entry,
108
+ )
109
+
110
+ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
111
+ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
112
+ add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry)
113
+
114
+ add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
115
+
116
+ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
117
+
118
+ add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
119
+
120
+ add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
121
+ add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
122
+
123
+ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
124
+ add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
125
+
126
+ add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
127
+
128
+ add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
129
+ add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry)
@@ -0,0 +1,238 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ from pydantic import BaseModel, Field, Secret, field_validator
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
9
+ from unstructured_ingest.error import ValueError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Downloader,
14
+ DownloaderConfig,
15
+ DownloadResponse,
16
+ Indexer,
17
+ IndexerConfig,
18
+ )
19
+ from unstructured_ingest.processes.connector_registry import (
20
+ SourceRegistryEntry,
21
+ )
22
+ from unstructured_ingest.utils.dep_check import requires_dependencies
23
+
24
+ if TYPE_CHECKING:
25
+ from pyairtable import Api
26
+ from pyairtable.api.types import RecordDict
27
+
28
+ CONNECTOR_TYPE = "airtable"
29
+
30
+
31
+ class AirtableTableMeta(BaseModel):
32
+ """Metadata specifying a table id, a base id which the table is stored in,
33
+ and an t.Optional view id in case particular rows and fields are to be ingested"""
34
+
35
+ base_id: str
36
+ table_id: str
37
+ view_id: Optional[str] = None
38
+
39
+ def get_id(self) -> str:
40
+ id_s = f"{self.base_id}{self.table_id}"
41
+ id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
42
+ return str(uuid5(NAMESPACE_DNS, id_s))
43
+
44
+
45
+ class AirtableAccessConfig(AccessConfig):
46
+ personal_access_token: str = Field(
47
+ description="Personal access token to authenticate into Airtable. Check: "
48
+ "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
49
+ "for more info"
50
+ )
51
+
52
+
53
+ class AirtableConnectionConfig(ConnectionConfig):
54
+ access_config: Secret[AirtableAccessConfig]
55
+
56
+ @requires_dependencies(["pyairtable"], extras="airtable")
57
+ def get_client(self) -> "Api":
58
+ from pyairtable import Api
59
+
60
+ access_config = self.access_config.get_secret_value()
61
+ return Api(api_key=access_config.personal_access_token)
62
+
63
+
64
+ class AirtableIndexerConfig(IndexerConfig):
65
+ list_of_paths: Optional[list[str]] = Field(
66
+ default=None,
67
+ description="""
68
+ A list of paths that specify the locations to ingest data from within Airtable.
69
+
70
+ If this argument is not set, the connector ingests all tables within each and every base.
71
+ --list-of-paths: path1 path2 path3 ….
72
+ path: base_id/table_id(optional)/view_id(optional)/
73
+
74
+ To obtain (base, table, view) ids in bulk, check:
75
+ https://airtable.com/developers/web/api/list-bases (base ids)
76
+ https://airtable.com/developers/web/api/get-base-schema (table and view ids)
77
+ https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
78
+
79
+ To obtain specific ids from Airtable UI, go to your workspace, and copy any
80
+ relevant id from the URL structure:
81
+ https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
82
+ appAbcDeF1ghijKlm -> base_id
83
+ tblABcdEfG1HIJkLm -> table_id
84
+ viwABCDEfg6hijKLM -> view_id
85
+
86
+ You can also check: https://support.airtable.com/docs/finding-airtable-ids
87
+
88
+ Here is an example for one --list-of-paths:
89
+ base1/ → gets the entirety of all tables inside base1
90
+ base1/table1 → gets all rows and columns within table1 in base1
91
+ base1/table1/view1 → gets the rows and columns that are
92
+ visible in view1 for the table1 in base1
93
+
94
+ Examples to invalid airtable_paths:
95
+ table1 → has to mention base to be valid
96
+ base1/view1 → has to mention table to be valid
97
+ """,
98
+ )
99
+
100
+ @classmethod
101
+ def validate_path(cls, path: str):
102
+ components = path.split("/")
103
+ if len(components) > 3:
104
+ raise ValueError(
105
+ f"Path must be of the format: base_id/table_id/view_id, "
106
+ f"where table id and view id are optional. Got: {path}"
107
+ )
108
+
109
+ @field_validator("list_of_paths")
110
+ @classmethod
111
+ def validate_format(cls, v: list[str]) -> list[str]:
112
+ for path in v:
113
+ cls.validate_path(path=path)
114
+ return v
115
+
116
+
117
+ @dataclass
118
+ class AirtableIndexer(Indexer):
119
+ connector_type: str = CONNECTOR_TYPE
120
+ connection_config: AirtableConnectionConfig
121
+ index_config: AirtableIndexerConfig
122
+
123
+ def get_all_table_meta(self) -> list[AirtableTableMeta]:
124
+ client = self.connection_config.get_client()
125
+ bases = client.bases()
126
+ airtable_meta = []
127
+ for base in bases:
128
+ for table in base.schema().tables:
129
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
130
+ return airtable_meta
131
+
132
+ def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
133
+ client = self.connection_config.get_client()
134
+ base = client.base(base_id=base_id)
135
+ airtable_meta = []
136
+ for table in base.tables():
137
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
138
+ return airtable_meta
139
+
140
+ def get_meta_from_list(self) -> list[AirtableTableMeta]:
141
+ airtable_meta = []
142
+ for path in self.index_config.list_of_paths:
143
+ components = path.split("/")
144
+ if len(components) == 1:
145
+ airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
146
+ elif len(components) == 2:
147
+ airtable_meta.append(
148
+ AirtableTableMeta(base_id=components[0], table_id=components[1])
149
+ )
150
+ elif len(components) == 3:
151
+ airtable_meta.append(
152
+ AirtableTableMeta(
153
+ base_id=components[0], table_id=components[1], view_id=components[2]
154
+ )
155
+ )
156
+ else:
157
+ raise ValueError(
158
+ f"Path must be of the format: base_id/table_id/view_id, "
159
+ f"where table id and view id are optional. Got: {path}"
160
+ )
161
+ return airtable_meta
162
+
163
+ def get_table_metas(self) -> list[AirtableTableMeta]:
164
+ if not self.index_config.list_of_paths:
165
+ return self.get_all_table_meta()
166
+ return self.get_meta_from_list()
167
+
168
+ def precheck(self) -> None:
169
+ client = self.connection_config.get_client()
170
+ client.request(method="HEAD", url=client.build_url("meta", "bases"))
171
+
172
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
173
+ table_metas = self.get_table_metas()
174
+ for table_meta in table_metas:
175
+ fullpath = (
176
+ f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
177
+ if table_meta.view_id
178
+ else f"{table_meta.base_id}/{table_meta.table_id}.csv"
179
+ )
180
+ yield FileData(
181
+ identifier=table_meta.get_id(),
182
+ connector_type=CONNECTOR_TYPE,
183
+ additional_metadata=table_meta.model_dump(),
184
+ source_identifiers=SourceIdentifiers(
185
+ filename=str(Path(fullpath).name),
186
+ fullpath=fullpath,
187
+ ),
188
+ display_name=fullpath,
189
+ )
190
+
191
+
192
+ class AirtableDownloaderConfig(DownloaderConfig):
193
+ pass
194
+
195
+
196
+ @dataclass
197
+ class AirtableDownloader(Downloader):
198
+ connection_config: AirtableConnectionConfig
199
+ download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
200
+ connector_type: str = CONNECTOR_TYPE
201
+
202
+ def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
203
+ client = self.connection_config.get_client()
204
+ table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
205
+ table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
206
+ rows = table.all(**table_fetch_kwargs)
207
+ return rows
208
+
209
+ def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
210
+ row_dict = {
211
+ "id": table_row["id"],
212
+ "created_time": table_row["createdTime"],
213
+ }
214
+ row_dict.update(table_row["fields"])
215
+ return row_dict
216
+
217
+ @requires_dependencies(["pandas"], extras="airtable")
218
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
219
+ import pandas as pd
220
+
221
+ table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
222
+ table_contents = self.get_table_contents(table_meta=table_meta)
223
+ df = pd.DataFrame.from_dict(
224
+ data=[self._table_row_to_dict(table_row=row) for row in table_contents]
225
+ ).sort_index(axis=1)
226
+ download_path = self.get_download_path(file_data=file_data)
227
+ download_path.parent.mkdir(parents=True, exist_ok=True)
228
+ df.to_csv(path_or_buf=download_path)
229
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
230
+
231
+
232
+ airtable_source_entry = SourceRegistryEntry(
233
+ indexer=AirtableIndexer,
234
+ indexer_config=AirtableIndexerConfig,
235
+ downloader=AirtableDownloader,
236
+ downloader_config=AirtableDownloaderConfig,
237
+ connection_config=AirtableConnectionConfig,
238
+ )
@@ -0,0 +1,9 @@
1
+ CREATE TABLE IF NOT EXISTS `elements` (
2
+ id STRING NOT NULL PRIMARY KEY,
3
+ record_id STRING NOT NULL,
4
+ element_id STRING NOT NULL,
5
+ text STRING,
6
+ embeddings ARRAY<FLOAT>,
7
+ type STRING,
8
+ metadata VARIANT
9
+ );
@@ -0,0 +1,23 @@
1
+ {
2
+ "properties": [
3
+ {
4
+ "dataType": [
5
+ "text"
6
+ ],
7
+ "indexFilterable": true,
8
+ "indexSearchable": true,
9
+ "name": "record_id",
10
+ "tokenization": "word"
11
+ },
12
+ {
13
+ "dataType": [
14
+ "text"
15
+ ],
16
+ "indexFilterable": true,
17
+ "indexSearchable": true,
18
+ "name": "text",
19
+ "tokenization": "word"
20
+ }
21
+ ],
22
+ "vectorizer": "none"
23
+ }