unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,82 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- from typing import Type
5
-
6
- from unstructured_ingest.connector.fsspec.fsspec import (
7
- FsspecDestinationConnector,
8
- FsspecIngestDoc,
9
- FsspecSourceConnector,
10
- FsspecWriteConfig,
11
- SimpleFsspecConfig,
12
- )
13
- from unstructured_ingest.enhanced_dataclass import enhanced_field
14
- from unstructured_ingest.error import SourceConnectionError
15
- from unstructured_ingest.interfaces import AccessConfig
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
18
-
19
-
20
- @dataclass
21
- class GcsAccessConfig(AccessConfig):
22
- token: t.Optional[str] = enhanced_field(
23
- default=None, sensitive=True, overload_name="service_account_key"
24
- )
25
-
26
- def __post_init__(self):
27
- ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
28
-
29
- # Case: null value
30
- if not self.token:
31
- return
32
- # Case: one of auth constants
33
- if self.token in ALLOWED_AUTH_VALUES:
34
- return
35
- # Case: token as json
36
- if isinstance(json_to_dict(self.token), dict):
37
- self.token = json_to_dict(self.token)
38
- return
39
- # Case: path to token
40
- if Path(self.token).is_file():
41
- return
42
-
43
- raise ValueError("Invalid auth token value")
44
-
45
-
46
- @dataclass
47
- class GcsWriteConfig(FsspecWriteConfig):
48
- pass
49
-
50
-
51
- @dataclass
52
- class SimpleGcsConfig(SimpleFsspecConfig):
53
- access_config: GcsAccessConfig = None
54
-
55
-
56
- @dataclass
57
- class GcsIngestDoc(FsspecIngestDoc):
58
- connector_config: SimpleGcsConfig
59
- registry_name: str = "gcs"
60
-
61
- @SourceConnectionError.wrap
62
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
63
- def get_file(self):
64
- super().get_file()
65
-
66
-
67
- @dataclass
68
- class GcsSourceConnector(FsspecSourceConnector):
69
- connector_config: SimpleGcsConfig
70
-
71
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
72
- def initialize(self):
73
- super().initialize()
74
-
75
- def __post_init__(self):
76
- self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
77
-
78
-
79
- @dataclass
80
- class GcsDestinationConnector(FsspecDestinationConnector):
81
- connector_config: SimpleGcsConfig
82
- write_config: GcsWriteConfig
@@ -1,62 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
- from typing import Type
4
-
5
- from unstructured_ingest.connector.fsspec.fsspec import (
6
- FsspecDestinationConnector,
7
- FsspecIngestDoc,
8
- FsspecSourceConnector,
9
- FsspecWriteConfig,
10
- SimpleFsspecConfig,
11
- )
12
- from unstructured_ingest.enhanced_dataclass import enhanced_field
13
- from unstructured_ingest.interfaces import AccessConfig
14
- from unstructured_ingest.utils.dep_check import requires_dependencies
15
-
16
-
17
- @dataclass
18
- class S3AccessConfig(AccessConfig):
19
- anon: bool = enhanced_field(default=False, overload_name="anonymous")
20
- endpoint_url: t.Optional[str] = None
21
- key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
22
- secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
23
- token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
24
-
25
-
26
- @dataclass
27
- class S3WriteConfig(FsspecWriteConfig):
28
- pass
29
-
30
-
31
- @dataclass
32
- class SimpleS3Config(SimpleFsspecConfig):
33
- access_config: S3AccessConfig = enhanced_field(default=None)
34
-
35
-
36
- @dataclass
37
- class S3IngestDoc(FsspecIngestDoc):
38
- connector_config: SimpleS3Config
39
- remote_file_path: str
40
- registry_name: str = "s3"
41
-
42
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
43
- def get_file(self):
44
- super().get_file()
45
-
46
-
47
- @dataclass
48
- class S3SourceConnector(FsspecSourceConnector):
49
- connector_config: SimpleS3Config
50
-
51
- def __post_init__(self):
52
- self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc
53
-
54
-
55
- @dataclass
56
- class S3DestinationConnector(FsspecDestinationConnector):
57
- connector_config: SimpleS3Config
58
- write_config: S3WriteConfig
59
-
60
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
61
- def initialize(self):
62
- super().initialize()
@@ -1,81 +0,0 @@
1
- import os
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- from typing import Type
5
- from urllib.parse import urlparse
6
-
7
- from unstructured_ingest.connector.fsspec.fsspec import (
8
- FsspecIngestDoc,
9
- FsspecSourceConnector,
10
- SimpleFsspecConfig,
11
- )
12
- from unstructured_ingest.enhanced_dataclass import enhanced_field
13
- from unstructured_ingest.error import SourceConnectionError
14
- from unstructured_ingest.interfaces import AccessConfig
15
- from unstructured_ingest.logger import logger
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
-
18
-
19
- @dataclass
20
- class SftpAccessConfig(AccessConfig):
21
- username: str
22
- password: str = enhanced_field(sensitive=True)
23
- host: str = ""
24
- port: int = 22
25
- look_for_keys: bool = False
26
- allow_agent: bool = False
27
-
28
-
29
- @dataclass
30
- class SimpleSftpConfig(SimpleFsspecConfig):
31
- access_config: SftpAccessConfig = None
32
-
33
- def __post_init__(self):
34
- super().__post_init__()
35
-
36
- _, ext = os.path.splitext(self.remote_url)
37
- parsed_url = urlparse(self.remote_url)
38
- if ext:
39
- # We only want the file_path if it has an extension
40
- self.file_path = Path(self.remote_url).name
41
- self.dir_path = Path(parsed_url.path).parent.as_posix().lstrip("/")
42
- self.path_without_protocol = self.dir_path
43
- else:
44
- self.file_path = ""
45
- self.dir_path = parsed_url.path.lstrip("/")
46
- self.path_without_protocol = self.dir_path
47
- self.access_config.host = parsed_url.hostname or self.access_config.host
48
- self.access_config.port = parsed_url.port or self.access_config.port
49
-
50
-
51
- @dataclass
52
- class SftpIngestDoc(FsspecIngestDoc):
53
- connector_config: SimpleSftpConfig
54
- registry_name: str = "sftp"
55
-
56
- @SourceConnectionError.wrap
57
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
58
- def get_file(self):
59
- super().get_file()
60
-
61
-
62
- @dataclass
63
- class SftpSourceConnector(FsspecSourceConnector):
64
- connector_config: SimpleSftpConfig
65
-
66
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
67
- def initialize(self):
68
- super().initialize()
69
-
70
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
71
- def check_connection(self):
72
- from fsspec.implementations.sftp import SFTPFileSystem
73
-
74
- try:
75
- SFTPFileSystem(**self.connector_config.get_access_config())
76
- except Exception as e:
77
- logger.error(f"failed to validate connection: {e}", exc_info=True)
78
- raise SourceConnectionError(f"failed to validate connection: {e}")
79
-
80
- def __post_init__(self):
81
- self.ingest_doc_cls: Type[SftpIngestDoc] = SftpIngestDoc
@@ -1,124 +0,0 @@
1
- import fnmatch
2
- import typing as t
3
- from dataclasses import dataclass, field
4
- from pathlib import Path
5
-
6
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
- from unstructured_ingest.error import SourceConnectionError
8
- from unstructured_ingest.interfaces import (
9
- AccessConfig,
10
- BaseConnectorConfig,
11
- BaseSingleIngestDoc,
12
- BaseSourceConnector,
13
- IngestDocCleanupMixin,
14
- SourceConnectorCleanupMixin,
15
- )
16
- from unstructured_ingest.logger import logger
17
-
18
-
19
- @dataclass
20
- class GitAccessConfig(AccessConfig):
21
- access_token: t.Optional[str] = enhanced_field(
22
- default=None, sensitive=True, overload_name="git_access_token"
23
- )
24
-
25
-
26
- @dataclass
27
- class SimpleGitConfig(BaseConnectorConfig):
28
- url: str
29
- access_config: GitAccessConfig
30
- branch: t.Optional[str] = enhanced_field(default=None, overload_name="git_branch")
31
- file_glob: t.Optional[t.List[str]] = enhanced_field(default=None, overload_name="git_file_glob")
32
- repo_path: str = field(init=False, repr=False)
33
-
34
-
35
- @dataclass
36
- class GitIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
37
- connector_config: SimpleGitConfig = field(repr=False)
38
- path: str
39
-
40
- @property
41
- def filename(self):
42
- return (Path(self.read_config.download_dir) / self.path).resolve()
43
-
44
- @property
45
- def _output_filename(self):
46
- return Path(self.processor_config.output_dir) / f"{self.path}.json"
47
-
48
- @property
49
- def record_locator(self) -> t.Dict[str, t.Any]:
50
- record_locator = {
51
- "repo_path": self.connector_config.repo_path,
52
- "file_path": self.path,
53
- }
54
- if self.connector_config.branch is not None:
55
- record_locator["branch"] = self.connector_config.branch
56
- return record_locator
57
-
58
- def _create_full_tmp_dir_path(self):
59
- """includes directories in in the gitlab repository"""
60
- self.filename.parent.mkdir(parents=True, exist_ok=True)
61
-
62
- def update_source_metadata(self, **kwargs):
63
- raise NotImplementedError()
64
-
65
- @SourceConnectionError.wrap
66
- @BaseSingleIngestDoc.skip_if_file_exists
67
- def get_file(self):
68
- """Fetches the "remote" doc and stores it locally on the filesystem."""
69
- self._create_full_tmp_dir_path()
70
- self._fetch_and_write()
71
-
72
- def _fetch_content(self) -> None:
73
- raise NotImplementedError()
74
-
75
- def _fetch_and_write(self) -> None:
76
- raise NotImplementedError()
77
-
78
-
79
- @dataclass
80
- class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
81
- connector_config: SimpleGitConfig
82
-
83
- def initialize(self):
84
- pass
85
-
86
- def check_connection(self):
87
- pass
88
-
89
- @staticmethod
90
- def is_file_type_supported(path: str) -> bool:
91
- # Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
92
- # TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
93
- supported = path.endswith(
94
- (
95
- ".md",
96
- ".txt",
97
- ".pdf",
98
- ".doc",
99
- ".docx",
100
- ".eml",
101
- ".heic",
102
- ".html",
103
- ".png",
104
- ".jpg",
105
- ".ppt",
106
- ".pptx",
107
- ".xml",
108
- ),
109
- )
110
- if not supported:
111
- logger.debug(
112
- f"The file {path!r} is discarded as it does not contain a supported filetype.",
113
- )
114
- return supported
115
-
116
- def does_path_match_glob(self, path: str) -> bool:
117
- if not self.connector_config.file_glob:
118
- return True
119
- patterns = self.connector_config.file_glob
120
- for pattern in patterns:
121
- if fnmatch.filter([path], pattern):
122
- return True
123
- logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
124
- return False
@@ -1,174 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
- from datetime import datetime
4
- from urllib.parse import urlparse
5
-
6
- from unstructured_ingest.connector.git import (
7
- GitIngestDoc,
8
- GitSourceConnector,
9
- SimpleGitConfig,
10
- )
11
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
- from unstructured_ingest.interfaces import SourceMetadata
13
- from unstructured_ingest.logger import logger
14
- from unstructured_ingest.utils.dep_check import requires_dependencies
15
-
16
- if t.TYPE_CHECKING:
17
- from github.Repository import Repository
18
-
19
-
20
- @dataclass
21
- class SimpleGitHubConfig(SimpleGitConfig):
22
- def __post_init__(self):
23
- parsed_gh_url = urlparse(self.url)
24
- path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
25
-
26
- # If a scheme and netloc are provided, ensure they are correct
27
- # Additionally, ensure that the path contains two fragments
28
- if (
29
- (parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
30
- or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
31
- or len(path_fragments) != 2
32
- ):
33
- raise ValueError(
34
- 'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
35
- ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
36
- )
37
-
38
- # If there's no issues, store the core repository info
39
- self.repo_path = parsed_gh_url.path
40
-
41
- @SourceConnectionError.wrap
42
- @requires_dependencies(["github"], extras="github")
43
- def get_repo(self) -> "Repository":
44
- from github import Github
45
-
46
- github = Github(self.access_config.access_token)
47
- return github.get_repo(self.repo_path)
48
-
49
-
50
- @dataclass
51
- class GitHubIngestDoc(GitIngestDoc):
52
- connector_config: SimpleGitHubConfig
53
- registry_name: str = "github"
54
-
55
- @property
56
- def date_created(self) -> t.Optional[str]:
57
- return None
58
-
59
- @requires_dependencies(["github"], extras="github")
60
- def _fetch_file(self):
61
- from github.GithubException import UnknownObjectException
62
-
63
- try:
64
- content_file = self.connector_config.get_repo().get_contents(self.path)
65
- except UnknownObjectException:
66
- logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
67
- return None
68
-
69
- return content_file
70
-
71
- @SourceConnectionNetworkError.wrap
72
- @requires_dependencies(["requests"], extras="github")
73
- def _fetch_content(self, content_file):
74
- import requests
75
-
76
- contents = b""
77
- if (
78
- not content_file.content # type: ignore
79
- and content_file.encoding == "none" # type: ignore
80
- and content_file.size # type: ignore
81
- ):
82
- logger.info("File too large for the GitHub API, using direct download link instead.")
83
- # NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors?
84
- response = requests.get(content_file.download_url) # type: ignore
85
- if response.status_code != 200:
86
- logger.info("Direct download link has failed... Skipping this file.")
87
- return None
88
- else:
89
- contents = response.content
90
- else:
91
- contents = content_file.decoded_content # type: ignore
92
- return contents
93
-
94
- def update_source_metadata(self, **kwargs):
95
- content_file = kwargs.get("content_file", self._fetch_file())
96
- if content_file is None:
97
- self.source_metadata = SourceMetadata(
98
- exists=False,
99
- )
100
- return
101
-
102
- date_modified = datetime.strptime(
103
- content_file.last_modified,
104
- "%a, %d %b %Y %H:%M:%S %Z",
105
- ).isoformat()
106
- self.source_metadata = SourceMetadata(
107
- date_modified=date_modified,
108
- version=content_file.etag,
109
- source_url=content_file.download_url,
110
- exists=True,
111
- )
112
-
113
- def _fetch_and_write(self) -> None:
114
- content_file = self._fetch_file()
115
- self.update_source_metadata(content_file=content_file)
116
- contents = self._fetch_content(content_file)
117
- if contents is None:
118
- raise ValueError(
119
- f"Failed to retrieve file from repo "
120
- f"{self.connector_config.url}/{self.path}. Check logs",
121
- )
122
- with open(self.filename, "wb") as f:
123
- f.write(contents)
124
-
125
-
126
- @dataclass
127
- class GitHubSourceConnector(GitSourceConnector):
128
- connector_config: SimpleGitHubConfig
129
-
130
- @requires_dependencies(["github"], extras="github")
131
- def check_connection(self):
132
- from github import Consts
133
- from github.GithubRetry import GithubRetry
134
- from github.Requester import Requester
135
-
136
- try:
137
- requester = Requester(
138
- auth=self.connector_config.access_config.access_token,
139
- base_url=Consts.DEFAULT_BASE_URL,
140
- timeout=Consts.DEFAULT_TIMEOUT,
141
- user_agent=Consts.DEFAULT_USER_AGENT,
142
- per_page=Consts.DEFAULT_PER_PAGE,
143
- verify=True,
144
- retry=GithubRetry(),
145
- pool_size=None,
146
- )
147
- url_base = (
148
- "/repositories/" if isinstance(self.connector_config.repo_path, int) else "/repos/"
149
- )
150
- url = f"{url_base}{self.connector_config.repo_path}"
151
- headers, _ = requester.requestJsonAndCheck("HEAD", url)
152
- logger.debug(f"headers from HEAD request: {headers}")
153
- except Exception as e:
154
- logger.error(f"failed to validate connection: {e}", exc_info=True)
155
- raise SourceConnectionError(f"failed to validate connection: {e}")
156
-
157
- def get_ingest_docs(self):
158
- repo = self.connector_config.get_repo()
159
- # Load the Git tree with all files, and then create Ingest docs
160
- # for all blobs, i.e. all files, ignoring directories
161
- sha = self.connector_config.branch or repo.default_branch
162
- git_tree = repo.get_git_tree(sha, recursive=True)
163
- return [
164
- GitHubIngestDoc(
165
- connector_config=self.connector_config,
166
- processor_config=self.processor_config,
167
- read_config=self.read_config,
168
- path=element.path,
169
- )
170
- for element in git_tree.tree
171
- if element.type == "blob"
172
- and self.is_file_type_supported(element.path)
173
- and (not self.connector_config.file_glob or self.does_path_match_glob(element.path))
174
- ]
@@ -1,142 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
- from urllib.parse import urlparse
4
-
5
- from unstructured_ingest.connector.git import (
6
- GitIngestDoc,
7
- GitSourceConnector,
8
- SimpleGitConfig,
9
- )
10
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
- from unstructured_ingest.interfaces import SourceMetadata
12
- from unstructured_ingest.logger import logger
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
-
15
- if t.TYPE_CHECKING:
16
- from gitlab.v4.objects.projects import Project
17
-
18
-
19
- @dataclass
20
- class SimpleGitlabConfig(SimpleGitConfig):
21
- base_url: str = "https://gitlab.com"
22
-
23
- def __post_init__(self):
24
- parsed_gh_url = urlparse(self.url)
25
- # If a scheme or netloc are provided, use the parsed base url
26
- if parsed_gh_url.scheme or parsed_gh_url.netloc:
27
- self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
28
- self.repo_path = parsed_gh_url.path
29
- while self.repo_path.startswith("/"):
30
- self.repo_path = self.repo_path[1:]
31
-
32
- @SourceConnectionError.wrap
33
- @requires_dependencies(["gitlab"], extras="gitlab")
34
- def get_project(self) -> "Project":
35
- from gitlab import Gitlab
36
-
37
- gitlab = Gitlab(self.base_url, private_token=self.access_config.access_token)
38
- return gitlab.projects.get(self.repo_path)
39
-
40
-
41
- @dataclass
42
- class GitLabIngestDoc(GitIngestDoc):
43
- connector_config: SimpleGitlabConfig
44
- registry_name: str = "gitlab"
45
-
46
- @property
47
- def date_created(self) -> t.Optional[str]:
48
- return None
49
-
50
- @property
51
- def date_modified(self) -> t.Optional[str]:
52
- return None
53
-
54
- @property
55
- def source_url(self) -> t.Optional[str]:
56
- return None
57
-
58
- @SourceConnectionNetworkError.wrap
59
- @requires_dependencies(["gitlab"], extras="gitlab")
60
- def _fetch_content(self):
61
- from gitlab.exceptions import GitlabHttpError
62
-
63
- try:
64
- project = self.connector_config.get_project()
65
- content_file = project.files.get(
66
- self.path,
67
- ref=self.connector_config.branch or project.default_branch,
68
- )
69
- except GitlabHttpError as e:
70
- if e.response_code == 404:
71
- logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
72
- return None
73
- raise
74
- return content_file
75
-
76
- def update_source_metadata(self, **kwargs):
77
- content_file = kwargs.get("content_file", self._fetch_content())
78
- if content_file is None:
79
- self.source_metadata = SourceMetadata(
80
- exists=None,
81
- )
82
- return
83
- self.source_metadata = SourceMetadata(
84
- version=content_file.attributes.get("last_commit_id", ""),
85
- exists=True,
86
- )
87
-
88
- def _fetch_and_write(self) -> None:
89
- content_file = self._fetch_content()
90
- self.update_source_metadata(content_file=content_file)
91
- if content_file is None:
92
- raise ValueError(
93
- f"Failed to retrieve file from repo "
94
- f"{self.connector_config.url}/{self.path}. Check logs.",
95
- )
96
- contents = content_file.decode()
97
- with open(self.filename, "wb") as f:
98
- f.write(contents)
99
-
100
-
101
- @dataclass
102
- class GitLabSourceConnector(GitSourceConnector):
103
- connector_config: SimpleGitlabConfig
104
-
105
- @requires_dependencies(["gitlab"], extras="gitlab")
106
- def check_connection(self):
107
- from gitlab import Gitlab
108
- from gitlab.exceptions import GitlabError
109
-
110
- try:
111
- gitlab = Gitlab(
112
- self.connector_config.base_url,
113
- private_token=self.connector_config.access_config.access_token,
114
- )
115
- gitlab.auth()
116
- except GitlabError as gitlab_error:
117
- logger.error(f"failed to validate connection: {gitlab_error}", exc_info=True)
118
- raise SourceConnectionError(f"failed to validate connection: {gitlab_error}")
119
-
120
- def get_ingest_docs(self):
121
- # Load the Git tree with all files, and then create Ingest docs
122
- # for all blobs, i.e. all files, ignoring directories
123
- project = self.connector_config.get_project()
124
- ref = self.connector_config.branch or project.default_branch
125
- git_tree = project.repository_tree(
126
- ref=ref,
127
- recursive=True,
128
- iterator=True,
129
- all=True,
130
- )
131
- return [
132
- GitLabIngestDoc(
133
- connector_config=self.connector_config,
134
- processor_config=self.processor_config,
135
- read_config=self.read_config,
136
- path=element["path"],
137
- )
138
- for element in git_tree
139
- if element["type"] == "blob"
140
- and self.is_file_type_supported(element["path"])
141
- and (not self.connector_config.file_glob or self.does_path_match_glob(element["path"]))
142
- ]