unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,4 @@
1
- from unstructured_ingest.v2.processes.connector_registry import (
1
+ from unstructured_ingest.processes.connector_registry import (
2
2
  add_destination_entry,
3
3
  add_source_entry,
4
4
  )
@@ -8,20 +8,20 @@ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
8
 
9
9
  from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
10
10
 
11
+ from unstructured_ingest.data_types.file_data import (
12
+ BatchFileData,
13
+ BatchItem,
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ SourceIdentifiers,
17
+ )
11
18
  from unstructured_ingest.error import (
12
19
  DestinationConnectionError,
13
20
  SourceConnectionError,
14
21
  SourceConnectionNetworkError,
15
22
  WriteError,
16
23
  )
17
- from unstructured_ingest.utils.data_prep import (
18
- batch_generator,
19
- flatten_dict,
20
- generator_batching_wbytes,
21
- )
22
- from unstructured_ingest.utils.dep_check import requires_dependencies
23
- from unstructured_ingest.v2.constants import RECORD_ID_LABEL
24
- from unstructured_ingest.v2.interfaces import (
24
+ from unstructured_ingest.interfaces import (
25
25
  AccessConfig,
26
26
  ConnectionConfig,
27
27
  Downloader,
@@ -35,19 +35,19 @@ from unstructured_ingest.v2.interfaces import (
35
35
  UploadStagerConfig,
36
36
  download_responses,
37
37
  )
38
- from unstructured_ingest.v2.logger import logger
39
- from unstructured_ingest.v2.processes.connector_registry import (
38
+ from unstructured_ingest.logger import logger
39
+ from unstructured_ingest.processes.connector_registry import (
40
40
  DestinationRegistryEntry,
41
41
  SourceRegistryEntry,
42
42
  )
43
- from unstructured_ingest.v2.types.file_data import (
44
- BatchFileData,
45
- BatchItem,
46
- FileData,
47
- FileDataSourceMetadata,
48
- SourceIdentifiers,
43
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
44
+ from unstructured_ingest.utils.data_prep import (
45
+ batch_generator,
46
+ flatten_dict,
47
+ generator_batching_wbytes,
48
+ get_enhanced_element_id,
49
49
  )
50
- from unstructured_ingest.v2.utils import get_enhanced_element_id
50
+ from unstructured_ingest.utils.dep_check import requires_dependencies
51
51
 
52
52
  if TYPE_CHECKING:
53
53
  from elasticsearch import Elasticsearch as ElasticsearchClient
@@ -7,17 +7,16 @@ from pydantic import BaseModel, Field, Secret, field_validator
7
7
  from unstructured_ingest.error import (
8
8
  DestinationConnectionError,
9
9
  )
10
- from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import (
10
+ from unstructured_ingest.interfaces import (
12
11
  AccessConfig,
13
12
  ConnectionConfig,
14
13
  )
15
- from unstructured_ingest.v2.logger import logger
16
- from unstructured_ingest.v2.processes.connector_registry import (
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
17
16
  DestinationRegistryEntry,
18
17
  SourceRegistryEntry,
19
18
  )
20
- from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
19
+ from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
21
20
  ElasticsearchDownloader,
22
21
  ElasticsearchDownloaderConfig,
23
22
  ElasticsearchIndexer,
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch imp
27
26
  ElasticsearchUploadStager,
28
27
  ElasticsearchUploadStagerConfig,
29
28
  )
29
+ from unstructured_ingest.utils.dep_check import requires_dependencies
30
30
 
31
31
  if TYPE_CHECKING:
32
32
  from opensearchpy import OpenSearch
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from unstructured_ingest.v2.processes.connector_registry import (
3
+ from unstructured_ingest.processes.connector_registry import (
4
4
  add_destination_entry,
5
5
  add_source_entry,
6
6
  )
@@ -7,14 +7,14 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
- from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
12
- from unstructured_ingest.v2.logger import logger
13
- from unstructured_ingest.v2.processes.connector_registry import (
10
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
11
+ from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.processes.connector_registry import (
14
14
  DestinationRegistryEntry,
15
15
  SourceRegistryEntry,
16
16
  )
17
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
17
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
18
18
  FsspecAccessConfig,
19
19
  FsspecConnectionConfig,
20
20
  FsspecDownloader,
@@ -24,12 +24,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
24
24
  FsspecUploader,
25
25
  FsspecUploaderConfig,
26
26
  )
27
- from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
28
- from unstructured_ingest.v2.processes.utils.blob_storage import (
27
+ from unstructured_ingest.processes.connectors.fsspec.utils import json_serial, sterilize_dict
28
+ from unstructured_ingest.processes.utils.blob_storage import (
29
29
  BlobStoreUploadStager,
30
30
  BlobStoreUploadStagerConfig,
31
31
  )
32
- from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from adlfs import AzureBlobFileSystem
@@ -9,14 +9,14 @@ from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
  from pydantic.functional_validators import BeforeValidator
11
11
 
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
14
- from unstructured_ingest.v2.logger import logger
15
- from unstructured_ingest.v2.processes.connector_registry import (
12
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
13
+ from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
16
16
  DestinationRegistryEntry,
17
17
  SourceRegistryEntry,
18
18
  )
19
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
19
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
20
20
  FsspecAccessConfig,
21
21
  FsspecConnectionConfig,
22
22
  FsspecDownloader,
@@ -26,12 +26,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploader,
27
27
  FsspecUploaderConfig,
28
28
  )
29
- from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
30
- from unstructured_ingest.v2.processes.utils.blob_storage import (
29
+ from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
30
+ from unstructured_ingest.processes.utils.blob_storage import (
31
31
  BlobStoreUploadStager,
32
32
  BlobStoreUploadStagerConfig,
33
33
  )
34
- from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
34
+ from unstructured_ingest.utils.dep_check import requires_dependencies
35
35
 
36
36
  if TYPE_CHECKING:
37
37
  from boxfs import BoxFileSystem
@@ -6,21 +6,21 @@ from typing import TYPE_CHECKING, Any, Optional
6
6
 
7
7
  from pydantic import Field, Secret
8
8
 
9
- from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.errors import (
9
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
10
+ from unstructured_ingest.errors_v2 import (
11
11
  ProviderError,
12
12
  UserAuthError,
13
13
  UserError,
14
14
  )
15
- from unstructured_ingest.v2.errors import (
15
+ from unstructured_ingest.errors_v2 import (
16
16
  RateLimitError as CustomRateLimitError,
17
17
  )
18
- from unstructured_ingest.v2.logger import logger
19
- from unstructured_ingest.v2.processes.connector_registry import (
18
+ from unstructured_ingest.logger import logger
19
+ from unstructured_ingest.processes.connector_registry import (
20
20
  DestinationRegistryEntry,
21
21
  SourceRegistryEntry,
22
22
  )
23
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
23
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
24
24
  FsspecAccessConfig,
25
25
  FsspecConnectionConfig,
26
26
  FsspecDownloader,
@@ -30,11 +30,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
30
30
  FsspecUploader,
31
31
  FsspecUploaderConfig,
32
32
  )
33
- from unstructured_ingest.v2.processes.utils.blob_storage import (
33
+ from unstructured_ingest.processes.utils.blob_storage import (
34
34
  BlobStoreUploadStager,
35
35
  BlobStoreUploadStagerConfig,
36
36
  )
37
- from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
37
+ from unstructured_ingest.utils.dep_check import requires_dependencies
38
38
 
39
39
  if TYPE_CHECKING:
40
40
  pass
@@ -12,7 +12,12 @@ from uuid import NAMESPACE_DNS, uuid5
12
12
 
13
13
  from pydantic import BaseModel, Field, Secret
14
14
 
15
- from unstructured_ingest.v2.interfaces import (
15
+ from unstructured_ingest.data_types.file_data import (
16
+ FileData,
17
+ FileDataSourceMetadata,
18
+ SourceIdentifiers,
19
+ )
20
+ from unstructured_ingest.interfaces import (
16
21
  AccessConfig,
17
22
  ConnectionConfig,
18
23
  Downloader,
@@ -23,13 +28,8 @@ from unstructured_ingest.v2.interfaces import (
23
28
  Uploader,
24
29
  UploaderConfig,
25
30
  )
26
- from unstructured_ingest.v2.logger import logger
27
- from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
28
- from unstructured_ingest.v2.types.file_data import (
29
- FileData,
30
- FileDataSourceMetadata,
31
- SourceIdentifiers,
32
- )
31
+ from unstructured_ingest.logger import logger
32
+ from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from fsspec import AbstractFileSystem
@@ -9,15 +9,14 @@ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
9
9
  from dateutil import parser
10
10
  from pydantic import Field, Secret
11
11
 
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
14
- from unstructured_ingest.v2.errors import ProviderError, UserError
15
- from unstructured_ingest.v2.logger import logger
16
- from unstructured_ingest.v2.processes.connector_registry import (
12
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
13
+ from unstructured_ingest.errors_v2 import ProviderError, UserError
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
17
16
  DestinationRegistryEntry,
18
17
  SourceRegistryEntry,
19
18
  )
20
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
19
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
21
20
  FsspecAccessConfig,
22
21
  FsspecConnectionConfig,
23
22
  FsspecDownloader,
@@ -27,11 +26,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
27
26
  FsspecUploader,
28
27
  FsspecUploaderConfig,
29
28
  )
30
- from unstructured_ingest.v2.processes.utils.blob_storage import (
29
+ from unstructured_ingest.processes.utils.blob_storage import (
31
30
  BlobStoreUploadStager,
32
31
  BlobStoreUploadStagerConfig,
33
32
  )
34
- from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
33
+ from unstructured_ingest.utils.dep_check import requires_dependencies
34
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
35
35
 
36
36
  if TYPE_CHECKING:
37
37
  from gcsfs import GCSFileSystem
@@ -6,14 +6,16 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from pydantic import Field, Secret
8
8
 
9
- from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
11
- from unstructured_ingest.v2.logger import logger
12
- from unstructured_ingest.v2.processes.connector_registry import (
9
+ from unstructured_ingest.data_types.file_data import (
10
+ FileDataSourceMetadata,
11
+ )
12
+ from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
13
+ from unstructured_ingest.logger import logger
14
+ from unstructured_ingest.processes.connector_registry import (
13
15
  DestinationRegistryEntry,
14
16
  SourceRegistryEntry,
15
17
  )
16
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
18
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
17
19
  FsspecAccessConfig,
18
20
  FsspecConnectionConfig,
19
21
  FsspecDownloader,
@@ -23,13 +25,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
23
25
  FsspecUploader,
24
26
  FsspecUploaderConfig,
25
27
  )
26
- from unstructured_ingest.v2.processes.utils.blob_storage import (
28
+ from unstructured_ingest.processes.utils.blob_storage import (
27
29
  BlobStoreUploadStager,
28
30
  BlobStoreUploadStagerConfig,
29
31
  )
30
- from unstructured_ingest.v2.types.file_data import (
31
- FileDataSourceMetadata,
32
- )
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
33
 
34
34
  CONNECTOR_TYPE = "s3"
35
35
 
@@ -10,12 +10,12 @@ from urllib.parse import urlparse
10
10
 
11
11
  from pydantic import Field, Secret
12
12
 
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.processes.connector_registry import (
13
+ from unstructured_ingest.data_types.file_data import FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.processes.connector_registry import (
15
15
  DestinationRegistryEntry,
16
16
  SourceRegistryEntry,
17
17
  )
18
- from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
18
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
19
19
  FsspecAccessConfig,
20
20
  FsspecConnectionConfig,
21
21
  FsspecDownloader,
@@ -25,11 +25,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
25
  FsspecUploader,
26
26
  FsspecUploaderConfig,
27
27
  )
28
- from unstructured_ingest.v2.processes.utils.blob_storage import (
28
+ from unstructured_ingest.processes.utils.blob_storage import (
29
29
  BlobStoreUploadStager,
30
30
  BlobStoreUploadStagerConfig,
31
31
  )
32
- from unstructured_ingest.v2.types.file_data import FileData, FileDataSourceMetadata
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from fsspec.implementations.sftp import SFTPFileSystem
@@ -0,0 +1,221 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from time import time
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+ from urllib.parse import urlparse
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import Field, Secret, field_validator
9
+
10
+ from unstructured_ingest.data_types.file_data import (
11
+ FileData,
12
+ FileDataSourceMetadata,
13
+ SourceIdentifiers,
14
+ )
15
+ from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
16
+ from unstructured_ingest.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ Downloader,
20
+ DownloaderConfig,
21
+ Indexer,
22
+ IndexerConfig,
23
+ download_responses,
24
+ )
25
+ from unstructured_ingest.logger import logger
26
+ from unstructured_ingest.processes.connector_registry import (
27
+ SourceRegistryEntry,
28
+ )
29
+ from unstructured_ingest.utils.dep_check import requires_dependencies
30
+
31
+ if TYPE_CHECKING:
32
+ from github import ContentFile, GitTreeElement, Repository
33
+ from github import Github as GithubClient
34
+ from github.GithubException import GithubException
35
+ from requests import HTTPError
36
+
37
+ CONNECTOR_TYPE = "github"
38
+
39
+
40
+ class GithubAccessConfig(AccessConfig):
41
+ access_token: str = Field(description="Github acess token")
42
+
43
+
44
+ class GithubConnectionConfig(ConnectionConfig):
45
+ access_config: Secret[GithubAccessConfig]
46
+ url: str = Field(description="Github url or repository owner/name pair")
47
+
48
+ @field_validator("url", mode="after")
49
+ def conform_url(cls, value: str):
50
+ parsed_url = urlparse(value)
51
+ return parsed_url.path
52
+
53
+ def get_full_url(self):
54
+ return f"https://github.com/{self.url}"
55
+
56
+ @requires_dependencies(["github"], extras="github")
57
+ def get_client(self) -> "GithubClient":
58
+ from github import Github as GithubClient
59
+
60
+ return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
61
+
62
+ def get_repo(self) -> "Repository":
63
+ client = self.get_client()
64
+ return client.get_repo(self.url)
65
+
66
+ def wrap_github_exception(self, e: "GithubException") -> Exception:
67
+ data = e.data
68
+ status_code = e.status
69
+ message = data.get("message")
70
+ if status_code == 401:
71
+ return UserAuthError(f"Unauthorized access to Github: {message}")
72
+ if 400 <= status_code < 500:
73
+ return UserError(message)
74
+ if status_code > 500:
75
+ return ProviderError(message)
76
+ logger.debug(f"unhandled github error: {e}")
77
+ return e
78
+
79
+ def wrap_http_error(self, e: "HTTPError") -> Exception:
80
+ status_code = e.response.status_code
81
+ if status_code == 401:
82
+ return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
83
+ if 400 <= status_code < 500:
84
+ return UserError(e.response.text)
85
+ if status_code > 500:
86
+ return ProviderError(e.response.text)
87
+ logger.debug(f"unhandled http error: {e}")
88
+ return e
89
+
90
+ @requires_dependencies(["requests"], extras="github")
91
+ def wrap_error(self, e: Exception) -> Exception:
92
+ from github.GithubException import GithubException
93
+ from requests import HTTPError
94
+
95
+ if isinstance(e, GithubException):
96
+ return self.wrap_github_exception(e=e)
97
+ if isinstance(e, HTTPError):
98
+ return self.wrap_http_error(e=e)
99
+ logger.debug(f"unhandled error: {e}")
100
+ return e
101
+
102
+
103
+ class GithubIndexerConfig(IndexerConfig):
104
+ branch: Optional[str] = Field(
105
+ description="Branch to index, use the default if one isn't provided", default=None
106
+ )
107
+ recursive: bool = Field(
108
+ description="Recursively index all files in the repository", default=True
109
+ )
110
+
111
+
112
+ @dataclass
113
+ class GithubIndexer(Indexer):
114
+ connection_config: GithubConnectionConfig
115
+ index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
116
+ connector_type: str = CONNECTOR_TYPE
117
+
118
+ def precheck(self) -> None:
119
+ try:
120
+ self.connection_config.get_repo()
121
+ except Exception as e:
122
+ raise self.connection_config.wrap_error(e=e)
123
+
124
+ def get_branch(self) -> str:
125
+ repo = self.connection_config.get_repo()
126
+ sha = self.index_config.branch or repo.default_branch
127
+ return sha
128
+
129
+ def list_files(self) -> list["GitTreeElement"]:
130
+ repo = self.connection_config.get_repo()
131
+ sha = self.index_config.branch or repo.default_branch
132
+ git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
133
+ file_elements = [
134
+ element for element in git_tree.tree if element.size is not None and element.size > 0
135
+ ]
136
+ return file_elements
137
+
138
+ def convert_element(self, element: "GitTreeElement") -> FileData:
139
+ full_path = (
140
+ f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
141
+ )
142
+
143
+ return FileData(
144
+ identifier=str(uuid5(NAMESPACE_DNS, full_path)),
145
+ connector_type=self.connector_type,
146
+ display_name=full_path,
147
+ source_identifiers=SourceIdentifiers(
148
+ filename=Path(element.path).name,
149
+ fullpath=(Path(self.get_branch()) / element.path).as_posix(),
150
+ rel_path=element.path,
151
+ ),
152
+ metadata=FileDataSourceMetadata(
153
+ url=element.url,
154
+ version=element.etag,
155
+ record_locator={},
156
+ date_modified=str(element.last_modified_datetime.timestamp()),
157
+ date_processed=str(time()),
158
+ filesize_bytes=element.size,
159
+ permissions_data=[{"mode": element.mode}],
160
+ ),
161
+ )
162
+
163
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
164
+ for element in self.list_files():
165
+ yield self.convert_element(element=element)
166
+
167
+
168
+ class GithubDownloaderConfig(DownloaderConfig):
169
+ pass
170
+
171
+
172
+ @dataclass
173
+ class GithubDownloader(Downloader):
174
+ download_config: GithubDownloaderConfig
175
+ connection_config: GithubConnectionConfig
176
+ connector_type: str = CONNECTOR_TYPE
177
+
178
+ @requires_dependencies(["github"], extras="github")
179
+ def get_file(self, file_data: FileData) -> "ContentFile":
180
+ from github.GithubException import UnknownObjectException
181
+
182
+ path = file_data.source_identifiers.relative_path
183
+ repo = self.connection_config.get_repo()
184
+
185
+ try:
186
+ content_file = repo.get_contents(path)
187
+ except UnknownObjectException as e:
188
+ logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
189
+ raise UserError(f"File not found: {path}")
190
+ return content_file
191
+
192
+ @requires_dependencies(["requests"], extras="github")
193
+ def get_contents(self, content_file: "ContentFile") -> bytes:
194
+ import requests
195
+
196
+ if content_file.decoded_content:
197
+ return content_file.decoded_content
198
+ download_url = content_file.download_url
199
+ resp = requests.get(download_url)
200
+ try:
201
+ resp.raise_for_status()
202
+ except requests.HTTPError as e:
203
+ raise self.connection_config.wrap_error(e=e)
204
+ return resp.content
205
+
206
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
207
+ content_file = self.get_file(file_data)
208
+ contents = self.get_contents(content_file)
209
+ download_path = self.get_download_path(file_data)
210
+ with download_path.open("wb") as f:
211
+ f.write(contents)
212
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
213
+
214
+
215
+ github_source_entry = SourceRegistryEntry(
216
+ indexer=GithubIndexer,
217
+ indexer_config=GithubIndexerConfig,
218
+ downloader=GithubDownloader,
219
+ downloader_config=GithubDownloaderConfig,
220
+ connection_config=GithubConnectionConfig,
221
+ )
@@ -8,9 +8,13 @@ from urllib.parse import urlparse
8
8
 
9
9
  from pydantic import Field, Secret, model_validator
10
10
 
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
11
16
  from unstructured_ingest.error import SourceConnectionError
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.interfaces import (
17
+ from unstructured_ingest.interfaces import (
14
18
  AccessConfig,
15
19
  ConnectionConfig,
16
20
  Downloader,
@@ -19,13 +23,9 @@ from unstructured_ingest.v2.interfaces import (
19
23
  Indexer,
20
24
  IndexerConfig,
21
25
  )
22
- from unstructured_ingest.v2.logger import logger
23
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
24
- from unstructured_ingest.v2.types.file_data import (
25
- FileData,
26
- FileDataSourceMetadata,
27
- SourceIdentifiers,
28
- )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
28
+ from unstructured_ingest.utils.dep_check import requires_dependencies
29
29
 
30
30
  CONNECTOR_TYPE = "gitlab"
31
31
  if TYPE_CHECKING:
@@ -9,13 +9,16 @@ from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
  from pydantic.functional_validators import BeforeValidator
11
11
 
12
+ from unstructured_ingest.data_types.file_data import (
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ SourceIdentifiers,
16
+ )
12
17
  from unstructured_ingest.error import (
13
18
  SourceConnectionError,
14
19
  SourceConnectionNetworkError,
15
20
  )
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
- from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
18
- from unstructured_ingest.v2.interfaces import (
21
+ from unstructured_ingest.interfaces import (
19
22
  AccessConfig,
20
23
  ConnectionConfig,
21
24
  Downloader,
@@ -24,21 +27,27 @@ from unstructured_ingest.v2.interfaces import (
24
27
  Indexer,
25
28
  IndexerConfig,
26
29
  )
27
- from unstructured_ingest.v2.logger import logger
28
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
29
- from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
30
- from unstructured_ingest.v2.types.file_data import (
31
- FileData,
32
- FileDataSourceMetadata,
33
- SourceIdentifiers,
34
- )
35
-
36
- CONNECTOR_TYPE = "google_drive"
30
+ from unstructured_ingest.logger import logger
31
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
32
+ from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
33
+ from unstructured_ingest.utils.dep_check import requires_dependencies
37
34
 
38
35
  if TYPE_CHECKING:
39
36
  from googleapiclient.discovery import Resource as GoogleAPIResource
40
37
  from googleapiclient.http import MediaIoBaseDownload
41
38
 
39
+ CONNECTOR_TYPE = "google_drive"
40
+
41
+ GOOGLE_DRIVE_EXPORT_TYPES = {
42
+ "application/vnd.google-apps.document": "application/"
43
+ "vnd.openxmlformats-officedocument.wordprocessingml.document",
44
+ "application/vnd.google-apps.spreadsheet": "application/"
45
+ "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
46
+ "application/vnd.google-apps.presentation": "application/"
47
+ "vnd.openxmlformats-officedocument.presentationml.presentation",
48
+ "application/vnd.google-apps.photo": "image/jpeg",
49
+ }
50
+
42
51
 
43
52
  class GoogleDriveAccessConfig(AccessConfig):
44
53
  service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(