unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,348 +0,0 @@
1
- import io
2
- import json
3
- import os
4
- import typing as t
5
- from dataclasses import dataclass, field
6
- from datetime import datetime
7
- from mimetypes import guess_extension
8
- from pathlib import Path
9
-
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
- from unstructured_ingest.interfaces import (
13
- AccessConfig,
14
- BaseConnectorConfig,
15
- BaseSessionHandle,
16
- BaseSingleIngestDoc,
17
- BaseSourceConnector,
18
- ConfigSessionHandleMixin,
19
- IngestDocCleanupMixin,
20
- IngestDocSessionHandleMixin,
21
- SourceConnectorCleanupMixin,
22
- SourceMetadata,
23
- )
24
- from unstructured_ingest.logger import logger
25
- from unstructured_ingest.utils.dep_check import requires_dependencies
26
- from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
27
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
28
-
29
- if t.TYPE_CHECKING:
30
- from googleapiclient.discovery import Resource as GoogleAPIResource
31
- from googleapiclient.http import MediaIoBaseDownload
32
-
33
- FILE_FORMAT = "{id}-{name}{ext}"
34
- DIRECTORY_FORMAT = "{id}-{name}"
35
-
36
-
37
- @dataclass
38
- class GoogleDriveSessionHandle(BaseSessionHandle):
39
- service: "GoogleAPIResource"
40
-
41
-
42
- @requires_dependencies(["googleapiclient"], extras="google-drive")
43
- def create_service_account_object(key_path: t.Union[str, dict], id=None):
44
- """
45
- Creates a service object for interacting with Google Drive.
46
-
47
- Providing a drive id enforces a key validation process.
48
-
49
- Args:
50
- key_path: Path to Google Drive service account json file. (or the actual json)
51
- id: ID of a file on Google Drive. File has to be either publicly accessible or accessible
52
- to the service account.
53
-
54
- Returns:
55
- Service account object
56
- """
57
- from google.auth import default, exceptions
58
- from google.oauth2 import service_account
59
- from googleapiclient.discovery import build
60
- from googleapiclient.errors import HttpError
61
-
62
- # Service account key can be a dict or a file path(str)
63
- # But the dict may come in as a string
64
- key_path = json_to_dict(key_path)
65
-
66
- try:
67
- if isinstance(key_path, dict):
68
- creds = service_account.Credentials.from_service_account_info(key_path)
69
- elif isinstance(key_path, str):
70
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
71
- creds, _ = default()
72
- else:
73
- raise ValueError(
74
- f"key path not recognized as a dictionary or a file path: "
75
- f"[{type(key_path)}] {key_path}",
76
- )
77
- service = build("drive", "v3", credentials=creds)
78
-
79
- if id:
80
- service.files().list(
81
- spaces="drive",
82
- fields="files(id)",
83
- pageToken=None,
84
- corpora="user",
85
- q=f"'{id}' in parents",
86
- ).execute()
87
-
88
- except HttpError as exc:
89
- raise ValueError(f"{exc.reason}")
90
- except exceptions.DefaultCredentialsError:
91
- raise ValueError("The provided API key is invalid.")
92
-
93
- return service
94
-
95
-
96
- @dataclass
97
- class GoogleDriveAccessConfig(AccessConfig):
98
- service_account_key: t.Union[str, dict] = enhanced_field(sensitive=True)
99
-
100
-
101
- @dataclass
102
- class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
103
- """Connector config where drive_id is the id of the document to process or
104
- the folder to process all documents from."""
105
-
106
- # Google Drive Specific Options
107
- drive_id: str
108
- access_config: GoogleDriveAccessConfig
109
- extension: t.Optional[str] = None
110
- recursive: bool = False
111
-
112
- def create_session_handle(
113
- self,
114
- ) -> GoogleDriveSessionHandle:
115
- service = create_service_account_object(self.access_config.service_account_key)
116
- return GoogleDriveSessionHandle(service=service)
117
-
118
-
119
- @dataclass
120
- class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
121
- connector_config: SimpleGoogleDriveConfig
122
- meta: t.Dict[str, str] = field(default_factory=dict)
123
- registry_name: str = "google_drive"
124
-
125
- @property
126
- def filename(self):
127
- return Path(self.meta.get("download_filepath")).resolve() # type: ignore
128
-
129
- @property
130
- def _output_filename(self):
131
- return Path(f"{self.meta.get('output_filepath')}.json").resolve()
132
-
133
- @property
134
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
135
- return {
136
- "drive_id": self.connector_config.drive_id,
137
- "file_id": self.meta["id"],
138
- }
139
-
140
- @requires_dependencies(["googleapiclient"], extras="google-drive")
141
- def update_source_metadata(self):
142
- from googleapiclient.errors import HttpError
143
-
144
- try:
145
- file_obj = (
146
- self.session_handle.service.files()
147
- .get(
148
- fileId=self.meta["id"],
149
- fields="id, createdTime, modifiedTime, version, webContentLink",
150
- )
151
- .execute()
152
- )
153
- except HttpError as e:
154
- if e.status_code == 404:
155
- logger.error(f"File {self.meta['name']} not found")
156
- self.source_metadata = SourceMetadata(
157
- exists=True,
158
- )
159
- return
160
- raise
161
-
162
- date_created = None
163
- if dc := file_obj.get("createdTime", ""):
164
- date_created = datetime.strptime(
165
- dc,
166
- "%Y-%m-%dT%H:%M:%S.%fZ",
167
- ).isoformat()
168
-
169
- date_modified = None
170
- if dm := file_obj.get("modifiedTime", ""):
171
- date_modified = datetime.strptime(
172
- dm,
173
- "%Y-%m-%dT%H:%M:%S.%fZ",
174
- ).isoformat()
175
-
176
- self.source_metadata = SourceMetadata(
177
- date_created=date_created,
178
- date_modified=date_modified,
179
- version=file_obj.get("version", ""),
180
- source_url=file_obj.get("webContentLink", ""),
181
- exists=True,
182
- )
183
-
184
- @SourceConnectionNetworkError.wrap
185
- def _run_downloader(self, downloader: "MediaIoBaseDownload") -> bool:
186
- downloaded = False
187
- while downloaded is False:
188
- _, downloaded = downloader.next_chunk()
189
- return downloaded
190
-
191
- @requires_dependencies(["googleapiclient"], extras="google-drive")
192
- @SourceConnectionError.wrap
193
- @BaseSingleIngestDoc.skip_if_file_exists
194
- def get_file(self):
195
- from googleapiclient.http import MediaIoBaseDownload
196
-
197
- if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"):
198
- export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
199
- self.meta.get("mimeType"), # type: ignore
200
- )
201
- if not export_mime:
202
- logger.info(
203
- f"File not supported. Name: {self.meta.get('name')} "
204
- f"ID: {self.meta.get('id')} "
205
- f"MimeType: {self.meta.get('mimeType')}",
206
- )
207
- return
208
-
209
- request = self.session_handle.service.files().export_media(
210
- fileId=self.meta.get("id"),
211
- mimeType=export_mime,
212
- )
213
- else:
214
- request = self.session_handle.service.files().get_media(fileId=self.meta.get("id"))
215
- file = io.BytesIO()
216
- downloader = MediaIoBaseDownload(file, request)
217
- self.update_source_metadata()
218
- downloaded = self._run_downloader(downloader=downloader)
219
-
220
- saved = False
221
- if downloaded and file:
222
- dir_ = Path(self.meta["download_dir"])
223
- if dir_:
224
- if not dir_.is_dir():
225
- logger.debug(f"creating directory: {self.meta.get('download_dir')}")
226
-
227
- if dir_:
228
- dir_.mkdir(parents=True, exist_ok=True)
229
-
230
- with open(self.filename, "wb") as handler:
231
- handler.write(file.getbuffer())
232
- saved = True
233
- logger.debug(f"file downloaded: {self.filename}.")
234
- if not saved:
235
- logger.error(f"Error while downloading and saving file: {self.filename}.")
236
-
237
- def write_result(self):
238
- """Write the structured json result for this doc. result must be json serializable."""
239
- if self.read_config.download_only:
240
- return
241
- self._output_filename.parent.mkdir(parents=True, exist_ok=True)
242
- with open(self._output_filename, "w") as output_f:
243
- output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
244
- logger.info(f"wrote {self._output_filename}")
245
-
246
-
247
- @dataclass
248
- class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
249
- """Objects of this class support fetching documents from Google Drive"""
250
-
251
- connector_config: SimpleGoogleDriveConfig
252
-
253
- def _list_objects(self, drive_id, recursive=False):
254
- files = []
255
- service = self.connector_config.create_session_handle().service
256
-
257
- def traverse(drive_id, download_dir, output_dir, recursive=False):
258
- page_token = None
259
- while True:
260
- response = (
261
- service.files()
262
- .list(
263
- spaces="drive",
264
- fields="nextPageToken, files(id, name, mimeType)",
265
- pageToken=page_token,
266
- corpora="user",
267
- q=f"'{drive_id}' in parents",
268
- )
269
- .execute()
270
- )
271
-
272
- for meta in response.get("files", []):
273
- if meta.get("mimeType") == "application/vnd.google-apps.folder":
274
- dir_ = DIRECTORY_FORMAT.format(name=meta.get("name"), id=meta.get("id"))
275
- if recursive:
276
- download_sub_dir = (download_dir / dir_).resolve()
277
- output_sub_dir = (output_dir / dir_).resolve()
278
- traverse(meta.get("id"), download_sub_dir, output_sub_dir, True)
279
- else:
280
- ext = ""
281
- if not Path(meta.get("name")).suffixes:
282
- guess = guess_extension(meta.get("mimeType"))
283
- ext = guess if guess else ext
284
-
285
- if meta.get("mimeType", "").startswith("application/vnd.google-apps"):
286
- export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(meta.get("mimeType"))
287
- if not export_mime:
288
- logger.info(
289
- f"File {meta.get('name')} has an "
290
- f"unsupported MimeType {meta.get('mimeType')}",
291
- )
292
- continue
293
-
294
- if not ext:
295
- guess = guess_extension(export_mime)
296
- ext = guess if guess else ext
297
-
298
- # TODO(Habeeb): Consider filtering at the query level.
299
- if (
300
- self.connector_config.extension
301
- and self.connector_config.extension != ext
302
- ): # noqa: SIM102
303
- logger.debug(
304
- f"File {meta.get('name')} does not match "
305
- f"the file type {self.connector_config.extension}",
306
- )
307
- continue
308
-
309
- name = FILE_FORMAT.format(name=meta.get("name"), id=meta.get("id"), ext=ext)
310
- meta["download_dir"] = str(download_dir)
311
- meta["download_filepath"] = (download_dir / name).resolve().as_posix()
312
- meta["output_dir"] = str(output_dir)
313
- meta["output_filepath"] = (output_dir / name).resolve().as_posix()
314
- files.append(meta)
315
-
316
- page_token = response.get("nextPageToken", None)
317
- if page_token is None:
318
- break
319
-
320
- traverse(
321
- drive_id,
322
- Path(self.read_config.download_dir),
323
- Path(self.processor_config.output_dir),
324
- recursive,
325
- )
326
- return files
327
-
328
- def initialize(self):
329
- pass
330
-
331
- def check_connection(self):
332
- try:
333
- self.connector_config.create_session_handle().service
334
- except Exception as e:
335
- logger.error(f"failed to validate connection: {e}", exc_info=True)
336
- raise SourceConnectionError(f"failed to validate connection: {e}")
337
-
338
- def get_ingest_docs(self):
339
- files = self._list_objects(self.connector_config.drive_id, self.connector_config.recursive)
340
- return [
341
- GoogleDriveIngestDoc(
342
- connector_config=self.connector_config,
343
- processor_config=self.processor_config,
344
- read_config=self.read_config,
345
- meta=file,
346
- )
347
- for file in files
348
- ]
@@ -1,278 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
- from enum import Enum
4
- from functools import reduce
5
- from pathlib import Path
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.error import SourceConnectionError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseSessionHandle,
13
- BaseSingleIngestDoc,
14
- BaseSourceConnector,
15
- ConfigSessionHandleMixin,
16
- IngestDocCleanupMixin,
17
- IngestDocSessionHandleMixin,
18
- SourceConnectorCleanupMixin,
19
- SourceMetadata,
20
- )
21
- from unstructured_ingest.logger import logger
22
- from unstructured_ingest.utils.dep_check import requires_dependencies
23
-
24
- if t.TYPE_CHECKING:
25
- from hubspot import HubSpot
26
-
27
- CONTENT_TAG = "content"
28
-
29
-
30
- class HubSpotObjectTypes(Enum):
31
- CALLS = "calls"
32
- COMMUNICATIONS = "communications"
33
- EMAILS = "emails"
34
- NOTES = "notes"
35
- PRODUCTS = "products"
36
- TICKETS = "tickets"
37
-
38
-
39
- @dataclass
40
- class HubSpotSessionHandle(BaseSessionHandle):
41
- service: "HubSpot"
42
-
43
-
44
- @dataclass
45
- class HubSpotAccessConfig(AccessConfig):
46
- api_token: str = enhanced_field(repr=False, sensitive=True)
47
-
48
-
49
- @dataclass
50
- class SimpleHubSpotConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
51
- access_config: HubSpotAccessConfig
52
- params: t.Optional[str] = None
53
- properties: t.Optional[dict] = None
54
- object_types: t.Optional[t.List[str]] = None
55
- custom_properties: t.Optional[t.Dict[str, t.List[str]]] = None
56
-
57
- @requires_dependencies(["hubspot"], extras="hubspot")
58
- def create_session_handle(self) -> HubSpotSessionHandle:
59
- from hubspot import HubSpot
60
-
61
- service = HubSpot(access_token=self.access_config.api_token)
62
- return HubSpotSessionHandle(service=service)
63
-
64
-
65
- @dataclass
66
- class HubSpotIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
67
- connector_config: SimpleHubSpotConfig
68
- object_id: str
69
- object_type: str
70
- content_properties: t.List[str]
71
- registry_name: str = "hubspot"
72
-
73
- def __post_init__(self):
74
- self._add_custom_properties()
75
-
76
- @property
77
- def filename(self):
78
- return (
79
- Path(self.read_config.download_dir)
80
- / f"{self.object_type}/{self.object_id}.txt" # type: ignore
81
- ).resolve()
82
-
83
- @property
84
- def _output_filename(self):
85
- return (
86
- Path(self.processor_config.output_dir)
87
- / f"{self.object_type}/{self.object_id}.json" # type: ignore
88
- ).resolve()
89
-
90
- @property
91
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
92
- return {
93
- f"{self.registry_name}_id": self.object_id,
94
- }
95
-
96
- @property
97
- def version(self) -> t.Optional[str]:
98
- return None
99
-
100
- @property
101
- def source_url(self) -> t.Optional[str]:
102
- return None
103
-
104
- def _add_custom_properties(self):
105
- if (self.connector_config.custom_properties is not None) and (
106
- (cprops := self.connector_config.custom_properties.get(self.object_type)) is not None
107
- ):
108
- self.content_properties += cprops
109
-
110
- def _join_object_properties(self, obj) -> str:
111
- return "\n".join(
112
- [
113
- obj.properties[cprop]
114
- for cprop in self.content_properties
115
- if (obj.properties.get(cprop) is not None)
116
- ],
117
- )
118
-
119
- def _resolve_getter(self):
120
- method_path = ""
121
- if self.object_type in [
122
- HubSpotObjectTypes.CALLS.value,
123
- HubSpotObjectTypes.COMMUNICATIONS.value,
124
- HubSpotObjectTypes.EMAILS.value,
125
- HubSpotObjectTypes.NOTES.value,
126
- ]:
127
- method_path = f"crm.objects.{self.object_type}.basic_api.get_by_id"
128
- if self.object_type in [
129
- HubSpotObjectTypes.PRODUCTS.value,
130
- HubSpotObjectTypes.TICKETS.value,
131
- ]:
132
- method_path = f"crm.{self.object_type}.basic_api.get_by_id"
133
-
134
- method = reduce(getattr, method_path.split("."), self.session_handle.service)
135
- return method
136
-
137
- @requires_dependencies(["hubspot"], extras="hubspot")
138
- def _fetch_obj(self, check_only=False):
139
- from hubspot.crm.objects.exceptions import NotFoundException
140
-
141
- get_by_id_method = self._resolve_getter()
142
- try:
143
- response = get_by_id_method(
144
- self.object_id,
145
- properties=([] if check_only else self.content_properties),
146
- )
147
- except NotFoundException as e:
148
- logger.error(e)
149
- return None
150
- return response
151
-
152
- def update_source_metadata(self, **kwargs) -> None:
153
- obj = kwargs.get("object", self._fetch_obj(check_only=True)) # type: ignore
154
- if obj is None:
155
- self.source_metadata = SourceMetadata(
156
- exists=False,
157
- )
158
- return
159
- self.source_metadata = SourceMetadata(
160
- date_created=obj.created_at.isoformat(),
161
- date_modified=obj.updated_at.isoformat(),
162
- exists=True,
163
- )
164
-
165
- @SourceConnectionError.wrap
166
- @BaseSingleIngestDoc.skip_if_file_exists
167
- def get_file(self):
168
- obj = self._fetch_obj()
169
- if obj is None:
170
- raise ValueError(
171
- f"Failed to retrieve object {self.registry_name}",
172
- f"with ID {self.object_id}",
173
- )
174
- self.update_source_metadata(object=obj)
175
- output = self._join_object_properties(obj)
176
- self.filename.parent.mkdir(parents=True, exist_ok=True)
177
- with open(self.filename, "w", encoding="utf8") as f:
178
- f.write(output)
179
- return
180
-
181
-
182
- @dataclass
183
- class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
184
- connector_config: SimpleHubSpotConfig
185
-
186
- def initialize(self):
187
- self.hubspot = self.connector_config.create_session_handle().service
188
-
189
- def check_connection(self):
190
- return self.connector_config.create_session_handle().service
191
-
192
- @requires_dependencies(["hubspot"], extras="hubspot")
193
- def _list_objects(self, get_page_method, object_type: str, content_properties: t.List[str]):
194
- try:
195
- objects = get_page_method()
196
- except Exception as e:
197
- logger.error(e)
198
- logger.error(
199
- f"Failed to retrieve {object_type}, omitting processing...",
200
- )
201
- return []
202
- return [
203
- HubSpotIngestDoc(
204
- connector_config=self.connector_config,
205
- processor_config=self.processor_config,
206
- read_config=self.read_config,
207
- object_id=obj.id,
208
- object_type=object_type,
209
- content_properties=content_properties,
210
- )
211
- for obj in objects.results
212
- ]
213
-
214
- def _get_calls(self) -> t.List[HubSpotIngestDoc]:
215
- return self._list_objects(
216
- self.hubspot.crm.objects.calls.basic_api.get_page,
217
- HubSpotObjectTypes.CALLS.value,
218
- ["hs_call_title", "hs_call_body"],
219
- )
220
-
221
- def _get_communications(self) -> t.List[HubSpotIngestDoc]:
222
- return self._list_objects(
223
- self.hubspot.crm.objects.communications.basic_api.get_page,
224
- HubSpotObjectTypes.COMMUNICATIONS.value,
225
- ["hs_communication_body"],
226
- )
227
-
228
- def _get_emails(self) -> t.List[HubSpotIngestDoc]:
229
- return self._list_objects(
230
- self.hubspot.crm.objects.emails.basic_api.get_page,
231
- HubSpotObjectTypes.EMAILS.value,
232
- ["hs_email_subject", "hs_email_text"],
233
- )
234
-
235
- def _get_notes(self) -> t.List[HubSpotIngestDoc]:
236
- return self._list_objects(
237
- self.hubspot.crm.objects.notes.basic_api.get_page,
238
- HubSpotObjectTypes.NOTES.value,
239
- ["hs_note_body"],
240
- )
241
-
242
- def _get_products(self) -> t.List[HubSpotIngestDoc]:
243
- return self._list_objects(
244
- self.hubspot.crm.products.basic_api.get_page,
245
- HubSpotObjectTypes.PRODUCTS.value,
246
- ["description"],
247
- )
248
-
249
- def _get_tickets(self) -> t.List[HubSpotIngestDoc]:
250
- return self._list_objects(
251
- self.hubspot.crm.tickets.basic_api.get_page,
252
- HubSpotObjectTypes.TICKETS.value,
253
- ["subject", "content"],
254
- )
255
-
256
- def get_ingest_docs(self):
257
- obj_method_resolver = {
258
- HubSpotObjectTypes.CALLS.value: self._get_calls,
259
- HubSpotObjectTypes.COMMUNICATIONS.value: self._get_communications,
260
- HubSpotObjectTypes.EMAILS.value: self._get_emails,
261
- HubSpotObjectTypes.NOTES.value: self._get_notes,
262
- HubSpotObjectTypes.PRODUCTS.value: self._get_products,
263
- HubSpotObjectTypes.TICKETS.value: self._get_tickets,
264
- }
265
-
266
- if self.connector_config.object_types is not None:
267
- obj_method_resolver = {
268
- obj_name: obj_method_resolver.get(obj_name) # type: ignore
269
- for obj_name in self.connector_config.object_types
270
- }
271
-
272
- ingest_docs: t.List[HubSpotIngestDoc] = []
273
- for obj_name, obj_method in obj_method_resolver.items():
274
- logger.info(f"retrieving - {obj_name}")
275
- results: t.List[HubSpotIngestDoc] = obj_method() # type: ignore
276
- ingest_docs += results # type: ignore
277
-
278
- return ingest_docs