unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,285 +0,0 @@
1
- import math
2
- import typing as t
3
- from dataclasses import dataclass, field
4
- from datetime import datetime
5
- from pathlib import Path
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseSingleIngestDoc,
13
- BaseSourceConnector,
14
- IngestDocCleanupMixin,
15
- SourceConnectorCleanupMixin,
16
- SourceMetadata,
17
- )
18
- from unstructured_ingest.logger import logger
19
- from unstructured_ingest.utils.dep_check import requires_dependencies
20
-
21
- if t.TYPE_CHECKING:
22
- from atlassian import Confluence
23
-
24
-
25
- @dataclass
26
- class ConfluenceAccessConfig(AccessConfig):
27
- api_token: str = enhanced_field(sensitive=True)
28
-
29
-
30
- @dataclass
31
- class SimpleConfluenceConfig(BaseConnectorConfig):
32
- """Connector config where:
33
- user_email is the email to authenticate into Confluence Cloud,
34
- api_token is the api token to authenticate into Confluence Cloud,
35
- and url is the URL pointing to the Confluence Cloud instance.
36
-
37
- Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/
38
- for more info on the api_token.
39
- """
40
-
41
- user_email: str
42
- access_config: ConfluenceAccessConfig
43
- url: str
44
- max_num_of_spaces: int = 500
45
- max_num_of_docs_from_each_space: int = 100
46
- spaces: t.List[str] = field(default_factory=list)
47
-
48
-
49
- @dataclass
50
- class ConfluenceDocumentMeta:
51
- """Metadata specifying:
52
- id for the confluence space that the document locates in,
53
- and the id of document that is being reached to.
54
- """
55
-
56
- space_id: str
57
- document_id: str
58
-
59
-
60
- def scroll_wrapper(func):
61
- def wrapper(*args, **kwargs):
62
- """Wraps a function to obtain scroll functionality."""
63
- number_of_items_to_fetch = kwargs["number_of_items_to_fetch"]
64
- del kwargs["number_of_items_to_fetch"]
65
-
66
- kwargs["limit"] = min(100, number_of_items_to_fetch)
67
- kwargs["start"] = kwargs.get("start", 0)
68
-
69
- all_results = []
70
- num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"])
71
-
72
- for _ in range(num_iterations):
73
- response = func(*args, **kwargs)
74
- if isinstance(response, list):
75
- all_results += func(*args, **kwargs)
76
- elif isinstance(response, dict):
77
- all_results += func(*args, **kwargs)["results"]
78
-
79
- kwargs["start"] += kwargs["limit"]
80
-
81
- return all_results[:number_of_items_to_fetch]
82
-
83
- return wrapper
84
-
85
-
86
- @dataclass
87
- class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
88
- """Class encapsulating fetching a doc and writing processed results (but not
89
- doing the processing).
90
-
91
- Current implementation creates a Confluence connection object
92
- to fetch each doc, rather than creating a it for each thread.
93
- """
94
-
95
- connector_config: SimpleConfluenceConfig
96
- document_meta: ConfluenceDocumentMeta
97
- registry_name: str = "confluence"
98
-
99
- # TODO: remove one of filename or _tmp_download_file, using a wrapper
100
- @property
101
- def filename(self):
102
- if not self.read_config.download_dir:
103
- return None
104
- return (
105
- Path(self.read_config.download_dir)
106
- / self.document_meta.space_id
107
- / f"{self.document_meta.document_id}.html"
108
- ).resolve()
109
-
110
- @property
111
- def _output_filename(self):
112
- """Create output file path based on output directory, space id and document id."""
113
- output_file = f"{self.document_meta.document_id}.json"
114
- return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file
115
-
116
- @property
117
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
118
- return {
119
- "url": self.connector_config.url,
120
- "page_id": self.document_meta.document_id,
121
- }
122
-
123
- @SourceConnectionNetworkError.wrap
124
- @requires_dependencies(["atlassian"], extras="Confluence")
125
- def _get_page(self):
126
- from atlassian import Confluence
127
- from atlassian.errors import ApiError
128
-
129
- try:
130
- confluence = Confluence(
131
- self.connector_config.url,
132
- username=self.connector_config.user_email,
133
- password=self.connector_config.access_config.api_token,
134
- )
135
- result = confluence.get_page_by_id(
136
- page_id=self.document_meta.document_id,
137
- expand="history.lastUpdated,version,body.view",
138
- )
139
- except ApiError as e:
140
- logger.error(e)
141
- return None
142
- return result
143
-
144
- def update_source_metadata(self, **kwargs):
145
- """Fetches file metadata from the current page."""
146
- page = kwargs.get("page", self._get_page())
147
- if page is None:
148
- self.source_metadata = SourceMetadata(
149
- exists=False,
150
- )
151
- return
152
- document_history = page["history"]
153
- date_created = datetime.strptime(
154
- document_history["createdDate"],
155
- "%Y-%m-%dT%H:%M:%S.%fZ",
156
- ).isoformat()
157
- if last_updated := document_history.get("lastUpdated", {}).get("when", ""):
158
- date_modified = datetime.strptime(
159
- last_updated,
160
- "%Y-%m-%dT%H:%M:%S.%fZ",
161
- ).isoformat()
162
- else:
163
- date_modified = date_created
164
- version = page["version"]["number"]
165
- self.source_metadata = SourceMetadata(
166
- date_created=date_created,
167
- date_modified=date_modified,
168
- version=version,
169
- source_url=page["_links"].get("self", None),
170
- exists=True,
171
- )
172
-
173
- @SourceConnectionError.wrap
174
- @requires_dependencies(["atlassian"], extras="confluence")
175
- @BaseSingleIngestDoc.skip_if_file_exists
176
- def get_file(self):
177
- # TODO: instead of having a separate connection object for each doc,
178
- # have a separate connection object for each process
179
-
180
- result = self._get_page()
181
- self.update_source_metadata(page=result)
182
- if result is None:
183
- raise ValueError(f"Failed to retrieve page with ID {self.document_meta.document_id}")
184
- self.document = result["body"]["view"]["value"]
185
- self.filename.parent.mkdir(parents=True, exist_ok=True)
186
- with open(self.filename, "w", encoding="utf8") as f:
187
- f.write(self.document)
188
-
189
-
190
- @dataclass
191
- class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
192
- """Fetches body fields from all documents within all spaces in a Confluence Cloud instance."""
193
-
194
- connector_config: SimpleConfluenceConfig
195
- _confluence: t.Optional["Confluence"] = field(init=False, default=None)
196
-
197
- @property
198
- def confluence(self) -> "Confluence":
199
- from atlassian import Confluence
200
-
201
- if self._confluence is None:
202
- self._confluence = Confluence(
203
- url=self.connector_config.url,
204
- username=self.connector_config.user_email,
205
- password=self.connector_config.access_config.api_token,
206
- )
207
- return self._confluence
208
-
209
- @requires_dependencies(["atlassian", "requests"], extras="Confluence")
210
- def check_connection(self):
211
- import requests
212
-
213
- url = "rest/api/space"
214
- try:
215
- self.confluence.request(method="HEAD", path=url)
216
- except requests.HTTPError as http_error:
217
- logger.error(f"failed to validate connection: {http_error}", exc_info=True)
218
- raise SourceConnectionError(f"failed to validate connection: {http_error}")
219
-
220
- @requires_dependencies(["atlassian"], extras="Confluence")
221
- def initialize(self):
222
- self.list_of_spaces = None
223
- if self.connector_config.spaces:
224
- self.list_of_spaces = self.connector_config.spaces
225
- if self.connector_config.max_num_of_spaces:
226
- logger.warning(
227
- """--confluence-list-of-spaces and --confluence-num-of-spaces cannot
228
- be used at the same time. Connector will only fetch the
229
- --confluence-list-of-spaces that you've provided.""",
230
- )
231
-
232
- @requires_dependencies(["atlassian"], extras="Confluence")
233
- def _get_space_ids(self):
234
- """Fetches spaces in a confluence domain."""
235
-
236
- get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces)
237
-
238
- all_results = get_spaces_with_scroll(
239
- number_of_items_to_fetch=self.connector_config.max_num_of_spaces,
240
- )
241
-
242
- space_ids = [space["key"] for space in all_results]
243
- return space_ids
244
-
245
- @requires_dependencies(["atlassian"], extras="Confluence")
246
- def _get_docs_ids_within_one_space(
247
- self,
248
- space_id: str,
249
- content_type: str = "page",
250
- ):
251
- get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space)
252
- results = get_pages_with_scroll(
253
- space=space_id,
254
- number_of_items_to_fetch=self.connector_config.max_num_of_docs_from_each_space,
255
- content_type=content_type,
256
- )
257
-
258
- doc_ids = [(space_id, doc["id"]) for doc in results]
259
- return doc_ids
260
-
261
- @requires_dependencies(["atlassian"], extras="Confluence")
262
- def _get_doc_ids_within_spaces(self):
263
- space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces
264
-
265
- doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids]
266
-
267
- doc_ids_flattened = [
268
- (space_id, doc_id)
269
- for doc_ids_space in doc_ids_all
270
- for space_id, doc_id in doc_ids_space
271
- ]
272
- return doc_ids_flattened
273
-
274
- def get_ingest_docs(self):
275
- """Fetches all documents in a confluence space."""
276
- doc_ids = self._get_doc_ids_within_spaces()
277
- return [
278
- ConfluenceIngestDoc(
279
- connector_config=self.connector_config,
280
- processor_config=self.processor_config,
281
- read_config=self.read_config,
282
- document_meta=ConfluenceDocumentMeta(space_id, doc_id),
283
- )
284
- for space_id, doc_id in doc_ids
285
- ]
@@ -1,137 +0,0 @@
1
- import copy
2
- import json
3
- import os
4
- import typing as t
5
- from dataclasses import dataclass, field
6
- from io import BytesIO
7
- from pathlib import PurePath
8
-
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.enhanced_dataclass.core import _asdict
11
- from unstructured_ingest.error import DestinationConnectionError
12
- from unstructured_ingest.interfaces import (
13
- AccessConfig,
14
- BaseConnectorConfig,
15
- BaseDestinationConnector,
16
- BaseSingleIngestDoc,
17
- WriteConfig,
18
- )
19
- from unstructured_ingest.logger import logger
20
- from unstructured_ingest.utils.dep_check import requires_dependencies
21
-
22
- if t.TYPE_CHECKING:
23
- from databricks.sdk import WorkspaceClient
24
-
25
-
26
- @dataclass
27
- class DatabricksVolumesAccessConfig(AccessConfig):
28
- account_id: t.Optional[str] = None
29
- username: t.Optional[str] = None
30
- password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
31
- client_id: t.Optional[str] = None
32
- client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
33
- token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
34
- profile: t.Optional[str] = None
35
- azure_workspace_resource_id: t.Optional[str] = None
36
- azure_client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
37
- azure_client_id: t.Optional[str] = None
38
- azure_tenant_id: t.Optional[str] = None
39
- azure_environment: t.Optional[str] = None
40
- auth_type: t.Optional[str] = None
41
- cluster_id: t.Optional[str] = None
42
- google_credentials: t.Optional[str] = None
43
- google_service_account: t.Optional[str] = None
44
-
45
-
46
- @dataclass
47
- class SimpleDatabricksVolumesConfig(BaseConnectorConfig):
48
- access_config: DatabricksVolumesAccessConfig
49
- host: t.Optional[str] = None
50
-
51
-
52
- @dataclass
53
- class DatabricksVolumesWriteConfig(WriteConfig):
54
- volume: str
55
- catalog: str
56
- volume_path: t.Optional[str] = None
57
- overwrite: bool = False
58
- encoding: str = "utf-8"
59
- schema: str = "default"
60
-
61
- @property
62
- def path(self) -> str:
63
- path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
64
- if self.volume_path:
65
- path = f"{path}/{self.volume_path}"
66
- return path
67
-
68
-
69
- @dataclass
70
- class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
71
- write_config: DatabricksVolumesWriteConfig
72
- connector_config: SimpleDatabricksVolumesConfig
73
- _client: t.Optional["WorkspaceClient"] = field(init=False, default=None)
74
-
75
- def to_dict(self, **kwargs):
76
- self_cp = copy.copy(self)
77
- if hasattr(self_cp, "_client"):
78
- setattr(self_cp, "_client", None)
79
- return _asdict(self_cp, **kwargs)
80
-
81
- @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
82
- def generate_client(self) -> "WorkspaceClient":
83
- from databricks.sdk import WorkspaceClient
84
-
85
- return WorkspaceClient(
86
- host=self.connector_config.host, **self.connector_config.access_config.to_dict()
87
- )
88
-
89
- @property
90
- def client(self) -> "WorkspaceClient":
91
- if self._client is None:
92
- self._client = self.generate_client()
93
- return self._client
94
-
95
- def check_connection(self):
96
- try:
97
- assert self.client.current_user.me().active
98
- except Exception as e:
99
- logger.error(f"failed to validate connection: {e}", exc_info=True)
100
- raise DestinationConnectionError(f"failed to validate connection: {e}")
101
-
102
- def initialize(self):
103
- _ = self.client
104
-
105
- def write_dict(
106
- self,
107
- *args,
108
- elements_dict: t.List[t.Dict[str, t.Any]],
109
- filename: t.Optional[str] = None,
110
- indent: int = 4,
111
- encoding: str = "utf-8",
112
- **kwargs,
113
- ) -> None:
114
- output_folder = self.write_config.path
115
- output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
116
- filename = (
117
- filename.strip(os.sep) if filename else filename
118
- ) # Make sure filename doesn't begin with file separator
119
- output_path = str(PurePath(output_folder, filename)) if filename else output_folder
120
- logger.debug(f"uploading content to {output_path}")
121
- self.client.files.upload(
122
- file_path=output_path,
123
- contents=BytesIO(json.dumps(elements_dict).encode(encoding=self.write_config.encoding)),
124
- overwrite=self.write_config.overwrite,
125
- )
126
-
127
- def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
128
- pass
129
-
130
- def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
131
- for doc in docs:
132
- file_path = doc.base_output_filename
133
- filename = file_path if file_path else None
134
- with open(doc._output_filename) as json_file:
135
- logger.debug(f"uploading content from {doc._output_filename}")
136
- json_list = json.load(json_file)
137
- self.write_dict(elements_dict=json_list, filename=filename)
@@ -1,203 +0,0 @@
1
- import os
2
- import typing as t
3
- from dataclasses import dataclass
4
- from datetime import datetime as dt
5
- from multiprocessing import Process
6
- from pathlib import Path
7
-
8
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
9
- from unstructured_ingest.interfaces import (
10
- BaseConnectorConfig,
11
- BaseDestinationConnector,
12
- BaseSingleIngestDoc,
13
- BaseSourceConnector,
14
- IngestDocCleanupMixin,
15
- SourceConnectorCleanupMixin,
16
- SourceMetadata,
17
- WriteConfig,
18
- )
19
- from unstructured_ingest.logger import logger
20
- from unstructured_ingest.utils.dep_check import requires_dependencies
21
-
22
- if t.TYPE_CHECKING:
23
- from deltalake import DeltaTable
24
-
25
-
26
- @dataclass
27
- class SimpleDeltaTableConfig(BaseConnectorConfig):
28
- table_uri: t.Union[str, Path]
29
- version: t.Optional[int] = None
30
- storage_options: t.Optional[t.Dict[str, str]] = None
31
- without_files: bool = False
32
-
33
-
34
- @dataclass
35
- class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
36
- connector_config: SimpleDeltaTableConfig
37
- uri: str
38
- modified_date: str
39
- created_at: str
40
- registry_name: str = "delta-table"
41
-
42
- def uri_filename(self) -> str:
43
- basename = os.path.basename(self.uri)
44
- return os.path.splitext(basename)[0]
45
-
46
- @property
47
- def filename(self):
48
- return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve()
49
-
50
- @property
51
- def _output_filename(self):
52
- """Create filename document id combined with a hash of the query to uniquely identify
53
- the output file."""
54
- return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json"
55
-
56
- def _create_full_tmp_dir_path(self):
57
- self.filename.parent.mkdir(parents=True, exist_ok=True)
58
- self._output_filename.parent.mkdir(parents=True, exist_ok=True)
59
-
60
- @requires_dependencies(["fsspec"], extras="delta-table")
61
- def _get_fs_from_uri(self):
62
- from fsspec.core import url_to_fs
63
-
64
- try:
65
- fs, _ = url_to_fs(self.uri)
66
- except ImportError as error:
67
- raise ImportError(
68
- f"uri {self.uri} may be associated with a filesystem that "
69
- f"requires additional dependencies: {error}",
70
- )
71
- return fs
72
-
73
- def update_source_metadata(self, **kwargs):
74
- fs = kwargs.get("fs", self._get_fs_from_uri())
75
- version = (
76
- fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "")
77
- )
78
- file_exists = fs.exists(self.uri)
79
- self.source_metadata = SourceMetadata(
80
- date_created=self.created_at,
81
- date_modified=self.modified_date,
82
- version=version,
83
- source_url=self.uri,
84
- exists=file_exists,
85
- )
86
-
87
- @SourceConnectionError.wrap
88
- @BaseSingleIngestDoc.skip_if_file_exists
89
- def get_file(self):
90
- fs = self._get_fs_from_uri()
91
- self.update_source_metadata(fs=fs)
92
- logger.info(f"using a {fs} filesystem to collect table data")
93
- self._create_full_tmp_dir_path()
94
-
95
- df = self._get_df(filesystem=fs)
96
-
97
- logger.info(f"writing {len(df)} rows to {self.filename}")
98
- df.to_csv(self.filename)
99
-
100
- @SourceConnectionNetworkError.wrap
101
- def _get_df(self, filesystem):
102
- import pyarrow.parquet as pq
103
-
104
- return pq.ParquetDataset(self.uri, filesystem=filesystem).read_pandas().to_pandas()
105
-
106
-
107
- @dataclass
108
- class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
109
- connector_config: SimpleDeltaTableConfig
110
- delta_table: t.Optional["DeltaTable"] = None
111
-
112
- def check_connection(self):
113
- pass
114
-
115
- @requires_dependencies(["deltalake"], extras="delta-table")
116
- def initialize(self):
117
- from deltalake import DeltaTable
118
-
119
- self.delta_table = DeltaTable(
120
- table_uri=self.connector_config.table_uri,
121
- version=self.connector_config.version,
122
- storage_options=self.connector_config.storage_options,
123
- without_files=self.connector_config.without_files,
124
- )
125
- rows = self.delta_table.to_pyarrow_dataset().count_rows()
126
- if not rows > 0:
127
- raise ValueError(f"no data found at {self.connector_config.table_uri}")
128
- logger.info(f"processing {rows} rows of data")
129
-
130
- def get_ingest_docs(self):
131
- """Batches the results into distinct docs"""
132
- if not self.delta_table:
133
- raise ValueError("delta table was never initialized")
134
- actions = self.delta_table.get_add_actions().to_pandas()
135
- mod_date_dict = {
136
- row["path"]: str(row["modification_time"]) for _, row in actions.iterrows()
137
- }
138
- created_at = dt.fromtimestamp(self.delta_table.metadata().created_time / 1000)
139
- return [
140
- DeltaTableIngestDoc(
141
- connector_config=self.connector_config,
142
- processor_config=self.processor_config,
143
- read_config=self.read_config,
144
- uri=uri,
145
- modified_date=mod_date_dict[os.path.basename(uri)],
146
- created_at=str(created_at),
147
- )
148
- for uri in self.delta_table.file_uris()
149
- ]
150
-
151
-
152
- @dataclass
153
- class DeltaTableWriteConfig(WriteConfig):
154
- drop_empty_cols: bool = False
155
- mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
156
- schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
157
- engine: t.Literal["pyarrow", "rust"] = "pyarrow"
158
-
159
-
160
- @dataclass
161
- class DeltaTableDestinationConnector(BaseDestinationConnector):
162
- write_config: DeltaTableWriteConfig
163
- connector_config: SimpleDeltaTableConfig
164
-
165
- @requires_dependencies(["deltalake"], extras="delta-table")
166
- def initialize(self):
167
- pass
168
-
169
- def check_connection(self):
170
- pass
171
-
172
- @requires_dependencies(["deltalake"], extras="delta-table")
173
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
174
- from deltalake.writer import write_deltalake
175
-
176
- from unstructured_ingest.utils.table import convert_to_pandas_dataframe
177
-
178
- df = convert_to_pandas_dataframe(
179
- elements_dict=elements_dict,
180
- drop_empty_cols=self.write_config.drop_empty_cols,
181
- )
182
- logger.info(
183
- f"writing {len(df)} rows to destination table "
184
- f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
185
- )
186
- writer_kwargs = {
187
- "table_or_uri": self.connector_config.table_uri,
188
- "data": df,
189
- "mode": self.write_config.mode,
190
- "engine": self.write_config.engine,
191
- }
192
- if self.write_config.schema_mode is not None:
193
- writer_kwargs["schema_mode"] = self.write_config.schema_mode
194
- # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
195
- # ingest to fail, even though all tasks are completed normally. Putting the writer into a
196
- # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
197
- # rust backend to finish
198
- writer = Process(
199
- target=write_deltalake,
200
- kwargs=writer_kwargs,
201
- )
202
- writer.start()
203
- writer.join()