unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,573 +0,0 @@
1
- import json
2
- import os
3
- import typing as t
4
- from dataclasses import dataclass
5
- from html import unescape
6
- from pathlib import Path
7
- from urllib.parse import urlparse
8
-
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
- from unstructured_ingest.interfaces import (
12
- AccessConfig,
13
- BaseConnectorConfig,
14
- BaseSingleIngestDoc,
15
- BaseSourceConnector,
16
- IngestDocCleanupMixin,
17
- SourceConnectorCleanupMixin,
18
- SourceMetadata,
19
- )
20
- from unstructured_ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig
21
- from unstructured_ingest.logger import logger
22
- from unstructured_ingest.utils.dep_check import requires_dependencies
23
- from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime
24
-
25
- if t.TYPE_CHECKING:
26
- from office365.sharepoint.client_context import ClientContext
27
- from office365.sharepoint.files.file import File
28
- from office365.sharepoint.publishing.pages.page import SitePage
29
-
30
- MAX_MB_SIZE = 512_000_000
31
- CONTENT_LABELS = ["CanvasContent1", "LayoutWebpartsContent1", "TimeCreated"]
32
-
33
-
34
- @dataclass
35
- class SharepointAccessConfig(AccessConfig):
36
- client_cred: str = enhanced_field(repr=False, sensitive=True)
37
-
38
-
39
- @dataclass
40
- class SimpleSharepointConfig(BaseConnectorConfig):
41
- access_config: SharepointAccessConfig
42
- client_id: str
43
- site: str
44
- path: str
45
- process_pages: bool = enhanced_field(default=True, init=False)
46
- recursive: bool = False
47
- files_only: bool = False
48
- permissions_config: t.Optional[SharepointPermissionsConfig] = None
49
-
50
- def __post_init__(self):
51
- if not (self.client_id and self.access_config.client_cred and self.site):
52
- raise ValueError(
53
- "Please provide one of the following mandatory values:"
54
- "\n--client-id\n--client-cred\n--site",
55
- )
56
- self.process_pages = not self.files_only
57
-
58
- @requires_dependencies(["office365"], extras="sharepoint")
59
- def get_site_client(self, site_url: str = "") -> "ClientContext":
60
- from office365.runtime.auth.client_credential import ClientCredential
61
- from office365.sharepoint.client_context import ClientContext
62
-
63
- try:
64
- site_client = ClientContext(site_url or self.site).with_credentials(
65
- ClientCredential(self.client_id, self.access_config.client_cred),
66
- )
67
- except Exception:
68
- logger.error("Couldn't set Sharepoint client.")
69
- raise
70
- return site_client
71
-
72
- def get_permissions_client(self):
73
- try:
74
- permissions_connector = SharepointPermissionsConnector(self.permissions_config)
75
- assert permissions_connector.access_token
76
- return permissions_connector
77
- except Exception as e:
78
- logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e)
79
-
80
-
81
- @dataclass
82
- class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
83
- connector_config: SimpleSharepointConfig
84
- site_url: str
85
- server_path: str
86
- is_page: bool
87
- file_path: str
88
- registry_name: str = "sharepoint"
89
-
90
- def __post_init__(self):
91
- self.extension = Path(self.file_path).suffix if not self.is_page else ".html"
92
- self.extension = ".html" if self.extension == ".aspx" else self.extension
93
- if not self.extension:
94
- raise ValueError("Unsupported file without extension.")
95
-
96
- self._set_download_paths()
97
-
98
- def _set_download_paths(self) -> None:
99
- """Parses the folder structure from the source and creates the download and output paths"""
100
- download_path = Path(f"{self.read_config.download_dir}")
101
- output_path = Path(f"{self.processor_config.output_dir}")
102
- parent = Path(self.file_path).with_suffix(self.extension)
103
- self.download_dir = (download_path / parent.parent).resolve()
104
- self.download_filepath = (download_path / parent).resolve()
105
- output_filename = str(parent) + ".json"
106
- self.output_dir = (output_path / parent.parent).resolve()
107
- self.output_filepath = (output_path / output_filename).resolve()
108
-
109
- @property
110
- def filename(self):
111
- return Path(self.download_filepath).resolve()
112
-
113
- @property
114
- def _output_filename(self):
115
- return Path(self.output_filepath).resolve()
116
-
117
- @property
118
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
119
- return {
120
- "server_path": self.server_path,
121
- "site_url": self.site_url,
122
- }
123
-
124
- @SourceConnectionNetworkError.wrap
125
- @requires_dependencies(["office365"], extras="sharepoint")
126
- def _fetch_file(self, properties_only: bool = False):
127
- """Retrieves the actual page/file from the Sharepoint instance"""
128
- from office365.runtime.client_request_exception import ClientRequestException
129
-
130
- site_client = self.connector_config.get_site_client(self.site_url)
131
-
132
- try:
133
- if self.is_page:
134
- file = site_client.web.get_file_by_server_relative_path("/" + self.server_path)
135
- file = file.listItemAllFields.select(CONTENT_LABELS).get().execute_query()
136
- else:
137
- file = site_client.web.get_file_by_server_relative_url(self.server_path)
138
- if properties_only:
139
- file = file.get().execute_query()
140
- except ClientRequestException as e:
141
- if e.response.status_code == 404:
142
- return None
143
- raise
144
- return file
145
-
146
- def _fetch_page(self):
147
- site_client = self.connector_config.get_site_client(self.site_url)
148
- try:
149
- page = (
150
- site_client.site_pages.pages.get_by_url(self.server_path)
151
- .expand(["FirstPublished", "Modified", "Version"])
152
- .get()
153
- .execute_query()
154
- )
155
- except Exception as e:
156
- logger.error(f"Failed to retrieve page {self.server_path} from site {self.site_url}")
157
- logger.error(e)
158
- return None
159
- return page
160
-
161
- def update_permissions_data(self):
162
- def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath):
163
- permissions_filename = permissions_filename.split("_SEP_")
164
- ingest_doc_filepath = ingest_doc_filepath.split("/")
165
-
166
- if parent_type == "sites":
167
- return permissions_filename[0] == ingest_doc_filepath[1]
168
-
169
- elif parent_type == "SitePages" or parent_type == "Shared Documents":
170
- return True
171
-
172
- permissions_data = None
173
- permissions_dir = Path(self.processor_config.output_dir) / "permissions_data"
174
-
175
- if permissions_dir.is_dir():
176
- parent_type = self.file_path.split("/")[0]
177
-
178
- if parent_type == "sites":
179
- read_dir = permissions_dir / "sites"
180
- elif parent_type == "SitePages" or parent_type == "Shared Documents":
181
- read_dir = permissions_dir / "other"
182
- else:
183
- read_dir = permissions_dir / "other"
184
-
185
- for filename in os.listdir(read_dir):
186
- permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1]
187
- ingestdoc_docname = self.file_path.split("/")[-1]
188
-
189
- if ingestdoc_docname == permissions_docname and parent_name_matches(
190
- parent_type=parent_type,
191
- permissions_filename=filename,
192
- ingest_doc_filepath=self.file_path,
193
- ):
194
- with open(read_dir / filename) as f:
195
- permissions_data = json.loads(f.read())
196
-
197
- return permissions_data
198
-
199
- def update_source_metadata(self, **kwargs):
200
- if self.is_page:
201
- page = self._fetch_page()
202
- if page is None:
203
- self.source_metadata = SourceMetadata(
204
- exists=False,
205
- )
206
- return
207
- self.source_metadata = SourceMetadata(
208
- date_created=page.get_property("FirstPublished", None),
209
- date_modified=page.get_property("Modified", None),
210
- version=page.get_property("Version", ""),
211
- source_url=page.absolute_url,
212
- exists=True,
213
- permissions_data=(
214
- self.update_permissions_data()
215
- if self.connector_config.permissions_config
216
- else None
217
- ),
218
- )
219
- return
220
-
221
- file = self._fetch_file(True)
222
- if file is None:
223
- self.source_metadata = SourceMetadata(
224
- exists=False,
225
- )
226
- return
227
- self.source_metadata = SourceMetadata(
228
- date_created=ensure_isoformat_datetime(timestamp=file.time_created),
229
- date_modified=ensure_isoformat_datetime(timestamp=file.time_last_modified),
230
- version=file.major_version,
231
- source_url=file.properties.get("LinkingUrl", None),
232
- exists=True,
233
- permissions_data=(
234
- self.update_permissions_data() if self.connector_config.permissions_config else None
235
- ),
236
- )
237
-
238
- def _download_page(self):
239
- """Formats and saves locally page content"""
240
- content = self._fetch_file()
241
- self.update_source_metadata()
242
- pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + (
243
- content.properties.get("CanvasContent1", "") or ""
244
- )
245
- if pld != "":
246
- pld = unescape(pld)
247
- else:
248
- logger.info(
249
- f"Page {self.server_path} has no retrievable content. \
250
- Dumping empty doc.",
251
- )
252
- pld = "<div></div>"
253
-
254
- self.output_dir.mkdir(parents=True, exist_ok=True)
255
- if not self.download_dir.is_dir():
256
- logger.debug(f"creating directory: {self.download_dir}")
257
- self.download_dir.mkdir(parents=True, exist_ok=True)
258
- with self.filename.open(mode="w") as f:
259
- f.write(pld)
260
- logger.info(f"file downloaded: {self.filename}")
261
-
262
- def _download_file(self):
263
- file = self._fetch_file()
264
- self.update_source_metadata()
265
- fsize = file.length
266
- self.output_dir.mkdir(parents=True, exist_ok=True)
267
-
268
- if not self.download_dir.is_dir():
269
- logger.debug(f"creating directory: {self.download_dir}")
270
- self.download_dir.mkdir(parents=True, exist_ok=True)
271
-
272
- if fsize > MAX_MB_SIZE:
273
- logger.info(f"downloading file with size: {fsize} bytes in chunks")
274
- with self.filename.open(mode="wb") as f:
275
- file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
276
- else:
277
- with self.filename.open(mode="wb") as f:
278
- file.download(f).execute_query()
279
- logger.info(f"file downloaded: {self.filename}")
280
-
281
- @BaseSingleIngestDoc.skip_if_file_exists
282
- @SourceConnectionError.wrap
283
- @requires_dependencies(["office365"])
284
- def get_file(self):
285
- if self.is_page:
286
- self._download_page()
287
- else:
288
- self._download_file()
289
- return
290
-
291
-
292
- @dataclass
293
- class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
294
- connector_config: SimpleSharepointConfig
295
-
296
- def check_connection(self):
297
- try:
298
- site_client = self.connector_config.get_site_client()
299
- site_client.site_pages.pages.get().execute_query()
300
- except Exception as e:
301
- logger.error(f"failed to validate connection: {e}", exc_info=True)
302
- raise SourceConnectionError(f"failed to validate connection: {e}")
303
-
304
- @requires_dependencies(["office365"], extras="sharepoint")
305
- def _list_files(self, folder, recursive) -> t.List["File"]:
306
- from office365.runtime.client_request_exception import ClientRequestException
307
-
308
- try:
309
- objects = folder.expand(["Files", "Folders"]).get().execute_query()
310
- files = list(objects.files)
311
- if not recursive:
312
- return files
313
- for f in objects.folders:
314
- if "/Forms" in f.serverRelativeUrl:
315
- continue
316
- files += self._list_files(f, recursive)
317
- return files
318
- except ClientRequestException as e:
319
- if e.response.status_code != 404:
320
- logger.info("Caught an error while processing documents %s", e.response.text)
321
- return []
322
-
323
- def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_page=False):
324
- if is_page:
325
- file_path = obj.get_property("Url", "")
326
- server_path = file_path if file_path[0] != "/" else file_path[1:]
327
- if (url_path := (urlparse(base_url).path)) and (url_path != "/"):
328
- file_path = url_path[1:] + "/" + file_path
329
- else:
330
- server_path = obj.serverRelativeUrl
331
- file_path = obj.serverRelativeUrl[1:]
332
-
333
- return SharepointIngestDoc(
334
- processor_config=self.processor_config,
335
- read_config=self.read_config,
336
- connector_config=self.connector_config,
337
- site_url=base_url,
338
- server_path=server_path,
339
- is_page=is_page,
340
- file_path=file_path,
341
- )
342
-
343
- @requires_dependencies(["office365"], extras="sharepoint")
344
- def _list_pages(self, site_client) -> list:
345
- from office365.runtime.client_request_exception import ClientRequestException
346
-
347
- try:
348
- site_pages = site_client.site_pages.pages.get().execute_query()
349
- except ClientRequestException as e:
350
- logger.info(
351
- "Caught an error while retrieving site pages from %s \n%s",
352
- site_client.base_url,
353
- e.response.text,
354
- )
355
- return []
356
-
357
- return [self._prepare_ingest_doc(page, site_client.base_url, True) for page in site_pages]
358
-
359
- def _ingest_site_docs(self, site_client) -> t.List["SharepointIngestDoc"]:
360
- root_folder = site_client.web.get_folder_by_server_relative_path(self.connector_config.path)
361
- files = self._list_files(root_folder, self.connector_config.recursive)
362
- if not files:
363
- logger.info(
364
- f"No processable files at path {self.connector_config.path}\
365
- for site {site_client.base_url}",
366
- )
367
- output = []
368
- for file in files:
369
- try:
370
- output.append(self._prepare_ingest_doc(file, site_client.base_url))
371
- except ValueError as e:
372
- logger.error("Unable to process file %s", file.properties["Name"])
373
- logger.error(e)
374
- if self.connector_config.process_pages:
375
- page_output = self._list_pages(site_client)
376
- if not page_output:
377
- logger.info(f"couldn't process pages for site {site_client.base_url}")
378
- output = output + page_output
379
- return output
380
-
381
- def initialize(self):
382
- pass
383
-
384
- def get_ingest_docs(self):
385
- base_site_client = self.connector_config.get_site_client()
386
-
387
- if not all(
388
- getattr(self.connector_config.permissions_config, attr, False)
389
- for attr in ["application_id", "client_cred", "tenant"]
390
- ):
391
- logger.info(
392
- "Permissions config is not fed with 'application_id', 'client_cred' and 'tenant'."
393
- "Skipping permissions ingestion.",
394
- )
395
- else:
396
- permissions_client = self.connector_config.get_permissions_client()
397
- if permissions_client:
398
- permissions_client.write_all_permissions(self.processor_config.output_dir)
399
-
400
- if not base_site_client.is_tenant:
401
- return self._ingest_site_docs(base_site_client)
402
- tenant = base_site_client.tenant
403
- tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query()
404
- tenant_sites = {s.url for s in tenant_sites if (s.url is not None)}
405
- ingest_docs: t.List[SharepointIngestDoc] = []
406
- for site_url in tenant_sites:
407
- logger.info(f"processing docs for site: {site_url}")
408
- site_client = self.connector_config.get_site_client(site_url)
409
- ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
410
- return ingest_docs
411
-
412
-
413
- @dataclass
414
- class SharepointPermissionsConnector:
415
- def __init__(self, permissions_config):
416
- self.permissions_config: SharepointPermissionsConfig = permissions_config
417
- self.initialize()
418
-
419
- def initialize(self):
420
- self.access_token: str = self.get_access_token()
421
-
422
- @requires_dependencies(["requests"], extras="sharepoint")
423
- def get_access_token(self) -> str:
424
- import requests
425
-
426
- url = (
427
- f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token"
428
- )
429
- headers = {"Content-Type": "application/x-www-form-urlencoded"}
430
- data = {
431
- "client_id": self.permissions_config.application_id,
432
- "scope": "https://graph.microsoft.com/.default",
433
- "client_secret": self.permissions_config.client_cred,
434
- "grant_type": "client_credentials",
435
- }
436
- response = requests.post(url, headers=headers, data=data)
437
- return response.json()["access_token"]
438
-
439
- def validated_response(self, response):
440
- if response.status_code == 200:
441
- return response.json()
442
- else:
443
- logger.info(f"request failed with status code {response.status_code}:")
444
- logger.info(response.text)
445
-
446
- @requires_dependencies(["requests"], extras="sharepoint")
447
- def get_sites(self):
448
- import requests
449
-
450
- url = "https://graph.microsoft.com/v1.0/sites"
451
- params = {
452
- "$select": "webUrl, id",
453
- }
454
-
455
- headers = {
456
- "Authorization": f"Bearer {self.access_token}",
457
- }
458
-
459
- response = requests.get(url, params=params, headers=headers)
460
- return self.validated_response(response)
461
-
462
- @requires_dependencies(["requests"], extras="sharepoint")
463
- def get_drives(self, site):
464
- import requests
465
-
466
- url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives"
467
-
468
- headers = {
469
- "Authorization": f"Bearer {self.access_token}",
470
- }
471
-
472
- response = requests.get(url, headers=headers)
473
-
474
- return self.validated_response(response)
475
-
476
- @requires_dependencies(["requests"], extras="sharepoint")
477
- def get_drive_items(self, site, drive_id):
478
- import requests
479
-
480
- url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children"
481
-
482
- headers = {
483
- "Authorization": f"Bearer {self.access_token}",
484
- }
485
-
486
- response = requests.get(url, headers=headers)
487
-
488
- return self.validated_response(response)
489
-
490
- def extract_site_name_from_weburl(self, weburl):
491
- split_path = urlparse(weburl).path.lstrip("/").split("/")
492
-
493
- if split_path[0] == "sites":
494
- return "sites", split_path[1]
495
-
496
- elif split_path[0] == "Shared%20Documents":
497
- return "Shared Documents", "Shared Documents"
498
-
499
- elif split_path[0] == "personal":
500
- return "Personal", "Personal"
501
-
502
- elif split_path[0] == "_layouts":
503
- return "layouts", "layouts"
504
-
505
- # if other weburl structures are found, additional logic might need to be implemented
506
-
507
- logger.warning(
508
- """Couldn't extract sitename, unknown site or parent type. Skipping permissions
509
- ingestion for the document with the URL:""",
510
- weburl,
511
- )
512
-
513
- return None, None
514
-
515
- @requires_dependencies(["requests"], extras="sharepoint")
516
- def get_permissions_for_drive_item(self, site, drive_id, item_id):
517
- import requests
518
-
519
- url = f"https://graph.microsoft.com/v1.0/sites/ \
520
- {site}/drives/{drive_id}/items/{item_id}/permissions"
521
-
522
- headers = {
523
- "Authorization": f"Bearer {self.access_token}",
524
- }
525
-
526
- response = requests.get(url, headers=headers)
527
-
528
- return self.validated_response(response)
529
-
530
- def write_all_permissions(self, output_dir):
531
- sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]]
532
- drive_ids = []
533
-
534
- logger.info("Obtaining drive data for sites for permissions (rbac)")
535
- for site_id, site_url in sites:
536
- drives = self.get_drives(site_id)
537
- if drives:
538
- drives_for_site = drives["value"]
539
- drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site])
540
-
541
- logger.info("Obtaining item data from drives for permissions (rbac)")
542
- item_ids = []
543
- for site, drive_id in drive_ids:
544
- drive_items = self.get_drive_items(site, drive_id)
545
- if drive_items:
546
- item_ids.extend(
547
- [
548
- (site, drive_id, item["id"], item["name"], item["webUrl"])
549
- for item in drive_items["value"]
550
- ],
551
- )
552
-
553
- permissions_dir = Path(output_dir) / "permissions_data"
554
-
555
- logger.info("Writing permissions data to disk")
556
- for site, drive_id, item_id, item_name, item_web_url in item_ids:
557
- res = self.get_permissions_for_drive_item(site, drive_id, item_id)
558
- if res:
559
- parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url)
560
-
561
- if parent_type == "sites":
562
- write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json"
563
-
564
- elif parent_type == "Personal" or parent_type == "Shared Documents":
565
- write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
566
- else:
567
- write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
568
-
569
- if not Path(os.path.dirname(write_path)).is_dir():
570
- os.makedirs(os.path.dirname(write_path))
571
-
572
- with open(write_path, "w") as f:
573
- json.dump(res["value"], f)