unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,19 +0,0 @@
1
- import os
2
- import shutil
3
- from pathlib import Path
4
-
5
- from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.pipeline.interfaces import CopyNode
8
-
9
-
10
- class Copier(CopyNode):
11
- def run(self, json_path: str):
12
- filename = os.path.basename(json_path)
13
- doc_hash = os.path.splitext(filename)[0]
14
- ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
15
- ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
16
- desired_output = ingest_doc._output_filename
17
- Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
18
- logger.info(f"copying {json_path} -> {desired_output}")
19
- shutil.copy(json_path, desired_output)
@@ -1,12 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.pipeline.interfaces import DocFactoryNode
5
-
6
-
7
- @dataclass
8
- class DocFactory(DocFactoryNode):
9
- def run(self, *args, **kwargs) -> t.Iterable[dict]:
10
- docs = self.source_doc_connector.get_ingest_docs()
11
- json_docs = [doc.to_dict() for doc in docs]
12
- return json_docs
@@ -1,60 +0,0 @@
1
- import hashlib
2
- import json
3
- import typing as t
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
9
- from unstructured_ingest.error import PartitionError
10
- from unstructured_ingest.logger import logger
11
- from unstructured_ingest.pipeline.interfaces import PartitionNode
12
- from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
13
-
14
-
15
- @dataclass
16
- class Partitioner(PartitionNode):
17
- @PartitionError.wrap
18
- def run(self, ingest_doc_dict) -> Optional[str]:
19
- try:
20
- doc = create_ingest_doc_from_dict(ingest_doc_dict)
21
- doc_filename_hash = get_ingest_doc_hash(ingest_doc_dict)
22
- hashed_filename = hashlib.sha256(
23
- f"{self.create_hash()}{doc_filename_hash}".encode(),
24
- ).hexdigest()[:32]
25
- self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_dict
26
- doc_filename = f"{hashed_filename}.json"
27
- json_path = (Path(self.get_path()) / doc_filename).resolve()
28
- if (
29
- not self.pipeline_context.reprocess
30
- and json_path.is_file()
31
- and json_path.stat().st_size
32
- ):
33
- logger.info(f"file exists: {json_path}, skipping partition")
34
- return str(json_path)
35
- partition_kwargs: t.Dict[str, t.Any] = {
36
- "strategy": self.partition_config.strategy,
37
- "encoding": self.partition_config.encoding,
38
- "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
39
- "languages": self.partition_config.ocr_languages,
40
- "hi_res_model_name": self.partition_config.hi_res_model_name,
41
- }
42
- if self.partition_config.skip_infer_table_types:
43
- partition_kwargs["skip_infer_table_types"] = (
44
- self.partition_config.skip_infer_table_types
45
- )
46
- if self.partition_config.additional_partition_args:
47
- partition_kwargs.update(self.partition_config.additional_partition_args)
48
- elements = doc.process_file(
49
- partition_config=self.partition_config,
50
- **partition_kwargs,
51
- )
52
- with open(json_path, "w", encoding="utf8") as output_f:
53
- logger.info(f"writing partitioned content to {json_path}")
54
- json.dump(elements, output_f, ensure_ascii=False, indent=2, sort_keys=True)
55
- return str(json_path)
56
- except Exception as e:
57
- if self.pipeline_context.raise_on_error:
58
- raise
59
- logger.error(f"failed to partition doc: {ingest_doc_dict}, {e}", exc_info=True)
60
- return None
@@ -1,12 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- from unstructured_ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
4
- from unstructured_ingest.pipeline.interfaces import PermissionsNode
5
-
6
-
7
- @dataclass
8
- class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
9
- processor_config: ProcessorConfig
10
-
11
- def run(self):
12
- self.cleanup_permissions()
@@ -1,134 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import hashlib
4
- import json
5
- import os.path
6
- from dataclasses import dataclass
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING, Optional
9
-
10
- from unstructured_ingest.interfaces import ChunkingConfig, PartitionConfig
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.pipeline.interfaces import ReformatNode
13
- from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
14
-
15
- if TYPE_CHECKING:
16
- from unstructured.documents.elements import Element
17
-
18
-
19
- @dataclass
20
- class Chunker(ReformatNode):
21
- """Implementation for the chunking node in the ingest pipeline.
22
-
23
- Parameters
24
- ----------
25
- pipeline_context: PipelineContext (inherited from parent class)
26
- chunking_config: ChunkingConfig
27
- partition_config: PartitionConfig
28
- """
29
-
30
- chunking_config: ChunkingConfig
31
- partition_config: PartitionConfig
32
-
33
- def initialize(self):
34
- logger.info(
35
- f"Running chunking node. Chunking config: {self.chunking_config.to_json()}]",
36
- )
37
- super().initialize()
38
-
39
- def create_hash(self) -> str:
40
- hash_dict = self.chunking_config.to_dict()
41
- return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
42
-
43
- def run(self, elements_json: str) -> Optional[str]:
44
- try:
45
- elements_json_filename = os.path.basename(elements_json)
46
- filename_ext = os.path.basename(elements_json_filename)
47
- filename = os.path.splitext(filename_ext)[0]
48
- hashed_filename = hashlib.sha256(
49
- f"{self.create_hash()}{filename}".encode(),
50
- ).hexdigest()[:32]
51
- json_filename = f"{hashed_filename}.json"
52
- json_path = (Path(self.get_path()) / json_filename).resolve()
53
- self.pipeline_context.ingest_docs_map[hashed_filename] = (
54
- self.pipeline_context.ingest_docs_map[filename]
55
- )
56
- if (
57
- not self.pipeline_context.reprocess
58
- and json_path.is_file()
59
- and json_path.stat().st_size
60
- ):
61
- logger.debug(f"file exists: {json_path}, skipping chunking")
62
- return str(json_path)
63
-
64
- chunked_elements = self.chunk(elements_json)
65
-
66
- # -- return if chunking_strategy is None --
67
- if chunked_elements is None:
68
- logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}")
69
- return
70
-
71
- element_dicts = [e.to_dict() for e in chunked_elements]
72
- assign_and_map_hash_ids(elements=element_dicts)
73
-
74
- with open(json_path, "w", encoding="utf8") as output_f:
75
- logger.info(f"writing chunking content to {json_path}")
76
- json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
77
- return str(json_path)
78
-
79
- except Exception as e:
80
- if self.pipeline_context.raise_on_error:
81
- raise
82
- logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
83
- return None
84
-
85
- def get_path(self) -> Path:
86
- return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
87
-
88
- def chunk(self, elements_json_file: str) -> Optional[list["Element"]]:
89
- """Called by Chunker.run() to properly execute the defined chunking_strategy."""
90
- # -- No chunking_strategy means no chunking --
91
- if self.chunking_config.chunking_strategy is None:
92
- return
93
- # -- Chunk locally for open-source chunking strategies, even when partitioning remotely --
94
- if self.chunking_config.chunking_strategy in ("basic", "by_title"):
95
- from unstructured.chunking import dispatch
96
- from unstructured.staging.base import elements_from_json
97
-
98
- return dispatch.chunk(
99
- elements=elements_from_json(filename=elements_json_file),
100
- chunking_strategy=self.chunking_config.chunking_strategy,
101
- combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars,
102
- include_orig_elements=self.chunking_config.include_orig_elements,
103
- max_characters=self.chunking_config.max_characters,
104
- multipage_sections=self.chunking_config.multipage_sections,
105
- new_after_n_chars=self.chunking_config.new_after_n_chars,
106
- overlap=self.chunking_config.overlap,
107
- overlap_all=self.chunking_config.overlap_all,
108
- )
109
- # -- Chunk remotely --
110
- if self.partition_config.partition_by_api:
111
- from unstructured.partition.api import partition_via_api
112
-
113
- return partition_via_api(
114
- filename=elements_json_file,
115
- # -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
116
- # -- error, which will be caught and logged by Chunker.run()
117
- api_key=self.partition_config.api_key, # type: ignore
118
- api_url=self.partition_config.partition_endpoint, # type: ignore
119
- chunking_strategy=self.chunking_config.chunking_strategy,
120
- combine_under_n_chars=self.chunking_config.combine_text_under_n_chars,
121
- include_orig_elements=self.chunking_config.include_orig_elements,
122
- max_characters=self.chunking_config.max_characters,
123
- multipage_sections=self.chunking_config.multipage_sections,
124
- new_after_n_chars=self.chunking_config.new_after_n_chars,
125
- overlap=self.chunking_config.overlap,
126
- overlap_all=self.chunking_config.overlap_all,
127
- )
128
- # -- Warn that the defined chunking_strategy is not locally available --
129
- logger.warning(
130
- f"There is no locally available chunking_strategy:"
131
- f" {self.chunking_config.chunking_strategy}."
132
- f" If trying to partition remotely, check that `partition_by_api`, `api_url`,"
133
- f" and `api_key` are correctly defined."
134
- )
@@ -1,64 +0,0 @@
1
- import hashlib
2
- import json
3
- import os.path
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from unstructured_ingest.interfaces import (
9
- EmbeddingConfig,
10
- )
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.pipeline.interfaces import ReformatNode
13
-
14
-
15
- @dataclass
16
- class Embedder(ReformatNode):
17
- embedder_config: EmbeddingConfig
18
-
19
- def initialize(self):
20
- logger.info(
21
- f"Running embedding node. Embedding config: {self.embedder_config.to_json()}]",
22
- )
23
- super().initialize()
24
-
25
- def create_hash(self) -> str:
26
- hash_dict = self.embedder_config.to_dict()
27
- return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
28
-
29
- def run(self, elements_json: str) -> Optional[str]:
30
- try:
31
- elements_json_filename = os.path.basename(elements_json)
32
- filename_ext = os.path.basename(elements_json_filename)
33
- filename = os.path.splitext(filename_ext)[0]
34
- hashed_filename = hashlib.sha256(
35
- f"{self.create_hash()}{filename}".encode(),
36
- ).hexdigest()[:32]
37
- json_filename = f"{hashed_filename}.json"
38
- json_path = (Path(self.get_path()) / json_filename).resolve()
39
- self.pipeline_context.ingest_docs_map[hashed_filename] = (
40
- self.pipeline_context.ingest_docs_map[filename]
41
- )
42
- if (
43
- not self.pipeline_context.reprocess
44
- and json_path.is_file()
45
- and json_path.stat().st_size
46
- ):
47
- logger.debug(f"file exists: {json_path}, skipping embedding")
48
- return str(json_path)
49
- with open(elements_json) as f:
50
- elements = json.load(f)
51
- embedder = self.embedder_config.get_embedder()
52
- element_dicts = embedder.embed_documents(elements=elements)
53
- with open(json_path, "w", encoding="utf8") as output_f:
54
- logger.info(f"writing embeddings content to {json_path}")
55
- json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
56
- return str(json_path)
57
- except Exception as e:
58
- if self.pipeline_context.raise_on_error:
59
- raise
60
- logger.error(f"failed to embed content from file {elements_json}, {e}", exc_info=True)
61
- return None
62
-
63
- def get_path(self) -> Path:
64
- return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
@@ -1,77 +0,0 @@
1
- import os
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
- from unstructured_ingest.interfaces import (
7
- BaseIngestDocBatch,
8
- BaseSessionHandle,
9
- BaseSingleIngestDoc,
10
- IngestDocSessionHandleMixin,
11
- )
12
- from unstructured_ingest.logger import logger
13
- from unstructured_ingest.pipeline.interfaces import SourceNode
14
-
15
- # module-level variable to store session handle
16
- session_handle: t.Optional[BaseSessionHandle] = None
17
-
18
-
19
- @dataclass
20
- class Reader(SourceNode):
21
- def get_single(self, doc: BaseSingleIngestDoc, ingest_doc_dict: dict) -> str:
22
- if (
23
- not self.read_config.re_download
24
- and doc.filename.is_file()
25
- and doc.filename.stat().st_size
26
- ):
27
- logger.info(f"file exists: {doc.filename}, skipping download")
28
- # Still need to fetch metadata if file exists locally
29
- doc.update_source_metadata()
30
- else:
31
- serialized_doc = doc.to_json(redact_sensitive=True)
32
- logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
33
- if self.retry_strategy:
34
- self.retry_strategy(doc.get_file)
35
- else:
36
- doc.get_file()
37
- for k, v in doc.to_dict().items():
38
- ingest_doc_dict[k] = v
39
- return doc.filename
40
-
41
- def get_batch(self, doc_batch: BaseIngestDocBatch, ingest_doc_dict: dict) -> t.List[str]:
42
- if self.retry_strategy:
43
- self.retry_strategy(doc_batch.get_files)
44
- else:
45
- doc_batch.get_files()
46
- for k, v in doc_batch.to_dict().items():
47
- ingest_doc_dict[k] = v
48
- return [doc.filename for doc in doc_batch.ingest_docs]
49
-
50
- def run(self, ingest_doc_dict: dict) -> t.Optional[t.Union[str, t.List[str]]]:
51
- try:
52
- global session_handle
53
- doc = create_ingest_doc_from_dict(ingest_doc_dict)
54
- if isinstance(doc, IngestDocSessionHandleMixin):
55
- if session_handle is None:
56
- # create via doc.session_handle, which is a property that creates a
57
- # session handle if one is not already defined
58
- session_handle = doc.session_handle
59
- else:
60
- doc._session_handle = session_handle
61
- if isinstance(doc, BaseSingleIngestDoc):
62
- return self.get_single(doc=doc, ingest_doc_dict=ingest_doc_dict)
63
- elif isinstance(doc, BaseIngestDocBatch):
64
- return self.get_batch(doc_batch=doc, ingest_doc_dict=ingest_doc_dict)
65
- else:
66
- raise ValueError(
67
- f"type of doc ({type(doc)}) is not a recognized type: "
68
- f"BaseSingleIngestDoc or BaseSingleIngestDoc"
69
- )
70
- except Exception as e:
71
- if self.pipeline_context.raise_on_error:
72
- raise
73
- logger.error(
74
- f"failed to get data associated with source doc: {ingest_doc_dict}, {e}",
75
- exc_info=True,
76
- )
77
- return None
@@ -1,6 +0,0 @@
1
- import hashlib
2
-
3
-
4
- def get_ingest_doc_hash(json_as_dict: dict) -> str:
5
- hashed = hashlib.sha256(json_as_dict["unique_id"].encode()).hexdigest()[:32]
6
- return hashed
@@ -1,18 +0,0 @@
1
- import os.path
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
- from unstructured_ingest.pipeline.interfaces import WriteNode
7
-
8
-
9
- @dataclass
10
- class Writer(WriteNode):
11
- def run(self, json_paths: t.List[str]):
12
- ingest_docs = []
13
- for json_path in json_paths:
14
- filename = os.path.basename(json_path)
15
- doc_hash = os.path.splitext(filename)[0]
16
- ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
17
- ingest_docs.append(create_ingest_doc_from_dict(ingest_doc_dict))
18
- self.dest_doc_connector.write(docs=ingest_docs)
@@ -1,93 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import multiprocessing as mp
4
- from contextlib import suppress
5
- from typing import Optional
6
-
7
- from unstructured_ingest.interfaces import (
8
- BaseDestinationConnector,
9
- BaseSourceConnector,
10
- ChunkingConfig,
11
- EmbeddingConfig,
12
- PartitionConfig,
13
- PermissionsConfig,
14
- ProcessorConfig,
15
- RetryStrategyConfig,
16
- )
17
- from unstructured_ingest.pipeline import (
18
- Chunker,
19
- DocFactory,
20
- Embedder,
21
- Partitioner,
22
- PermissionsDataCleaner,
23
- Pipeline,
24
- PipelineContext,
25
- Reader,
26
- ReformatNode,
27
- Writer,
28
- )
29
-
30
- with suppress(RuntimeError):
31
- mp.set_start_method("spawn")
32
-
33
-
34
- def process_documents(
35
- processor_config: ProcessorConfig,
36
- source_doc_connector: BaseSourceConnector,
37
- partition_config: PartitionConfig,
38
- dest_doc_connector: Optional[BaseDestinationConnector] = None,
39
- chunking_config: Optional[ChunkingConfig] = None,
40
- embedder_config: Optional[EmbeddingConfig] = None,
41
- permissions_config: Optional[PermissionsConfig] = None,
42
- retry_strategy_config: Optional[RetryStrategyConfig] = None,
43
- ) -> None:
44
- pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
45
- doc_factory = DocFactory(
46
- pipeline_context=pipeline_config,
47
- source_doc_connector=source_doc_connector,
48
- )
49
- reader = Reader(
50
- pipeline_context=pipeline_config,
51
- retry_strategy_config=retry_strategy_config,
52
- read_config=source_doc_connector.read_config,
53
- )
54
- partitioner = Partitioner(pipeline_context=pipeline_config, partition_config=partition_config)
55
- reformat_nodes: list[ReformatNode] = []
56
- if chunking_config:
57
- reformat_nodes.append(
58
- Chunker(
59
- pipeline_context=pipeline_config,
60
- chunking_config=chunking_config,
61
- partition_config=partition_config,
62
- ),
63
- )
64
- if embedder_config:
65
- reformat_nodes.append(
66
- Embedder(
67
- pipeline_context=pipeline_config,
68
- embedder_config=embedder_config,
69
- ),
70
- )
71
- writer = (
72
- Writer(
73
- pipeline_context=pipeline_config,
74
- dest_doc_connector=dest_doc_connector,
75
- )
76
- if dest_doc_connector
77
- else None
78
- )
79
- permissions_data_cleaner = (
80
- PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
81
- if permissions_config
82
- else None
83
- )
84
- pipeline = Pipeline(
85
- pipeline_context=pipeline_config,
86
- doc_factory_node=doc_factory,
87
- source_node=reader,
88
- partition_node=partitioner,
89
- reformat_nodes=reformat_nodes,
90
- write_node=writer,
91
- permissions_node=permissions_data_cleaner,
92
- )
93
- pipeline.run()
@@ -1,104 +0,0 @@
1
- import typing as t
2
- from typing import Type
3
-
4
- from .airtable import AirtableRunner
5
- from .astradb import AstraDBRunner
6
- from .base_runner import Runner
7
- from .biomed import BiomedRunner
8
- from .confluence import ConfluenceRunner
9
- from .delta_table import DeltaTableRunner
10
- from .discord import DiscordRunner
11
- from .elasticsearch import ElasticSearchRunner
12
- from .fsspec.azure import AzureRunner
13
- from .fsspec.box import BoxRunner
14
- from .fsspec.dropbox import DropboxRunner
15
- from .fsspec.fsspec import FsspecRunner
16
- from .fsspec.gcs import GCSRunner
17
- from .fsspec.s3 import S3Runner
18
- from .fsspec.sftp import SftpRunner
19
- from .github import GithubRunner
20
- from .gitlab import GitlabRunner
21
- from .google_drive import GoogleDriveRunner
22
- from .hubspot import HubSpotRunner
23
- from .jira import JiraRunner
24
- from .kafka import KafkaRunner
25
- from .local import LocalRunner
26
- from .mongodb import MongoDBRunner
27
- from .notion import NotionRunner
28
- from .onedrive import OneDriveRunner
29
- from .opensearch import OpenSearchRunner
30
- from .outlook import OutlookRunner
31
- from .reddit import RedditRunner
32
- from .salesforce import SalesforceRunner
33
- from .sharepoint import SharePointRunner
34
- from .slack import SlackRunner
35
- from .wikipedia import WikipediaRunner
36
-
37
- runner_map: t.Dict[str, Type[Runner]] = {
38
- "airtable": AirtableRunner,
39
- "astradb": AstraDBRunner,
40
- "azure": AzureRunner,
41
- "biomed": BiomedRunner,
42
- "box": BoxRunner,
43
- "confluence": ConfluenceRunner,
44
- "delta_table": DeltaTableRunner,
45
- "discord": DiscordRunner,
46
- "dropbox": DropboxRunner,
47
- "elasticsearch": ElasticSearchRunner,
48
- "fsspec": FsspecRunner,
49
- "gcs": GCSRunner,
50
- "github": GithubRunner,
51
- "gitlab": GitlabRunner,
52
- "gdrive": GoogleDriveRunner,
53
- "google_drive": GoogleDriveRunner,
54
- "hubspot": HubSpotRunner,
55
- "jira": JiraRunner,
56
- "kafka": KafkaRunner,
57
- "local": LocalRunner,
58
- "mongodb": MongoDBRunner,
59
- "notion": NotionRunner,
60
- "onedrive": OneDriveRunner,
61
- "opensearch": OpenSearchRunner,
62
- "outlook": OutlookRunner,
63
- "reddit": RedditRunner,
64
- "s3": S3Runner,
65
- "salesforce": SalesforceRunner,
66
- "sftp": SftpRunner,
67
- "sharepoint": SharePointRunner,
68
- "slack": SlackRunner,
69
- "wikipedia": WikipediaRunner,
70
- }
71
-
72
- __all__ = [
73
- "AirtableRunner",
74
- "AstraRunner",
75
- "AzureRunner",
76
- "BiomedRunner",
77
- "BoxRunner",
78
- "ConfluenceRunner",
79
- "DeltaTableRunner",
80
- "DiscordRunner",
81
- "DropboxRunner",
82
- "ElasticSearchRunner",
83
- "FsspecRunner",
84
- "GCSRunner",
85
- "GoogleDriveRunner",
86
- "GithubRunner",
87
- "GitlabRunner",
88
- "JiraRunner",
89
- "KafkaRunner",
90
- "LocalRunner",
91
- "MongoDBRunner",
92
- "NotionRunner",
93
- "OneDriveRunner",
94
- "OpenSearchRunner",
95
- "OutlookRunner",
96
- "RedditRunner",
97
- "S3Runner",
98
- "SalesforceRunner",
99
- "SharePointRunner",
100
- "SlackRunner",
101
- "WikipediaRunner",
102
- "runner_map",
103
- "Runner",
104
- ]
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.airtable import SimpleAirtableConfig
12
-
13
-
14
- @dataclass
15
- class AirtableRunner(Runner):
16
- connector_config: "SimpleAirtableConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.access_config.personal_access_token.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="airtable",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.airtable import (
32
- AirtableSourceConnector,
33
- )
34
-
35
- return AirtableSourceConnector
@@ -1,34 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.astradb import SimpleAstraDBConfig
12
-
13
-
14
- @dataclass
15
- class AstraDBRunner(Runner):
16
- connector_config: "SimpleAstraDBConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
21
- )
22
- self.read_config.download_dir = update_download_dir_hash(
23
- connector_name="astradb",
24
- read_config=self.read_config,
25
- hashed_dir_name=hashed_dir_name,
26
- logger=logger,
27
- )
28
-
29
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
- from unstructured_ingest.connector.astradb import (
31
- AstraDBSourceConnector,
32
- )
33
-
34
- return AstraDBSourceConnector