unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,852 +0,0 @@
1
- """Defines Abstract Base Classes (ABC's) core to batch processing documents
2
- through Unstructured."""
3
-
4
- from __future__ import annotations
5
-
6
- import functools
7
- import json
8
- import os
9
- import re
10
- from abc import ABC, abstractmethod
11
- from dataclasses import InitVar, dataclass, field
12
- from datetime import datetime
13
- from pathlib import Path
14
- from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar
15
-
16
- from dataclasses_json import DataClassJsonMixin
17
- from dataclasses_json.core import Json, _decode_dataclass
18
-
19
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
20
- from unstructured_ingest.enhanced_dataclass.core import _asdict
21
- from unstructured_ingest.error import PartitionError, SourceConnectionError
22
- from unstructured_ingest.logger import logger
23
- from unstructured_ingest.utils.data_prep import flatten_dict
24
- from unstructured_ingest.v2.unstructured_api import call_api
25
-
26
- if TYPE_CHECKING:
27
- from unstructured.documents.elements import Element
28
-
29
- from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
30
-
31
- A = TypeVar("A", bound="DataClassJsonMixin")
32
-
33
- # -- Needed to resolve TypeError raised by using InitVar and __future__.annotations
34
- # -- See more here: https://stackoverflow.com/questions/70400639/
35
- InitVar.__call__ = lambda *args: None # type: ignore
36
-
37
- SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
38
- "s3",
39
- "s3a",
40
- "abfs",
41
- "az",
42
- "gs",
43
- "gcs",
44
- "box",
45
- "dropbox",
46
- "sftp",
47
- ]
48
-
49
-
50
- @dataclass
51
- class BaseSessionHandle(ABC):
52
- """Abstract Base Class for sharing resources that are local to an individual process.
53
- e.g., a connection for making a request for fetching documents."""
54
-
55
-
56
- @dataclass
57
- class BaseConfig(EnhancedDataClassJsonMixin, ABC):
58
- pass
59
-
60
-
61
- @dataclass
62
- class AccessConfig(BaseConfig):
63
- """Meant to designate holding any sensitive information associated with other configs
64
- and also for access specific configs."""
65
-
66
-
67
- @dataclass
68
- class RetryStrategyConfig(BaseConfig):
69
- """
70
- Contains all info needed for decorator to pull from `self` for backoff
71
- and retry triggered by exception.
72
-
73
- Args:
74
- max_retries: The maximum number of attempts to make before giving
75
- up. Once exhausted, the exception will be allowed to escape.
76
- The default value of None means there is no limit to the
77
- number of tries. If a callable is passed, it will be
78
- evaluated at runtime and its return value used.
79
- max_retry_time: The maximum total amount of time to try for before
80
- giving up. Once expired, the exception will be allowed to
81
- escape. If a callable is passed, it will be
82
- evaluated at runtime and its return value used.
83
- """
84
-
85
- max_retries: Optional[int] = None
86
- max_retry_time: Optional[float] = None
87
-
88
-
89
- @dataclass
90
- class PartitionConfig(BaseConfig):
91
- # where to write structured data outputs
92
- pdf_infer_table_structure: bool = False
93
- strategy: str = "auto"
94
- ocr_languages: Optional[list[str]] = None
95
- encoding: Optional[str] = None
96
- additional_partition_args: dict[str, Any] = field(default_factory=dict)
97
- skip_infer_table_types: Optional[list[str]] = None
98
- fields_include: list[str] = field(
99
- default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
100
- )
101
- flatten_metadata: bool = False
102
- metadata_exclude: list[str] = field(default_factory=list)
103
- metadata_include: list[str] = field(default_factory=list)
104
- partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
105
- partition_by_api: bool = False
106
- api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
107
- hi_res_model_name: Optional[str] = None
108
-
109
-
110
- @dataclass
111
- class ProcessorConfig(BaseConfig):
112
- reprocess: bool = False
113
- verbose: bool = False
114
- work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
115
- output_dir: str = "structured-output"
116
- num_processes: int = 2
117
- raise_on_error: bool = False
118
-
119
-
120
- @dataclass
121
- class FileStorageConfig(BaseConfig):
122
- remote_url: str
123
- uncompress: bool = False
124
- recursive: bool = False
125
- file_glob: Optional[list[str]] = None
126
-
127
-
128
- @dataclass
129
- class FsspecConfig(FileStorageConfig):
130
- access_config: Optional[AccessConfig] = None
131
- protocol: str = field(init=False)
132
- path_without_protocol: str = field(init=False)
133
- dir_path: str = field(init=False)
134
- file_path: str = field(init=False)
135
-
136
- def get_access_config(self) -> dict[str, Any]:
137
- if self.access_config:
138
- return self.access_config.to_dict(apply_name_overload=False)
139
- else:
140
- return {}
141
-
142
- def __post_init__(self):
143
- self.protocol, self.path_without_protocol = self.remote_url.split("://")
144
- if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS:
145
- raise ValueError(
146
- f"Protocol {self.protocol} not supported yet, only "
147
- f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.",
148
- )
149
-
150
- # dropbox root is an empty string
151
- match = re.match(rf"{self.protocol}://([\s])/", self.remote_url)
152
- if match and self.protocol == "dropbox":
153
- self.dir_path = " "
154
- self.file_path = ""
155
- return
156
-
157
- # dropbox paths can start with slash
158
- match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url)
159
- if match and self.protocol == "dropbox":
160
- self.dir_path = match.group(1)
161
- self.file_path = match.group(2) or ""
162
- return
163
-
164
- # just a path with no trailing prefix
165
- match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url)
166
- if match:
167
- self.dir_path = match.group(1)
168
- self.file_path = ""
169
- return
170
-
171
- # valid path with a dir and/or file
172
- match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.remote_url)
173
- if not match:
174
- raise ValueError(
175
- f"Invalid path {self.remote_url}. "
176
- f"Expected <protocol>://<dir-path>/<file-or-dir-path>.",
177
- )
178
- self.dir_path = match.group(1)
179
- self.file_path = match.group(2) or ""
180
-
181
-
182
- @dataclass
183
- class ReadConfig(BaseConfig):
184
- # where raw documents are stored for processing, and then removed if not preserve_downloads
185
- download_dir: Optional[str] = ""
186
- re_download: bool = False
187
- preserve_downloads: bool = False
188
- download_only: bool = False
189
- max_docs: Optional[int] = None
190
-
191
-
192
- @dataclass
193
- class EmbeddingConfig(BaseConfig):
194
- provider: str
195
- api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
196
- model_name: Optional[str] = None
197
- aws_access_key_id: Optional[str] = None
198
- aws_secret_access_key: Optional[str] = None
199
- aws_region: Optional[str] = None
200
-
201
- def get_embedder(self) -> "BaseEmbeddingEncoder":
202
- kwargs: dict[str, Any] = {}
203
- if self.api_key:
204
- kwargs["api_key"] = self.api_key
205
- if self.model_name:
206
- kwargs["model_name"] = self.model_name
207
- # TODO make this more dynamic to map to encoder configs
208
- if self.provider == "openai":
209
- from unstructured_ingest.embed.openai import (
210
- OpenAIEmbeddingConfig,
211
- OpenAIEmbeddingEncoder,
212
- )
213
-
214
- return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
215
- elif self.provider == "huggingface":
216
- from unstructured_ingest.embed.huggingface import (
217
- HuggingFaceEmbeddingConfig,
218
- HuggingFaceEmbeddingEncoder,
219
- )
220
-
221
- return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
222
- elif self.provider == "octoai":
223
- from unstructured_ingest.embed.octoai import (
224
- OctoAiEmbeddingConfig,
225
- OctoAIEmbeddingEncoder,
226
- )
227
-
228
- return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
229
- elif self.provider == "bedrock":
230
- from unstructured_ingest.embed.bedrock import (
231
- BedrockEmbeddingConfig,
232
- BedrockEmbeddingEncoder,
233
- )
234
-
235
- return BedrockEmbeddingEncoder(
236
- config=BedrockEmbeddingConfig(
237
- aws_access_key_id=self.aws_access_key_id,
238
- aws_secret_access_key=self.aws_secret_access_key,
239
- region_name=self.aws_region,
240
- )
241
- )
242
- elif self.provider == "vertexai":
243
- from unstructured_ingest.embed.vertexai import (
244
- VertexAIEmbeddingConfig,
245
- VertexAIEmbeddingEncoder,
246
- )
247
-
248
- return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
249
- elif self.provider == "voyageai":
250
- from unstructured_ingest.embed.voyageai import (
251
- VoyageAIEmbeddingConfig,
252
- VoyageAIEmbeddingEncoder,
253
- )
254
-
255
- return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
256
- else:
257
- raise ValueError(f"{self.provider} not a recognized encoder")
258
-
259
-
260
- @dataclass
261
- class ChunkingConfig(BaseConfig):
262
- chunk_elements: InitVar[bool] = False
263
- chunking_strategy: Optional[str] = None
264
- combine_text_under_n_chars: Optional[int] = None
265
- include_orig_elements: Optional[bool] = None
266
- max_characters: Optional[int] = None
267
- multipage_sections: Optional[bool] = None
268
- new_after_n_chars: Optional[int] = None
269
- overlap: Optional[int] = None
270
- overlap_all: Optional[bool] = None
271
-
272
- def __post_init__(self, chunk_elements: bool) -> None:
273
- """Resolve chunking_strategy if chunk_elements is True.
274
-
275
- If chunk_elements is True and chunking_strategy is None, default to 'by_title'. Otherwise,
276
- do nothing and keep the defined value of chunking_strategy."
277
- """
278
- if chunk_elements and self.chunking_strategy is None:
279
- self.chunking_strategy = "by_title"
280
-
281
-
282
- @dataclass
283
- class PermissionsConfig(BaseConfig):
284
- application_id: Optional[str] = enhanced_field(overload_name="permissions_application_id")
285
- tenant: Optional[str] = enhanced_field(overload_name="permissions_tenant")
286
- client_cred: Optional[str] = enhanced_field(
287
- default=None, sensitive=True, overload_name="permissions_client_cred"
288
- )
289
-
290
-
291
- # module-level variable to store session handle
292
- global_write_session_handle: Optional[BaseSessionHandle] = None
293
-
294
-
295
- @dataclass
296
- class WriteConfig(BaseConfig):
297
- pass
298
-
299
-
300
- @dataclass
301
- class BaseConnectorConfig(BaseConfig, ABC):
302
- """Abstract definition on which to define connector-specific attributes."""
303
-
304
-
305
- @dataclass
306
- class SourceMetadata(EnhancedDataClassJsonMixin, ABC):
307
- date_created: Optional[str] = None
308
- date_modified: Optional[str] = None
309
- version: Optional[str] = None
310
- source_url: Optional[str] = None
311
- exists: Optional[bool] = None
312
- permissions_data: Optional[list[dict[str, Any]]] = None
313
-
314
-
315
- class IngestDocJsonMixin(EnhancedDataClassJsonMixin):
316
- """
317
- Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
318
- created from the dataclass. This explicitly sets properties to look for on the IngestDoc
319
- class when creating the json/dict for serialization purposes.
320
- """
321
-
322
- metadata_properties = [
323
- "date_created",
324
- "date_modified",
325
- "date_processed",
326
- "exists",
327
- "permissions_data",
328
- "version",
329
- "source_url",
330
- ]
331
- properties_to_serialize = [
332
- "base_filename",
333
- "filename",
334
- "_output_filename",
335
- "record_locator",
336
- "_source_metadata",
337
- "unique_id",
338
- ]
339
-
340
- def add_props(self, as_dict: dict[str, Any], props: list[str]):
341
- for prop in props:
342
- val = getattr(self, prop)
343
- if isinstance(val, Path):
344
- val = str(val)
345
- if isinstance(val, DataClassJsonMixin):
346
- val = val.to_dict(encode_json=False)
347
- as_dict[prop] = val
348
-
349
- def to_dict(self, **kwargs) -> dict[str, Json]:
350
- as_dict = _asdict(self, **kwargs)
351
- if "_session_handle" in as_dict:
352
- as_dict.pop("_session_handle", None)
353
- self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
354
- if getattr(self, "_source_metadata") is not None:
355
- self.add_props(as_dict=as_dict, props=self.metadata_properties)
356
- return as_dict
357
-
358
- @classmethod
359
- def from_dict(
360
- cls: Type[A], kvs: Json, *, infer_missing=False, apply_name_overload: bool = True
361
- ) -> A:
362
- doc = super().from_dict(
363
- kvs=kvs, infer_missing=infer_missing, apply_name_overload=apply_name_overload
364
- )
365
- if meta := kvs.get("_source_metadata"):
366
- setattr(doc, "_source_metadata", SourceMetadata.from_dict(meta))
367
- if date_processed := kvs.get("_date_processed"):
368
- setattr(doc, "_date_processed", date_processed)
369
- return doc
370
-
371
-
372
- class BatchIngestDocJsonMixin(EnhancedDataClassJsonMixin):
373
- """
374
- Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
375
- created from the dataclass. This explicitly sets properties to look for on the IngestDoc
376
- class when creating the json/dict for serialization purposes.
377
- """
378
-
379
- properties_to_serialize = ["unique_id"]
380
-
381
- def add_props(self, as_dict: dict[str, Any], props: list[str]):
382
- for prop in props:
383
- val = getattr(self, prop)
384
- if isinstance(val, Path):
385
- val = str(val)
386
- if isinstance(val, DataClassJsonMixin):
387
- val = val.to_dict(encode_json=False)
388
- as_dict[prop] = val
389
-
390
- def to_dict(self, encode_json=False) -> dict[str, Json]:
391
- as_dict = _asdict(self, encode_json=encode_json)
392
- self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
393
- return as_dict
394
-
395
- @classmethod
396
- def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A:
397
- doc = _decode_dataclass(cls, kvs, infer_missing)
398
- return doc
399
-
400
-
401
- @dataclass
402
- class BaseIngestDoc(ABC):
403
- processor_config: ProcessorConfig
404
- read_config: ReadConfig
405
- connector_config: BaseConnectorConfig
406
-
407
- @property
408
- @abstractmethod
409
- def unique_id(self) -> str:
410
- pass
411
-
412
-
413
- @dataclass
414
- class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
415
- """An "ingest document" is specific to a connector, and provides
416
- methods to fetch a single raw document, store it locally for processing, any cleanup
417
- needed after successful processing of the doc, and the ability to write the doc's
418
- structured outputs once processed.
419
-
420
- Crucially, it is not responsible for the actual processing of the raw document.
421
- """
422
-
423
- _source_metadata: Optional[SourceMetadata] = field(init=False, default=None)
424
- _date_processed: Optional[str] = field(init=False, default=None)
425
-
426
- @property
427
- def source_metadata(self) -> SourceMetadata:
428
- if self._source_metadata is None:
429
- self.update_source_metadata()
430
- # Provide guarantee that the field was set by update_source_metadata()
431
- if self._source_metadata is None:
432
- raise ValueError("failed to set source metadata")
433
- return self._source_metadata
434
-
435
- @source_metadata.setter
436
- def source_metadata(self, value: SourceMetadata):
437
- self._source_metadata = value
438
-
439
- @property
440
- def date_created(self) -> Optional[str]:
441
- """The date the document was created on the source system."""
442
- return self.source_metadata.date_created
443
-
444
- @property
445
- def date_modified(self) -> Optional[str]:
446
- """The date the document was last modified on the source system."""
447
- return self.source_metadata.date_modified
448
-
449
- @property
450
- def date_processed(self) -> Optional[str]:
451
- """The date the document was last processed by Unstructured.
452
- self._date_processed is assigned internally in self.partition_file()"""
453
- return self._date_processed
454
-
455
- @property
456
- def exists(self) -> Optional[bool]:
457
- """Whether the document exists on the remote source."""
458
- return self.source_metadata.exists
459
-
460
- @property
461
- @abstractmethod
462
- def filename(self):
463
- """The local filename of the document after fetching from remote source."""
464
-
465
- @property
466
- def base_filename(self) -> Optional[str]:
467
- if self.read_config.download_dir and self.filename:
468
- download_path = str(Path(self.read_config.download_dir).resolve())
469
- full_path = str(self.filename)
470
- base_path = full_path.replace(download_path, "")
471
- return base_path
472
- return None
473
-
474
- @property
475
- def base_output_filename(self) -> Optional[str]:
476
- if self.processor_config.output_dir and self._output_filename:
477
- output_path = str(Path(self.processor_config.output_dir).resolve())
478
- full_path = str(self._output_filename)
479
- base_path = full_path.replace(output_path, "")
480
- return base_path
481
- return None
482
-
483
- @property
484
- @abstractmethod
485
- def _output_filename(self):
486
- """Filename of the structured output for this doc."""
487
-
488
- @property
489
- def record_locator(self) -> Optional[dict[str, Any]]: # Values must be JSON-serializable
490
- """A dictionary with any data necessary to uniquely identify the document on
491
- the source system."""
492
- return None
493
-
494
- @property
495
- def unique_id(self) -> str:
496
- return self.filename
497
-
498
- @property
499
- def source_url(self) -> Optional[str]:
500
- """The url of the source document."""
501
- return self.source_metadata.source_url # type: ignore
502
-
503
- @property
504
- def version(self) -> Optional[str]:
505
- """The version of the source document, this could be the last modified date, an
506
- explicit version number, or anything else that can be used to uniquely identify
507
- the version of the document."""
508
- return self.source_metadata.version # type: ignore
509
-
510
- @property
511
- def permissions_data(self) -> Optional[list[dict[str, Any]]]:
512
- """Access control data, aka permissions or sharing, from the source system."""
513
- if self.source_metadata is None:
514
- self.update_source_metadata()
515
- return self.source_metadata.permissions_data # type: ignore
516
-
517
- @abstractmethod
518
- def cleanup_file(self):
519
- """Removes the local copy the file (or anything else) after successful processing."""
520
-
521
- @staticmethod
522
- def skip_if_file_exists(func):
523
- """Decorator that checks if a file exists, is not empty, and should not re-download,
524
- if so log a message indicating as much and skip the decorated function."""
525
-
526
- @functools.wraps(func)
527
- def wrapper(self, *args, **kwargs):
528
- if (
529
- not self.read_config.re_download
530
- and self.filename.is_file()
531
- and self.filename.stat().st_size
532
- ):
533
- logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
534
- return None
535
- return func(self, *args, **kwargs)
536
-
537
- return wrapper
538
-
539
- # TODO: set as @abstractmethod and pass or raise NotImplementedError
540
- def update_source_metadata(self, **kwargs) -> None:
541
- """Sets the SourceMetadata and the properties for the doc"""
542
- self._source_metadata = SourceMetadata()
543
-
544
- def update_permissions_data(self):
545
- """Sets the _permissions_data property for the doc.
546
- This property is later used to fill the corresponding SourceMetadata.permissions_data field,
547
- and after that carries on to the permissions_data property."""
548
- self._permissions_data: Optional[list[dict[str, Any]]] = None
549
-
550
- # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
551
- # in addition to or instead of get_file()
552
- @abstractmethod
553
- @SourceConnectionError.wrap
554
- def get_file(self):
555
- """Fetches the "remote" doc and stores it locally on the filesystem."""
556
-
557
- def has_output(self) -> bool:
558
- """Determine if structured output for this doc already exists."""
559
- return self._output_filename.is_file() and self._output_filename.stat().st_size
560
-
561
- @PartitionError.wrap
562
- def partition_file(
563
- self,
564
- partition_config: PartitionConfig,
565
- **partition_kwargs,
566
- ) -> list["Element"]:
567
- from unstructured.documents.elements import DataSourceMetadata
568
- from unstructured.partition.auto import partition
569
- from unstructured.staging.base import elements_from_dicts
570
-
571
- if not partition_config.partition_by_api:
572
- logger.debug("Using local partition")
573
- elements = partition(
574
- filename=str(self.filename),
575
- data_source_metadata=DataSourceMetadata(
576
- url=self.source_url,
577
- version=self.version,
578
- record_locator=self.record_locator,
579
- date_created=self.date_created,
580
- date_modified=self.date_modified,
581
- date_processed=self.date_processed,
582
- permissions_data=self.permissions_data,
583
- ),
584
- **partition_kwargs,
585
- )
586
- else:
587
- endpoint = partition_config.partition_endpoint
588
-
589
- logger.debug(f"using remote partition ({endpoint})")
590
- elements_dicts = call_api(
591
- server_url=endpoint,
592
- api_key=partition_config.api_key,
593
- filename=Path(self.filename),
594
- api_parameters=partition_kwargs,
595
- )
596
- elements = elements_from_dicts(elements_dicts)
597
- # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
598
- # pass the stringified json here
599
- return elements
600
-
601
- def process_file(
602
- self,
603
- partition_config: PartitionConfig,
604
- **partition_kwargs,
605
- ) -> Optional[list[dict[str, Any]]]:
606
- self._date_processed = datetime.utcnow().isoformat()
607
- if self.read_config.download_only:
608
- return None
609
- logger.info(f"processing {self.filename}")
610
-
611
- elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
612
- element_dicts = [e.to_dict() for e in elements]
613
-
614
- self.isd_elems_no_filename: list[dict[str, Any]] = []
615
- for elem in element_dicts:
616
- if partition_config.metadata_exclude and partition_config.metadata_include:
617
- raise ValueError(
618
- "Arguments `--metadata-include` and `--metadata-exclude` are "
619
- "mutually exclusive with each other.",
620
- )
621
- elif partition_config.metadata_exclude:
622
- ex_list = partition_config.metadata_exclude
623
- for ex in ex_list:
624
- if "." in ex: # handle nested fields
625
- nested_fields = ex.split(".")
626
- current_elem = elem
627
- for f in nested_fields[:-1]:
628
- if f in current_elem:
629
- current_elem = current_elem[f]
630
- field_to_exclude = nested_fields[-1]
631
- if field_to_exclude in current_elem:
632
- current_elem.pop(field_to_exclude, None)
633
- else: # handle top-level fields
634
- elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
635
- elif partition_config.metadata_include:
636
- in_list = partition_config.metadata_include
637
- for k in list(elem["metadata"].keys()): # type: ignore[attr-defined]
638
- if k not in in_list:
639
- elem["metadata"].pop(k, None) # type: ignore[attr-defined]
640
- in_list = partition_config.fields_include
641
- elem = {k: v for k, v in elem.items() if k in in_list}
642
-
643
- if partition_config.flatten_metadata and "metadata" in elem:
644
- metadata = elem.pop("metadata")
645
- elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
646
-
647
- self.isd_elems_no_filename.append(elem)
648
-
649
- return self.isd_elems_no_filename
650
-
651
-
652
- @dataclass
653
- class BaseIngestDocBatch(BaseIngestDoc, BatchIngestDocJsonMixin, ABC):
654
- ingest_docs: list[BaseSingleIngestDoc] = field(default_factory=list)
655
-
656
- @abstractmethod
657
- @SourceConnectionError.wrap
658
- def get_files(self):
659
- """Fetches the "remote" docs and stores it locally on the filesystem."""
660
-
661
-
662
- @dataclass
663
- class BaseConnector(EnhancedDataClassJsonMixin, ABC):
664
- @abstractmethod
665
- def check_connection(self):
666
- pass
667
-
668
-
669
- @dataclass
670
- class BaseSourceConnector(BaseConnector, ABC):
671
- """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
672
-
673
- processor_config: ProcessorConfig
674
- read_config: ReadConfig
675
- connector_config: BaseConnectorConfig
676
-
677
- @abstractmethod
678
- def cleanup(self, cur_dir=None):
679
- """Any additional cleanup up need after processing is complete. E.g., removing
680
- temporary download dirs that are empty.
681
-
682
- By convention, documents that failed to process are typically not cleaned up."""
683
-
684
- @abstractmethod
685
- def initialize(self):
686
- """Initializes the connector. Should also validate the connector is properly
687
- configured: e.g., list a single a document from the source."""
688
-
689
- @abstractmethod
690
- def get_ingest_docs(self):
691
- """Returns all ingest docs (derived from BaseIngestDoc).
692
- This does not imply downloading all the raw documents themselves,
693
- rather each IngestDoc is capable of fetching its content (in another process)
694
- with IngestDoc.get_file()."""
695
-
696
-
697
- @dataclass
698
- class BaseDestinationConnector(BaseConnector, ABC):
699
- write_config: WriteConfig
700
- connector_config: BaseConnectorConfig
701
-
702
- def __init__(self, write_config: WriteConfig, connector_config: BaseConnectorConfig):
703
- self.write_config = write_config
704
- self.connector_config = connector_config
705
-
706
- def conform_dict(self, data: dict[str, Any]) -> None:
707
- """
708
- When the original dictionary needs to be modified in place
709
- """
710
- return
711
-
712
- def normalize_dict(self, element_dict: dict[str, Any]) -> dict[str, Any]:
713
- """
714
- When the original dictionary needs to be mapped to a new one
715
- """
716
- return element_dict
717
-
718
- @abstractmethod
719
- def initialize(self):
720
- """Initializes the connector. Should also validate the connector is properly
721
- configured."""
722
-
723
- def write(self, docs: list[BaseSingleIngestDoc]) -> None:
724
- elements_dict = self.get_elements_dict(docs=docs)
725
- self.modify_and_write_dict(elements_dict=elements_dict)
726
-
727
- def get_elements_dict(self, docs: list[BaseSingleIngestDoc]) -> list[dict[str, Any]]:
728
- dict_list: list[dict[str, Any]] = []
729
- for doc in docs:
730
- local_path = doc._output_filename
731
- with open(local_path) as json_file:
732
- dict_content = json.load(json_file)
733
- logger.info(
734
- f"Extending {len(dict_content)} json elements from content in {local_path}",
735
- )
736
- dict_list.extend(dict_content)
737
- return dict_list
738
-
739
- @abstractmethod
740
- def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
741
- pass
742
-
743
- def modify_and_write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
744
- """
745
- Modify in this instance means this method wraps calls to conform_dict() and
746
- normalize() before actually processing the content via write_dict()
747
- """
748
- for d in elements_dict:
749
- self.conform_dict(data=d)
750
- elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict]
751
- return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs)
752
-
753
- def write_elements(self, elements: list["Element"], *args, **kwargs) -> None:
754
- elements_dict = [e.to_dict() for e in elements]
755
- self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs)
756
-
757
-
758
- class SourceConnectorCleanupMixin:
759
- read_config: ReadConfig
760
-
761
- def cleanup(self, cur_dir=None):
762
- """Recursively clean up downloaded files and directories."""
763
- if self.read_config.preserve_downloads or self.read_config.download_only:
764
- return
765
- if cur_dir is None:
766
- cur_dir = self.read_config.download_dir
767
- if cur_dir is None or not Path(cur_dir).is_dir():
768
- return
769
- sub_dirs = os.listdir(cur_dir)
770
- os.chdir(cur_dir)
771
- for sub_dir in sub_dirs:
772
- # don't traverse symlinks, not that there every should be any
773
- if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
774
- self.cleanup(sub_dir)
775
- os.chdir("..")
776
- if len(os.listdir(cur_dir)) == 0:
777
- os.rmdir(cur_dir)
778
-
779
-
780
- class PermissionsCleanupMixin:
781
- processor_config: ProcessorConfig
782
-
783
- def cleanup_permissions(self, cur_dir=None):
784
- def has_no_folders(folder_path):
785
- folders = [
786
- item
787
- for item in os.listdir(folder_path)
788
- if os.path.isdir(os.path.join(folder_path, item))
789
- ]
790
- return len(folders) == 0
791
-
792
- """Recursively clean up downloaded files and directories."""
793
- if cur_dir is None:
794
- cur_dir = Path(self.processor_config.output_dir, "permissions_data")
795
- if not Path(cur_dir).exists():
796
- return
797
- if Path(cur_dir).is_file():
798
- cur_file = cur_dir
799
- os.remove(cur_file)
800
- return
801
- sub_dirs = os.listdir(cur_dir)
802
- os.chdir(cur_dir)
803
- for sub_dir in sub_dirs:
804
- # don't traverse symlinks, not that there every should be any
805
- if not os.path.islink(sub_dir):
806
- self.cleanup_permissions(sub_dir)
807
- os.chdir("..")
808
- if has_no_folders(cur_dir):
809
- os.rmdir(cur_dir)
810
-
811
-
812
- class IngestDocCleanupMixin:
813
- read_config: ReadConfig
814
-
815
- @property
816
- @abstractmethod
817
- def filename(self):
818
- """The local filename of the document after fetching from remote source."""
819
-
820
- def cleanup_file(self):
821
- """Removes the local copy of the file after successful processing."""
822
- if (
823
- not self.read_config.preserve_downloads
824
- and self.filename.is_file()
825
- and not self.read_config.download_only
826
- ):
827
- logger.debug(f"cleaning up {self}")
828
- os.unlink(self.filename)
829
-
830
-
831
- class ConfigSessionHandleMixin:
832
- @abstractmethod
833
- def create_session_handle(self) -> BaseSessionHandle:
834
- """Creates a session handle that will be assigned on each IngestDoc to share
835
- session related resources across all document handling for a given subprocess."""
836
-
837
-
838
- @dataclass
839
- class IngestDocSessionHandleMixin:
840
- connector_config: ConfigSessionHandleMixin
841
- _session_handle: Optional[BaseSessionHandle] = field(default=None, init=False)
842
-
843
- @property
844
- def session_handle(self):
845
- """If a session handle is not assigned, creates a new one and assigns it."""
846
- if self._session_handle is None:
847
- self._session_handle = self.connector_config.create_session_handle()
848
- return self._session_handle
849
-
850
- @session_handle.setter
851
- def session_handle(self, session_handle: BaseSessionHandle):
852
- self._session_handle = session_handle