unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,584 +0,0 @@
1
- import enum
2
- import logging
3
- from dataclasses import dataclass, field
4
- from typing import List, Optional, Tuple
5
- from urllib.parse import urlparse
6
- from uuid import UUID
7
-
8
- from htmlBuilder.attributes import Style, Type
9
- from htmlBuilder.tags import (
10
- Body,
11
- Div,
12
- Head,
13
- Html,
14
- HtmlTag,
15
- Ol,
16
- Table,
17
- Td,
18
- Th,
19
- Title,
20
- Tr,
21
- Ul,
22
- )
23
- from notion_client.errors import APIResponseError
24
-
25
- import unstructured_ingest.connector.notion.types.blocks as notion_blocks
26
- from unstructured_ingest.connector.notion.client import Client
27
- from unstructured_ingest.connector.notion.interfaces import BlockBase
28
- from unstructured_ingest.connector.notion.types.block import Block
29
- from unstructured_ingest.connector.notion.types.database import Database
30
-
31
-
32
- @dataclass
33
- class TextExtractionResponse:
34
- text: Optional[str] = None
35
- child_pages: List[str] = field(default_factory=list)
36
- child_databases: List[str] = field(default_factory=list)
37
-
38
-
39
- @dataclass
40
- class HtmlExtractionResponse:
41
- html: Optional[HtmlTag] = None
42
- child_pages: List[str] = field(default_factory=list)
43
- child_databases: List[str] = field(default_factory=list)
44
-
45
-
46
- def extract_page_html(
47
- client: Client,
48
- page_id: str,
49
- logger: logging.Logger,
50
- ) -> HtmlExtractionResponse:
51
- page_id_uuid = UUID(page_id)
52
- html_elements: List[Tuple[BlockBase, HtmlTag]] = []
53
- parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore
54
- head = None
55
- if isinstance(parent_block.block, notion_blocks.ChildPage):
56
- head = Head([], Title([], parent_block.block.title))
57
- child_pages: List[str] = []
58
- child_databases: List[str] = []
59
- parents: List[Tuple[int, Block]] = [(0, parent_block)]
60
- processed_block_ids = []
61
- while len(parents) > 0:
62
- level, parent = parents.pop(0)
63
- parent_html = parent.get_html()
64
- if parent_html:
65
- html_elements.append((parent.block, parent_html))
66
- logger.debug(f"processing block: {parent}")
67
- if isinstance(parent.block, notion_blocks.ChildPage) and parent.id != str(page_id_uuid):
68
- child_pages.append(parent.id)
69
- continue
70
- if isinstance(parent.block, notion_blocks.ChildDatabase):
71
- child_databases.append(parent.id)
72
- continue
73
- if isinstance(parent.block, notion_blocks.Table):
74
- table_response = build_table(client=client, table=parent)
75
- html_elements.append((parent.block, table_response.table_html))
76
- child_pages.extend(table_response.child_pages)
77
- child_databases.extend(table_response.child_databases)
78
- continue
79
- if isinstance(parent.block, notion_blocks.ColumnList):
80
- column_html = build_columned_list(client=client, column_parent=parent)
81
- html_elements.append((parent.block, column_html))
82
- continue
83
- if isinstance(parent.block, notion_blocks.BulletedListItem):
84
- bullet_list_resp = build_bulleted_list_children(
85
- client=client,
86
- bulleted_list_item_parent=parent,
87
- )
88
- if bullet_list_children := bullet_list_resp.child_list:
89
- html_elements.append((parent.block, bullet_list_children))
90
- continue
91
- if isinstance(parent.block, notion_blocks.NumberedListItem):
92
- numbered_list_resp = build_numbered_list_children(
93
- client=client,
94
- numbered_list_item_parent=parent,
95
- )
96
- if numbered_list_children := numbered_list_resp.child_list:
97
- html_elements.append((parent.block, numbered_list_children))
98
- continue
99
- if parent.block.can_have_children() and parent.has_children:
100
- children = []
101
- for children_block in client.blocks.children.iterate_list( # type: ignore
102
- block_id=parent.id,
103
- ):
104
- children.extend(children_block)
105
- if children:
106
- logger.debug(f"adding {len(children)} children from parent: {parent}")
107
- for child in children:
108
- if child.id not in processed_block_ids:
109
- parents.append((level + 1, child))
110
- processed_block_ids.append(parent)
111
-
112
- # Join list items
113
- joined_html_elements = []
114
- numbered_list_items = []
115
- bullet_list_items = []
116
- for block, html in html_elements:
117
- if isinstance(block, notion_blocks.BulletedListItem):
118
- bullet_list_items.append(html)
119
- continue
120
- if isinstance(block, notion_blocks.NumberedListItem):
121
- numbered_list_items.append(html)
122
- continue
123
- if len(numbered_list_items) > 0:
124
- joined_html_elements.append(Ol([], numbered_list_items))
125
- numbered_list_items = []
126
- if len(bullet_list_items) > 0:
127
- joined_html_elements.append(Ul([], bullet_list_items))
128
- bullet_list_items = []
129
- joined_html_elements.append(html)
130
-
131
- body = Body([], joined_html_elements)
132
- all_elements = [body]
133
- if head:
134
- all_elements = [head] + all_elements
135
- full_html = Html([], all_elements)
136
- return HtmlExtractionResponse(
137
- full_html,
138
- child_pages=child_pages,
139
- child_databases=child_databases,
140
- )
141
-
142
-
143
- def extract_database_html(
144
- client: Client,
145
- database_id: str,
146
- logger: logging.Logger,
147
- ) -> HtmlExtractionResponse:
148
- logger.debug(f"processing database id: {database_id}")
149
- database: Database = client.databases.retrieve(database_id=database_id) # type: ignore
150
- property_keys = list(database.properties.keys())
151
- property_keys = sorted(property_keys)
152
- table_html_rows = []
153
- child_pages: List[str] = []
154
- child_databases: List[str] = []
155
- # Create header row
156
- table_html_rows.append(Tr([], [Th([], k) for k in property_keys]))
157
-
158
- all_pages = []
159
- for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore
160
- all_pages.extend(page_chunk)
161
-
162
- logger.debug(f"creating {len(all_pages)} rows")
163
- for page in all_pages:
164
- if is_database_url(client=client, url=page.url):
165
- child_databases.append(page.id)
166
- if is_page_url(client=client, url=page.url):
167
- child_pages.append(page.id)
168
- properties = page.properties
169
- inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore
170
- table_html_rows.append(
171
- Tr(
172
- [],
173
- [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]],
174
- ),
175
- )
176
-
177
- table_html = Table([], table_html_rows)
178
-
179
- return HtmlExtractionResponse(
180
- html=table_html,
181
- child_pages=child_pages,
182
- child_databases=child_databases,
183
- )
184
-
185
-
186
- @dataclass
187
- class ChildExtractionResponse:
188
- child_pages: List[str] = field(default_factory=list)
189
- child_databases: List[str] = field(default_factory=list)
190
-
191
-
192
- class QueueEntryType(enum.Enum):
193
- DATABASE = "database"
194
- PAGE = "page"
195
-
196
-
197
- @dataclass
198
- class QueueEntry:
199
- type: QueueEntryType
200
- id: UUID
201
-
202
-
203
- def get_recursive_content_from_page(
204
- client: Client,
205
- page_id: str,
206
- logger: logging.Logger,
207
- ) -> ChildExtractionResponse:
208
- return get_recursive_content(
209
- client=client,
210
- init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)),
211
- logger=logger,
212
- )
213
-
214
-
215
- def get_recursive_content_from_database(
216
- client: Client,
217
- database_id: str,
218
- logger: logging.Logger,
219
- ) -> ChildExtractionResponse:
220
- return get_recursive_content(
221
- client=client,
222
- init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
223
- logger=logger,
224
- )
225
-
226
-
227
- def get_recursive_content(
228
- client: Client,
229
- init_entry: QueueEntry,
230
- logger: logging.Logger,
231
- ) -> ChildExtractionResponse:
232
- parents: List[QueueEntry] = [init_entry]
233
- child_pages: List[str] = []
234
- child_dbs: List[str] = []
235
- processed: List[str] = []
236
- while len(parents) > 0:
237
- parent: QueueEntry = parents.pop()
238
- processed.append(str(parent.id))
239
- if parent.type == QueueEntryType.PAGE:
240
- logger.debug(f"getting child data from page: {parent.id}")
241
- page_children = []
242
- try:
243
- for children_block in client.blocks.children.iterate_list( # type: ignore
244
- block_id=str(parent.id),
245
- ):
246
- page_children.extend(children_block)
247
- except APIResponseError as api_error:
248
- logger.error(f"failed to get page with id {parent.id}: {api_error}")
249
- if str(parent.id) in child_pages:
250
- child_pages.remove(str(parent.id))
251
- continue
252
- if not page_children:
253
- continue
254
-
255
- # Extract child pages
256
- child_pages_from_page = [
257
- c for c in page_children if isinstance(c.block, notion_blocks.ChildPage)
258
- ]
259
- if child_pages_from_page:
260
- child_page_blocks: List[notion_blocks.ChildPage] = [
261
- p.block
262
- for p in child_pages_from_page
263
- if isinstance(p.block, notion_blocks.ChildPage)
264
- ]
265
- logger.debug(
266
- "found child pages from parent page {}: {}".format(
267
- parent.id,
268
- ", ".join([block.title for block in child_page_blocks]),
269
- ),
270
- )
271
- new_pages = [p.id for p in child_pages_from_page if p.id not in processed]
272
- new_pages = list(set(new_pages))
273
- child_pages.extend(new_pages)
274
- parents.extend(
275
- [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
276
- )
277
-
278
- # Extract child databases
279
- child_dbs_from_page = [
280
- c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase)
281
- ]
282
- if child_dbs_from_page:
283
- child_db_blocks: List[notion_blocks.ChildDatabase] = [
284
- c.block
285
- for c in page_children
286
- if isinstance(c.block, notion_blocks.ChildDatabase)
287
- ]
288
- logger.debug(
289
- "found child database from parent page {}: {}".format(
290
- parent.id,
291
- ", ".join([block.title for block in child_db_blocks]),
292
- ),
293
- )
294
- new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed]
295
- new_dbs = list(set(new_dbs))
296
- child_dbs.extend(new_dbs)
297
- parents.extend(
298
- [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
299
- )
300
-
301
- linked_to_others: List[notion_blocks.LinkToPage] = [
302
- c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage)
303
- ]
304
- for link in linked_to_others:
305
- if (page_id := link.page_id) and (
306
- page_id not in processed and page_id not in child_pages
307
- ):
308
- child_pages.append(page_id)
309
- parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)))
310
- if (database_id := link.database_id) and (
311
- database_id not in processed and database_id not in child_dbs
312
- ):
313
- child_dbs.append(database_id)
314
- parents.append(
315
- QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
316
- )
317
-
318
- elif parent.type == QueueEntryType.DATABASE:
319
- logger.debug(f"getting child data from database: {parent.id}")
320
- database_pages = []
321
- try:
322
- for page_entries in client.databases.iterate_query( # type: ignore
323
- database_id=str(parent.id),
324
- ):
325
- database_pages.extend(page_entries)
326
- except APIResponseError as api_error:
327
- logger.error(f"failed to get database with id {parent.id}: {api_error}")
328
- if str(parent.id) in child_dbs:
329
- child_dbs.remove(str(parent.id))
330
- continue
331
- if not database_pages:
332
- continue
333
-
334
- child_pages_from_db = [
335
- p for p in database_pages if is_page_url(client=client, url=p.url)
336
- ]
337
- if child_pages_from_db:
338
- logger.debug(
339
- "found child pages from parent database {}: {}".format(
340
- parent.id,
341
- ", ".join([p.url for p in child_pages_from_db]),
342
- ),
343
- )
344
- new_pages = [p.id for p in child_pages_from_db if p.id not in processed]
345
- child_pages.extend(new_pages)
346
- parents.extend(
347
- [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
348
- )
349
-
350
- child_dbs_from_db = [
351
- p for p in database_pages if is_database_url(client=client, url=p.url)
352
- ]
353
- if child_dbs_from_db:
354
- logger.debug(
355
- "found child database from parent database {}: {}".format(
356
- parent.id,
357
- ", ".join([db.url for db in child_dbs_from_db]),
358
- ),
359
- )
360
- new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed]
361
- child_dbs.extend(new_dbs)
362
- parents.extend(
363
- [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
364
- )
365
-
366
- return ChildExtractionResponse(
367
- child_pages=child_pages,
368
- child_databases=child_dbs,
369
- )
370
-
371
-
372
- def is_valid_uuid(uuid_str: str) -> bool:
373
- try:
374
- UUID(uuid_str)
375
- return True
376
- except Exception:
377
- return False
378
-
379
-
380
- def get_uuid_from_url(path: str) -> Optional[str]:
381
- strings = path.split("-")
382
- if len(strings) > 0 and is_valid_uuid(strings[-1]):
383
- return strings[-1]
384
- return None
385
-
386
-
387
- def is_page_url(client: Client, url: str):
388
- parsed_url = urlparse(url)
389
- path = parsed_url.path.split("/")[-1]
390
- if parsed_url.netloc != "www.notion.so":
391
- return False
392
- page_uuid = get_uuid_from_url(path=path)
393
- if not page_uuid:
394
- return False
395
- check_resp = client.pages.retrieve_status(page_id=page_uuid)
396
- return check_resp == 200
397
-
398
-
399
- def is_database_url(client: Client, url: str):
400
- parsed_url = urlparse(url)
401
- path = parsed_url.path.split("/")[-1]
402
- if parsed_url.netloc != "www.notion.so":
403
- return False
404
- database_uuid = get_uuid_from_url(path=path)
405
- if not database_uuid:
406
- return False
407
- check_resp = client.databases.retrieve_status(database_id=database_uuid)
408
- return check_resp == 200
409
-
410
-
411
- @dataclass
412
- class BuildTableResponse:
413
- table_html: HtmlTag
414
- child_pages: List[str] = field(default_factory=list)
415
- child_databases: List[str] = field(default_factory=list)
416
-
417
-
418
- def build_table(client: Client, table: Block) -> BuildTableResponse:
419
- if not isinstance(table.block, notion_blocks.Table):
420
- raise ValueError(f"block type not table: {type(table.block)}")
421
- rows: List[notion_blocks.TableRow] = []
422
- child_pages: List[str] = []
423
- child_databases: List[str] = []
424
- for row_chunk in client.blocks.children.iterate_list( # type: ignore
425
- block_id=table.id,
426
- ):
427
- rows.extend(
428
- [row.block for row in row_chunk if isinstance(row.block, notion_blocks.TableRow)],
429
- )
430
-
431
- # Extract child databases and pages
432
- for row in rows:
433
- for c in row.cells:
434
- for rt in c.rich_texts:
435
- if mention := rt.mention:
436
- if mention.type == "page" and (page := mention.page):
437
- child_pages.append(page.id)
438
- if mention.type == "database" and (database := mention.database):
439
- child_databases.append(database.id)
440
-
441
- header: Optional[notion_blocks.TableRow] = None
442
- if table.block.has_column_header:
443
- header = rows.pop(0)
444
- table_html_rows = []
445
- if header:
446
- header.is_header = True
447
- table_html_rows.append(header.get_html())
448
- table_html_rows.extend([row.get_html() for row in rows])
449
- html_table = Table([], table_html_rows)
450
-
451
- return BuildTableResponse(
452
- table_html=html_table,
453
- child_pages=child_pages,
454
- child_databases=child_databases,
455
- )
456
-
457
-
458
- def build_columned_list(client: Client, column_parent: Block) -> HtmlTag:
459
- if not isinstance(column_parent.block, notion_blocks.ColumnList):
460
- raise ValueError(f"block type not column list: {type(column_parent.block)}")
461
- columns: List[Block] = []
462
- for column_chunk in client.blocks.children.iterate_list( # type: ignore
463
- block_id=column_parent.id,
464
- ):
465
- columns.extend(column_chunk)
466
- num_columns = len(columns)
467
- columns_content = []
468
- for column in columns:
469
- for column_content_chunk in client.blocks.children.iterate_list( # type: ignore
470
- block_id=column.id,
471
- ):
472
- columns_content.append(
473
- Div(
474
- [Style(f"width:{100 / num_columns}%; float: left")],
475
- [content.block.get_html() for content in column_content_chunk],
476
- ),
477
- )
478
-
479
- return Div([], columns_content)
480
-
481
-
482
- @dataclass
483
- class BulletedListResponse:
484
- html: HtmlTag
485
- child_list: Optional[HtmlTag] = None
486
-
487
-
488
- bulleted_list_styles = ["circle", "square", "disc"]
489
-
490
-
491
- def build_bulleted_list_children(
492
- client: Client,
493
- bulleted_list_item_parent: Block,
494
- list_style_ind: int = 0,
495
- ) -> BulletedListResponse:
496
- if not isinstance(bulleted_list_item_parent.block, notion_blocks.BulletedListItem):
497
- raise ValueError(
498
- f"block type not bulleted list item: {type(bulleted_list_item_parent.block)}",
499
- )
500
- html = bulleted_list_item_parent.get_html()
501
- if html:
502
- html.attributes = [Style("margin-left: 10px")]
503
- if not bulleted_list_item_parent.has_children:
504
- return BulletedListResponse(
505
- html=html,
506
- )
507
- children = []
508
- for child_block in client.blocks.children.iterate_list( # type: ignore
509
- block_id=bulleted_list_item_parent.id,
510
- ):
511
- children.extend(child_block)
512
- if not children:
513
- return BulletedListResponse(
514
- html=bulleted_list_item_parent.get_html(),
515
- )
516
- child_html = []
517
- for child in children:
518
- child_resp = build_bulleted_list_children(
519
- client=client,
520
- bulleted_list_item_parent=child,
521
- list_style_ind=(list_style_ind + 1) % len(bulleted_list_styles),
522
- )
523
- child_html.append(child_resp.html)
524
- if child_children := child_resp.child_list:
525
- child_html.append(child_children)
526
-
527
- return BulletedListResponse(
528
- html=html,
529
- child_list=Ul(
530
- [Style(f"list-style-type: {bulleted_list_styles[list_style_ind]}")],
531
- child_html,
532
- ),
533
- )
534
-
535
-
536
- @dataclass
537
- class NumberedListResponse:
538
- html: HtmlTag
539
- child_list: Optional[HtmlTag] = None
540
-
541
-
542
- numbered_list_types = ["a", "i", "1"]
543
-
544
-
545
- def build_numbered_list_children(
546
- client: Client,
547
- numbered_list_item_parent: Block,
548
- type_attr_ind=0,
549
- ) -> NumberedListResponse:
550
- if not isinstance(numbered_list_item_parent.block, notion_blocks.NumberedListItem):
551
- raise ValueError(
552
- f"block type not numbered list item: {type(numbered_list_item_parent.block)}",
553
- )
554
- html = numbered_list_item_parent.get_html()
555
- if html:
556
- html.attributes = [Style("margin-left: 10px")]
557
- if not numbered_list_item_parent.has_children:
558
- return NumberedListResponse(
559
- html=html,
560
- )
561
- children = []
562
- for child_block in client.blocks.children.iterate_list( # type: ignore
563
- block_id=numbered_list_item_parent.id,
564
- ):
565
- children.extend(child_block)
566
- if not children:
567
- return NumberedListResponse(
568
- html=numbered_list_item_parent.get_html(),
569
- )
570
- child_html = []
571
- for child in children:
572
- child_resp = build_numbered_list_children(
573
- client=client,
574
- numbered_list_item_parent=child,
575
- type_attr_ind=(type_attr_ind + 1) % len(numbered_list_types),
576
- )
577
- child_html.append(child_resp.html)
578
- if child_children := child_resp.child_list:
579
- child_html.append(child_children)
580
-
581
- return NumberedListResponse(
582
- html=html,
583
- child_list=Ol([Type(numbered_list_types[type_attr_ind])], child_html),
584
- )
@@ -1,40 +0,0 @@
1
- # https://developers.notion.com/reference/block#bookmark
2
- from dataclasses import dataclass, field
3
- from typing import List, Optional
4
-
5
- from htmlBuilder.attributes import Href
6
- from htmlBuilder.tags import A, Br, Div, HtmlTag
7
-
8
- from unstructured_ingest.connector.notion.interfaces import BlockBase
9
- from unstructured_ingest.connector.notion.types.rich_text import RichText
10
-
11
-
12
- @dataclass
13
- class Bookmark(BlockBase):
14
- url: str
15
- caption: List[RichText] = field(default_factory=list)
16
-
17
- @classmethod
18
- def from_dict(cls, data: dict):
19
- captions = data.pop("caption", [])
20
- return cls(
21
- url=data["url"],
22
- caption=[RichText.from_dict(c) for c in captions],
23
- )
24
-
25
- def get_html(self) -> Optional[HtmlTag]:
26
- texts = []
27
- if self.url:
28
- texts.append(A([Href(self.url)], self.url))
29
- if self.caption:
30
- texts.append(Div([], [rt.get_html() for rt in self.caption]))
31
- if not texts:
32
- return None
33
- joined = [Br()] * (len(texts) * 2 - 1)
34
- joined[0::2] = texts
35
-
36
- return Div([], joined)
37
-
38
- @staticmethod
39
- def can_have_children() -> bool:
40
- return False