unstructured-ingest 0.0.2__tar.gz → 0.0.2.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (373) hide show
  1. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/PKG-INFO +1 -1
  2. unstructured-ingest-0.0.2.dev0/unstructured_ingest/__version__.py +1 -0
  3. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/base/cmd.py +0 -10
  4. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/base/src.py +0 -2
  5. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +9 -1
  6. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/local.py +8 -0
  7. unstructured-ingest-0.0.2.dev0/unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  8. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/__init__.py +1 -2
  9. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/downloader.py +3 -9
  10. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/file_data.py +1 -6
  11. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/process.py +0 -3
  12. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/interfaces.py +5 -3
  13. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/pipeline.py +2 -72
  14. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/download.py +13 -77
  15. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/astra.py +0 -8
  16. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -8
  17. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/chroma.py +6 -8
  18. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -9
  19. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/elasticsearch.py +9 -23
  20. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -12
  21. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/s3.py +5 -13
  22. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/google_drive.py +9 -13
  23. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/local.py +15 -15
  24. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/mongodb.py +4 -10
  25. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/onedrive.py +2 -14
  26. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
  27. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/salesforce.py +8 -10
  28. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/sharepoint.py +8 -14
  29. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/sql.py +9 -24
  30. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/weaviate.py +5 -13
  31. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/PKG-INFO +1 -1
  32. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/SOURCES.txt +0 -3
  33. unstructured-ingest-0.0.2/unstructured_ingest/__version__.py +0 -1
  34. unstructured-ingest-0.0.2/unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  35. unstructured-ingest-0.0.2/unstructured_ingest/v2/cli/configs/filter.py +0 -28
  36. unstructured-ingest-0.0.2/unstructured_ingest/v2/pipeline/steps/filter.py +0 -40
  37. unstructured-ingest-0.0.2/unstructured_ingest/v2/processes/filter.py +0 -54
  38. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/README.md +0 -0
  39. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/pyproject.toml +0 -0
  40. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/setup.cfg +0 -0
  41. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/setup.py +0 -0
  42. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/test/test_error.py +0 -0
  43. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/test/test_interfaces.py +0 -0
  44. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/test/test_logger.py +0 -0
  45. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/test/test_utils.py +0 -0
  46. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/__init__.py +0 -0
  47. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/__init__.py +0 -0
  48. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/base/__init__.py +0 -0
  49. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/base/cmd.py +0 -0
  50. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/base/dest.py +0 -0
  51. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/base/src.py +0 -0
  52. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cli.py +0 -0
  53. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmd_factory.py +0 -0
  54. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/__init__.py +0 -0
  55. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/airtable.py +0 -0
  56. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/astra.py +0 -0
  57. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/azure_cognitive_search.py +0 -0
  58. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/biomed.py +0 -0
  59. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/chroma.py +0 -0
  60. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/clarifai.py +0 -0
  61. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/confluence.py +0 -0
  62. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/databricks_volumes.py +0 -0
  63. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/delta_table.py +0 -0
  64. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/discord.py +0 -0
  65. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/elasticsearch.py +0 -0
  66. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  67. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/azure.py +0 -0
  68. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/box.py +0 -0
  69. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -0
  70. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -0
  71. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -0
  72. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/s3.py +0 -0
  73. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -0
  74. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/github.py +0 -0
  75. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/gitlab.py +0 -0
  76. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/google_drive.py +0 -0
  77. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/hubspot.py +0 -0
  78. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/jira.py +0 -0
  79. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/kafka.py +0 -0
  80. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/local.py +0 -0
  81. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/mongodb.py +0 -0
  82. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/notion.py +0 -0
  83. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/onedrive.py +0 -0
  84. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/opensearch.py +0 -0
  85. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/outlook.py +0 -0
  86. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/pinecone.py +0 -0
  87. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/qdrant.py +0 -0
  88. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/reddit.py +0 -0
  89. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/salesforce.py +0 -0
  90. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/sharepoint.py +0 -0
  91. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/slack.py +0 -0
  92. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/sql.py +0 -0
  93. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/vectara.py +0 -0
  94. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/weaviate.py +0 -0
  95. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/cmds/wikipedia.py +0 -0
  96. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/common.py +0 -0
  97. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/interfaces.py +0 -0
  98. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/cli/utils.py +0 -0
  99. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/__init__.py +0 -0
  100. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/airtable.py +0 -0
  101. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/astra.py +0 -0
  102. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/azure_cognitive_search.py +0 -0
  103. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/biomed.py +0 -0
  104. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/chroma.py +0 -0
  105. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/clarifai.py +0 -0
  106. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/confluence.py +0 -0
  107. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/databricks_volumes.py +0 -0
  108. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/delta_table.py +0 -0
  109. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/discord.py +0 -0
  110. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/elasticsearch.py +0 -0
  111. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/__init__.py +0 -0
  112. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/azure.py +0 -0
  113. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/box.py +0 -0
  114. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/dropbox.py +0 -0
  115. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/fsspec.py +0 -0
  116. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/gcs.py +0 -0
  117. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/s3.py +0 -0
  118. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/fsspec/sftp.py +0 -0
  119. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/git.py +0 -0
  120. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/github.py +0 -0
  121. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/gitlab.py +0 -0
  122. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/google_drive.py +0 -0
  123. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/hubspot.py +0 -0
  124. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/jira.py +0 -0
  125. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/kafka.py +0 -0
  126. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/local.py +0 -0
  127. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/mongodb.py +0 -0
  128. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/__init__.py +0 -0
  129. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/client.py +0 -0
  130. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/connector.py +0 -0
  131. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/helpers.py +0 -0
  132. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/interfaces.py +0 -0
  133. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/__init__.py +0 -0
  134. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/block.py +0 -0
  135. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/__init__.py +0 -0
  136. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -0
  137. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +0 -0
  138. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +0 -0
  139. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/callout.py +0 -0
  140. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -0
  141. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/child_page.py +0 -0
  142. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/code.py +0 -0
  143. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -0
  144. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/divider.py +0 -0
  145. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/embed.py +0 -0
  146. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/equation.py +0 -0
  147. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/file.py +0 -0
  148. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/heading.py +0 -0
  149. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/image.py +0 -0
  150. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -0
  151. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/link_to_page.py +0 -0
  152. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -0
  153. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/paragraph.py +0 -0
  154. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/pdf.py +0 -0
  155. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/quote.py +0 -0
  156. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -0
  157. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/table.py +0 -0
  158. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -0
  159. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/template.py +0 -0
  160. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/todo.py +0 -0
  161. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/toggle.py +0 -0
  162. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -0
  163. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/blocks/video.py +0 -0
  164. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database.py +0 -0
  165. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -0
  166. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -0
  167. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/created_by.py +0 -0
  168. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/created_time.py +0 -0
  169. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/date.py +0 -0
  170. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/email.py +0 -0
  171. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/files.py +0 -0
  172. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -0
  173. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +0 -0
  174. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -0
  175. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -0
  176. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/number.py +0 -0
  177. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/people.py +0 -0
  178. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -0
  179. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -0
  180. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/rich_text.py +0 -0
  181. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/rollup.py +0 -0
  182. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/select.py +0 -0
  183. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/status.py +0 -0
  184. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/title.py +0 -0
  185. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -0
  186. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/url.py +0 -0
  187. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/database_properties/verification.py +0 -0
  188. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/date.py +0 -0
  189. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/file.py +0 -0
  190. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/page.py +0 -0
  191. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/parent.py +0 -0
  192. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/rich_text.py +0 -0
  193. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/notion/types/user.py +0 -0
  194. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/onedrive.py +0 -0
  195. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/opensearch.py +0 -0
  196. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/outlook.py +0 -0
  197. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/pinecone.py +0 -0
  198. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/qdrant.py +0 -0
  199. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/reddit.py +0 -0
  200. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/registry.py +0 -0
  201. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/salesforce.py +0 -0
  202. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/sharepoint.py +0 -0
  203. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/slack.py +0 -0
  204. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/sql.py +0 -0
  205. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/vectara.py +0 -0
  206. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/weaviate.py +0 -0
  207. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/connector/wikipedia.py +0 -0
  208. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/enhanced_dataclass/__init__.py +0 -0
  209. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/enhanced_dataclass/core.py +0 -0
  210. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -0
  211. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -0
  212. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/error.py +0 -0
  213. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/evaluate.py +0 -0
  214. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/ingest_backoff/__init__.py +0 -0
  215. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/ingest_backoff/_common.py +0 -0
  216. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/ingest_backoff/_wrapper.py +0 -0
  217. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/interfaces.py +0 -0
  218. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/logger.py +0 -0
  219. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/main.py +0 -0
  220. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/__init__.py +0 -0
  221. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/copy.py +0 -0
  222. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/doc_factory.py +0 -0
  223. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/interfaces.py +0 -0
  224. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/partition.py +0 -0
  225. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/permissions.py +0 -0
  226. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/pipeline.py +0 -0
  227. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  228. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/reformat/chunking.py +0 -0
  229. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/reformat/embedding.py +0 -0
  230. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/source.py +0 -0
  231. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/utils.py +0 -0
  232. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/pipeline/write.py +0 -0
  233. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/processor.py +0 -0
  234. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/__init__.py +0 -0
  235. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/airtable.py +0 -0
  236. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/astra.py +0 -0
  237. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/base_runner.py +0 -0
  238. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/biomed.py +0 -0
  239. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/confluence.py +0 -0
  240. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/delta_table.py +0 -0
  241. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/discord.py +0 -0
  242. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/elasticsearch.py +0 -0
  243. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/__init__.py +0 -0
  244. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/azure.py +0 -0
  245. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/box.py +0 -0
  246. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/dropbox.py +0 -0
  247. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/fsspec.py +0 -0
  248. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/gcs.py +0 -0
  249. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/s3.py +0 -0
  250. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/fsspec/sftp.py +0 -0
  251. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/github.py +0 -0
  252. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/gitlab.py +0 -0
  253. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/google_drive.py +0 -0
  254. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/hubspot.py +0 -0
  255. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/jira.py +0 -0
  256. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/kafka.py +0 -0
  257. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/local.py +0 -0
  258. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/mongodb.py +0 -0
  259. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/notion.py +0 -0
  260. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/onedrive.py +0 -0
  261. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/opensearch.py +0 -0
  262. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/outlook.py +0 -0
  263. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/reddit.py +0 -0
  264. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/salesforce.py +0 -0
  265. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/sharepoint.py +0 -0
  266. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/slack.py +0 -0
  267. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/utils.py +0 -0
  268. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/wikipedia.py +0 -0
  269. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/__init__.py +0 -0
  270. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/astra.py +0 -0
  271. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -0
  272. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/base_writer.py +0 -0
  273. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/chroma.py +0 -0
  274. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/clarifai.py +0 -0
  275. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/databricks_volumes.py +0 -0
  276. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/delta_table.py +0 -0
  277. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/elasticsearch.py +0 -0
  278. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  279. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/azure.py +0 -0
  280. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/box.py +0 -0
  281. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -0
  282. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/gcs.py +0 -0
  283. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/fsspec/s3.py +0 -0
  284. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/kafka.py +0 -0
  285. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/mongodb.py +0 -0
  286. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/opensearch.py +0 -0
  287. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/pinecone.py +0 -0
  288. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/qdrant.py +0 -0
  289. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/sql.py +0 -0
  290. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/vectara.py +0 -0
  291. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/runner/writers/weaviate.py +0 -0
  292. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/__init__.py +0 -0
  293. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/compression.py +0 -0
  294. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/data_prep.py +0 -0
  295. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/dep_check.py +0 -0
  296. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  297. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/utils/table.py +0 -0
  298. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/__init__.py +0 -0
  299. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/__init__.py +0 -0
  300. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/base/__init__.py +0 -0
  301. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/base/dest.py +0 -0
  302. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/base/importer.py +0 -0
  303. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cli.py +0 -0
  304. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/__init__.py +0 -0
  305. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/astra.py +0 -0
  306. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -0
  307. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/chroma.py +0 -0
  308. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -0
  309. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -0
  310. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  311. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -0
  312. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -0
  313. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -0
  314. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -0
  315. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -0
  316. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -0
  317. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/google_drive.py +0 -0
  318. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/milvus.py +0 -0
  319. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/mongodb.py +0 -0
  320. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/onedrive.py +0 -0
  321. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/opensearch.py +0 -0
  322. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/pinecone.py +0 -0
  323. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/salesforce.py +0 -0
  324. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -0
  325. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/singlestore.py +0 -0
  326. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/sql.py +0 -0
  327. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/cmds/weaviate.py +0 -0
  328. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/configs/chunk.py +0 -0
  329. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/configs/embed.py +0 -0
  330. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/configs/partition.py +0 -0
  331. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/configs/processor.py +0 -0
  332. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/interfaces.py +0 -0
  333. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/cli/utils.py +0 -0
  334. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/example.py +0 -0
  335. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/connector.py +0 -0
  336. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/indexer.py +0 -0
  337. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/processor.py +0 -0
  338. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/upload_stager.py +0 -0
  339. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/interfaces/uploader.py +0 -0
  340. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/logger.py +0 -0
  341. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/main.py +0 -0
  342. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/__init__.py +0 -0
  343. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  344. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/chunk.py +0 -0
  345. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/embed.py +0 -0
  346. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/index.py +0 -0
  347. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/partition.py +0 -0
  348. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/stage.py +0 -0
  349. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/uncompress.py +0 -0
  350. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/steps/upload.py +0 -0
  351. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/pipeline/utils.py +0 -0
  352. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/__init__.py +0 -0
  353. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/chunker.py +0 -0
  354. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connector_registry.py +0 -0
  355. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/__init__.py +0 -0
  356. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +0 -0
  357. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/azure.py +0 -0
  358. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/box.py +0 -0
  359. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +0 -0
  360. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +0 -0
  361. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +0 -0
  362. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/fsspec/utils.py +0 -0
  363. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/milvus.py +0 -0
  364. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/opensearch.py +0 -0
  365. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/singlestore.py +0 -0
  366. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/connectors/utils.py +0 -0
  367. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/embedder.py +0 -0
  368. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/partitioner.py +0 -0
  369. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest/v2/processes/uncompress.py +0 -0
  370. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/dependency_links.txt +0 -0
  371. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/entry_points.txt +0 -0
  372. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/requires.txt +12 -12
  373. {unstructured-ingest-0.0.2 → unstructured-ingest-0.0.2.dev0}/unstructured_ingest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.2
3
+ Version: 0.0.2.dev0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -0,0 +1 @@
1
+ __version__ = "0.0.2-dev0" # pragma: no cover
@@ -24,7 +24,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
26
26
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
27
- from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
28
27
  from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
29
28
 
30
29
  CommandT = TypeVar("CommandT", bound=click.Command)
@@ -76,8 +75,6 @@ class BaseCmd(ABC):
76
75
  }
77
76
  if chunker := self.get_chunker(options=source_options):
78
77
  pipeline_kwargs["chunker"] = chunker
79
- if filterer := self.get_filterer(options=source_options):
80
- pipeline_kwargs["filterer"] = filterer
81
78
  if embedder := self.get_embeder(options=source_options):
82
79
  pipeline_kwargs["embedder"] = embedder
83
80
  if dest:
@@ -108,13 +105,6 @@ class BaseCmd(ABC):
108
105
  return None
109
106
  return Chunker(config=chunker_config)
110
107
 
111
- @staticmethod
112
- def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
113
- filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
114
- if not filterer_configs.to_dict():
115
- return None
116
- return Filterer(config=filterer_configs)
117
-
118
108
  @staticmethod
119
109
  def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
120
110
  embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
@@ -8,7 +8,6 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
8
  from unstructured_ingest.v2.cli.configs import (
9
9
  ChunkerCliConfig,
10
10
  EmbedderCliConfig,
11
- FilterCliConfig,
12
11
  PartitionerCliConfig,
13
12
  ProcessorCliConfig,
14
13
  )
@@ -27,7 +26,6 @@ class SrcCmd(BaseCmd):
27
26
  ProcessorCliConfig,
28
27
  PartitionerCliConfig,
29
28
  EmbedderCliConfig,
30
- FilterCliConfig,
31
29
  ChunkerCliConfig,
32
30
  ]
33
31
  )
@@ -3,6 +3,7 @@ from dataclasses import dataclass
3
3
  import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.interfaces import CliConfig
6
+ from unstructured_ingest.v2.cli.utils import DelimitedString
6
7
 
7
8
 
8
9
  @dataclass
@@ -13,7 +14,7 @@ class FsspecCliDownloadConfig(CliConfig):
13
14
  click.Option(
14
15
  ["--download-dir"],
15
16
  help="Where files are downloaded to, defaults to a location at"
16
- "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
17
+ "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
17
18
  ),
18
19
  ]
19
20
 
@@ -64,6 +65,13 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
64
65
  help="Recursively download files in their respective folders "
65
66
  "otherwise stop at the files in provided folder level.",
66
67
  ),
68
+ click.Option(
69
+ ["--file-glob"],
70
+ default=None,
71
+ type=DelimitedString(),
72
+ help="A comma-separated list of file globs to limit which types of "
73
+ "local files are accepted, e.g. '*.html,*.txt'",
74
+ ),
67
75
  ]
68
76
  )
69
77
  return options
@@ -4,6 +4,7 @@ import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
6
  from unstructured_ingest.v2.cli.interfaces import CliConfig
7
+ from unstructured_ingest.v2.cli.utils import DelimitedString
7
8
  from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
8
9
 
9
10
 
@@ -18,6 +19,13 @@ class LocalCliIndexerConfig(CliConfig):
18
19
  type=click.Path(file_okay=True, dir_okay=True, exists=True),
19
20
  help="Path to the location in the local file system that will be processed.",
20
21
  ),
22
+ click.Option(
23
+ ["--file-glob"],
24
+ default=None,
25
+ type=DelimitedString(),
26
+ help="A comma-separated list of file globs to limit which types of "
27
+ "local files are accepted, e.g. '*.html,*.txt'",
28
+ ),
21
29
  click.Option(
22
30
  ["--recursive"],
23
31
  is_flag=True,
@@ -0,0 +1,6 @@
1
+ from .chunk import ChunkerCliConfig
2
+ from .embed import EmbedderCliConfig
3
+ from .partition import PartitionerCliConfig
4
+ from .processor import ProcessorCliConfig
5
+
6
+ __all__ = ["ChunkerCliConfig", "ProcessorCliConfig", "PartitionerCliConfig", "EmbedderCliConfig"]
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
3
+ from .file_data import FileData, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -26,5 +26,4 @@ __all__ = [
26
26
  "AccessConfig",
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
- "FileDataSourceMetadata",
30
29
  ]
@@ -30,15 +30,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
30
30
  connector_type: str
31
31
  download_config: DownloaderConfigT
32
32
 
33
- def get_download_path(self, file_data: FileData) -> Optional[Path]:
34
- if not file_data.source_identifiers:
35
- return None
36
- rel_path = file_data.source_identifiers.relative_path
37
- if not rel_path:
38
- return None
39
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
40
- return self.download_dir / Path(rel_path)
41
-
42
33
  @staticmethod
43
34
  def is_float(value: str):
44
35
  try:
@@ -77,6 +68,9 @@ class Downloader(BaseProcess, BaseConnector, ABC):
77
68
  def is_async(self) -> bool:
78
69
  return True
79
70
 
71
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
72
+ return None
73
+
80
74
  @abstractmethod
81
75
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
82
76
  pass
@@ -22,18 +22,13 @@ class SourceIdentifiers:
22
22
  return self.rel_path or self.fullpath
23
23
 
24
24
 
25
- @dataclass
26
- class FileDataSourceMetadata(DataSourceMetadata):
27
- filesize_bytes: Optional[int] = None
28
-
29
-
30
25
  @dataclass
31
26
  class FileData(DataClassJsonMixin):
32
27
  identifier: str
33
28
  connector_type: str
34
29
  source_identifiers: Optional[SourceIdentifiers] = None
35
30
  doc_type: Literal["file", "batch"] = field(default="file")
36
- metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
31
+ metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
37
32
  additional_metadata: dict[str, Any] = field(default_factory=dict)
38
33
  reprocess: bool = False
39
34
 
@@ -8,9 +8,6 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
- def precheck(self) -> None:
12
- pass
13
-
14
11
  @abstractmethod
15
12
  def run(self, **kwargs: Any) -> Any:
16
13
  pass
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
92
92
 
93
93
  if iterable:
94
94
  if len(iterable) == 1:
95
- return self.process_serially(iterable)
95
+ return [self.process_serially(iterable)]
96
96
  if self.context.num_processes == 1:
97
97
  return self.process_serially(iterable)
98
98
  with mp.Pool(
@@ -126,8 +126,6 @@ class PipelineStep(ABC):
126
126
  logger.info(
127
127
  f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
128
128
  )
129
- else:
130
- logger.info(f"Calling {self.__class__.__name__} with no inputs")
131
129
  if self.context.async_supported and self.process.is_async():
132
130
  return self.process_async(iterable=iterable)
133
131
  if self.context.mp_supported:
@@ -148,6 +146,8 @@ class PipelineStep(ABC):
148
146
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
149
147
  if "file_data_path" in kwargs:
150
148
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
149
+ else:
150
+ self.context.status[self.identifier] = {"step_error": str(e)}
151
151
  if self.context.raise_on_error:
152
152
  raise e
153
153
  return None
@@ -160,6 +160,8 @@ class PipelineStep(ABC):
160
160
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
161
161
  if "file_data_path" in kwargs:
162
162
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
163
+ else:
164
+ self.context.status[self.identifier] = {"step_error": str(e)}
163
165
  if self.context.raise_on_error:
164
166
  raise e
165
167
  return None
@@ -9,7 +9,6 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
9
9
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
10
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
11
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
12
- from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
13
12
  from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
14
13
  from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
15
14
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
@@ -28,7 +27,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
28
27
  )
29
28
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader
30
29
  from unstructured_ingest.v2.processes.embedder import EmbedderConfig
31
- from unstructured_ingest.v2.processes.filter import FiltererConfig
32
30
  from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
33
31
 
34
32
 
@@ -39,33 +37,22 @@ class PipelineError(Exception):
39
37
  @dataclass
40
38
  class Pipeline:
41
39
  context: ProcessorConfig
42
-
43
40
  indexer: InitVar[IndexerT]
44
41
  indexer_step: IndexStep = field(init=False)
45
-
46
42
  downloader: InitVar[DownloaderT]
47
43
  downloader_step: DownloadStep = field(init=False)
48
-
49
44
  partitioner: InitVar[Partitioner]
50
45
  partitioner_step: PartitionStep = field(init=False)
51
-
52
46
  chunker: InitVar[Optional[Chunker]] = None
53
47
  chunker_step: ChunkStep = field(init=False, default=None)
54
-
55
48
  embedder: InitVar[Optional[Embedder]] = None
56
49
  embedder_step: EmbedStep = field(init=False, default=None)
57
-
58
50
  stager: InitVar[Optional[UploadStager]] = None
59
51
  stager_step: UploadStageStep = field(init=False, default=None)
60
-
61
52
  uploader: InitVar[Uploader] = field(default=LocalUploader())
62
53
  uploader_step: UploadStep = field(init=False, default=None)
63
-
64
54
  uncompress_step: UncompressStep = field(init=False, default=None)
65
55
 
66
- filterer: InitVar[Optional[Filterer]] = None
67
- filter_step: FilterStep = field(init=False, default=None)
68
-
69
56
  def __post_init__(
70
57
  self,
71
58
  indexer: IndexerT,
@@ -75,12 +62,10 @@ class Pipeline:
75
62
  embedder: Embedder = None,
76
63
  stager: UploadStager = None,
77
64
  uploader: Uploader = None,
78
- filterer: Filterer = None,
79
65
  ):
80
66
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
81
67
  self.indexer_step = IndexStep(process=indexer, context=self.context)
82
68
  self.downloader_step = DownloadStep(process=downloader, context=self.context)
83
- self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
84
69
  self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
85
70
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
86
71
 
@@ -124,7 +109,6 @@ class Pipeline:
124
109
  def run(self):
125
110
  try:
126
111
  start_time = time()
127
- self._run_prechecks()
128
112
  self._run()
129
113
  logger.info(f"Finished ingest process in {time() - start_time}s")
130
114
  finally:
@@ -146,37 +130,6 @@ class Pipeline:
146
130
  final = [f for f in flat if f]
147
131
  return final or None
148
132
 
149
- def _run_prechecks(self):
150
- steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
151
- if self.chunker_step:
152
- steps.append(self.chunker_step)
153
- if self.embedder_step:
154
- steps.append(self.embedder_step)
155
- if self.uncompress_step:
156
- steps.append(self.uncompress_step)
157
- if self.stager_step:
158
- steps.append(self.stager_step)
159
- failures = {}
160
- for step in steps:
161
- try:
162
- step.process.precheck()
163
- except Exception as e:
164
- failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
165
- if failures:
166
- for k, v in failures.items():
167
- logger.error(f"Step precheck failure: {k}: {v}")
168
- raise PipelineError("Precheck failed")
169
-
170
- def apply_filter(self, records: list[dict]) -> list[dict]:
171
- if not self.filter_step:
172
- return records
173
- data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
174
- filtered_data = self.filter_step(data_to_filter)
175
- filtered_data = [f for f in filtered_data if f is not None]
176
- filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
177
- filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
178
- return filtered_records
179
-
180
133
  def _run(self):
181
134
  logger.info(
182
135
  f"Running local pipline: {self} with configs: "
@@ -194,33 +147,18 @@ class Pipeline:
194
147
  if not indices_inputs:
195
148
  return
196
149
 
197
- # Initial filtering on indexed content
198
- indices_inputs = self.apply_filter(records=indices_inputs)
199
- if not indices_inputs:
200
- return
201
-
202
150
  # Download associated content to local file system
203
151
  downloaded_data = self.downloader_step(indices_inputs)
204
152
  downloaded_data = self.clean_results(results=downloaded_data)
205
153
  if not downloaded_data:
206
154
  return
207
155
 
208
- # Post download filtering
209
- downloaded_data = self.apply_filter(records=downloaded_data)
210
- if not downloaded_data:
211
- return
212
-
213
156
  # Run uncompress if available
214
157
  if self.uncompress_step:
215
158
  downloaded_data = self.uncompress_step(downloaded_data)
216
159
  # Flatten list of lists
217
160
  downloaded_data = self.clean_results(results=downloaded_data)
218
161
 
219
- # Post uncompress filtering
220
- downloaded_data = self.apply_filter(records=downloaded_data)
221
- if not downloaded_data:
222
- return
223
-
224
162
  if not downloaded_data:
225
163
  return
226
164
 
@@ -241,14 +179,9 @@ class Pipeline:
241
179
  self.uploader_step(iterable=elements)
242
180
 
243
181
  def __str__(self):
244
- s = [str(self.indexer_step)]
245
- if filter_step := self.filter_step:
246
- s.append(str(filter_step))
247
- s.append(str(self.downloader_step))
248
- if filter_step := self.filter_step:
249
- s.append(str(filter_step))
182
+ s = [str(self.indexer_step), str(self.downloader_step)]
250
183
  if uncompress_step := self.uncompress_step:
251
- s.extend([str(uncompress_step), str(filter_step)])
184
+ s.append(str(uncompress_step))
252
185
  s.append(str(self.partitioner_step))
253
186
  if chunker_step := self.chunker_step:
254
187
  s.append(str(chunker_step))
@@ -267,7 +200,6 @@ class Pipeline:
267
200
  downloader_config: DownloaderConfigT,
268
201
  source_connection_config: ConnectionConfig,
269
202
  partitioner_config: PartitionerConfig,
270
- filterer_config: FiltererConfig = None,
271
203
  chunker_config: Optional[ChunkerConfig] = None,
272
204
  embedder_config: Optional[EmbedderConfig] = None,
273
205
  destination_connection_config: Optional[ConnectionConfig] = None,
@@ -303,8 +235,6 @@ class Pipeline:
303
235
  ),
304
236
  "partitioner": Partitioner(config=partitioner_config),
305
237
  }
306
- if filterer_config:
307
- pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
308
238
  if chunker_config:
309
239
  pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
310
240
  if embedder_config:
@@ -2,7 +2,6 @@ import asyncio
2
2
  import hashlib
3
3
  import json
4
4
  from dataclasses import dataclass
5
- from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict, TypeVar
7
6
 
8
7
  from unstructured_ingest.v2.interfaces import FileData, download_responses
@@ -71,40 +70,11 @@ class DownloadStep(PipelineStep):
71
70
  return True
72
71
  return False
73
72
 
74
- def update_file_data(
75
- self, file_data: FileData, file_data_path: Path, download_path: Path
76
- ) -> None:
77
- file_size_bytes = download_path.stat().st_size
78
- changed = False
79
- if not file_data.metadata.filesize_bytes and file_size_bytes:
80
- changed = True
81
- file_data.metadata.filesize_bytes = file_size_bytes
82
- if (
83
- file_data.metadata.filesize_bytes
84
- and file_data.metadata.filesize_bytes != file_size_bytes
85
- ):
86
- logger.warning(
87
- f"file size in original file data "
88
- f"({file_data.metadata.filesize_bytes}) doesn't "
89
- f"match size of local file: {file_size_bytes}, updating"
90
- )
91
- changed = True
92
- file_data.metadata.filesize_bytes = file_size_bytes
93
- if changed:
94
- logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
95
- with file_data_path.open("w") as file:
96
- json.dump(file_data.to_dict(), file, indent=2)
97
-
98
73
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
99
74
  file_data = FileData.from_file(path=file_data_path)
100
75
  download_path = self.process.get_download_path(file_data=file_data)
101
76
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
102
77
  logger.debug(f"Skipping download, file already exists locally: {download_path}")
103
- self.update_file_data(
104
- file_data=file_data,
105
- file_data_path=Path(file_data_path),
106
- download_path=download_path,
107
- )
108
78
  return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
109
79
  fn_kwargs = {"file_data": file_data}
110
80
  if not asyncio.iscoroutinefunction(fn):
@@ -115,60 +85,26 @@ class DownloadStep(PipelineStep):
115
85
  else:
116
86
  download_results = await fn(**fn_kwargs)
117
87
  return self.create_step_results(
118
- current_file_data_path=file_data_path,
119
- download_results=download_results,
120
- current_file_data=file_data,
88
+ current_file_data_path=file_data_path, download_results=download_results
121
89
  )
122
90
 
123
91
  def create_step_results(
124
- self,
125
- current_file_data_path: str,
126
- current_file_data: FileData,
127
- download_results: download_responses,
92
+ self, current_file_data_path: str, download_results: download_responses
128
93
  ) -> list[DownloadStepResponse]:
129
- responses = []
130
94
  if not isinstance(download_results, list):
131
- file_data = current_file_data
132
- file_data_path = current_file_data_path
133
- download_path = download_results["path"]
134
- if download_results["file_data"].identifier == current_file_data.identifier:
135
- self.update_file_data(
136
- file_data=file_data,
137
- file_data_path=Path(file_data_path),
138
- download_path=download_path,
139
- )
140
- responses = [
141
- DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
142
- ]
143
- else:
144
- file_data = download_results["file_data"]
145
- file_data_path = self.persist_new_file_data(file_data=file_data)
146
- self.update_file_data(
147
- file_data=file_data,
148
- file_data_path=Path(file_data_path),
149
- download_path=download_path,
95
+ return [
96
+ DownloadStepResponse(
97
+ file_data_path=current_file_data_path, path=str(download_results["path"])
150
98
  )
151
- responses = [
152
- DownloadStepResponse(
153
- file_data_path=current_file_data_path, path=str(download_results["path"])
154
- )
155
- ]
156
- else:
99
+ ]
157
100
  # Supplemental results generated as part of the download process
158
- for res in download_results:
159
- file_data = res["file_data"]
160
- file_data_path = self.persist_new_file_data(file_data=file_data)
161
- download_path = res["path"]
162
- self.update_file_data(
163
- file_data=file_data,
164
- file_data_path=Path(file_data_path),
165
- download_path=download_path,
166
- )
167
- responses.append(
168
- DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
169
- )
170
-
171
- return responses
101
+ download_step_results = []
102
+ for res in download_results:
103
+ file_data_path = self.persist_new_file_data(file_data=res["file_data"])
104
+ download_step_results.append(
105
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
106
+ )
107
+ return download_step_results
172
108
 
173
109
  def persist_new_file_data(self, file_data: FileData) -> str:
174
110
  record_hash = self.get_hash(extras=[file_data.identifier])
@@ -7,7 +7,6 @@ from unstructured import __name__ as integration_name
7
7
  from unstructured.__version__ import __version__ as integration_version
8
8
 
9
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.error import DestinationConnectionError
11
10
  from unstructured_ingest.utils.data_prep import batch_generator
12
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
12
  from unstructured_ingest.v2.interfaces import (
@@ -95,13 +94,6 @@ class AstraUploader(Uploader):
95
94
  upload_config: AstraUploaderConfig
96
95
  connector_type: str = CONNECTOR_TYPE
97
96
 
98
- def precheck(self) -> None:
99
- try:
100
- self.get_collection()
101
- except Exception as e:
102
- logger.error(f"Failed to validate connection {e}", exc_info=True)
103
- raise DestinationConnectionError(f"failed to validate connection: {e}")
104
-
105
97
  @requires_dependencies(["astrapy"], extras="astra")
106
98
  def get_collection(self) -> "AstraDBCollection":
107
99
  from astrapy.db import AstraDB
@@ -175,14 +175,6 @@ class AzureCognitiveSearchUploader(Uploader):
175
175
  ),
176
176
  )
177
177
 
178
- def precheck(self) -> None:
179
- try:
180
- client = self.connection_config.generate_client()
181
- client.get_document_count()
182
- except Exception as e:
183
- logger.error(f"failed to validate connection: {e}", exc_info=True)
184
- raise DestinationConnectionError(f"failed to validate connection: {e}")
185
-
186
178
  def write_dict_wrapper(self, elements_dict):
187
179
  return self.write_dict(elements_dict=elements_dict)
188
180
 
@@ -111,13 +111,10 @@ class ChromaUploader(Uploader):
111
111
  connector_type: str = CONNECTOR_TYPE
112
112
  upload_config: ChromaUploaderConfig
113
113
  connection_config: ChromaConnectionConfig
114
+ client: Optional["Client"] = field(init=False)
114
115
 
115
- def precheck(self) -> None:
116
- try:
117
- self.create_client()
118
- except Exception as e:
119
- logger.error(f"failed to validate connection: {e}", exc_info=True)
120
- raise DestinationConnectionError(f"failed to validate connection: {e}")
116
+ def __post_init__(self):
117
+ self.client = self.create_client()
121
118
 
122
119
  @requires_dependencies(["chromadb"], extras="chroma")
123
120
  def create_client(self) -> "Client":
@@ -190,9 +187,10 @@ class ChromaUploader(Uploader):
190
187
  f"collection {self.connection_config.collection_name} "
191
188
  f"at {self.connection_config.host}",
192
189
  )
193
- client = self.create_client()
194
190
 
195
- collection = client.get_or_create_collection(name=self.connection_config.collection_name)
191
+ collection = self.client.get_or_create_collection(
192
+ name=self.connection_config.collection_name
193
+ )
196
194
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
197
195
  self.upsert_batch(collection, self.prepare_chroma_list(chunk))
198
196
 
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
5
  from unstructured_ingest.enhanced_dataclass import enhanced_field
6
- from unstructured_ingest.error import DestinationConnectionError
7
6
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
7
  from unstructured_ingest.v2.interfaces import (
9
8
  AccessConfig,
@@ -12,7 +11,6 @@ from unstructured_ingest.v2.interfaces import (
12
11
  Uploader,
13
12
  UploaderConfig,
14
13
  )
15
- from unstructured_ingest.v2.logger import logger
16
14
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
17
15
 
18
16
  if TYPE_CHECKING:
@@ -80,13 +78,6 @@ class DatabricksVolumesUploader(Uploader):
80
78
  host=self.connection_config.host, **self.connection_config.access_config.to_dict()
81
79
  )
82
80
 
83
- def precheck(self) -> None:
84
- try:
85
- assert self.client.current_user.me().active
86
- except Exception as e:
87
- logger.error(f"failed to validate connection: {e}", exc_info=True)
88
- raise DestinationConnectionError(f"failed to validate connection: {e}")
89
-
90
81
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
91
82
  for content in contents:
92
83
  with open(content.path, "rb") as elements_file: