unstructured-ingest 0.0.0__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (373) hide show
  1. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/PKG-INFO +2 -1
  2. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/setup.py +1 -0
  3. unstructured-ingest-0.0.2/unstructured_ingest/__version__.py +1 -0
  4. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/helpers.py +1 -1
  5. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/logger.py +2 -2
  6. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/base/cmd.py +10 -0
  7. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/base/src.py +2 -0
  8. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/__init__.py +2 -0
  9. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  10. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/local.py +0 -8
  11. unstructured-ingest-0.0.2/unstructured_ingest/v2/cli/cmds/milvus.py +72 -0
  12. unstructured-ingest-0.0.2/unstructured_ingest/v2/cli/configs/__init__.py +13 -0
  13. unstructured-ingest-0.0.2/unstructured_ingest/v2/cli/configs/filter.py +28 -0
  14. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/__init__.py +2 -1
  15. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/downloader.py +9 -3
  16. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/file_data.py +6 -1
  17. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/process.py +3 -0
  18. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/logger.py +1 -1
  19. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/interfaces.py +3 -1
  20. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  21. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  22. unstructured-ingest-0.0.2/unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  23. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/__init__.py +4 -2
  24. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  25. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  26. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  27. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  28. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  29. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
  30. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
  31. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  32. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/local.py +15 -15
  33. unstructured-ingest-0.0.2/unstructured_ingest/v2/processes/connectors/milvus.py +200 -0
  34. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  35. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  36. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/pinecone.py +10 -7
  37. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  38. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  39. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  40. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  41. unstructured-ingest-0.0.2/unstructured_ingest/v2/processes/filter.py +54 -0
  42. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/PKG-INFO +2 -1
  43. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/SOURCES.txt +5 -0
  44. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/requires.txt +17 -14
  45. unstructured-ingest-0.0.0/unstructured_ingest/__version__.py +0 -1
  46. unstructured-ingest-0.0.0/unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  47. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/README.md +0 -0
  48. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/pyproject.toml +0 -0
  49. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/setup.cfg +0 -0
  50. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/test/test_error.py +0 -0
  51. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/test/test_interfaces.py +0 -0
  52. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/test/test_logger.py +0 -0
  53. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/test/test_utils.py +0 -0
  54. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/__init__.py +0 -0
  55. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/__init__.py +0 -0
  56. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/base/__init__.py +0 -0
  57. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/base/cmd.py +0 -0
  58. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/base/dest.py +0 -0
  59. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/base/src.py +0 -0
  60. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cli.py +0 -0
  61. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmd_factory.py +0 -0
  62. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/__init__.py +0 -0
  63. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/airtable.py +0 -0
  64. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/astra.py +0 -0
  65. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/azure_cognitive_search.py +0 -0
  66. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/biomed.py +0 -0
  67. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/chroma.py +0 -0
  68. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/clarifai.py +0 -0
  69. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/confluence.py +0 -0
  70. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/databricks_volumes.py +0 -0
  71. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/delta_table.py +0 -0
  72. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/discord.py +0 -0
  73. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/elasticsearch.py +0 -0
  74. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  75. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/azure.py +0 -0
  76. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/box.py +0 -0
  77. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -0
  78. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -0
  79. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -0
  80. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/s3.py +0 -0
  81. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -0
  82. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/github.py +0 -0
  83. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/gitlab.py +0 -0
  84. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/google_drive.py +0 -0
  85. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/hubspot.py +0 -0
  86. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/jira.py +0 -0
  87. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/kafka.py +0 -0
  88. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/local.py +0 -0
  89. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/mongodb.py +0 -0
  90. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/notion.py +0 -0
  91. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/onedrive.py +0 -0
  92. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/opensearch.py +0 -0
  93. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/outlook.py +0 -0
  94. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/pinecone.py +0 -0
  95. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/qdrant.py +0 -0
  96. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/reddit.py +0 -0
  97. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/salesforce.py +0 -0
  98. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/sharepoint.py +0 -0
  99. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/slack.py +0 -0
  100. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/sql.py +0 -0
  101. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/vectara.py +0 -0
  102. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/weaviate.py +0 -0
  103. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/cmds/wikipedia.py +0 -0
  104. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/common.py +0 -0
  105. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/interfaces.py +0 -0
  106. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/cli/utils.py +0 -0
  107. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/__init__.py +0 -0
  108. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/airtable.py +0 -0
  109. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/astra.py +0 -0
  110. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/azure_cognitive_search.py +0 -0
  111. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/biomed.py +0 -0
  112. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/chroma.py +0 -0
  113. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/clarifai.py +0 -0
  114. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/confluence.py +0 -0
  115. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/databricks_volumes.py +0 -0
  116. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/delta_table.py +0 -0
  117. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/discord.py +0 -0
  118. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/elasticsearch.py +0 -0
  119. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/__init__.py +0 -0
  120. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/azure.py +0 -0
  121. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/box.py +0 -0
  122. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/dropbox.py +0 -0
  123. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/fsspec.py +0 -0
  124. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/gcs.py +0 -0
  125. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/s3.py +0 -0
  126. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/fsspec/sftp.py +0 -0
  127. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/git.py +0 -0
  128. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/github.py +0 -0
  129. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/gitlab.py +0 -0
  130. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/google_drive.py +0 -0
  131. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/hubspot.py +0 -0
  132. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/jira.py +0 -0
  133. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/kafka.py +0 -0
  134. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/local.py +0 -0
  135. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/mongodb.py +0 -0
  136. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/__init__.py +0 -0
  137. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/client.py +0 -0
  138. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/connector.py +0 -0
  139. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/interfaces.py +0 -0
  140. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/__init__.py +0 -0
  141. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/block.py +0 -0
  142. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/__init__.py +0 -0
  143. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -0
  144. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +0 -0
  145. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +0 -0
  146. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/callout.py +0 -0
  147. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -0
  148. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/child_page.py +0 -0
  149. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/code.py +0 -0
  150. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -0
  151. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/divider.py +0 -0
  152. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/embed.py +0 -0
  153. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/equation.py +0 -0
  154. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/file.py +0 -0
  155. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/heading.py +0 -0
  156. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/image.py +0 -0
  157. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -0
  158. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/link_to_page.py +0 -0
  159. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -0
  160. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/paragraph.py +0 -0
  161. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/pdf.py +0 -0
  162. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/quote.py +0 -0
  163. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -0
  164. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/table.py +0 -0
  165. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -0
  166. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/template.py +0 -0
  167. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/todo.py +0 -0
  168. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/toggle.py +0 -0
  169. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -0
  170. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/blocks/video.py +0 -0
  171. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database.py +0 -0
  172. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -0
  173. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -0
  174. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/created_by.py +0 -0
  175. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/created_time.py +0 -0
  176. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/date.py +0 -0
  177. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/email.py +0 -0
  178. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/files.py +0 -0
  179. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -0
  180. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +0 -0
  181. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -0
  182. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -0
  183. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/number.py +0 -0
  184. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/people.py +0 -0
  185. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -0
  186. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -0
  187. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/rich_text.py +0 -0
  188. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/rollup.py +0 -0
  189. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/select.py +0 -0
  190. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/status.py +0 -0
  191. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/title.py +0 -0
  192. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -0
  193. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/url.py +0 -0
  194. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/database_properties/verification.py +0 -0
  195. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/date.py +0 -0
  196. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/file.py +0 -0
  197. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/page.py +0 -0
  198. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/parent.py +0 -0
  199. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/rich_text.py +0 -0
  200. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/notion/types/user.py +0 -0
  201. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/onedrive.py +0 -0
  202. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/opensearch.py +0 -0
  203. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/outlook.py +0 -0
  204. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/pinecone.py +0 -0
  205. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/qdrant.py +0 -0
  206. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/reddit.py +0 -0
  207. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/registry.py +0 -0
  208. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/salesforce.py +0 -0
  209. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/sharepoint.py +0 -0
  210. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/slack.py +0 -0
  211. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/sql.py +0 -0
  212. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/vectara.py +0 -0
  213. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/weaviate.py +0 -0
  214. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/connector/wikipedia.py +0 -0
  215. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/enhanced_dataclass/__init__.py +0 -0
  216. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/enhanced_dataclass/core.py +0 -0
  217. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -0
  218. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -0
  219. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/error.py +0 -0
  220. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/evaluate.py +0 -0
  221. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/ingest_backoff/__init__.py +0 -0
  222. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/ingest_backoff/_common.py +0 -0
  223. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/ingest_backoff/_wrapper.py +0 -0
  224. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/interfaces.py +0 -0
  225. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/main.py +0 -0
  226. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/__init__.py +0 -0
  227. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/copy.py +0 -0
  228. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/doc_factory.py +0 -0
  229. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/interfaces.py +0 -0
  230. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/partition.py +0 -0
  231. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/permissions.py +0 -0
  232. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/pipeline.py +0 -0
  233. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  234. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/reformat/chunking.py +0 -0
  235. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/reformat/embedding.py +0 -0
  236. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/source.py +0 -0
  237. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/utils.py +0 -0
  238. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/pipeline/write.py +0 -0
  239. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/processor.py +0 -0
  240. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/__init__.py +0 -0
  241. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/airtable.py +0 -0
  242. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/astra.py +0 -0
  243. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/base_runner.py +0 -0
  244. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/biomed.py +0 -0
  245. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/confluence.py +0 -0
  246. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/delta_table.py +0 -0
  247. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/discord.py +0 -0
  248. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/elasticsearch.py +0 -0
  249. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/__init__.py +0 -0
  250. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/azure.py +0 -0
  251. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/box.py +0 -0
  252. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/dropbox.py +0 -0
  253. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/fsspec.py +0 -0
  254. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/gcs.py +0 -0
  255. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/s3.py +0 -0
  256. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/fsspec/sftp.py +0 -0
  257. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/github.py +0 -0
  258. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/gitlab.py +0 -0
  259. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/google_drive.py +0 -0
  260. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/hubspot.py +0 -0
  261. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/jira.py +0 -0
  262. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/kafka.py +0 -0
  263. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/local.py +0 -0
  264. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/mongodb.py +0 -0
  265. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/notion.py +0 -0
  266. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/onedrive.py +0 -0
  267. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/opensearch.py +0 -0
  268. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/outlook.py +0 -0
  269. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/reddit.py +0 -0
  270. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/salesforce.py +0 -0
  271. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/sharepoint.py +0 -0
  272. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/slack.py +0 -0
  273. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/utils.py +0 -0
  274. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/wikipedia.py +0 -0
  275. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/__init__.py +0 -0
  276. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/astra.py +0 -0
  277. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -0
  278. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/base_writer.py +0 -0
  279. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/chroma.py +0 -0
  280. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/clarifai.py +0 -0
  281. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/databricks_volumes.py +0 -0
  282. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/delta_table.py +0 -0
  283. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/elasticsearch.py +0 -0
  284. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  285. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/azure.py +0 -0
  286. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/box.py +0 -0
  287. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -0
  288. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/gcs.py +0 -0
  289. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/fsspec/s3.py +0 -0
  290. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/kafka.py +0 -0
  291. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/mongodb.py +0 -0
  292. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/opensearch.py +0 -0
  293. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/pinecone.py +0 -0
  294. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/qdrant.py +0 -0
  295. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/sql.py +0 -0
  296. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/vectara.py +0 -0
  297. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/runner/writers/weaviate.py +0 -0
  298. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/__init__.py +0 -0
  299. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/compression.py +0 -0
  300. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/data_prep.py +0 -0
  301. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/dep_check.py +0 -0
  302. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  303. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/utils/table.py +0 -0
  304. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/__init__.py +0 -0
  305. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/__init__.py +0 -0
  306. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/base/__init__.py +0 -0
  307. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/base/dest.py +0 -0
  308. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/base/importer.py +0 -0
  309. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cli.py +0 -0
  310. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/astra.py +0 -0
  311. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -0
  312. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/chroma.py +0 -0
  313. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -0
  314. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -0
  315. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  316. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -0
  317. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -0
  318. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -0
  319. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -0
  320. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -0
  321. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -0
  322. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/google_drive.py +0 -0
  323. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/mongodb.py +0 -0
  324. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/onedrive.py +0 -0
  325. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/opensearch.py +0 -0
  326. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/pinecone.py +0 -0
  327. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/salesforce.py +0 -0
  328. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -0
  329. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/singlestore.py +0 -0
  330. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/sql.py +0 -0
  331. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/cmds/weaviate.py +0 -0
  332. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/configs/chunk.py +0 -0
  333. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/configs/embed.py +0 -0
  334. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/configs/partition.py +0 -0
  335. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/configs/processor.py +0 -0
  336. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/interfaces.py +0 -0
  337. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/cli/utils.py +0 -0
  338. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/example.py +0 -0
  339. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/connector.py +0 -0
  340. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/indexer.py +0 -0
  341. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/processor.py +0 -0
  342. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/upload_stager.py +0 -0
  343. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/interfaces/uploader.py +0 -0
  344. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/main.py +0 -0
  345. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/__init__.py +0 -0
  346. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  347. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/chunk.py +0 -0
  348. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/embed.py +0 -0
  349. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/index.py +0 -0
  350. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/partition.py +0 -0
  351. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/stage.py +0 -0
  352. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/uncompress.py +0 -0
  353. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/steps/upload.py +0 -0
  354. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/pipeline/utils.py +0 -0
  355. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/__init__.py +0 -0
  356. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/chunker.py +0 -0
  357. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connector_registry.py +0 -0
  358. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +0 -0
  359. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/azure.py +0 -0
  360. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/box.py +0 -0
  361. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +0 -0
  362. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +0 -0
  363. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +0 -0
  364. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/fsspec/utils.py +0 -0
  365. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/opensearch.py +0 -0
  366. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/singlestore.py +0 -0
  367. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/connectors/utils.py +0 -0
  368. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/embedder.py +0 -0
  369. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/partitioner.py +0 -0
  370. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest/v2/processes/uncompress.py +0 -0
  371. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/dependency_links.txt +0 -0
  372. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/entry_points.txt +0 -0
  373. {unstructured-ingest-0.0.0 → unstructured-ingest-0.0.2}/unstructured_ingest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.0
3
+ Version: 0.0.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -56,6 +56,7 @@ Provides-Extra: google-drive
56
56
  Provides-Extra: hubspot
57
57
  Provides-Extra: jira
58
58
  Provides-Extra: kafka
59
+ Provides-Extra: milvus
59
60
  Provides-Extra: mongodb
60
61
  Provides-Extra: notion
61
62
  Provides-Extra: onedrive
@@ -102,6 +102,7 @@ connectors_reqs = {
102
102
  "hubspot": load_requirements("requirements/connectors/hubspot.in"),
103
103
  "jira": load_requirements("requirements/connectors/jira.in"),
104
104
  "kafka": load_requirements("requirements/connectors/kafka.in"),
105
+ "milvus": load_requirements("requirements/connectors/milvus.in"),
105
106
  "mongodb": load_requirements("requirements/connectors/mongodb.in"),
106
107
  "notion": load_requirements("requirements/connectors/notion.in"),
107
108
  "onedrive": load_requirements("requirements/connectors/onedrive.in"),
@@ -0,0 +1 @@
1
+ __version__ = "0.0.2" # pragma: no cover
@@ -5,7 +5,6 @@ from typing import List, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
  from uuid import UUID
7
7
 
8
- import unstructured.ingest.connector.notion.types.blocks as notion_blocks
9
8
  from htmlBuilder.attributes import Style, Type
10
9
  from htmlBuilder.tags import (
11
10
  Body,
@@ -23,6 +22,7 @@ from htmlBuilder.tags import (
23
22
  )
24
23
  from notion_client.errors import APIResponseError
25
24
 
25
+ import unstructured_ingest.connector.notion.types.blocks as notion_blocks
26
26
  from unstructured_ingest.connector.notion.client import Client
27
27
  from unstructured_ingest.connector.notion.interfaces import BlockBase
28
28
  from unstructured_ingest.connector.notion.types.block import Block
@@ -3,7 +3,7 @@ import json
3
3
  import logging
4
4
  import typing as t
5
5
 
6
- logger = logging.getLogger("unstructured.ingest")
6
+ logger = logging.getLogger("unstructured_ingest")
7
7
 
8
8
 
9
9
  def default_is_data_sensitive(k: str, v: t.Any) -> bool:
@@ -119,7 +119,7 @@ def ingest_log_streaming_init(level: int) -> None:
119
119
 
120
120
  def make_default_logger(level: int) -> logging.Logger:
121
121
  """Return a custom logger."""
122
- logger = logging.getLogger("unstructured.ingest")
122
+ logger = logging.getLogger("unstructured_ingest")
123
123
  handler = logging.StreamHandler()
124
124
  handler.name = "ingest_log_handler"
125
125
  formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
26
26
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
27
+ from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
27
28
  from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
28
29
 
29
30
  CommandT = TypeVar("CommandT", bound=click.Command)
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
75
76
  }
76
77
  if chunker := self.get_chunker(options=source_options):
77
78
  pipeline_kwargs["chunker"] = chunker
79
+ if filterer := self.get_filterer(options=source_options):
80
+ pipeline_kwargs["filterer"] = filterer
78
81
  if embedder := self.get_embeder(options=source_options):
79
82
  pipeline_kwargs["embedder"] = embedder
80
83
  if dest:
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
105
108
  return None
106
109
  return Chunker(config=chunker_config)
107
110
 
111
+ @staticmethod
112
+ def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
113
+ filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
114
+ if not filterer_configs.to_dict():
115
+ return None
116
+ return Filterer(config=filterer_configs)
117
+
108
118
  @staticmethod
109
119
  def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
110
120
  embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
8
  from unstructured_ingest.v2.cli.configs import (
9
9
  ChunkerCliConfig,
10
10
  EmbedderCliConfig,
11
+ FilterCliConfig,
11
12
  PartitionerCliConfig,
12
13
  ProcessorCliConfig,
13
14
  )
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
26
27
  ProcessorCliConfig,
27
28
  PartitionerCliConfig,
28
29
  EmbedderCliConfig,
30
+ FilterCliConfig,
29
31
  ChunkerCliConfig,
30
32
  ]
31
33
  )
@@ -15,6 +15,7 @@ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
15
15
  from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
16
16
  from .google_drive import google_drive_src_cmd
17
17
  from .local import local_dest_cmd, local_src_cmd
18
+ from .milvus import milvus_dest_cmd
18
19
  from .mongodb import mongodb_dest_cmd
19
20
  from .onedrive import onedrive_drive_src_cmd
20
21
  from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
@@ -60,6 +61,7 @@ dest_cmds = [
60
61
  elasticsearch_dest_cmd,
61
62
  gcs_dest_cmd,
62
63
  local_dest_cmd,
64
+ milvus_dest_cmd,
63
65
  opensearch_dest_cmd,
64
66
  pinecone_dest_cmd,
65
67
  s3_dest_cmd,
@@ -3,7 +3,6 @@ from dataclasses import dataclass
3
3
  import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.interfaces import CliConfig
6
- from unstructured_ingest.v2.cli.utils import DelimitedString
7
6
 
8
7
 
9
8
  @dataclass
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
14
13
  click.Option(
15
14
  ["--download-dir"],
16
15
  help="Where files are downloaded to, defaults to a location at"
17
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
16
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
17
  ),
19
18
  ]
20
19
 
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
65
64
  help="Recursively download files in their respective folders "
66
65
  "otherwise stop at the files in provided folder level.",
67
66
  ),
68
- click.Option(
69
- ["--file-glob"],
70
- default=None,
71
- type=DelimitedString(),
72
- help="A comma-separated list of file globs to limit which types of "
73
- "local files are accepted, e.g. '*.html,*.txt'",
74
- ),
75
67
  ]
76
68
  )
77
69
  return options
@@ -4,7 +4,6 @@ import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
6
  from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
7
  from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
9
8
 
10
9
 
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
19
18
  type=click.Path(file_okay=True, dir_okay=True, exists=True),
20
19
  help="Path to the location in the local file system that will be processed.",
21
20
  ),
22
- click.Option(
23
- ["--file-glob"],
24
- default=None,
25
- type=DelimitedString(),
26
- help="A comma-separated list of file globs to limit which types of "
27
- "local files are accepted, e.g. '*.html,*.txt'",
28
- ),
29
21
  click.Option(
30
22
  ["--recursive"],
31
23
  is_flag=True,
@@ -0,0 +1,72 @@
1
+ from dataclasses import dataclass
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.v2.cli.base import DestCmd
6
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
7
+ from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
8
+
9
+
10
+ @dataclass
11
+ class MilvusCliConnectionConfig(CliConfig):
12
+ @staticmethod
13
+ def get_cli_options() -> list[click.Option]:
14
+ options = [
15
+ click.Option(
16
+ ["--uri"],
17
+ required=False,
18
+ type=str,
19
+ default=None,
20
+ help="Milvus uri, eg 'http://localhost:19530",
21
+ ),
22
+ click.Option(
23
+ ["--user"],
24
+ required=False,
25
+ type=str,
26
+ default=None,
27
+ help="Milvus user",
28
+ ),
29
+ click.Option(
30
+ ["--password"],
31
+ required=False,
32
+ type=str,
33
+ default=None,
34
+ help="Milvus password",
35
+ ),
36
+ click.Option(
37
+ ["--db-name"],
38
+ required=False,
39
+ type=str,
40
+ default=None,
41
+ help="Milvus database name",
42
+ ),
43
+ ]
44
+ return options
45
+
46
+
47
+ @dataclass
48
+ class MilvusCliUploaderConfig(CliConfig):
49
+ @staticmethod
50
+ def get_cli_options() -> list[click.Option]:
51
+ options = [
52
+ click.Option(
53
+ ["--collection-name"],
54
+ required=True,
55
+ type=str,
56
+ help="Milvus collections to write to",
57
+ ),
58
+ click.Option(
59
+ ["--num-of-processes"],
60
+ type=click.IntRange(min=1),
61
+ default=4,
62
+ help="number of processes to use when writing to support parallel writes",
63
+ ),
64
+ ]
65
+ return options
66
+
67
+
68
+ milvus_dest_cmd = DestCmd(
69
+ cmd_name=CONNECTOR_TYPE,
70
+ connection_config=MilvusCliConnectionConfig,
71
+ uploader_config=MilvusCliUploaderConfig,
72
+ )
@@ -0,0 +1,13 @@
1
+ from .chunk import ChunkerCliConfig
2
+ from .embed import EmbedderCliConfig
3
+ from .filter import FilterCliConfig
4
+ from .partition import PartitionerCliConfig
5
+ from .processor import ProcessorCliConfig
6
+
7
+ __all__ = [
8
+ "ChunkerCliConfig",
9
+ "ProcessorCliConfig",
10
+ "PartitionerCliConfig",
11
+ "EmbedderCliConfig",
12
+ "FilterCliConfig",
13
+ ]
@@ -0,0 +1,28 @@
1
+ from dataclasses import dataclass
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
6
+ from unstructured_ingest.v2.cli.utils import DelimitedString
7
+
8
+
9
+ @dataclass
10
+ class FilterCliConfig(CliConfig):
11
+ @staticmethod
12
+ def get_cli_options() -> list[click.Option]:
13
+ options = [
14
+ click.Option(
15
+ ["--file-glob"],
16
+ default=None,
17
+ type=DelimitedString(),
18
+ help="A comma-separated list of file globs to limit which types of "
19
+ "local files are accepted, e.g. '*.html,*.txt'",
20
+ ),
21
+ click.Option(
22
+ ["--max-file-size"],
23
+ default=None,
24
+ type=click.IntRange(min=1),
25
+ help="Max file size to process in bytes",
26
+ ),
27
+ ]
28
+ return options
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, SourceIdentifiers
3
+ from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -26,4 +26,5 @@ __all__ = [
26
26
  "AccessConfig",
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
+ "FileDataSourceMetadata",
29
30
  ]
@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
30
30
  connector_type: str
31
31
  download_config: DownloaderConfigT
32
32
 
33
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
34
+ if not file_data.source_identifiers:
35
+ return None
36
+ rel_path = file_data.source_identifiers.relative_path
37
+ if not rel_path:
38
+ return None
39
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
40
+ return self.download_dir / Path(rel_path)
41
+
33
42
  @staticmethod
34
43
  def is_float(value: str):
35
44
  try:
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
68
77
  def is_async(self) -> bool:
69
78
  return True
70
79
 
71
- def get_download_path(self, file_data: FileData) -> Optional[Path]:
72
- return None
73
-
74
80
  @abstractmethod
75
81
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
76
82
  pass
@@ -22,13 +22,18 @@ class SourceIdentifiers:
22
22
  return self.rel_path or self.fullpath
23
23
 
24
24
 
25
+ @dataclass
26
+ class FileDataSourceMetadata(DataSourceMetadata):
27
+ filesize_bytes: Optional[int] = None
28
+
29
+
25
30
  @dataclass
26
31
  class FileData(DataClassJsonMixin):
27
32
  identifier: str
28
33
  connector_type: str
29
34
  source_identifiers: Optional[SourceIdentifiers] = None
30
35
  doc_type: Literal["file", "batch"] = field(default="file")
31
- metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
36
+ metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
32
37
  additional_metadata: dict[str, Any] = field(default_factory=dict)
33
38
  reprocess: bool = False
34
39
 
@@ -8,6 +8,9 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
+ def precheck(self) -> None:
12
+ pass
13
+
11
14
  @abstractmethod
12
15
  def run(self, **kwargs: Any) -> Any:
13
16
  pass
@@ -5,7 +5,7 @@ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
5
5
  from typing import Any, Callable
6
6
 
7
7
  log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
8
- LOGGER_NAME = "unstructured.ingest.v2"
8
+ LOGGER_NAME = "unstructured_ingest.v2"
9
9
 
10
10
 
11
11
  def default_is_data_sensitive(k: str, v: Any) -> bool:
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
92
92
 
93
93
  if iterable:
94
94
  if len(iterable) == 1:
95
- return [self.process_serially(iterable)]
95
+ return self.process_serially(iterable)
96
96
  if self.context.num_processes == 1:
97
97
  return self.process_serially(iterable)
98
98
  with mp.Pool(
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
126
126
  logger.info(
127
127
  f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
128
128
  )
129
+ else:
130
+ logger.info(f"Calling {self.__class__.__name__} with no inputs")
129
131
  if self.context.async_supported and self.process.is_async():
130
132
  return self.process_async(iterable=iterable)
131
133
  if self.context.mp_supported:
@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
9
9
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
10
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
11
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
12
+ from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
12
13
  from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
13
14
  from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
14
15
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
28
  )
28
29
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader
29
30
  from unstructured_ingest.v2.processes.embedder import EmbedderConfig
31
+ from unstructured_ingest.v2.processes.filter import FiltererConfig
30
32
  from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
31
33
 
32
34
 
@@ -37,22 +39,33 @@ class PipelineError(Exception):
37
39
  @dataclass
38
40
  class Pipeline:
39
41
  context: ProcessorConfig
42
+
40
43
  indexer: InitVar[IndexerT]
41
44
  indexer_step: IndexStep = field(init=False)
45
+
42
46
  downloader: InitVar[DownloaderT]
43
47
  downloader_step: DownloadStep = field(init=False)
48
+
44
49
  partitioner: InitVar[Partitioner]
45
50
  partitioner_step: PartitionStep = field(init=False)
51
+
46
52
  chunker: InitVar[Optional[Chunker]] = None
47
53
  chunker_step: ChunkStep = field(init=False, default=None)
54
+
48
55
  embedder: InitVar[Optional[Embedder]] = None
49
56
  embedder_step: EmbedStep = field(init=False, default=None)
57
+
50
58
  stager: InitVar[Optional[UploadStager]] = None
51
59
  stager_step: UploadStageStep = field(init=False, default=None)
60
+
52
61
  uploader: InitVar[Uploader] = field(default=LocalUploader())
53
62
  uploader_step: UploadStep = field(init=False, default=None)
63
+
54
64
  uncompress_step: UncompressStep = field(init=False, default=None)
55
65
 
66
+ filterer: InitVar[Optional[Filterer]] = None
67
+ filter_step: FilterStep = field(init=False, default=None)
68
+
56
69
  def __post_init__(
57
70
  self,
58
71
  indexer: IndexerT,
@@ -62,10 +75,12 @@ class Pipeline:
62
75
  embedder: Embedder = None,
63
76
  stager: UploadStager = None,
64
77
  uploader: Uploader = None,
78
+ filterer: Filterer = None,
65
79
  ):
66
80
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
67
81
  self.indexer_step = IndexStep(process=indexer, context=self.context)
68
82
  self.downloader_step = DownloadStep(process=downloader, context=self.context)
83
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
69
84
  self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
70
85
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
71
86
 
@@ -109,6 +124,7 @@ class Pipeline:
109
124
  def run(self):
110
125
  try:
111
126
  start_time = time()
127
+ self._run_prechecks()
112
128
  self._run()
113
129
  logger.info(f"Finished ingest process in {time() - start_time}s")
114
130
  finally:
@@ -130,6 +146,37 @@ class Pipeline:
130
146
  final = [f for f in flat if f]
131
147
  return final or None
132
148
 
149
+ def _run_prechecks(self):
150
+ steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
151
+ if self.chunker_step:
152
+ steps.append(self.chunker_step)
153
+ if self.embedder_step:
154
+ steps.append(self.embedder_step)
155
+ if self.uncompress_step:
156
+ steps.append(self.uncompress_step)
157
+ if self.stager_step:
158
+ steps.append(self.stager_step)
159
+ failures = {}
160
+ for step in steps:
161
+ try:
162
+ step.process.precheck()
163
+ except Exception as e:
164
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
165
+ if failures:
166
+ for k, v in failures.items():
167
+ logger.error(f"Step precheck failure: {k}: {v}")
168
+ raise PipelineError("Precheck failed")
169
+
170
+ def apply_filter(self, records: list[dict]) -> list[dict]:
171
+ if not self.filter_step:
172
+ return records
173
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
174
+ filtered_data = self.filter_step(data_to_filter)
175
+ filtered_data = [f for f in filtered_data if f is not None]
176
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
177
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
178
+ return filtered_records
179
+
133
180
  def _run(self):
134
181
  logger.info(
135
182
  f"Running local pipline: {self} with configs: "
@@ -147,18 +194,33 @@ class Pipeline:
147
194
  if not indices_inputs:
148
195
  return
149
196
 
197
+ # Initial filtering on indexed content
198
+ indices_inputs = self.apply_filter(records=indices_inputs)
199
+ if not indices_inputs:
200
+ return
201
+
150
202
  # Download associated content to local file system
151
203
  downloaded_data = self.downloader_step(indices_inputs)
152
204
  downloaded_data = self.clean_results(results=downloaded_data)
153
205
  if not downloaded_data:
154
206
  return
155
207
 
208
+ # Post download filtering
209
+ downloaded_data = self.apply_filter(records=downloaded_data)
210
+ if not downloaded_data:
211
+ return
212
+
156
213
  # Run uncompress if available
157
214
  if self.uncompress_step:
158
215
  downloaded_data = self.uncompress_step(downloaded_data)
159
216
  # Flatten list of lists
160
217
  downloaded_data = self.clean_results(results=downloaded_data)
161
218
 
219
+ # Post uncompress filtering
220
+ downloaded_data = self.apply_filter(records=downloaded_data)
221
+ if not downloaded_data:
222
+ return
223
+
162
224
  if not downloaded_data:
163
225
  return
164
226
 
@@ -179,9 +241,14 @@ class Pipeline:
179
241
  self.uploader_step(iterable=elements)
180
242
 
181
243
  def __str__(self):
182
- s = [str(self.indexer_step), str(self.downloader_step)]
244
+ s = [str(self.indexer_step)]
245
+ if filter_step := self.filter_step:
246
+ s.append(str(filter_step))
247
+ s.append(str(self.downloader_step))
248
+ if filter_step := self.filter_step:
249
+ s.append(str(filter_step))
183
250
  if uncompress_step := self.uncompress_step:
184
- s.append(str(uncompress_step))
251
+ s.extend([str(uncompress_step), str(filter_step)])
185
252
  s.append(str(self.partitioner_step))
186
253
  if chunker_step := self.chunker_step:
187
254
  s.append(str(chunker_step))
@@ -200,6 +267,7 @@ class Pipeline:
200
267
  downloader_config: DownloaderConfigT,
201
268
  source_connection_config: ConnectionConfig,
202
269
  partitioner_config: PartitionerConfig,
270
+ filterer_config: FiltererConfig = None,
203
271
  chunker_config: Optional[ChunkerConfig] = None,
204
272
  embedder_config: Optional[EmbedderConfig] = None,
205
273
  destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +303,8 @@ class Pipeline:
235
303
  ),
236
304
  "partitioner": Partitioner(config=partitioner_config),
237
305
  }
306
+ if filterer_config:
307
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
238
308
  if chunker_config:
239
309
  pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
240
310
  if embedder_config: