unstructured-ingest 0.0.2.dev0__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (376) hide show
  1. unstructured-ingest-0.0.3/PKG-INFO +87 -0
  2. unstructured-ingest-0.0.3/README.md +3 -0
  3. unstructured-ingest-0.0.3/unstructured_ingest/__version__.py +1 -0
  4. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/base/cmd.py +10 -0
  5. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/base/src.py +2 -0
  6. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  7. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/local.py +0 -8
  8. unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/__init__.py +13 -0
  9. unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/filter.py +28 -0
  10. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/__init__.py +2 -1
  11. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/downloader.py +9 -3
  12. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/file_data.py +6 -1
  13. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/process.py +3 -4
  14. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  15. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  16. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  17. unstructured-ingest-0.0.3/unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  18. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  19. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  20. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  21. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  22. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  23. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -0
  24. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/box.py +8 -0
  25. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +8 -0
  26. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +30 -28
  27. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +8 -0
  28. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/s3.py +21 -5
  29. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +8 -0
  30. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  31. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/local.py +15 -15
  32. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  33. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  34. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/opensearch.py +33 -5
  35. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
  36. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  37. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  38. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  39. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  40. unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/filter.py +54 -0
  41. unstructured-ingest-0.0.3/unstructured_ingest.egg-info/PKG-INFO +87 -0
  42. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest.egg-info/SOURCES.txt +3 -0
  43. unstructured-ingest-0.0.2.dev0/PKG-INFO +0 -233
  44. unstructured-ingest-0.0.2.dev0/README.md +0 -149
  45. unstructured-ingest-0.0.2.dev0/unstructured_ingest/__version__.py +0 -1
  46. unstructured-ingest-0.0.2.dev0/unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  47. unstructured-ingest-0.0.2.dev0/unstructured_ingest.egg-info/PKG-INFO +0 -233
  48. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/pyproject.toml +0 -0
  49. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/setup.cfg +0 -0
  50. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/setup.py +0 -0
  51. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/test/test_error.py +0 -0
  52. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/test/test_interfaces.py +0 -0
  53. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/test/test_logger.py +0 -0
  54. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/test/test_utils.py +0 -0
  55. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/__init__.py +0 -0
  56. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/__init__.py +0 -0
  57. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/base/__init__.py +0 -0
  58. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/base/cmd.py +0 -0
  59. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/base/dest.py +0 -0
  60. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/base/src.py +0 -0
  61. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cli.py +0 -0
  62. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmd_factory.py +0 -0
  63. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/__init__.py +0 -0
  64. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/airtable.py +0 -0
  65. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/astra.py +0 -0
  66. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/azure_cognitive_search.py +0 -0
  67. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/biomed.py +0 -0
  68. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/chroma.py +0 -0
  69. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/clarifai.py +0 -0
  70. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/confluence.py +0 -0
  71. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/databricks_volumes.py +0 -0
  72. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/delta_table.py +0 -0
  73. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/discord.py +0 -0
  74. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/elasticsearch.py +0 -0
  75. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  76. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/azure.py +0 -0
  77. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/box.py +0 -0
  78. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -0
  79. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -0
  80. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -0
  81. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/s3.py +0 -0
  82. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -0
  83. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/github.py +0 -0
  84. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/gitlab.py +0 -0
  85. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/google_drive.py +0 -0
  86. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/hubspot.py +0 -0
  87. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/jira.py +0 -0
  88. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/kafka.py +0 -0
  89. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/local.py +0 -0
  90. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/mongodb.py +0 -0
  91. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/notion.py +0 -0
  92. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/onedrive.py +0 -0
  93. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/opensearch.py +0 -0
  94. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/outlook.py +0 -0
  95. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/pinecone.py +0 -0
  96. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/qdrant.py +0 -0
  97. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/reddit.py +0 -0
  98. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/salesforce.py +0 -0
  99. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/sharepoint.py +0 -0
  100. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/slack.py +0 -0
  101. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/sql.py +0 -0
  102. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/vectara.py +0 -0
  103. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/weaviate.py +0 -0
  104. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/cmds/wikipedia.py +0 -0
  105. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/common.py +0 -0
  106. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/interfaces.py +0 -0
  107. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/cli/utils.py +0 -0
  108. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/__init__.py +0 -0
  109. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/airtable.py +0 -0
  110. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/astra.py +0 -0
  111. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/azure_cognitive_search.py +0 -0
  112. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/biomed.py +0 -0
  113. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/chroma.py +0 -0
  114. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/clarifai.py +0 -0
  115. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/confluence.py +0 -0
  116. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/databricks_volumes.py +0 -0
  117. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/delta_table.py +0 -0
  118. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/discord.py +0 -0
  119. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/elasticsearch.py +0 -0
  120. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/__init__.py +0 -0
  121. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/azure.py +0 -0
  122. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/box.py +0 -0
  123. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/dropbox.py +0 -0
  124. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/fsspec.py +0 -0
  125. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/gcs.py +0 -0
  126. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/s3.py +0 -0
  127. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/fsspec/sftp.py +0 -0
  128. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/git.py +0 -0
  129. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/github.py +0 -0
  130. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/gitlab.py +0 -0
  131. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/google_drive.py +0 -0
  132. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/hubspot.py +0 -0
  133. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/jira.py +0 -0
  134. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/kafka.py +0 -0
  135. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/local.py +0 -0
  136. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/mongodb.py +0 -0
  137. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/__init__.py +0 -0
  138. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/client.py +0 -0
  139. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/connector.py +0 -0
  140. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/helpers.py +0 -0
  141. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/interfaces.py +0 -0
  142. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/__init__.py +0 -0
  143. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/block.py +0 -0
  144. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/__init__.py +0 -0
  145. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -0
  146. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +0 -0
  147. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +0 -0
  148. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/callout.py +0 -0
  149. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -0
  150. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/child_page.py +0 -0
  151. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/code.py +0 -0
  152. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -0
  153. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/divider.py +0 -0
  154. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/embed.py +0 -0
  155. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/equation.py +0 -0
  156. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/file.py +0 -0
  157. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/heading.py +0 -0
  158. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/image.py +0 -0
  159. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -0
  160. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/link_to_page.py +0 -0
  161. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -0
  162. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/paragraph.py +0 -0
  163. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/pdf.py +0 -0
  164. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/quote.py +0 -0
  165. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -0
  166. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/table.py +0 -0
  167. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -0
  168. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/template.py +0 -0
  169. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/todo.py +0 -0
  170. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/toggle.py +0 -0
  171. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -0
  172. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/blocks/video.py +0 -0
  173. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database.py +0 -0
  174. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -0
  175. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -0
  176. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/created_by.py +0 -0
  177. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/created_time.py +0 -0
  178. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/date.py +0 -0
  179. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/email.py +0 -0
  180. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/files.py +0 -0
  181. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -0
  182. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +0 -0
  183. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -0
  184. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -0
  185. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/number.py +0 -0
  186. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/people.py +0 -0
  187. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -0
  188. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -0
  189. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/rich_text.py +0 -0
  190. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/rollup.py +0 -0
  191. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/select.py +0 -0
  192. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/status.py +0 -0
  193. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/title.py +0 -0
  194. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -0
  195. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/url.py +0 -0
  196. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/database_properties/verification.py +0 -0
  197. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/date.py +0 -0
  198. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/file.py +0 -0
  199. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/page.py +0 -0
  200. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/parent.py +0 -0
  201. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/rich_text.py +0 -0
  202. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/notion/types/user.py +0 -0
  203. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/onedrive.py +0 -0
  204. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/opensearch.py +0 -0
  205. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/outlook.py +0 -0
  206. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/pinecone.py +0 -0
  207. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/qdrant.py +0 -0
  208. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/reddit.py +0 -0
  209. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/registry.py +0 -0
  210. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/salesforce.py +0 -0
  211. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/sharepoint.py +0 -0
  212. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/slack.py +0 -0
  213. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/sql.py +0 -0
  214. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/vectara.py +0 -0
  215. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/weaviate.py +0 -0
  216. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/connector/wikipedia.py +0 -0
  217. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/enhanced_dataclass/__init__.py +0 -0
  218. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/enhanced_dataclass/core.py +0 -0
  219. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -0
  220. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -0
  221. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/error.py +0 -0
  222. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/evaluate.py +0 -0
  223. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/ingest_backoff/__init__.py +0 -0
  224. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/ingest_backoff/_common.py +0 -0
  225. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/ingest_backoff/_wrapper.py +0 -0
  226. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/interfaces.py +0 -0
  227. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/logger.py +0 -0
  228. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/main.py +0 -0
  229. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/__init__.py +0 -0
  230. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/copy.py +0 -0
  231. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/doc_factory.py +0 -0
  232. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/interfaces.py +0 -0
  233. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/partition.py +0 -0
  234. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/permissions.py +0 -0
  235. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/pipeline.py +0 -0
  236. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  237. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/reformat/chunking.py +0 -0
  238. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/reformat/embedding.py +0 -0
  239. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/source.py +0 -0
  240. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/utils.py +0 -0
  241. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/pipeline/write.py +0 -0
  242. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/processor.py +0 -0
  243. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/__init__.py +0 -0
  244. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/airtable.py +0 -0
  245. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/astra.py +0 -0
  246. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/base_runner.py +0 -0
  247. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/biomed.py +0 -0
  248. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/confluence.py +0 -0
  249. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/delta_table.py +0 -0
  250. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/discord.py +0 -0
  251. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/elasticsearch.py +0 -0
  252. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/__init__.py +0 -0
  253. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/azure.py +0 -0
  254. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/box.py +0 -0
  255. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/dropbox.py +0 -0
  256. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/fsspec.py +0 -0
  257. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/gcs.py +0 -0
  258. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/s3.py +0 -0
  259. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/fsspec/sftp.py +0 -0
  260. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/github.py +0 -0
  261. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/gitlab.py +0 -0
  262. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/google_drive.py +0 -0
  263. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/hubspot.py +0 -0
  264. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/jira.py +0 -0
  265. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/kafka.py +0 -0
  266. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/local.py +0 -0
  267. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/mongodb.py +0 -0
  268. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/notion.py +0 -0
  269. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/onedrive.py +0 -0
  270. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/opensearch.py +0 -0
  271. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/outlook.py +0 -0
  272. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/reddit.py +0 -0
  273. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/salesforce.py +0 -0
  274. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/sharepoint.py +0 -0
  275. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/slack.py +0 -0
  276. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/utils.py +0 -0
  277. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/wikipedia.py +0 -0
  278. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/__init__.py +0 -0
  279. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/astra.py +0 -0
  280. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -0
  281. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/base_writer.py +0 -0
  282. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/chroma.py +0 -0
  283. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/clarifai.py +0 -0
  284. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/databricks_volumes.py +0 -0
  285. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/delta_table.py +0 -0
  286. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/elasticsearch.py +0 -0
  287. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  288. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/azure.py +0 -0
  289. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/box.py +0 -0
  290. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -0
  291. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/gcs.py +0 -0
  292. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/fsspec/s3.py +0 -0
  293. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/kafka.py +0 -0
  294. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/mongodb.py +0 -0
  295. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/opensearch.py +0 -0
  296. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/pinecone.py +0 -0
  297. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/qdrant.py +0 -0
  298. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/sql.py +0 -0
  299. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/vectara.py +0 -0
  300. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/runner/writers/weaviate.py +0 -0
  301. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/__init__.py +0 -0
  302. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/compression.py +0 -0
  303. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/data_prep.py +0 -0
  304. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/dep_check.py +0 -0
  305. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  306. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/utils/table.py +0 -0
  307. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/__init__.py +0 -0
  308. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/__init__.py +0 -0
  309. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/base/__init__.py +0 -0
  310. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/base/dest.py +0 -0
  311. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/base/importer.py +0 -0
  312. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cli.py +0 -0
  313. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/__init__.py +0 -0
  314. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/astra.py +0 -0
  315. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -0
  316. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/chroma.py +0 -0
  317. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -0
  318. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -0
  319. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  320. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -0
  321. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -0
  322. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -0
  323. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -0
  324. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -0
  325. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -0
  326. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/google_drive.py +0 -0
  327. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/milvus.py +0 -0
  328. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/mongodb.py +0 -0
  329. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/onedrive.py +0 -0
  330. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/opensearch.py +0 -0
  331. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/pinecone.py +0 -0
  332. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/salesforce.py +0 -0
  333. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -0
  334. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/singlestore.py +0 -0
  335. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/sql.py +0 -0
  336. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/cmds/weaviate.py +0 -0
  337. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/configs/chunk.py +0 -0
  338. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/configs/embed.py +0 -0
  339. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/configs/partition.py +0 -0
  340. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/configs/processor.py +0 -0
  341. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/interfaces.py +0 -0
  342. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/cli/utils.py +0 -0
  343. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/example.py +0 -0
  344. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/connector.py +0 -0
  345. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/indexer.py +0 -0
  346. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/processor.py +0 -0
  347. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/upload_stager.py +0 -0
  348. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/interfaces/uploader.py +0 -0
  349. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/logger.py +0 -0
  350. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/main.py +0 -0
  351. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/__init__.py +0 -0
  352. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  353. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/chunk.py +0 -0
  354. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/embed.py +0 -0
  355. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/index.py +0 -0
  356. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/partition.py +0 -0
  357. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/stage.py +0 -0
  358. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/uncompress.py +0 -0
  359. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/steps/upload.py +0 -0
  360. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/pipeline/utils.py +0 -0
  361. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/__init__.py +0 -0
  362. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/chunker.py +0 -0
  363. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connector_registry.py +0 -0
  364. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/__init__.py +0 -0
  365. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +0 -0
  366. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/fsspec/utils.py +0 -0
  367. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/milvus.py +0 -0
  368. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/singlestore.py +0 -0
  369. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/connectors/utils.py +0 -0
  370. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/embedder.py +0 -0
  371. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/partitioner.py +0 -0
  372. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest/v2/processes/uncompress.py +0 -0
  373. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest.egg-info/dependency_links.txt +0 -0
  374. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest.egg-info/entry_points.txt +0 -0
  375. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest.egg-info/requires.txt +14 -14
  376. {unstructured-ingest-0.0.2.dev0 → unstructured-ingest-0.0.3}/unstructured_ingest.egg-info/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.1
2
+ Name: unstructured-ingest
3
+ Version: 0.0.3
4
+ Summary: A library that prepares raw documents for downstream ML tasks.
5
+ Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
+ Author: Unstructured Technologies
7
+ Author-email: devops@unstructuredai.io
8
+ License: Apache-2.0
9
+ Keywords: NLP PDF HTML CV XML parsing preprocessing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9.0,<3.13
23
+ Description-Content-Type: text/markdown
24
+ Provides-Extra: csv
25
+ Provides-Extra: doc
26
+ Provides-Extra: docx
27
+ Provides-Extra: epub
28
+ Provides-Extra: md
29
+ Provides-Extra: msg
30
+ Provides-Extra: odt
31
+ Provides-Extra: org
32
+ Provides-Extra: pdf
33
+ Provides-Extra: ppt
34
+ Provides-Extra: pptx
35
+ Provides-Extra: rtf
36
+ Provides-Extra: rst
37
+ Provides-Extra: tsv
38
+ Provides-Extra: xlsx
39
+ Provides-Extra: airtable
40
+ Provides-Extra: astra
41
+ Provides-Extra: azure
42
+ Provides-Extra: azure-cognitive-search
43
+ Provides-Extra: biomed
44
+ Provides-Extra: box
45
+ Provides-Extra: chroma
46
+ Provides-Extra: clarifai
47
+ Provides-Extra: confluence
48
+ Provides-Extra: delta-table
49
+ Provides-Extra: discord
50
+ Provides-Extra: dropbox
51
+ Provides-Extra: elasticsearch
52
+ Provides-Extra: gcs
53
+ Provides-Extra: github
54
+ Provides-Extra: gitlab
55
+ Provides-Extra: google-drive
56
+ Provides-Extra: hubspot
57
+ Provides-Extra: jira
58
+ Provides-Extra: kafka
59
+ Provides-Extra: milvus
60
+ Provides-Extra: mongodb
61
+ Provides-Extra: notion
62
+ Provides-Extra: onedrive
63
+ Provides-Extra: opensearch
64
+ Provides-Extra: outlook
65
+ Provides-Extra: pinecone
66
+ Provides-Extra: postgres
67
+ Provides-Extra: qdrant
68
+ Provides-Extra: reddit
69
+ Provides-Extra: s3
70
+ Provides-Extra: sharepoint
71
+ Provides-Extra: salesforce
72
+ Provides-Extra: sftp
73
+ Provides-Extra: slack
74
+ Provides-Extra: wikipedia
75
+ Provides-Extra: weaviate
76
+ Provides-Extra: databricks-volumes
77
+ Provides-Extra: singlestore
78
+ Provides-Extra: embed-huggingface
79
+ Provides-Extra: embed-octoai
80
+ Provides-Extra: embed-vertexai
81
+ Provides-Extra: embed-voyageai
82
+ Provides-Extra: openai
83
+ Provides-Extra: bedrock
84
+
85
+ # Unstructured Ingest
86
+
87
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -0,0 +1,3 @@
1
+ # Unstructured Ingest
2
+
3
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -0,0 +1 @@
1
+ __version__ = "0.0.3" # pragma: no cover
@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
26
26
  from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
27
+ from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
27
28
  from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
28
29
 
29
30
  CommandT = TypeVar("CommandT", bound=click.Command)
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
75
76
  }
76
77
  if chunker := self.get_chunker(options=source_options):
77
78
  pipeline_kwargs["chunker"] = chunker
79
+ if filterer := self.get_filterer(options=source_options):
80
+ pipeline_kwargs["filterer"] = filterer
78
81
  if embedder := self.get_embeder(options=source_options):
79
82
  pipeline_kwargs["embedder"] = embedder
80
83
  if dest:
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
105
108
  return None
106
109
  return Chunker(config=chunker_config)
107
110
 
111
+ @staticmethod
112
+ def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
113
+ filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
114
+ if not filterer_configs.to_dict():
115
+ return None
116
+ return Filterer(config=filterer_configs)
117
+
108
118
  @staticmethod
109
119
  def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
110
120
  embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
8
  from unstructured_ingest.v2.cli.configs import (
9
9
  ChunkerCliConfig,
10
10
  EmbedderCliConfig,
11
+ FilterCliConfig,
11
12
  PartitionerCliConfig,
12
13
  ProcessorCliConfig,
13
14
  )
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
26
27
  ProcessorCliConfig,
27
28
  PartitionerCliConfig,
28
29
  EmbedderCliConfig,
30
+ FilterCliConfig,
29
31
  ChunkerCliConfig,
30
32
  ]
31
33
  )
@@ -3,7 +3,6 @@ from dataclasses import dataclass
3
3
  import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.interfaces import CliConfig
6
- from unstructured_ingest.v2.cli.utils import DelimitedString
7
6
 
8
7
 
9
8
  @dataclass
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
14
13
  click.Option(
15
14
  ["--download-dir"],
16
15
  help="Where files are downloaded to, defaults to a location at"
17
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
16
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
17
  ),
19
18
  ]
20
19
 
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
65
64
  help="Recursively download files in their respective folders "
66
65
  "otherwise stop at the files in provided folder level.",
67
66
  ),
68
- click.Option(
69
- ["--file-glob"],
70
- default=None,
71
- type=DelimitedString(),
72
- help="A comma-separated list of file globs to limit which types of "
73
- "local files are accepted, e.g. '*.html,*.txt'",
74
- ),
75
67
  ]
76
68
  )
77
69
  return options
@@ -4,7 +4,6 @@ import click
4
4
 
5
5
  from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
6
  from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
7
  from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
9
8
 
10
9
 
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
19
18
  type=click.Path(file_okay=True, dir_okay=True, exists=True),
20
19
  help="Path to the location in the local file system that will be processed.",
21
20
  ),
22
- click.Option(
23
- ["--file-glob"],
24
- default=None,
25
- type=DelimitedString(),
26
- help="A comma-separated list of file globs to limit which types of "
27
- "local files are accepted, e.g. '*.html,*.txt'",
28
- ),
29
21
  click.Option(
30
22
  ["--recursive"],
31
23
  is_flag=True,
@@ -0,0 +1,13 @@
1
+ from .chunk import ChunkerCliConfig
2
+ from .embed import EmbedderCliConfig
3
+ from .filter import FilterCliConfig
4
+ from .partition import PartitionerCliConfig
5
+ from .processor import ProcessorCliConfig
6
+
7
+ __all__ = [
8
+ "ChunkerCliConfig",
9
+ "ProcessorCliConfig",
10
+ "PartitionerCliConfig",
11
+ "EmbedderCliConfig",
12
+ "FilterCliConfig",
13
+ ]
@@ -0,0 +1,28 @@
1
+ from dataclasses import dataclass
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
6
+ from unstructured_ingest.v2.cli.utils import DelimitedString
7
+
8
+
9
+ @dataclass
10
+ class FilterCliConfig(CliConfig):
11
+ @staticmethod
12
+ def get_cli_options() -> list[click.Option]:
13
+ options = [
14
+ click.Option(
15
+ ["--file-glob"],
16
+ default=None,
17
+ type=DelimitedString(),
18
+ help="A comma-separated list of file globs to limit which types of "
19
+ "local files are accepted, e.g. '*.html,*.txt'",
20
+ ),
21
+ click.Option(
22
+ ["--max-file-size"],
23
+ default=None,
24
+ type=click.IntRange(min=1),
25
+ help="Max file size to process in bytes",
26
+ ),
27
+ ]
28
+ return options
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, SourceIdentifiers
3
+ from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -26,4 +26,5 @@ __all__ = [
26
26
  "AccessConfig",
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
+ "FileDataSourceMetadata",
29
30
  ]
@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
30
30
  connector_type: str
31
31
  download_config: DownloaderConfigT
32
32
 
33
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
34
+ if not file_data.source_identifiers:
35
+ return None
36
+ rel_path = file_data.source_identifiers.relative_path
37
+ if not rel_path:
38
+ return None
39
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
40
+ return self.download_dir / Path(rel_path)
41
+
33
42
  @staticmethod
34
43
  def is_float(value: str):
35
44
  try:
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
68
77
  def is_async(self) -> bool:
69
78
  return True
70
79
 
71
- def get_download_path(self, file_data: FileData) -> Optional[Path]:
72
- return None
73
-
74
80
  @abstractmethod
75
81
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
76
82
  pass
@@ -22,13 +22,18 @@ class SourceIdentifiers:
22
22
  return self.rel_path or self.fullpath
23
23
 
24
24
 
25
+ @dataclass
26
+ class FileDataSourceMetadata(DataSourceMetadata):
27
+ filesize_bytes: Optional[int] = None
28
+
29
+
25
30
  @dataclass
26
31
  class FileData(DataClassJsonMixin):
27
32
  identifier: str
28
33
  connector_type: str
29
34
  source_identifiers: Optional[SourceIdentifiers] = None
30
35
  doc_type: Literal["file", "batch"] = field(default="file")
31
- metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
36
+ metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
32
37
  additional_metadata: dict[str, Any] = field(default_factory=dict)
33
38
  reprocess: bool = False
34
39
 
@@ -8,13 +8,12 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
+ def precheck(self) -> None:
12
+ pass
13
+
11
14
  @abstractmethod
12
15
  def run(self, **kwargs: Any) -> Any:
13
16
  pass
14
17
 
15
18
  async def run_async(self, **kwargs: Any) -> Any:
16
19
  return self.run(**kwargs)
17
-
18
- def check_connection(self):
19
- # If the process requires external connections, run a quick check
20
- pass
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
92
92
 
93
93
  if iterable:
94
94
  if len(iterable) == 1:
95
- return [self.process_serially(iterable)]
95
+ return self.process_serially(iterable)
96
96
  if self.context.num_processes == 1:
97
97
  return self.process_serially(iterable)
98
98
  with mp.Pool(
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
126
126
  logger.info(
127
127
  f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
128
128
  )
129
+ else:
130
+ logger.info(f"Calling {self.__class__.__name__} with no inputs")
129
131
  if self.context.async_supported and self.process.is_async():
130
132
  return self.process_async(iterable=iterable)
131
133
  if self.context.mp_supported:
@@ -146,8 +148,6 @@ class PipelineStep(ABC):
146
148
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
147
149
  if "file_data_path" in kwargs:
148
150
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
149
- else:
150
- self.context.status[self.identifier] = {"step_error": str(e)}
151
151
  if self.context.raise_on_error:
152
152
  raise e
153
153
  return None
@@ -160,8 +160,6 @@ class PipelineStep(ABC):
160
160
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
161
161
  if "file_data_path" in kwargs:
162
162
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
163
- else:
164
- self.context.status[self.identifier] = {"step_error": str(e)}
165
163
  if self.context.raise_on_error:
166
164
  raise e
167
165
  return None
@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
9
9
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
10
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
11
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
12
+ from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
12
13
  from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
13
14
  from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
14
15
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
28
  )
28
29
  from unstructured_ingest.v2.processes.connectors.local import LocalUploader
29
30
  from unstructured_ingest.v2.processes.embedder import EmbedderConfig
31
+ from unstructured_ingest.v2.processes.filter import FiltererConfig
30
32
  from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
31
33
 
32
34
 
@@ -37,22 +39,33 @@ class PipelineError(Exception):
37
39
  @dataclass
38
40
  class Pipeline:
39
41
  context: ProcessorConfig
42
+
40
43
  indexer: InitVar[IndexerT]
41
44
  indexer_step: IndexStep = field(init=False)
45
+
42
46
  downloader: InitVar[DownloaderT]
43
47
  downloader_step: DownloadStep = field(init=False)
48
+
44
49
  partitioner: InitVar[Partitioner]
45
50
  partitioner_step: PartitionStep = field(init=False)
51
+
46
52
  chunker: InitVar[Optional[Chunker]] = None
47
53
  chunker_step: ChunkStep = field(init=False, default=None)
54
+
48
55
  embedder: InitVar[Optional[Embedder]] = None
49
56
  embedder_step: EmbedStep = field(init=False, default=None)
57
+
50
58
  stager: InitVar[Optional[UploadStager]] = None
51
59
  stager_step: UploadStageStep = field(init=False, default=None)
60
+
52
61
  uploader: InitVar[Uploader] = field(default=LocalUploader())
53
62
  uploader_step: UploadStep = field(init=False, default=None)
63
+
54
64
  uncompress_step: UncompressStep = field(init=False, default=None)
55
65
 
66
+ filterer: InitVar[Optional[Filterer]] = None
67
+ filter_step: FilterStep = field(init=False, default=None)
68
+
56
69
  def __post_init__(
57
70
  self,
58
71
  indexer: IndexerT,
@@ -62,10 +75,12 @@ class Pipeline:
62
75
  embedder: Embedder = None,
63
76
  stager: UploadStager = None,
64
77
  uploader: Uploader = None,
78
+ filterer: Filterer = None,
65
79
  ):
66
80
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
67
81
  self.indexer_step = IndexStep(process=indexer, context=self.context)
68
82
  self.downloader_step = DownloadStep(process=downloader, context=self.context)
83
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
69
84
  self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
70
85
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
71
86
 
@@ -109,6 +124,7 @@ class Pipeline:
109
124
  def run(self):
110
125
  try:
111
126
  start_time = time()
127
+ self._run_prechecks()
112
128
  self._run()
113
129
  logger.info(f"Finished ingest process in {time() - start_time}s")
114
130
  finally:
@@ -130,6 +146,37 @@ class Pipeline:
130
146
  final = [f for f in flat if f]
131
147
  return final or None
132
148
 
149
+ def _run_prechecks(self):
150
+ steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
151
+ if self.chunker_step:
152
+ steps.append(self.chunker_step)
153
+ if self.embedder_step:
154
+ steps.append(self.embedder_step)
155
+ if self.uncompress_step:
156
+ steps.append(self.uncompress_step)
157
+ if self.stager_step:
158
+ steps.append(self.stager_step)
159
+ failures = {}
160
+ for step in steps:
161
+ try:
162
+ step.process.precheck()
163
+ except Exception as e:
164
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
165
+ if failures:
166
+ for k, v in failures.items():
167
+ logger.error(f"Step precheck failure: {k}: {v}")
168
+ raise PipelineError("Precheck failed")
169
+
170
+ def apply_filter(self, records: list[dict]) -> list[dict]:
171
+ if not self.filter_step:
172
+ return records
173
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
174
+ filtered_data = self.filter_step(data_to_filter)
175
+ filtered_data = [f for f in filtered_data if f is not None]
176
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
177
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
178
+ return filtered_records
179
+
133
180
  def _run(self):
134
181
  logger.info(
135
182
  f"Running local pipline: {self} with configs: "
@@ -147,18 +194,33 @@ class Pipeline:
147
194
  if not indices_inputs:
148
195
  return
149
196
 
197
+ # Initial filtering on indexed content
198
+ indices_inputs = self.apply_filter(records=indices_inputs)
199
+ if not indices_inputs:
200
+ return
201
+
150
202
  # Download associated content to local file system
151
203
  downloaded_data = self.downloader_step(indices_inputs)
152
204
  downloaded_data = self.clean_results(results=downloaded_data)
153
205
  if not downloaded_data:
154
206
  return
155
207
 
208
+ # Post download filtering
209
+ downloaded_data = self.apply_filter(records=downloaded_data)
210
+ if not downloaded_data:
211
+ return
212
+
156
213
  # Run uncompress if available
157
214
  if self.uncompress_step:
158
215
  downloaded_data = self.uncompress_step(downloaded_data)
159
216
  # Flatten list of lists
160
217
  downloaded_data = self.clean_results(results=downloaded_data)
161
218
 
219
+ # Post uncompress filtering
220
+ downloaded_data = self.apply_filter(records=downloaded_data)
221
+ if not downloaded_data:
222
+ return
223
+
162
224
  if not downloaded_data:
163
225
  return
164
226
 
@@ -179,9 +241,14 @@ class Pipeline:
179
241
  self.uploader_step(iterable=elements)
180
242
 
181
243
  def __str__(self):
182
- s = [str(self.indexer_step), str(self.downloader_step)]
244
+ s = [str(self.indexer_step)]
245
+ if filter_step := self.filter_step:
246
+ s.append(str(filter_step))
247
+ s.append(str(self.downloader_step))
248
+ if filter_step := self.filter_step:
249
+ s.append(str(filter_step))
183
250
  if uncompress_step := self.uncompress_step:
184
- s.append(str(uncompress_step))
251
+ s.extend([str(uncompress_step), str(filter_step)])
185
252
  s.append(str(self.partitioner_step))
186
253
  if chunker_step := self.chunker_step:
187
254
  s.append(str(chunker_step))
@@ -200,6 +267,7 @@ class Pipeline:
200
267
  downloader_config: DownloaderConfigT,
201
268
  source_connection_config: ConnectionConfig,
202
269
  partitioner_config: PartitionerConfig,
270
+ filterer_config: FiltererConfig = None,
203
271
  chunker_config: Optional[ChunkerConfig] = None,
204
272
  embedder_config: Optional[EmbedderConfig] = None,
205
273
  destination_connection_config: Optional[ConnectionConfig] = None,
@@ -235,6 +303,8 @@ class Pipeline:
235
303
  ),
236
304
  "partitioner": Partitioner(config=partitioner_config),
237
305
  }
306
+ if filterer_config:
307
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
238
308
  if chunker_config:
239
309
  pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
240
310
  if embedder_config:
@@ -2,6 +2,7 @@ import asyncio
2
2
  import hashlib
3
3
  import json
4
4
  from dataclasses import dataclass
5
+ from pathlib import Path
5
6
  from typing import Callable, Optional, TypedDict, TypeVar
6
7
 
7
8
  from unstructured_ingest.v2.interfaces import FileData, download_responses
@@ -70,11 +71,40 @@ class DownloadStep(PipelineStep):
70
71
  return True
71
72
  return False
72
73
 
74
+ def update_file_data(
75
+ self, file_data: FileData, file_data_path: Path, download_path: Path
76
+ ) -> None:
77
+ file_size_bytes = download_path.stat().st_size
78
+ changed = False
79
+ if not file_data.metadata.filesize_bytes and file_size_bytes:
80
+ changed = True
81
+ file_data.metadata.filesize_bytes = file_size_bytes
82
+ if (
83
+ file_data.metadata.filesize_bytes
84
+ and file_data.metadata.filesize_bytes != file_size_bytes
85
+ ):
86
+ logger.warning(
87
+ f"file size in original file data "
88
+ f"({file_data.metadata.filesize_bytes}) doesn't "
89
+ f"match size of local file: {file_size_bytes}, updating"
90
+ )
91
+ changed = True
92
+ file_data.metadata.filesize_bytes = file_size_bytes
93
+ if changed:
94
+ logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
95
+ with file_data_path.open("w") as file:
96
+ json.dump(file_data.to_dict(), file, indent=2)
97
+
73
98
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
74
99
  file_data = FileData.from_file(path=file_data_path)
75
100
  download_path = self.process.get_download_path(file_data=file_data)
76
101
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
77
102
  logger.debug(f"Skipping download, file already exists locally: {download_path}")
103
+ self.update_file_data(
104
+ file_data=file_data,
105
+ file_data_path=Path(file_data_path),
106
+ download_path=download_path,
107
+ )
78
108
  return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
79
109
  fn_kwargs = {"file_data": file_data}
80
110
  if not asyncio.iscoroutinefunction(fn):
@@ -85,26 +115,60 @@ class DownloadStep(PipelineStep):
85
115
  else:
86
116
  download_results = await fn(**fn_kwargs)
87
117
  return self.create_step_results(
88
- current_file_data_path=file_data_path, download_results=download_results
118
+ current_file_data_path=file_data_path,
119
+ download_results=download_results,
120
+ current_file_data=file_data,
89
121
  )
90
122
 
91
123
  def create_step_results(
92
- self, current_file_data_path: str, download_results: download_responses
124
+ self,
125
+ current_file_data_path: str,
126
+ current_file_data: FileData,
127
+ download_results: download_responses,
93
128
  ) -> list[DownloadStepResponse]:
129
+ responses = []
94
130
  if not isinstance(download_results, list):
95
- return [
96
- DownloadStepResponse(
97
- file_data_path=current_file_data_path, path=str(download_results["path"])
131
+ file_data = current_file_data
132
+ file_data_path = current_file_data_path
133
+ download_path = download_results["path"]
134
+ if download_results["file_data"].identifier == current_file_data.identifier:
135
+ self.update_file_data(
136
+ file_data=file_data,
137
+ file_data_path=Path(file_data_path),
138
+ download_path=download_path,
139
+ )
140
+ responses = [
141
+ DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
142
+ ]
143
+ else:
144
+ file_data = download_results["file_data"]
145
+ file_data_path = self.persist_new_file_data(file_data=file_data)
146
+ self.update_file_data(
147
+ file_data=file_data,
148
+ file_data_path=Path(file_data_path),
149
+ download_path=download_path,
98
150
  )
99
- ]
151
+ responses = [
152
+ DownloadStepResponse(
153
+ file_data_path=current_file_data_path, path=str(download_results["path"])
154
+ )
155
+ ]
156
+ else:
100
157
  # Supplemental results generated as part of the download process
101
- download_step_results = []
102
- for res in download_results:
103
- file_data_path = self.persist_new_file_data(file_data=res["file_data"])
104
- download_step_results.append(
105
- DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
106
- )
107
- return download_step_results
158
+ for res in download_results:
159
+ file_data = res["file_data"]
160
+ file_data_path = self.persist_new_file_data(file_data=file_data)
161
+ download_path = res["path"]
162
+ self.update_file_data(
163
+ file_data=file_data,
164
+ file_data_path=Path(file_data_path),
165
+ download_path=download_path,
166
+ )
167
+ responses.append(
168
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
169
+ )
170
+
171
+ return responses
108
172
 
109
173
  def persist_new_file_data(self, file_data: FileData) -> str:
110
174
  record_hash = self.get_hash(extras=[file_data.identifier])