unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,55 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.azure_ai_search import (
9
- CONNECTOR_TYPE,
10
- AzureAISearchAccessConfig,
11
- AzureAISearchConnectionConfig,
12
- AzureAISearchUploaderConfig,
13
- AzureAISearchUploadStagerConfig,
14
- )
15
- from unstructured_ingest.processes.connectors.local import (
16
- LocalConnectionConfig,
17
- LocalDownloaderConfig,
18
- LocalIndexerConfig,
19
- )
20
- from unstructured_ingest.processes.embedder import EmbedderConfig
21
- from unstructured_ingest.processes.partitioner import PartitionerConfig
22
-
23
- base_path = Path(__file__).parent.parent.parent.parent
24
- docs_path = base_path / "example-docs"
25
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
- output_path = work_dir / "output"
27
- download_path = work_dir / "download"
28
-
29
- if __name__ == "__main__":
30
- logger.info(f"writing all content in: {work_dir.resolve()}")
31
- index_name = "ingest-test-destination"
32
- Pipeline.from_configs(
33
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
34
- indexer_config=LocalIndexerConfig(
35
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
36
- ),
37
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
38
- source_connection_config=LocalConnectionConfig(),
39
- partitioner_config=PartitionerConfig(strategy="fast"),
40
- chunker_config=ChunkerConfig(
41
- chunking_strategy="by_title", chunk_include_orig_elements=False
42
- ),
43
- embedder_config=EmbedderConfig(
44
- embedding_provider="openai", embedding_api_key=os.getenv("OPENAI_API_KEY")
45
- ),
46
- destination_connection_config=AzureAISearchConnectionConfig(
47
- access_config=AzureAISearchAccessConfig(
48
- azure_ai_search_key=os.getenv("AZURE_SEARCH_API_KEY")
49
- ),
50
- index=os.getenv("AZURE_SEARCH_INDEX"),
51
- endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
52
- ),
53
- uploader_config=AzureAISearchUploaderConfig(batch_size=10),
54
- stager_config=AzureAISearchUploadStagerConfig(),
55
- ).run()
examples/chroma.py DELETED
@@ -1,54 +0,0 @@
1
- import random
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.chroma import (
9
- CONNECTOR_TYPE,
10
- ChromaAccessConfig,
11
- ChromaConnectionConfig,
12
- ChromaUploaderConfig,
13
- ChromaUploadStagerConfig,
14
- )
15
- from unstructured_ingest.processes.connectors.local import (
16
- LocalConnectionConfig,
17
- LocalDownloaderConfig,
18
- LocalIndexerConfig,
19
- )
20
- from unstructured_ingest.processes.embedder import EmbedderConfig
21
- from unstructured_ingest.processes.partitioner import PartitionerConfig
22
-
23
- base_path = Path(__file__).parent.parent.parent.parent
24
- docs_path = base_path / "example-docs"
25
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
- output_path = work_dir / "output"
27
- download_path = work_dir / "download"
28
-
29
- if __name__ == "__main__":
30
- logger.info(f"writing all content in: {work_dir.resolve()}")
31
- Pipeline.from_configs(
32
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
- indexer_config=LocalIndexerConfig(input_path=docs_path.resolve() / "multisimple"),
34
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
- source_connection_config=LocalConnectionConfig(),
36
- partitioner_config=PartitionerConfig(strategy="fast"),
37
- chunker_config=ChunkerConfig(
38
- chunking_strategy="by_title",
39
- chunk_include_orig_elements=False,
40
- chunk_max_characters=1500,
41
- chunk_multipage_sections=True,
42
- ),
43
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
44
- destination_connection_config=ChromaConnectionConfig(
45
- access_config=ChromaAccessConfig(settings=None, headers=None),
46
- host="localhost",
47
- port=8047,
48
- collection_name=f"test-collection-{random.randint(1000, 9999)}",
49
- tenant="default_tenant",
50
- database="default_database",
51
- ),
52
- stager_config=ChromaUploadStagerConfig(),
53
- uploader_config=ChromaUploaderConfig(batch_size=10),
54
- ).run()
examples/couchbase.py DELETED
@@ -1,55 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.couchbase import (
8
- CONNECTOR_TYPE,
9
- CouchbaseAccessConfig,
10
- CouchbaseConnectionConfig,
11
- CouchbaseUploaderConfig,
12
- CouchbaseUploadStagerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import (
15
- LocalConnectionConfig,
16
- LocalDownloaderConfig,
17
- LocalIndexerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
34
- source_connection_config=LocalConnectionConfig(),
35
- partitioner_config=PartitionerConfig(strategy="fast"),
36
- chunker_config=ChunkerConfig(
37
- chunking_strategy="by_title",
38
- chunk_include_orig_elements=False,
39
- chunk_max_characters=1500,
40
- chunk_multipage_sections=True,
41
- ),
42
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
- destination_connection_config=CouchbaseConnectionConfig(
44
- access_config=CouchbaseAccessConfig(
45
- connection_string="couchbase://localhost",
46
- username="Administrator",
47
- password="password",
48
- ),
49
- bucket="example_bucket",
50
- scope="example_scope",
51
- collection="example_collection",
52
- ),
53
- stager_config=CouchbaseUploadStagerConfig(),
54
- uploader_config=CouchbaseUploaderConfig(batch_size=10),
55
- ).run()
@@ -1,55 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.databricks.volumes_native import (
9
- CONNECTOR_TYPE,
10
- DatabricksNativeVolumesAccessConfig,
11
- DatabricksNativeVolumesConnectionConfig,
12
- DatabricksNativeVolumesUploaderConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import (
15
- LocalConnectionConfig,
16
- LocalDownloaderConfig,
17
- LocalIndexerConfig,
18
- )
19
- from unstructured_ingest.processes.partitioner import PartitionerConfig
20
-
21
- base_path = Path(__file__).parent.parent.parent.parent
22
- docs_path = base_path / "example-docs"
23
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
- output_path = work_dir / "output"
25
- download_path = work_dir / "download"
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/fake-text.txt"),
32
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
33
- source_connection_config=LocalConnectionConfig(),
34
- partitioner_config=PartitionerConfig(strategy="fast"),
35
- chunker_config=ChunkerConfig(
36
- chunking_strategy="basic",
37
- ),
38
- embedder_config=None,
39
- destination_connection_config=DatabricksNativeVolumesConnectionConfig(
40
- access_config=DatabricksNativeVolumesAccessConfig(
41
- client_id=os.environ["DATABRICKS_CLIENT_ID"],
42
- client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
43
- ),
44
- host=os.environ["DATABRICKS_HOST"],
45
- catalog=os.environ["DATABRICKS_CATALOG"],
46
- volume=os.environ["DATABRICKS_VOLUME"],
47
- volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
48
- ),
49
- uploader_config=DatabricksNativeVolumesUploaderConfig(
50
- overwrite=True,
51
- catalog=os.environ["DATABRICKS_CATALOG"],
52
- volume=os.environ["DATABRICKS_VOLUME"],
53
- volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
54
- ),
55
- ).run()
@@ -1,53 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.databricks.volumes_native import (
9
- CONNECTOR_TYPE,
10
- DatabricksNativeVolumesAccessConfig,
11
- DatabricksNativeVolumesConnectionConfig,
12
- DatabricksNativeVolumesDownloaderConfig,
13
- DatabricksNativeVolumesIndexerConfig,
14
- )
15
- from unstructured_ingest.processes.connectors.local import (
16
- LocalUploaderConfig,
17
- )
18
- from unstructured_ingest.processes.partitioner import PartitionerConfig
19
-
20
- base_path = Path(__file__).parent.parent.parent.parent
21
- docs_path = base_path / "example-docs"
22
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
- output_path = work_dir / "output"
24
- download_path = work_dir / "download"
25
-
26
- if __name__ == "__main__":
27
- logger.info(f"writing all content in: {work_dir.resolve()}")
28
- Pipeline.from_configs(
29
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
30
- indexer_config=DatabricksNativeVolumesIndexerConfig(
31
- host=os.environ["DATABRICKS_HOST"],
32
- catalog=os.environ["DATABRICKS_CATALOG"],
33
- volume=os.environ["DATABRICKS_VOLUME"],
34
- volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
35
- ),
36
- downloader_config=DatabricksNativeVolumesDownloaderConfig(download_dir=download_path),
37
- source_connection_config=DatabricksNativeVolumesConnectionConfig(
38
- access_config=DatabricksNativeVolumesAccessConfig(
39
- client_id=os.environ["DATABRICKS_CLIENT_ID"],
40
- client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
41
- ),
42
- host=os.environ["DATABRICKS_HOST"],
43
- catalog=os.environ["DATABRICKS_CATALOG"],
44
- volume=os.environ["DATABRICKS_VOLUME"],
45
- volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
46
- ),
47
- partitioner_config=PartitionerConfig(strategy="fast"),
48
- chunker_config=ChunkerConfig(
49
- chunking_strategy="basic",
50
- ),
51
- embedder_config=None,
52
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
53
- ).run()
examples/delta_table.py DELETED
@@ -1,45 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.delta_table import (
8
- CONNECTOR_TYPE,
9
- DeltaTableAccessConfig,
10
- DeltaTableConnectionConfig,
11
- DeltaTableUploaderConfig,
12
- DeltaTableUploadStagerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import (
15
- LocalConnectionConfig,
16
- LocalDownloaderConfig,
17
- LocalIndexerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
- indexer_config=LocalIndexerConfig(
33
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt",
34
- ),
35
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
- source_connection_config=LocalConnectionConfig(),
37
- partitioner_config=PartitionerConfig(strategy="fast"),
38
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
- destination_connection_config=DeltaTableConnectionConfig(
41
- access_config=DeltaTableAccessConfig(), table_uri="example_uri"
42
- ),
43
- stager_config=DeltaTableUploadStagerConfig(),
44
- uploader_config=DeltaTableUploaderConfig(),
45
- ).run()
@@ -1,36 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.connectors.discord import (
8
- CONNECTOR_TYPE,
9
- DiscordAccessConfig,
10
- DiscordConnectionConfig,
11
- DiscordDownloaderConfig,
12
- DiscordIndexerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import LocalUploaderConfig
15
- from unstructured_ingest.processes.partitioner import PartitionerConfig
16
-
17
- base_path = Path(__file__).parent.parent.parent.parent
18
- docs_path = base_path / "example-docs"
19
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
20
- output_path = work_dir / "output"
21
- download_path = work_dir / "download"
22
-
23
- if __name__ == "__main__":
24
- logger.info(f"writing all content in: {work_dir.resolve()}")
25
- Pipeline.from_configs(
26
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
27
- indexer_config=DiscordIndexerConfig(channels=os.environ["DISCORD_CHANNELS"].split(",")),
28
- downloader_config=DiscordDownloaderConfig(limit=int(os.getenv("DISCORD_LIMIT", 100))),
29
- source_connection_config=DiscordConnectionConfig(
30
- access_config=DiscordAccessConfig(token=os.environ["DISCORD_TOKEN"])
31
- ),
32
- partitioner_config=PartitionerConfig(strategy="fast"),
33
- # chunker_config=ChunkerConfig(chunking_strategy="by_title"),
34
- # embedder_config=EmbedderConfig(embedding_provider="huggingface"),
35
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
36
- ).run()
examples/elasticsearch.py DELETED
@@ -1,49 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.elasticsearch import (
9
- CONNECTOR_TYPE,
10
- ElasticsearchAccessConfig,
11
- ElasticsearchConnectionConfig,
12
- ElasticsearchUploaderConfig,
13
- ElasticsearchUploadStagerConfig,
14
- )
15
- from unstructured_ingest.processes.connectors.local import (
16
- LocalConnectionConfig,
17
- LocalDownloaderConfig,
18
- LocalIndexerConfig,
19
- )
20
- from unstructured_ingest.processes.embedder import EmbedderConfig
21
- from unstructured_ingest.processes.partitioner import PartitionerConfig
22
-
23
- base_path = Path(__file__).parent.parent.parent.parent
24
- docs_path = base_path / "example-docs"
25
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
- output_path = work_dir / "output"
27
- download_path = work_dir / "download"
28
-
29
- if __name__ == "__main__":
30
- logger.info(f"writing all content in: {work_dir.resolve()}")
31
- index_name = "ingest-test-destination"
32
- Pipeline.from_configs(
33
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
34
- indexer_config=LocalIndexerConfig(
35
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
36
- ),
37
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
38
- source_connection_config=LocalConnectionConfig(),
39
- partitioner_config=PartitionerConfig(strategy="fast"),
40
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
41
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
42
- destination_connection_config=ElasticsearchConnectionConfig(
43
- access_config=ElasticsearchAccessConfig(password=os.getenv("ELASTIC_PASSWORD")),
44
- username=os.getenv("ELASTIC_USERNAME"),
45
- hosts=["http://localhost:9200"],
46
- ),
47
- uploader_config=ElasticsearchUploaderConfig(index_name=index_name),
48
- stager_config=ElasticsearchUploadStagerConfig(index_name=index_name),
49
- ).run()
examples/google_drive.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.google_drive import (
8
- CONNECTOR_TYPE,
9
- GoogleDriveAccessConfig,
10
- GoogleDriveConnectionConfig,
11
- GoogleDriveDownloaderConfig,
12
- GoogleDriveIndexerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import (
15
- LocalUploaderConfig,
16
- )
17
- from unstructured_ingest.processes.partitioner import PartitionerConfig
18
-
19
- base_path = Path(__file__).parent.parent.parent.parent
20
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
21
- output_path = work_dir / "output"
22
-
23
-
24
- if __name__ == "__main__":
25
- Pipeline.from_configs(
26
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
27
- # You'll need to set GOOGLE_DRIVE_SERVICE_KEY and GOOGLE_DRIVE_DRIVE_ID
28
- # environment variable to run this example
29
- source_connection_config=GoogleDriveConnectionConfig(
30
- access_config=GoogleDriveAccessConfig(
31
- service_account_key=os.environ.get("GOOGLE_DRIVE_SERVICE_KEY")
32
- ),
33
- drive_id=os.environ.get("GOOGLE_DRIVE_DRIVE_ID"),
34
- ),
35
- indexer_config=GoogleDriveIndexerConfig(
36
- resursive=True,
37
- ),
38
- downloader_config=GoogleDriveDownloaderConfig(),
39
- partitioner_config=PartitionerConfig(strategy="fast"),
40
- chunker_config=ChunkerConfig(
41
- chunking_strategy="basic",
42
- ),
43
- embedder_config=None,
44
- uploader_config=LocalUploaderConfig(output_dir=output_path),
45
- ).run()
examples/kdbai.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.kdbai import (
9
- CONNECTOR_TYPE,
10
- KdbaiConnectionConfig,
11
- KdbaiUploaderConfig,
12
- KdbaiUploadStagerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.local import (
15
- LocalConnectionConfig,
16
- LocalDownloaderConfig,
17
- LocalIndexerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
- input_path = docs_path.resolve() / "pdf" / "fake-memo.pdf"
28
-
29
- os.environ["KDBAI_API_KEY"] = "key"
30
- os.environ["KDBAI_ENDPOINT"] = "http://localhost"
31
- os.environ["KDBAI_DATABASE"] = "default"
32
- os.environ["KDBAI_TABLE"] = "table"
33
-
34
- if __name__ == "__main__":
35
- logger.info(f"writing all content in: {work_dir.resolve()}")
36
- logger.info(f"processing file(s): {input_path.resolve()}")
37
- Pipeline.from_configs(
38
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
39
- indexer_config=LocalIndexerConfig(
40
- input_path=docs_path.resolve() / "book-war-and-peace-1p.txt"
41
- ),
42
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
43
- source_connection_config=LocalConnectionConfig(),
44
- partitioner_config=PartitionerConfig(strategy="fast"),
45
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
46
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
47
- destination_connection_config=KdbaiConnectionConfig(
48
- endpoint=os.environ["KDBAI_ENDPOINT"],
49
- ),
50
- stager_config=KdbaiUploadStagerConfig(),
51
- uploader_config=KdbaiUploaderConfig(
52
- database_name=os.environ["KDBAI_DATABASE"], table_name=os.environ["KDBAI_TABLE"]
53
- ),
54
- ).run()
examples/local.py DELETED
@@ -1,36 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- CONNECTOR_TYPE,
9
- LocalConnectionConfig,
10
- LocalDownloaderConfig,
11
- LocalIndexerConfig,
12
- LocalUploaderConfig,
13
- )
14
- from unstructured_ingest.processes.embedder import EmbedderConfig
15
- from unstructured_ingest.processes.partitioner import PartitionerConfig
16
-
17
- base_path = Path(__file__).parent.parent.parent.parent
18
- docs_path = base_path / "example-docs"
19
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
20
- output_path = work_dir / "output"
21
- download_path = work_dir / "download"
22
-
23
- if __name__ == "__main__":
24
- logger.info(f"writing all content in: {work_dir.resolve()}")
25
- Pipeline.from_configs(
26
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
27
- indexer_config=LocalIndexerConfig(
28
- input_path=str(docs_path.resolve()) + "/language-docs/UDHR_first_article_all.txt"
29
- ),
30
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
31
- source_connection_config=LocalConnectionConfig(),
32
- partitioner_config=PartitionerConfig(strategy="fast"),
33
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
34
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
35
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
36
- ).run()
examples/milvus.py DELETED
@@ -1,44 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.milvus import (
13
- CONNECTOR_TYPE,
14
- MilvusConnectionConfig,
15
- MilvusUploaderConfig,
16
- MilvusUploadStagerConfig,
17
- )
18
- from unstructured_ingest.processes.embedder import EmbedderConfig
19
- from unstructured_ingest.processes.partitioner import PartitionerConfig
20
-
21
- base_path = Path(__file__).parent.parent.parent.parent
22
- docs_path = base_path / "example-docs"
23
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
- output_path = work_dir / "output"
25
- download_path = work_dir / "download"
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
31
- indexer_config=LocalIndexerConfig(
32
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
33
- ),
34
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
- source_connection_config=LocalConnectionConfig(),
36
- partitioner_config=PartitionerConfig(strategy="fast"),
37
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
38
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
39
- destination_connection_config=MilvusConnectionConfig(
40
- uri="http://localhost:19530", db_name="milvus"
41
- ),
42
- stager_config=MilvusUploadStagerConfig(),
43
- uploader_config=MilvusUploaderConfig(collection_name="ingest_test"),
44
- ).run()
examples/mongodb.py DELETED
@@ -1,53 +0,0 @@
1
- import random
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.local import (
9
- LocalConnectionConfig,
10
- LocalDownloaderConfig,
11
- LocalIndexerConfig,
12
- )
13
- from unstructured_ingest.processes.connectors.mongodb import (
14
- CONNECTOR_TYPE,
15
- MongoDBAccessConfig,
16
- MongoDBConnectionConfig,
17
- MongoDBUploaderConfig,
18
- MongoDBUploadStagerConfig,
19
- )
20
- from unstructured_ingest.processes.embedder import EmbedderConfig
21
- from unstructured_ingest.processes.partitioner import PartitionerConfig
22
-
23
- base_path = Path(__file__).parent.parent.parent.parent
24
- docs_path = base_path / "example-docs"
25
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
- output_path = work_dir / "output"
27
- download_path = work_dir / "download"
28
-
29
- if __name__ == "__main__":
30
- logger.info(f"writing all content in: {work_dir.resolve()}")
31
- Pipeline.from_configs(
32
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
34
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
- source_connection_config=LocalConnectionConfig(),
36
- partitioner_config=PartitionerConfig(strategy="fast"),
37
- chunker_config=ChunkerConfig(
38
- chunking_strategy="by_title",
39
- chunk_include_orig_elements=False,
40
- chunk_max_characters=1500,
41
- chunk_multipage_sections=True,
42
- ),
43
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
44
- destination_connection_config=MongoDBConnectionConfig(
45
- access_config=MongoDBAccessConfig(uri=None),
46
- host="localhost",
47
- port=27017,
48
- collection=f"test-collection-{random.randint(1000, 9999)}",
49
- database="testDatabase",
50
- ),
51
- stager_config=MongoDBUploadStagerConfig(),
52
- uploader_config=MongoDBUploaderConfig(batch_size=10),
53
- ).run()