unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
examples/opensearch.py DELETED
@@ -1,50 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.opensearch import (
13
- CONNECTOR_TYPE,
14
- OpenSearchAccessConfig,
15
- OpenSearchConnectionConfig,
16
- OpenSearchUploaderConfig,
17
- OpenSearchUploadStagerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
- indexer_config=LocalIndexerConfig(
33
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
34
- ),
35
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
- source_connection_config=LocalConnectionConfig(),
37
- partitioner_config=PartitionerConfig(strategy="fast"),
38
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
- destination_connection_config=OpenSearchConnectionConfig(
41
- hosts="http://localhost:9247",
42
- username="admin",
43
- use_ssl=True,
44
- access_config=OpenSearchAccessConfig(password="admin"),
45
- ),
46
- stager_config=OpenSearchUploadStagerConfig(index_name="ingest-test-destination"),
47
- uploader_config=OpenSearchUploaderConfig(
48
- index_name="ingest-test-destination", batch_size_bytes=150
49
- ),
50
- ).run()
examples/pinecone.py DELETED
@@ -1,57 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.local import (
9
- LocalConnectionConfig,
10
- LocalDownloaderConfig,
11
- LocalIndexerConfig,
12
- )
13
- from unstructured_ingest.processes.connectors.pinecone import (
14
- CONNECTOR_TYPE,
15
- PineconeAccessConfig,
16
- PineconeConnectionConfig,
17
- PineconeUploaderConfig,
18
- PineconeUploadStagerConfig,
19
- )
20
- from unstructured_ingest.processes.embedder import EmbedderConfig
21
- from unstructured_ingest.processes.partitioner import PartitionerConfig
22
-
23
- base_path = Path(__file__).parent.parent.parent.parent
24
- docs_path = base_path / "example-docs"
25
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
- output_path = work_dir / "output"
27
- download_path = work_dir / "download"
28
-
29
- if __name__ == "__main__":
30
- logger.info(f"writing all content in: {work_dir.resolve()}")
31
- Pipeline.from_configs(
32
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
- indexer_config=LocalIndexerConfig(
34
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
35
- ),
36
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
37
- source_connection_config=LocalConnectionConfig(),
38
- partitioner_config=PartitionerConfig(strategy="fast"),
39
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
40
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
41
- destination_connection_config=PineconeConnectionConfig(
42
- # You'll need to set PINECONE_API_KEY environment variable to run this example
43
- access_config=PineconeAccessConfig(pinecone_api_key=os.getenv("PINECONE_API_KEY")),
44
- index_name=os.getenv(
45
- "PINECONE_INDEX",
46
- default="your index name here. e.g. my-index,"
47
- "or define in environment variable PINECONE_INDEX",
48
- ),
49
- environment=os.getenv(
50
- "PINECONE_ENVIRONMENT",
51
- default="your environment name here. e.g. us-east-1,"
52
- "or define in environment variable PINECONE_ENVIRONMENT",
53
- ),
54
- ),
55
- stager_config=PineconeUploadStagerConfig(),
56
- uploader_config=PineconeUploaderConfig(batch_size=10, num_processes=2),
57
- ).run()
examples/s3.py DELETED
@@ -1,38 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.fsspec.s3 import (
8
- CONNECTOR_TYPE,
9
- S3ConnectionConfig,
10
- S3DownloaderConfig,
11
- S3IndexerConfig,
12
- )
13
- from unstructured_ingest.processes.connectors.local import (
14
- LocalUploaderConfig,
15
- )
16
- from unstructured_ingest.processes.embedder import EmbedderConfig
17
- from unstructured_ingest.processes.filter import FiltererConfig
18
- from unstructured_ingest.processes.partitioner import PartitionerConfig
19
-
20
- base_path = Path(__file__).parent.parent.parent.parent
21
- docs_path = base_path / "example-docs"
22
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
- output_path = work_dir / "output"
24
- download_path = work_dir / "download"
25
-
26
- if __name__ == "__main__":
27
- logger.info(f"writing all content in: {work_dir.resolve()}")
28
- Pipeline.from_configs(
29
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), verbose=True, iter_delete=True),
30
- indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
31
- downloader_config=S3DownloaderConfig(download_dir=download_path),
32
- source_connection_config=S3ConnectionConfig(anonymous=True),
33
- partitioner_config=PartitionerConfig(strategy="fast"),
34
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
35
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
36
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
37
- filterer_config=FiltererConfig(max_file_size=900000),
38
- ).run()
examples/salesforce.py DELETED
@@ -1,44 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.chunker import ChunkerConfig
8
- from unstructured_ingest.processes.connectors.local import (
9
- LocalUploaderConfig,
10
- )
11
- from unstructured_ingest.processes.connectors.salesforce import (
12
- CONNECTOR_TYPE,
13
- SalesforceAccessConfig,
14
- SalesforceConnectionConfig,
15
- SalesforceDownloaderConfig,
16
- SalesforceIndexerConfig,
17
- )
18
- from unstructured_ingest.processes.embedder import EmbedderConfig
19
- from unstructured_ingest.processes.partitioner import PartitionerConfig
20
-
21
- base_path = Path(__file__).parent.parent.parent.parent
22
- docs_path = base_path / "example-docs"
23
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
- output_path = work_dir / "output"
25
- download_path = work_dir / "download"
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
- indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]),
32
- downloader_config=SalesforceDownloaderConfig(download_dir=download_path),
33
- source_connection_config=SalesforceConnectionConfig(
34
- SalesforceAccessConfig(
35
- consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
36
- private_key=os.getenv("SALESFORCE_PRIVATE_KEY"),
37
- ),
38
- username=os.getenv("SALESFORCE_USERNAME"),
39
- ),
40
- partitioner_config=PartitionerConfig(strategy="fast"),
41
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
42
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
44
- ).run()
examples/sharepoint.py DELETED
@@ -1,47 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from unstructured_ingest.interfaces import ProcessorConfig
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.pipeline.pipeline import Pipeline
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalUploaderConfig,
9
- )
10
- from unstructured_ingest.processes.connectors.sharepoint import (
11
- CONNECTOR_TYPE,
12
- SharepointAccessConfig,
13
- SharepointConnectionConfig,
14
- SharepointDownloaderConfig,
15
- SharepointIndexerConfig,
16
- SharepointPermissionsConfig,
17
- )
18
- from unstructured_ingest.processes.partitioner import PartitionerConfig
19
-
20
- base_path = Path(__file__).parent.parent.parent.parent
21
- docs_path = base_path / "example-docs"
22
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
- output_path = work_dir / "output"
24
- download_path = work_dir / "download"
25
-
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
31
- indexer_config=SharepointIndexerConfig(),
32
- downloader_config=SharepointDownloaderConfig(download_dir=download_path),
33
- source_connection_config=SharepointConnectionConfig(
34
- client_id=os.getenv("SHAREPOINT_CLIENT_ID"),
35
- site=os.getenv("SHAREPOINT_SITE"),
36
- access_config=SharepointAccessConfig(client_cred=os.getenv("SHAREPOINT_CRED")),
37
- permissions_config=SharepointPermissionsConfig(
38
- permissions_application_id=os.getenv("SHAREPOINT_PERMISSIONS_APP_ID"),
39
- permissions_client_cred=os.getenv("SHAREPOINT_PERMISSIONS_APP_CRED"),
40
- permissions_tenant=os.getenv("SHAREPOINT_PERMISSIONS_TENANT"),
41
- ),
42
- ),
43
- partitioner_config=PartitionerConfig(strategy="fast"),
44
- # chunker_config=ChunkerConfig(chunking_strategy="by_title"),
45
- # embedder_config=EmbedderConfig(embedding_provider="huggingface"),
46
- uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
47
- ).run()
examples/singlestore.py DELETED
@@ -1,49 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.singlestore import (
13
- CONNECTOR_TYPE,
14
- SingleStoreAccessConfig,
15
- SingleStoreConnectionConfig,
16
- SingleStoreUploaderConfig,
17
- SingleStoreUploadStagerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
32
- indexer_config=LocalIndexerConfig(
33
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
34
- ),
35
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
- source_connection_config=LocalConnectionConfig(),
37
- partitioner_config=PartitionerConfig(strategy="fast"),
38
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
- destination_connection_config=SingleStoreConnectionConfig(
41
- access_config=SingleStoreAccessConfig(password="password"),
42
- host="localhost",
43
- port=3306,
44
- database="ingest_test",
45
- user="root",
46
- ),
47
- stager_config=SingleStoreUploadStagerConfig(),
48
- uploader_config=SingleStoreUploaderConfig(table_name="elements"),
49
- ).run()
examples/sql.py DELETED
@@ -1,90 +0,0 @@
1
- import os
2
- import sqlite3
3
- from pathlib import Path
4
-
5
- from unstructured_ingest.interfaces import ProcessorConfig
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.pipeline.pipeline import Pipeline
8
- from unstructured_ingest.processes.chunker import ChunkerConfig
9
- from unstructured_ingest.processes.connectors.local import (
10
- LocalConnectionConfig,
11
- LocalDownloaderConfig,
12
- LocalIndexerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.sql import (
15
- CONNECTOR_TYPE,
16
- POSTGRESQL_DB,
17
- SQLITE_DB,
18
- SQLAccessConfig,
19
- SQLConnectionConfig,
20
- SQLUploaderConfig,
21
- SQLUploadStagerConfig,
22
- )
23
- from unstructured_ingest.processes.embedder import EmbedderConfig
24
- from unstructured_ingest.processes.partitioner import PartitionerConfig
25
-
26
- base_path = Path(__file__).parent.parent.parent.parent
27
- docs_path = base_path / "example-docs"
28
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
29
- output_path = work_dir / "output"
30
- download_path = work_dir / "download"
31
-
32
- SQLITE_DB_PATH = "test-sql-db.sqlite"
33
-
34
- if __name__ == "__main__":
35
- logger.info(f"writing all content in: {work_dir.resolve()}")
36
-
37
- configs = {
38
- "context": ProcessorConfig(work_dir=str(work_dir.resolve())),
39
- "indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
40
- "downloader_config": LocalDownloaderConfig(download_dir=download_path),
41
- "source_connection_config": LocalConnectionConfig(),
42
- "partitioner_config": PartitionerConfig(strategy="fast"),
43
- "chunker_config": ChunkerConfig(
44
- chunking_strategy="by_title",
45
- chunk_include_orig_elements=False,
46
- chunk_max_characters=1500,
47
- chunk_multipage_sections=True,
48
- ),
49
- "embedder_config": EmbedderConfig(embedding_provider="huggingface"),
50
- "stager_config": SQLUploadStagerConfig(),
51
- "uploader_config": SQLUploaderConfig(batch_size=10),
52
- }
53
-
54
- if os.path.exists(SQLITE_DB):
55
- os.remove(SQLITE_DB)
56
-
57
- connection = sqlite3.connect(database=SQLITE_DB)
58
-
59
- query = None
60
- script_path = (
61
- Path(__file__).parent.parent.parent.parent.parent
62
- / Path("test_e2e/env_setup/sql/sqlite-schema.sql")
63
- ).resolve()
64
- with open(script_path) as f:
65
- query = f.read()
66
- cursor = connection.cursor()
67
- cursor.executescript(query)
68
- connection.close()
69
-
70
- # sqlite test first
71
- Pipeline.from_configs(
72
- destination_connection_config=SQLConnectionConfig(
73
- db_type=SQLITE_DB,
74
- database=SQLITE_DB_PATH,
75
- access_config=SQLAccessConfig(),
76
- ),
77
- **configs,
78
- ).run()
79
-
80
- # now, pg with pgvector
81
- Pipeline.from_configs(
82
- destination_connection_config=SQLConnectionConfig(
83
- db_type=POSTGRESQL_DB,
84
- database="elements",
85
- host="localhost",
86
- port=5433,
87
- access_config=SQLAccessConfig(username="unstructured", password="test"),
88
- ),
89
- **configs,
90
- ).run()
examples/vectara.py DELETED
@@ -1,54 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.vectara import (
13
- CONNECTOR_TYPE,
14
- VectaraAccessConfig,
15
- VectaraConnectionConfig,
16
- VectaraUploaderConfig,
17
- VectaraUploadStagerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
34
- source_connection_config=LocalConnectionConfig(),
35
- partitioner_config=PartitionerConfig(strategy="fast"),
36
- chunker_config=ChunkerConfig(
37
- chunking_strategy="by_title",
38
- chunk_include_orig_elements=False,
39
- chunk_max_characters=1500,
40
- chunk_multipage_sections=True,
41
- ),
42
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
- destination_connection_config=VectaraConnectionConfig(
44
- access_config=VectaraAccessConfig(
45
- oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
46
- ),
47
- customer_id="fill customer_id",
48
- corpus_name="fill corpus_name",
49
- corpus_key="fill corpus_key",
50
- token_url="fill token_url",
51
- ),
52
- stager_config=VectaraUploadStagerConfig(batch_size=10),
53
- uploader_config=VectaraUploaderConfig(),
54
- ).run()
examples/weaviate.py DELETED
@@ -1,44 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.weaviate.local import (
13
- CONNECTOR_TYPE,
14
- LocalWeaviateConnectionConfig,
15
- LocalWeaviateUploaderConfig,
16
- LocalWeaviateUploadStagerConfig,
17
- )
18
- from unstructured_ingest.processes.embedder import EmbedderConfig
19
- from unstructured_ingest.processes.partitioner import PartitionerConfig
20
-
21
- base_path = Path(__file__).parent.parent.parent.parent
22
- docs_path = base_path / "example-docs"
23
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
- output_path = work_dir / "output"
25
- download_path = work_dir / "download"
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
32
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
33
- source_connection_config=LocalConnectionConfig(),
34
- partitioner_config=PartitionerConfig(strategy="fast"),
35
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
36
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
37
- destination_connection_config=LocalWeaviateConnectionConfig(
38
- # Connects to http://localhost:8080
39
- ),
40
- stager_config=LocalWeaviateUploadStagerConfig(),
41
- uploader_config=LocalWeaviateUploaderConfig(
42
- collection="elements", batch_size=10, dynamic_batch=False
43
- ),
44
- ).run()
test/__init__.py DELETED
File without changes
File without changes
File without changes
@@ -1,31 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
- from test.integration.utils import requires_env
7
- from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
8
-
9
- int_test_dir = Path(__file__).parent
10
- assets_dir = int_test_dir / "assets"
11
-
12
- chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
13
-
14
-
15
- @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
16
- @pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
17
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
18
- @pytest.mark.asyncio
19
- async def test_chunker_api(chunker_file: Path, strategy: str):
20
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
21
- api_url = os.getenv("UNSTRUCTURED_API_URL")
22
-
23
- chunker_config = ChunkerConfig(
24
- chunking_strategy=strategy,
25
- chunk_by_api=True,
26
- chunk_api_key=api_key,
27
- chunking_endpoint=api_url,
28
- )
29
- chunker = Chunker(config=chunker_config)
30
- results = await chunker.run_async(elements_filepath=chunker_file)
31
- assert results
File without changes
@@ -1,38 +0,0 @@
1
- import tempfile
2
- from pathlib import Path
3
- from typing import Generator
4
-
5
- import pytest
6
-
7
- from unstructured_ingest.logger import logger
8
-
9
- FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
10
-
11
-
12
- @pytest.fixture
13
- def upload_file() -> Path:
14
- int_test_dir = Path(__file__).parent
15
- assets_dir = int_test_dir / "assets"
16
- upload_file = assets_dir / FILENAME
17
- assert upload_file.exists()
18
- assert upload_file.is_file()
19
- return upload_file
20
-
21
-
22
- @pytest.fixture
23
- def upload_file_ndjson() -> Path:
24
- int_test_dir = Path(__file__).parent
25
- assets_dir = int_test_dir / "assets"
26
- upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
27
- assert upload_file.exists()
28
- assert upload_file.is_file()
29
- return upload_file
30
-
31
-
32
- @pytest.fixture
33
- def temp_dir() -> Generator[Path, None, None]:
34
- with tempfile.TemporaryDirectory() as temp_dir:
35
- temp_path = Path(temp_dir)
36
- logger.info(f"Created temp dir '{temp_path}'")
37
- yield temp_path
38
- logger.info(f"Removing temp dir '{temp_path}'")
File without changes